diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 130e72de9dd..d49d1d603c7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -61,7 +61,7 @@ jobs: poetry install --without dev poetry run pip install tensorflow==2.11.0 - name: Test basic import - run: poetry run python -c 'from docarray import DocArray, BaseDoc' + run: poetry run python -c 'from docarray import DocList, BaseDoc' check-mypy: diff --git a/README.md b/README.md index 8d4b45ae264..af497cb5b3e 100644 --- a/README.md +++ b/README.md @@ -77,9 +77,10 @@ doc = MultiModalDocument( ) ``` -### Collect multiple `Documents` into a `DocArray`: +### Collect multiple `Documents` into a `DocList`: + ```python -from docarray import DocArray, BaseDoc +from docarray import DocList, BaseDoc from docarray.typing import AnyTensor, ImageUrl import numpy as np @@ -90,9 +91,9 @@ class Image(BaseDoc): ``` ```python -from docarray import DocArray +from docarray import DocList -da = DocArray[Image]( +da = DocList[Image]( [ Image( url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", @@ -150,16 +151,16 @@ Image.from_protobuf(doc.to_protobuf()) ```python # NOTE: DocumentStores are not yet implemented in version 2 -from docarray import DocArray +from docarray import DocList from docarray.documents import ImageDoc from docarray.stores import DocumentStore import numpy as np -da = DocArray([ImageDoc(embedding=np.zeros((128,))) for _ in range(1000)]) +da = DocList([ImageDoc(embedding=np.zeros((128,))) for _ in range(1000)]) store = DocumentStore[ImageDoc]( storage='qdrant' ) # create a DocumentStore with Qdrant as backend -store.insert(da) # insert the DocArray into the DocumentStore +store.insert(da) # insert the DocList into the DocumentStore # find the 10 most similar images based on the 'embedding' field match = store.find(ImageDoc(embedding=np.zeros((128,))), field='embedding', top_k=10) ``` @@ -233,7 +234,7 @@ Not very easy on the eyes if you ask us. And even worse, if you need to add one So, now let's see what the same code looks like with DocArray: ```python -from docarray import DocArray, BaseDoc +from docarray import DocList, BaseDoc from docarray.documents import ImageDoc, TextDoc, AudioDoc from docarray.typing import TorchTensor @@ -258,18 +259,18 @@ class MyPodcastModel(nn.Module): self.image_encoder = ImageEncoder() self.text_encoder = TextEncoder() - def forward_podcast(self, da: DocArray[Podcast]) -> DocArray[Podcast]: - da.audio.embedding = self.audio_encoder(da.audio.tensor) - da.text.embedding = self.text_encoder(da.text.tensor) - da.image.embedding = self.image_encoder(da.image.tensor) + def forward_podcast(self, docs: DocList[Podcast]) -> DocList[Podcast]: + docs.audio.embedding = self.audio_encoder(docs.audio.tensor) + docs.text.embedding = self.text_encoder(docs.text.tensor) + docs.image.embedding = self.image_encoder(docs.image.tensor) - return da + return docs - def forward(self, da: DocArray[PairPodcast]) -> DocArray[PairPodcast]: - da.left = self.forward_podcast(da.left) - da.right = self.forward_podcast(da.right) + def forward(self, docs: DocList[PairPodcast]) -> DocList[PairPodcast]: + docs.left = self.forward_podcast(docs.left) + docs.right = self.forward_podcast(docs.right) - return da + return docs ``` Looks much better, doesn't it? @@ -297,7 +298,7 @@ This would look like the following: ```python from typing import Optional -from docarray import DocArray, BaseDoc +from docarray import DocList, BaseDoc import tensorflow as tf @@ -312,7 +313,7 @@ class MyPodcastModel(tf.keras.Model): super().__init__() self.audio_encoder = AudioEncoder() - def call(self, inputs: DocArray[Podcast]) -> DocArray[Podcast]: + def call(self, inputs: DocList[Podcast]) -> DocList[Podcast]: inputs.audio_tensor.embedding = self.audio_encoder( inputs.audio_tensor.tensor ) # access audio_tensor's .tensor attribute @@ -407,7 +408,7 @@ store it there, and thus make it searchable: ```python # NOTE: DocumentStores are not yet implemented in version 2 -from docarray import DocArray, BaseDoc +from docarray import DocList, BaseDoc from docarray.stores import DocumentStore from docarray.documents import ImageDoc, TextDoc import numpy as np @@ -427,7 +428,7 @@ def _random_my_doc(): ) -da = DocArray([_random_my_doc() for _ in range(1000)]) # create some data +da = DocList([_random_my_doc() for _ in range(1000)]) # create some data store = DocumentStore[MyDoc]( storage='qdrant' ) # create a DocumentStore with Qdrant as backend diff --git a/docarray/__init__.py b/docarray/__init__.py index d8b6bae90a5..2bffdc80803 100644 --- a/docarray/__init__.py +++ b/docarray/__init__.py @@ -2,10 +2,10 @@ import logging -from docarray.array import DocArray, DocArrayStacked +from docarray.array import DocList, DocVec from docarray.base_doc.doc import BaseDoc -__all__ = ['BaseDoc', 'DocArray', 'DocArrayStacked'] +__all__ = ['BaseDoc', 'DocList', 'DocVec'] logger = logging.getLogger('docarray') diff --git a/docarray/array/__init__.py b/docarray/array/__init__.py index 9c0176426e2..16e1274c1e3 100644 --- a/docarray/array/__init__.py +++ b/docarray/array/__init__.py @@ -1,4 +1,5 @@ -from docarray.array.array.array import DocArray -from docarray.array.stacked.array_stacked import DocArrayStacked +from docarray.array.any_array import AnyDocArray +from docarray.array.doc_list.doc_list import DocList +from docarray.array.doc_vec.doc_vec import DocVec -__all__ = ['DocArray', 'DocArrayStacked'] +__all__ = ['DocList', 'DocVec', 'AnyDocArray'] diff --git a/docarray/array/abstract_array.py b/docarray/array/any_array.py similarity index 79% rename from docarray/array/abstract_array.py rename to docarray/array/any_array.py index f3a94b5b6fa..d156da9ea8c 100644 --- a/docarray/array/abstract_array.py +++ b/docarray/array/any_array.py @@ -25,7 +25,7 @@ from docarray.utils._internal._typing import change_cls_name if TYPE_CHECKING: - from docarray.proto import DocumentArrayProto, NodeProto + from docarray.proto import DocListProto, NodeProto from docarray.typing.tensor.abstract_tensor import AbstractTensor T = TypeVar('T', bound='AnyDocArray') @@ -34,7 +34,7 @@ class AnyDocArray(Sequence[T_doc], Generic[T_doc], AbstractType): - document_type: Type[BaseDoc] + doc_type: Type[BaseDoc] __typed_da__: Dict[Type['AnyDocArray'], Dict[Type[BaseDoc], Type]] = {} def __repr__(self): @@ -58,9 +58,9 @@ def __class_getitem__(cls, item: Union[Type[BaseDoc], TypeVar, str]): global _DocArrayTyped class _DocArrayTyped(cls): # type: ignore - document_type: Type[BaseDoc] = cast(Type[BaseDoc], item) + doc_type: Type[BaseDoc] = cast(Type[BaseDoc], item) - for field in _DocArrayTyped.document_type.__fields__.keys(): + for field in _DocArrayTyped.doc_type.__fields__.keys(): def _property_generator(val: str): def _getter(self): @@ -121,34 +121,34 @@ def _set_data_column( field: str, values: Union[List, T, 'AbstractTensor'], ): - """Set all Documents in this DocArray using the passed values + """Set all Documents in this DocList using the passed values :param field: name of the fields to extract - :values: the values to set at the DocArray level + :values: the values to set at the DocList level """ ... @classmethod @abstractmethod - def from_protobuf(cls: Type[T], pb_msg: 'DocumentArrayProto') -> T: + def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T: """create a Document from a protobuf message""" ... @abstractmethod - def to_protobuf(self) -> 'DocumentArrayProto': - """Convert DocArray into a Protobuf message""" + def to_protobuf(self) -> 'DocListProto': + """Convert DocList into a Protobuf message""" ... def _to_node_protobuf(self) -> 'NodeProto': - """Convert a DocArray into a NodeProto protobuf message. - This function should be called when a DocArray + """Convert a DocList into a NodeProto protobuf message. + This function should be called when a DocList is nested into another Document that need to be converted into a protobuf :return: the nested item protobuf message """ from docarray.proto import NodeProto - return NodeProto(document_array=self.to_protobuf()) + return NodeProto(doc_array=self.to_protobuf()) @abstractmethod def traverse_flat( @@ -157,7 +157,7 @@ def traverse_flat( ) -> Union[List[Any], 'AbstractTensor']: """ Return a List of the accessed objects when applying the `access_path`. If this - results in a nested list or list of DocArrays, the list will be flattened + results in a nested list or list of DocLists, the list will be flattened on the first level. The access path is a string that consists of attribute names, concatenated and "__"-separated. It describes the path from the first level to an arbitrary one, e.g. 'content__image__url'. @@ -167,7 +167,7 @@ def traverse_flat( EXAMPLE USAGE .. code-block:: python - from docarray import BaseDoc, DocArray, Text + from docarray import BaseDoc, DocList, Text class Author(BaseDoc): @@ -179,20 +179,20 @@ class Book(BaseDoc): content: Text - da = DocArray[Book]( + docs = DocList[Book]( Book(author=Author(name='Jenny'), content=Text(text=f'book_{i}')) for i in range(10) # noqa: E501 ) - books = da.traverse_flat(access_path='content') # list of 10 Text objs + books = docs.traverse_flat(access_path='content') # list of 10 Text objs - authors = da.traverse_flat(access_path='author__name') # list of 10 strings + authors = docs.traverse_flat(access_path='author__name') # list of 10 strings If the resulting list is a nested list, it will be flattened: EXAMPLE USAGE .. code-block:: python - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList class Chapter(BaseDoc): @@ -200,20 +200,18 @@ class Chapter(BaseDoc): class Book(BaseDoc): - chapters: DocArray[Chapter] + chapters: DocList[Chapter] - da = DocArray[Book]( - Book( - chapters=DocArray[Chapter]([Chapter(content='some_content') for _ in range(3)]) - ) + docs = DocList[Book]( + Book(chapters=DocList[Chapter]([Chapter(content='some_content') for _ in range(3)])) for _ in range(10) ) - chapters = da.traverse_flat(access_path='chapters') # list of 30 strings + chapters = docs.traverse_flat(access_path='chapters') # list of 30 strings - If your DocArray is in stacked mode, and you want to access a field of - type AnyTensor, the stacked tensor will be returned instead of a list: + If your DocList is in doc_vec mode, and you want to access a field of + type AnyTensor, the doc_vec tensor will be returned instead of a list: EXAMPLE USAGE .. code-block:: python @@ -221,7 +219,7 @@ class Image(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocArray[Image]( + batch = DocList[Image]( [ Image( tensor=torch.zeros(3, 224, 224), @@ -243,9 +241,9 @@ def _traverse(node: Any, access_path: str): if access_path: curr_attr, _, path_attrs = access_path.partition('__') - from docarray.array import DocArray + from docarray.array import DocList - if isinstance(node, (DocArray, list)): + if isinstance(node, (DocList, list)): for n in node: x = getattr(n, curr_attr) yield from AnyDocArray._traverse(x, path_attrs) @@ -257,16 +255,16 @@ def _traverse(node: Any, access_path: str): @staticmethod def _flatten_one_level(sequence: List[Any]) -> List[Any]: - from docarray import DocArray + from docarray import DocList - if len(sequence) == 0 or not isinstance(sequence[0], (list, DocArray)): + if len(sequence) == 0 or not isinstance(sequence[0], (list, DocList)): return sequence else: return [item for sublist in sequence for item in sublist] def summary(self): """ - Print a summary of this DocArray object and a summary of the schema of its + Print a summary of this DocList object and a summary of the schema of its Document type. """ DocArraySummary(self).summary() @@ -278,13 +276,13 @@ def _batch( show_progress: bool = False, ) -> Generator[T, None, None]: """ - Creates a `Generator` that yields `DocArray` of size `batch_size`. + Creates a `Generator` that yields `DocList` of size `batch_size`. Note, that the last batch might be smaller than `batch_size`. :param batch_size: Size of each generated batch. :param shuffle: If set, shuffle the Documents before dividing into minibatches. :param show_progress: if set, show a progress bar when batching documents. - :yield: a Generator of `DocArray`, each in the length of `batch_size` + :yield: a Generator of `DocList`, each in the length of `batch_size` """ from rich.progress import track diff --git a/docarray/array/array/__init__.py b/docarray/array/doc_list/__init__.py similarity index 100% rename from docarray/array/array/__init__.py rename to docarray/array/doc_list/__init__.py diff --git a/docarray/array/array/array.py b/docarray/array/doc_list/doc_list.py similarity index 62% rename from docarray/array/array/array.py rename to docarray/array/doc_list/doc_list.py index e3f56e74fda..89364ff4842 100644 --- a/docarray/array/array/array.py +++ b/docarray/array/doc_list/doc_list.py @@ -17,10 +17,10 @@ from typing_inspect import is_union_type -from docarray.array.abstract_array import AnyDocArray -from docarray.array.array.io import IOMixinArray -from docarray.array.array.pushpull import PushPullMixin -from docarray.array.array.sequence_indexing_mixin import ( +from docarray.array.any_array import AnyDocArray +from docarray.array.doc_list.io import IOMixinArray +from docarray.array.doc_list.pushpull import PushPullMixin +from docarray.array.doc_list.sequence_indexing_mixin import ( IndexingSequenceMixin, IndexIterType, ) @@ -31,19 +31,19 @@ from pydantic import BaseConfig from pydantic.fields import ModelField - from docarray.array.stacked.array_stacked import DocArrayStacked - from docarray.proto import DocumentArrayProto + from docarray.array.doc_vec.doc_vec import DocVec + from docarray.proto import DocListProto from docarray.typing import TorchTensor from docarray.typing.tensor.abstract_tensor import AbstractTensor -T = TypeVar('T', bound='DocArray') +T = TypeVar('T', bound='DocList') T_doc = TypeVar('T_doc', bound=BaseDoc) def _delegate_meth_to_data(meth_name: str) -> Callable: """ create a function that mimic a function call to the data attribute of the - DocArray + DocList :param meth_name: name of the method :return: a method that mimic the meth_name @@ -57,23 +57,23 @@ def _delegate_meth(self, *args, **kwargs): return _delegate_meth -class DocArray( +class DocList( IndexingSequenceMixin[T_doc], PushPullMixin, IOMixinArray, AnyDocArray[T_doc] ): """ - DocArray is a container of Documents. + DocList is a container of Documents. - A DocArray is a list of Documents of any schema. However, many - DocArray features are only available if these Documents are + A DocList is a list of Documents of any schema. However, many + DocList features are only available if these Documents are homogeneous and follow the same schema. To precise this schema you can use - the `DocArray[MyDocument]` syntax where MyDocument is a Document class - (i.e. schema). This creates a DocArray that can only contains Documents of + the `DocList[MyDocument]` syntax where MyDocument is a Document class + (i.e. schema). This creates a DocList that can only contains Documents of the type 'MyDocument'. --- ```python - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList from docarray.typing import NdArray, ImageUrl from typing import Optional @@ -83,7 +83,7 @@ class Image(BaseDoc): url: ImageUrl - da = DocArray[Image]( + docs = DocList[Image]( Image(url='http://url.com/foo.png') for _ in range(10) ) # noqa: E510 ``` @@ -91,37 +91,37 @@ class Image(BaseDoc): --- - If your DocArray is homogeneous (i.e. follows the same schema), you can access - fields at the DocArray level (for example `da.tensor` or `da.url`). - You can also set fields, with `da.tensor = np.random.random([10, 100])`: + If your DocList is homogeneous (i.e. follows the same schema), you can access + fields at the DocList level (for example `docs.tensor` or `docs.url`). + You can also set fields, with `docs.tensor = np.random.random([10, 100])`: - print(da.url) + print(docs.url) # [ImageUrl('http://url.com/foo.png', host_type='domain'), ...] import numpy as np - da.tensor = np.random.random([10, 100]) - print(da.tensor) + docs.tensor = np.random.random([10, 100]) + print(docs.tensor) # [NdArray([0.11299577, 0.47206767, 0.481723 , 0.34754724, 0.15016037, # 0.88861321, 0.88317666, 0.93845579, 0.60486676, ... ]), ...] - You can index into a DocArray like a numpy array or torch tensor: + You can index into a DocList like a numpy doc_list or torch tensor: - da[0] # index by position - da[0:5:2] # index by slice - da[[0, 2, 3]] # index by list of indices - da[True, False, True, True, ...] # index by boolean mask + docs[0] # index by position + docs[0:5:2] # index by slice + docs[[0, 2, 3]] # index by list of indices + docs[True, False, True, True, ...] # index by boolean mask - You can delete items from a DocArray like a Python List + You can delete items from a DocList like a Python List - del da[0] # remove first element from DocArray - del da[0:5] # remove elements for 0 to 5 from DocArray + del docs[0] # remove first element from DocList + del docs[0:5] # remove elements for 0 to 5 from DocList :param docs: iterable of Document """ - document_type: Type[BaseDoc] = AnyDoc + doc_type: Type[BaseDoc] = AnyDoc def __init__( self, @@ -135,14 +135,14 @@ def construct( docs: Sequence[T_doc], ) -> T: """ - Create a DocArray without validation any data. The data must come from a + Create a DocList without validation any data. The data must come from a trusted source :param docs: a Sequence (list) of Document with the same schema :return: """ - da = cls.__new__(cls) - da._data = docs if isinstance(docs, list) else list(docs) - return da + new_docs = cls.__new__(cls) + new_docs._data = docs if isinstance(docs, list) else list(docs) + return new_docs def __eq__(self, other: Any) -> bool: if self.__len__() != other.__len__(): @@ -154,17 +154,15 @@ def __eq__(self, other: Any) -> bool: def _validate_docs(self, docs: Iterable[T_doc]) -> Iterable[T_doc]: """ - Validate if an Iterable of Document are compatible with this DocArray + Validate if an Iterable of Document are compatible with this DocList """ for doc in docs: yield self._validate_one_doc(doc) def _validate_one_doc(self, doc: T_doc) -> T_doc: - """Validate if a Document is compatible with this DocArray""" - if not issubclass(self.document_type, AnyDoc) and not isinstance( - doc, self.document_type - ): - raise ValueError(f'{doc} is not a {self.document_type}') + """Validate if a Document is compatible with this DocList""" + if not issubclass(self.doc_type, AnyDoc) and not isinstance(doc, self.doc_type): + raise ValueError(f'{doc} is not a {self.doc_type}') return doc def __len__(self): @@ -180,16 +178,16 @@ def __bytes__(self) -> bytes: def append(self, doc: T_doc): """ - Append a Document to the DocArray. The Document must be from the same class - as the document_type of this DocArray otherwise it will fail. + Append a Document to the DocList. The Document must be from the same class + as the doc_type of this DocList otherwise it will fail. :param doc: A Document """ self._data.append(self._validate_one_doc(doc)) def extend(self, docs: Iterable[T_doc]): """ - Extend a DocArray with an Iterable of Document. The Documents must be from - the same class as the document_type of this DocArray otherwise it will + Extend a DocList with an Iterable of Document. The Documents must be from + the same class as the doc_type of this DocList otherwise it will fail. :param docs: Iterable of Documents """ @@ -197,8 +195,8 @@ def extend(self, docs: Iterable[T_doc]): def insert(self, i: int, doc: T_doc): """ - Insert a Document to the DocArray. The Document must be from the same - class as the document_type of this DocArray otherwise it will fail. + Insert a Document to the DocList. The Document must be from the same + class as the doc_type of this DocList otherwise it will fail. :param i: index to insert :param doc: A Document """ @@ -213,13 +211,13 @@ def _get_data_column( self: T, field: str, ) -> Union[MutableSequence, T, 'TorchTensor', 'NdArray']: - """Return all values of the fields from all docs this array contains + """Return all values of the fields from all docs this doc_list contains :param field: name of the fields to extract :return: Returns a list of the field value for each document - in the array like container + in the doc_list like container """ - field_type = self.__class__.document_type._get_field_type(field) + field_type = self.__class__.doc_type._get_field_type(field) if ( not is_union_type(field_type) @@ -229,7 +227,7 @@ def _get_data_column( # calling __class_getitem__ ourselves is a hack otherwise mypy complain # most likely a bug in mypy though # bug reported here https://github.com/python/mypy/issues/14111 - return DocArray.__class_getitem__(field_type)( + return DocList.__class_getitem__(field_type)( (getattr(doc, field) for doc in self), ) else: @@ -240,10 +238,10 @@ def _set_data_column( field: str, values: Union[List, T, 'AbstractTensor'], ): - """Set all Documents in this DocArray using the passed values + """Set all Documents in this DocList using the passed values :param field: name of the fields to set - :values: the values to set at the DocArray level + :values: the values to set at the DocList level """ ... @@ -253,19 +251,17 @@ def _set_data_column( def stack( self, tensor_type: Type['AbstractTensor'] = NdArray, - ) -> 'DocArrayStacked': + ) -> 'DocVec': """ - Convert the DocArray into a DocArrayStacked. `Self` cannot be used + Convert the DocList into a DocVec. `Self` cannot be used afterwards - :param tensor_type: Tensor Class used to wrap the stacked tensors. This is useful + :param tensor_type: Tensor Class used to wrap the doc_vec tensors. This is useful if the BaseDoc has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor - :return: A DocArrayStacked of the same document type as self + :return: A DocVec of the same document type as self """ - from docarray.array.stacked.array_stacked import DocArrayStacked + from docarray.array.doc_vec.doc_vec import DocVec - return DocArrayStacked.__class_getitem__(self.document_type)( - self, tensor_type=tensor_type - ) + return DocVec.__class_getitem__(self.doc_type)(self, tensor_type=tensor_type) @classmethod def validate( @@ -274,17 +270,17 @@ def validate( field: 'ModelField', config: 'BaseConfig', ): - from docarray.array.stacked.array_stacked import DocArrayStacked + from docarray.array.doc_vec.doc_vec import DocVec - if isinstance(value, (cls, DocArrayStacked)): + if isinstance(value, (cls, DocVec)): return value elif isinstance(value, Iterable): return cls(value) else: - raise TypeError(f'Expecting an Iterable of {cls.document_type}') + raise TypeError(f'Expecting an Iterable of {cls.doc_type}') def traverse_flat( - self: 'DocArray', + self: 'DocList', access_path: str, ) -> List[Any]: nodes = list(AnyDocArray._traverse(node=self, access_path=access_path)) @@ -293,9 +289,9 @@ def traverse_flat( return flattened @classmethod - def from_protobuf(cls: Type[T], pb_msg: 'DocumentArrayProto') -> T: + def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T: """create a Document from a protobuf message - :param pb_msg: The protobuf message from where to construct the DocArray + :param pb_msg: The protobuf message from where to construct the DocList """ return super().from_protobuf(pb_msg) diff --git a/docarray/array/array/io.py b/docarray/array/doc_list/io.py similarity index 87% rename from docarray/array/array/io.py rename to docarray/array/doc_list/io.py index 02b250fad4e..40d3486699f 100644 --- a/docarray/array/array/io.py +++ b/docarray/array/doc_list/io.py @@ -39,8 +39,8 @@ if TYPE_CHECKING: import pandas as pd - from docarray import DocArray - from docarray.proto import DocumentArrayProto + from docarray import DocList + from docarray.proto import DocListProto T = TypeVar('T', bound='IOMixinArray') T_doc = TypeVar('T_doc', bound=BaseDoc) @@ -97,7 +97,7 @@ def __getitem__(self, item: slice): class IOMixinArray(Iterable[T_doc]): - document_type: Type[T_doc] + doc_type: Type[T_doc] _data: List[T_doc] @abstractmethod @@ -112,19 +112,17 @@ def __init__( ... @classmethod - def from_protobuf(cls: Type[T], pb_msg: 'DocumentArrayProto') -> T: + def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T: """create a Document from a protobuf message - :param pb_msg: The protobuf message from where to construct the DocArray + :param pb_msg: The protobuf message from where to construct the DocList """ - return cls( - cls.document_type.from_protobuf(doc_proto) for doc_proto in pb_msg.docs - ) + return cls(cls.doc_type.from_protobuf(doc_proto) for doc_proto in pb_msg.docs) - def to_protobuf(self) -> 'DocumentArrayProto': - """Convert DocArray into a Protobuf message""" - from docarray.proto import DocumentArrayProto + def to_protobuf(self) -> 'DocListProto': + """Convert DocList into a Protobuf message""" + from docarray.proto import DocListProto - da_proto = DocumentArrayProto() + da_proto = DocListProto() for doc in self: da_proto.docs.append(doc.to_protobuf()) @@ -138,13 +136,13 @@ def from_bytes( compress: Optional[str] = None, show_progress: bool = False, ) -> T: - """Deserialize bytes into a DocArray. + """Deserialize bytes into a DocList. :param data: Bytes from which to deserialize :param protocol: protocol that was used to serialize :param compress: compress algorithm that was used to serialize :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :return: the deserialized DocArray + :return: the deserialized DocList """ return cls._load_binary_all( file_ctx=nullcontext(data), @@ -274,13 +272,13 @@ def from_base64( compress: Optional[str] = None, show_progress: bool = False, ) -> T: - """Deserialize base64 strings into a DocArray. + """Deserialize base64 strings into a DocList. :param data: Base64 string to deserialize :param protocol: protocol that was used to serialize :param compress: compress algorithm that was used to serialize :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :return: the deserialized DocArray + :return: the deserialized DocList """ return cls._load_binary_all( file_ctx=nullcontext(base64.b64decode(data)), @@ -316,17 +314,17 @@ def from_json( cls: Type[T], file: Union[str, bytes, bytearray], ) -> T: - """Deserialize JSON strings or bytes into a DocArray. + """Deserialize JSON strings or bytes into a DocList. - :param file: JSON object from where to deserialize a DocArray - :return: the deserialized DocArray + :param file: JSON object from where to deserialize a DocList + :return: the deserialized DocList """ json_docs = orjson.loads(file) - return cls([cls.document_type(**v) for v in json_docs]) + return cls([cls.doc_type(**v) for v in json_docs]) def to_json(self) -> bytes: """Convert the object into JSON bytes. Can be loaded via :meth:`.from_json`. - :return: JSON serialization of DocArray + :return: JSON serialization of DocList """ return orjson_dumps(self._data) @@ -343,36 +341,36 @@ def from_csv( file_path: str, encoding: str = 'utf-8', dialect: Union[str, csv.Dialect] = 'excel', - ) -> 'DocArray': + ) -> 'DocList': """ - Load a DocArray from a csv file following the schema defined in the - :attr:`~docarray.DocArray.document_type` attribute. - Every row of the csv file will be mapped to one document in the array. + Load a DocList from a csv file following the schema defined in the + :attr:`~docarray.DocList.doc_type` attribute. + Every row of the csv file will be mapped to one document in the doc_list. The column names (defined in the first row) have to match the field names of the Document type. For nested fields use "__"-separated access paths, such as 'image__url'. - List-like fields (including field of type DocArray) are not supported. + List-like fields (including field of type DocList) are not supported. - :param file_path: path to csv file to load DocArray from. + :param file_path: path to csv file to load DocList from. :param encoding: encoding used to read the csv file. Defaults to 'utf-8'. :param dialect: defines separator and how to handle whitespaces etc. Can be a csv.Dialect instance or one string of: 'excel' (for comma seperated values), 'excel-tab' (for tab separated values), 'unix' (for csv file generated on UNIX systems). - :return: DocArray + :return: DocList """ - from docarray import DocArray + from docarray import DocList - if cls.document_type == AnyDoc: + if cls.doc_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' - 'Please specify the DocArray\'s Document type using `DocArray[MyDoc]`.' + 'Please specify the DocList\'s Document type using `DocList[MyDoc]`.' ) - doc_type = cls.document_type - da = DocArray.__class_getitem__(doc_type)() + doc_type = cls.doc_type + docs = DocList.__class_getitem__(doc_type)() with open(file_path, 'r', encoding=encoding) as fp: rows = csv.DictReader(fp, dialect=dialect) @@ -387,8 +385,8 @@ def from_csv( ) if not all(valid_paths): raise ValueError( - f'Column names do not match the schema of the DocArray\'s ' - f'document type ({cls.document_type.__name__}): ' + f'Column names do not match the schema of the DocList\'s ' + f'document type ({cls.doc_type.__name__}): ' f'{list(compress(field_names, [not v for v in valid_paths]))}' ) @@ -396,15 +394,15 @@ def from_csv( doc_dict: Dict[Any, Any] = _access_path_dict_to_nested_dict( access_path2val ) - da.append(doc_type.parse_obj(doc_dict)) + docs.append(doc_type.parse_obj(doc_dict)) - return da + return docs def to_csv( self, file_path: str, dialect: Union[str, csv.Dialect] = 'excel' ) -> None: """ - Save a DocArray to a csv file. + Save a DocList to a csv file. The field names will be stored in the first row. Each row corresponds to the information of one Document. Columns for nested fields will be named after the "__"-seperated access paths, @@ -417,7 +415,7 @@ def to_csv( 'excel-tab' (for tab separated values), 'unix' (for csv file generated on UNIX systems). """ - fields = self.document_type._get_access_paths() + fields = self.doc_type._get_access_paths() with open(file_path, 'w') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=fields, dialect=dialect) @@ -428,17 +426,17 @@ def to_csv( writer.writerow(doc_dict) @classmethod - def from_pandas(cls, df: 'pd.DataFrame') -> 'DocArray': + def from_pandas(cls, df: 'pd.DataFrame') -> 'DocList': """ - Load a DocArray from a `pandas.DataFrame` following the schema - defined in the :attr:`~docarray.DocArray.document_type` attribute. - Every row of the dataframe will be mapped to one Document in the array. + Load a DocList from a `pandas.DataFrame` following the schema + defined in the :attr:`~docarray.DocList.doc_type` attribute. + Every row of the dataframe will be mapped to one Document in the doc_list. The column names of the dataframe have to match the field names of the Document type. For nested fields use "__"-separated access paths as column names, such as 'image__url'. - List-like fields (including field of type DocArray) are not supported. + List-like fields (including field of type DocList) are not supported. EXAMPLE USAGE: @@ -446,7 +444,7 @@ def from_pandas(cls, df: 'pd.DataFrame') -> 'DocArray': import pandas as pd - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList class Person(BaseDoc): @@ -458,26 +456,26 @@ class Person(BaseDoc): data=[['Maria', 12345], ['Jake', 54321]], columns=['name', 'follower'] ) - da = DocArray[Person].from_pandas(df) + docs = DocList[Person].from_pandas(df) - assert da.name == ['Maria', 'Jake'] - assert da.follower == [12345, 54321] + assert docs.name == ['Maria', 'Jake'] + assert docs.follower == [12345, 54321] :param df: pandas.DataFrame to extract Document's information from - :return: DocArray where each Document contains the information of one + :return: DocList where each Document contains the information of one corresponding row of the `pandas.DataFrame`. """ - from docarray import DocArray + from docarray import DocList - if cls.document_type == AnyDoc: + if cls.doc_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' - 'Please specify the DocArray\'s Document type using `DocArray[MyDoc]`.' + 'Please specify the DocList\'s Document type using `DocList[MyDoc]`.' ) - doc_type = cls.document_type - da = DocArray.__class_getitem__(doc_type)() + doc_type = cls.doc_type + docs = DocList.__class_getitem__(doc_type)() field_names = df.columns.tolist() if field_names is None or len(field_names) == 0: @@ -488,8 +486,8 @@ class Person(BaseDoc): ) if not all(valid_paths): raise ValueError( - f'Column names do not match the schema of the DocArray\'s ' - f'document type ({cls.document_type.__name__}): ' + f'Column names do not match the schema of the DocList\'s ' + f'document type ({cls.doc_type.__name__}): ' f'{list(compress(field_names, [not v for v in valid_paths]))}' ) @@ -497,13 +495,13 @@ class Person(BaseDoc): access_path2val = row._asdict() access_path2val.pop('index', None) doc_dict = _access_path_dict_to_nested_dict(access_path2val) - da.append(doc_type.parse_obj(doc_dict)) + docs.append(doc_type.parse_obj(doc_dict)) - return da + return docs def to_pandas(self) -> 'pd.DataFrame': """ - Save a DocArray to a `pandas.DataFrame`. + Save a DocList to a `pandas.DataFrame`. The field names will be stored as column names. Each row of the dataframe corresponds to the information of one Document. Columns for nested fields will be named after the "__"-seperated access paths, @@ -516,7 +514,7 @@ def to_pandas(self) -> 'pd.DataFrame': else: pd = import_library('pandas', raise_error=True) - fields = self.document_type._get_access_paths() + fields = self.doc_type._get_access_paths() df = pd.DataFrame(columns=fields) for doc in self: @@ -530,7 +528,7 @@ def to_pandas(self) -> 'pd.DataFrame': def _stream_header(self) -> bytes: # Binary format for streaming case - # V1 DocArray streaming serialization format + # V1 DocList streaming serialization format # | 1 byte | 8 bytes | 4 bytes | variable | 4 bytes | variable ... # 1 byte (uint8) @@ -547,11 +545,11 @@ def _load_binary_all( compress: Optional[str], show_progress: bool, ): - """Read a `DocArray` object from a binary file + """Read a `DocList` object from a binary file :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' :param compress: compress algorithm to use :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :return: a `DocArray` + :return: a `DocList` """ with file_ctx as fp: if isinstance(fp, bytes): @@ -565,9 +563,9 @@ def _load_binary_all( compress = None if protocol is not None and protocol == 'protobuf-array': - from docarray.proto import DocumentArrayProto + from docarray.proto import DocListProto - dap = DocumentArrayProto() + dap = DocListProto() dap.ParseFromString(d) return cls.from_protobuf(dap) @@ -606,7 +604,7 @@ def _load_binary_all( # variable length bytes doc load_protocol: str = protocol or 'protobuf' - doc = cls.document_type.from_bytes( + doc = cls.doc_type.from_bytes( d[start_doc_pos:end_doc_pos], protocol=load_protocol, compress=compress, @@ -663,7 +661,7 @@ def _load_binary_stream( f.read(4), 'big', signed=False ) load_protocol: str = protocol - yield cls.document_type.from_bytes( + yield cls.doc_type.from_bytes( f.read(len_current_doc_in_bytes), protocol=load_protocol, compress=compress, @@ -683,7 +681,7 @@ def load_binary( show_progress: bool = False, streaming: bool = False, ) -> Union[T, Generator['T_doc', None, None]]: - """Load array elements from a compressed binary file. + """Load doc_list elements from a compressed binary file. :param file: File or filename or serialized bytes where the data is stored. :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' @@ -691,7 +689,7 @@ def load_binary( :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :param streaming: if `True` returns a generator over `Document` objects. In case protocol is pickle the `Documents` are streamed from disk to save memory usage - :return: a DocArray object + :return: a DocList object .. note:: If `file` is `str` it can specify `protocol` and `compress` as file extensions. @@ -738,12 +736,12 @@ def save_binary( compress: Optional[str] = None, show_progress: bool = False, ) -> None: - """Save DocArray into a binary file. + """Save DocList into a binary file. - It will use the protocol to pick how to save the DocArray. - If used 'picke-array` and `protobuf-array` the DocArray will be stored + It will use the protocol to pick how to save the DocList. + If used 'picke-doc_list` and `protobuf-array` the DocList will be stored and compressed at complete level using `pickle` or `protobuf`. - When using `protobuf` or `pickle` as protocol each Document in DocArray + When using `protobuf` or `pickle` as protocol each Document in DocList will be stored individually and this would make it available for streaming. :param file: File or filename to which the data is saved. diff --git a/docarray/array/array/pushpull.py b/docarray/array/doc_list/pushpull.py similarity index 80% rename from docarray/array/array/pushpull.py rename to docarray/array/doc_list/pushpull.py index ee306620f4d..baa9c0439da 100644 --- a/docarray/array/array/pushpull.py +++ b/docarray/array/doc_list/pushpull.py @@ -19,7 +19,7 @@ SUPPORTED_PUSH_PULL_PROTOCOLS = get_args(PUSH_PULL_PROTOCOL) if TYPE_CHECKING: # pragma: no cover - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList from docarray.store.abstract_doc_store import AbstractDocStore @@ -30,7 +30,7 @@ class PushPullMixin(Iterable['BaseDoc']): """Mixin class for push/pull functionality.""" __backends__: Dict[str, Type['AbstractDocStore']] = {} - document_type: Type['BaseDoc'] + doc_type: Type['BaseDoc'] @abstractmethod def __len__(self) -> int: @@ -86,10 +86,10 @@ def push( show_progress: bool = False, branding: Optional[Dict] = None, ) -> Dict: - """Push this DocArray object to the specified url. + """Push this DocList object to the specified url. - :param url: url specifying the protocol and save name of the DocArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` - :param public: Only used by ``jac`` protocol. If true, anyone can pull a DocArray if they know its name. + :param url: url specifying the protocol and save name of the DocList. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param public: Only used by ``jac`` protocol. If true, anyone can pull a DocList if they know its name. Setting this to false will restrict access to only the creator. :param show_progress: If true, a progress bar will be displayed. :param branding: Only used by ``jac`` protocol. A dictionary of branding information to be sent to Jina AI Cloud. {"icon": "emoji", "background": "#fff"} @@ -112,8 +112,8 @@ def push_stream( """Push a stream of documents to the specified url. :param docs: a stream of documents - :param url: url specifying the protocol and save name of the DocArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` - :param public: Only used by ``jac`` protocol. If true, anyone can pull a DocArray if they know its name. + :param url: url specifying the protocol and save name of the DocList. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param public: Only used by ``jac`` protocol. If true, anyone can pull a DocList if they know its name. :param show_progress: If true, a progress bar will be displayed. :param branding: Only used by ``jac`` protocol. A dictionary of branding information to be sent to Jina AI Cloud. {"icon": "emoji", "background": "#fff"} """ @@ -129,20 +129,20 @@ def pull( url: str, show_progress: bool = False, local_cache: bool = True, - ) -> 'DocArray': - """Pull a :class:`DocArray` from the specified url. + ) -> 'DocList': + """Pull a :class:`DocList` from the specified url. - :param url: url specifying the protocol and save name of the DocArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param url: url specifying the protocol and save name of the DocList. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocArray to local folder - :return: a :class:`DocArray` object + :param local_cache: store the downloaded DocList to local folder + :return: a :class:`DocList` object """ from docarray.base_doc import AnyDoc - if cls.document_type == AnyDoc: + if cls.doc_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' - 'Please specify the DocArray\'s Document type using `DocArray[MyDoc]`.' + 'Please specify the DocList\'s Document type using `DocList[MyDoc]`.' ) logging.info(f'Pulling {url}') @@ -160,17 +160,17 @@ def pull_stream( ) -> Iterator['BaseDoc']: """Pull a stream of Documents from the specified url. - :param url: url specifying the protocol and save name of the DocArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param url: url specifying the protocol and save name of the DocList. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocArray to local folder + :param local_cache: store the downloaded DocList to local folder :return: Iterator of Documents """ from docarray.base_doc import AnyDoc - if cls.document_type == AnyDoc: + if cls.doc_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' - 'Please specify the DocArray\'s Document type using `DocArray[MyDoc]`.' + 'Please specify the DocList\'s Document type using `DocList[MyDoc]`.' ) logging.info(f'Pulling Document stream from {url}') diff --git a/docarray/array/array/sequence_indexing_mixin.py b/docarray/array/doc_list/sequence_indexing_mixin.py similarity index 96% rename from docarray/array/array/sequence_indexing_mixin.py rename to docarray/array/doc_list/sequence_indexing_mixin.py index ac07359f3b0..85bad64429f 100644 --- a/docarray/array/array/sequence_indexing_mixin.py +++ b/docarray/array/doc_list/sequence_indexing_mixin.py @@ -39,13 +39,13 @@ class IndexingSequenceMixin(Iterable[T_item]): This mixin allow sto extend a list into an object that can be indexed a la numpy/pytorch. - You can index into, delete from, and set items in a IndexingSequenceMixin like a numpy array or torch tensor: + You can index into, delete from, and set items in a IndexingSequenceMixin like a numpy doc_list or torch tensor: .. code-block:: python - da[0] # index by position - da[0:5:2] # index by slice - da[[0, 2, 3]] # index by list of indices - da[True, False, True, True, ...] # index by boolean mask + docs[0] # index by position + docs[0:5:2] # index by slice + docs[[0, 2, 3]] # index by list of indices + docs[True, False, True, True, ...] # index by boolean mask """ diff --git a/docarray/array/stacked/__init__.py b/docarray/array/doc_vec/__init__.py similarity index 100% rename from docarray/array/stacked/__init__.py rename to docarray/array/doc_vec/__init__.py diff --git a/docarray/array/stacked/column_storage.py b/docarray/array/doc_vec/column_storage.py similarity index 79% rename from docarray/array/stacked/column_storage.py rename to docarray/array/doc_vec/column_storage.py index 80129cfcdfd..42c67c96b3b 100644 --- a/docarray/array/stacked/column_storage.py +++ b/docarray/array/doc_vec/column_storage.py @@ -10,12 +10,12 @@ Union, ) -from docarray.array.stacked.list_advance_indexing import ListAdvancedIndexing +from docarray.array.doc_vec.list_advance_indexing import ListAdvancedIndexing from docarray.typing import NdArray from docarray.typing.tensor.abstract_tensor import AbstractTensor if TYPE_CHECKING: - from docarray.array.stacked.array_stacked import DocArrayStacked + from docarray.array.doc_vec.doc_vec import DocVec IndexIterType = Union[slice, Iterable[int], Iterable[bool], None] @@ -26,26 +26,26 @@ class ColumnStorage: """ ColumnStorage is a container to store the columns of the - :class:`~docarray.array.stacked.DocArrayStacked`. + :class:`~docarray.array.doc_vec.DocVec`. :param tensor_columns: a Dict of AbstractTensor - :param doc_columns: a Dict of :class:`~docarray.array.stacked.DocArrayStacked` - :param da_columns: a Dict of List of :class:`~docarray.array.stacked.DocArrayStacked` + :param doc_columns: a Dict of :class:`~docarray.array.doc_vec.DocVec` + :param docs_vec_columns: a Dict of List of :class:`~docarray.array.doc_vec.DocVec` :param any_columns: a Dict of List - :param tensor_type: Class used to wrap the stacked tensors + :param tensor_type: Class used to wrap the doc_vec tensors """ def __init__( self, tensor_columns: Dict[str, AbstractTensor], - doc_columns: Dict[str, 'DocArrayStacked'], - da_columns: Dict[str, ListAdvancedIndexing['DocArrayStacked']], + doc_columns: Dict[str, 'DocVec'], + docs_vec_columns: Dict[str, ListAdvancedIndexing['DocVec']], any_columns: Dict[str, ListAdvancedIndexing], tensor_type: Type[AbstractTensor] = NdArray, ): self.tensor_columns = tensor_columns self.doc_columns = doc_columns - self.da_columns = da_columns + self.docs_vec_columns = docs_vec_columns self.any_columns = any_columns self.tensor_type = tensor_type @@ -53,7 +53,7 @@ def __init__( self.columns = ChainMap( # type: ignore self.tensor_columns, # type: ignore self.doc_columns, # type: ignore - self.da_columns, # type: ignore + self.docs_vec_columns, # type: ignore self.any_columns, # type: ignore ) # type: ignore @@ -65,13 +65,15 @@ def __getitem__(self: T, item: IndexIterType) -> T: item = list(item) tensor_columns = {key: col[item] for key, col in self.tensor_columns.items()} doc_columns = {key: col[item] for key, col in self.doc_columns.items()} - da_columns = {key: col[item] for key, col in self.da_columns.items()} + docs_vec_columns = { + key: col[item] for key, col in self.docs_vec_columns.items() + } any_columns = {key: col[item] for key, col in self.any_columns.items()} return self.__class__( tensor_columns, doc_columns, - da_columns, + docs_vec_columns, any_columns, self.tensor_type, ) diff --git a/docarray/array/stacked/array_stacked.py b/docarray/array/doc_vec/doc_vec.py similarity index 72% rename from docarray/array/stacked/array_stacked.py rename to docarray/array/doc_vec/doc_vec.py index bc6ff8965a5..899ccef9da5 100644 --- a/docarray/array/stacked/array_stacked.py +++ b/docarray/array/doc_vec/doc_vec.py @@ -18,10 +18,10 @@ from pydantic import BaseConfig, parse_obj_as -from docarray.array.abstract_array import AnyDocArray -from docarray.array.array.array import DocArray -from docarray.array.stacked.column_storage import ColumnStorage, ColumnStorageView -from docarray.array.stacked.list_advance_indexing import ListAdvancedIndexing +from docarray.array.any_array import AnyDocArray +from docarray.array.doc_list.doc_list import DocList +from docarray.array.doc_vec.column_storage import ColumnStorage, ColumnStorageView +from docarray.array.doc_vec.list_advance_indexing import ListAdvancedIndexing from docarray.base_doc import BaseDoc from docarray.base_doc.mixins.io import _type_to_protobuf from docarray.typing import NdArray @@ -32,7 +32,7 @@ if TYPE_CHECKING: from pydantic.fields import ModelField - from docarray.proto import DocArrayStackedProto + from docarray.proto import DocVecProto torch_available = is_torch_available() if torch_available: @@ -49,45 +49,45 @@ TensorFlowTensor = None # type: ignore T_doc = TypeVar('T_doc', bound=BaseDoc) -T = TypeVar('T', bound='DocArrayStacked') +T = TypeVar('T', bound='DocVec') IndexIterType = Union[slice, Iterable[int], Iterable[bool], None] -class DocArrayStacked(AnyDocArray[T_doc]): +class DocVec(AnyDocArray[T_doc]): """ - DocArrayStacked is a container of Documents appropriates to perform + DocVec is a container of Documents appropriates to perform computation that require batches of data (ex: matrix multiplication, distance calculation, deep learning forward pass) - A DocArrayStacked has a similar interface as - {class}`~docarray.array.DocArray` but with an underlying implementation that is + A DocVec has a similar interface as + {class}`~docarray.array.DocList` but with an underlying implementation that is column based instead of row based. Each field of the schema of the DocArrayStack - (the :attr:`~docarray.array.stacked.DocArrayStacked.document_type` which is a - `BaseDoc`) will be stored in a column. If the field is a tensor, the data from all Documents will be stored as a single, stacked (torch/np/tf) tensor. + (the :attr:`~docarray.array.doc_vec.DocVec.doc_type` which is a + `BaseDoc`) will be stored in a column. If the field is a tensor, the data from all Documents will be stored as a single, doc_vec (torch/np/tf) tensor. If the tensor field is `AnyTensor` or a Union of tensor types, the - :attr:`~docarray.array.stacked.DocArrayStacked.tensor_type` will be used to determine - the type of the stacked column. + :attr:`~docarray.array.doc_vec.DocVec.tensor_type` will be used to determine + the type of the doc_vec column. - If the field is another `BasedDocument` the column will be another DocArrayStacked that follows the + If the field is another `BasedDoc` the column will be another DocVec that follows the schema of the nested Document. - If the field is a `DocArray` or - `DocArrayStacked` then the column will be a list of `DocArrayStacked`. + If the field is a `DocList` or + `DocVec` then the column will be a list of `DocVec`. For any other type the column is a Python list. - Every `Document` inside a `DocArrayStacked` is a view into the data columns stored at the `DocArrayStacked` level. The `Document` does + Every `Document` inside a `DocVec` is a view into the data columns stored at the `DocVec` level. The `BaseDoc` does not hold any data itself. The behavior of this Document "view" is similar to the behavior of `view = tensor[i]` in numpy/PyTorch. - :param docs: a DocArray - :param tensor_type: Tensor Class used to wrap the stacked tensors. This is useful - if the BaseDoc of this DocArrayStacked has some undefined tensor type like + :param docs: a homogeneous sequence of BaseDoc + :param tensor_type: Tensor Class used to wrap the doc_vec tensors. This is useful + if the BaseDoc of this DocVec has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor """ - document_type: Type[T_doc] + doc_type: Type[T_doc] def __init__( self: T, @@ -97,22 +97,22 @@ def __init__( self.tensor_type = tensor_type tensor_columns: Dict[str, AbstractTensor] = dict() - doc_columns: Dict[str, 'DocArrayStacked'] = dict() - da_columns: Dict[str, ListAdvancedIndexing['DocArrayStacked']] = dict() + doc_columns: Dict[str, 'DocVec'] = dict() + docs_vec_columns: Dict[str, ListAdvancedIndexing['DocVec']] = dict() any_columns: Dict[str, ListAdvancedIndexing] = dict() if len(docs) == 0: raise ValueError(f'docs {docs}: should not be empty') docs = ( docs - if isinstance(docs, DocArray) - else DocArray.__class_getitem__(self.document_type)(docs) + if isinstance(docs, DocList) + else DocList.__class_getitem__(self.doc_type)(docs) ) - for field_name, field in self.document_type.__fields__.items(): - # here we iterate over the field of the da schema, and we collect the data + for field_name, field in self.doc_type.__fields__.items(): + # here we iterate over the field of the docs schema, and we collect the data # from each document and put them in the corresponding column - field_type = self.document_type._get_field_type(field_name) + field_type = self.doc_type._get_field_type(field_name) if is_tensor_union(field_type): field_type = tensor_type @@ -166,11 +166,13 @@ def __init__( elif issubclass(field_type, AnyDocArray): docs_list = list() for doc in docs: - da = getattr(doc, field_name) - if isinstance(da, DocArray): - da = da.stack(tensor_type=self.tensor_type) - docs_list.append(da) - da_columns[field_name] = ListAdvancedIndexing(docs_list) + docs_nested = getattr(doc, field_name) + if isinstance(docs_nested, DocList): + docs_nested = docs_nested.stack( + tensor_type=self.tensor_type + ) + docs_list.append(docs_nested) + docs_vec_columns[field_name] = ListAdvancedIndexing(docs_list) else: any_columns[field_name] = ListAdvancedIndexing( getattr(docs, field_name) @@ -183,7 +185,7 @@ def __init__( self._storage = ColumnStorage( tensor_columns, doc_columns, - da_columns, + docs_vec_columns, any_columns, tensor_type, ) @@ -191,14 +193,14 @@ def __init__( @classmethod def from_columns_storage(cls: Type[T], storage: ColumnStorage) -> T: """ - Create a DocArrayStacked directly from a storage object + Create a DocVec directly from a storage object :param storage: the underlying storage. :return: a DocArrayStack """ - da = cls.__new__(cls) - da.tensor_type = storage.tensor_type - da._storage = storage - return da + docs = cls.__new__(cls) + docs.tensor_type = storage.tensor_type + docs._storage = storage + return docs @classmethod def validate( @@ -209,17 +211,17 @@ def validate( ) -> T: if isinstance(value, cls): return value - elif isinstance(value, DocArray.__class_getitem__(cls.document_type)): + elif isinstance(value, DocList.__class_getitem__(cls.doc_type)): return cast(T, value.stack()) elif isinstance(value, Sequence): return cls(value) elif isinstance(value, Iterable): return cls(list(value)) else: - raise TypeError(f'Expecting an Iterable of {cls.document_type}') + raise TypeError(f'Expecting an Iterable of {cls.doc_type}') def to(self: T, device: str) -> T: - """Move all tensors of this DocArrayStacked to the given device + """Move all tensors of this DocVec to the given device :param device: the device to move the data to """ @@ -230,9 +232,9 @@ def to(self: T, device: str) -> T: for field, col_doc in self._storage.doc_columns.items(): self._storage.doc_columns[field] = col_doc.to(device) - for _, col_da in self._storage.da_columns.items(): - for da in col_da: - da.to(device) + for _, col_da in self._storage.docs_vec_columns.items(): + for docs in col_da: + docs.to(device) return self @@ -255,12 +257,12 @@ def __getitem__(self: T, item: Union[int, IndexIterType]) -> Union[T_doc, T]: if isinstance(item, (slice, Iterable)): return self.__class__.from_columns_storage(self._storage[item]) # single doc case - return self.document_type.from_view(ColumnStorageView(item, self._storage)) + return self.doc_type.from_view(ColumnStorageView(item, self._storage)) def _get_data_column( self: T, field: str, - ) -> Union[MutableSequence, 'DocArrayStacked', AbstractTensor]: + ) -> Union[MutableSequence, 'DocVec', AbstractTensor]: """Return one column of the data :param field: name of the fields to extract @@ -269,8 +271,8 @@ def _get_data_column( """ if field in self._storage.any_columns.keys(): return self._storage.any_columns[field].data - elif field in self._storage.da_columns.keys(): - return self._storage.da_columns[field].data + elif field in self._storage.docs_vec_columns.keys(): + return self._storage.docs_vec_columns[field].data elif field in self._storage.columns.keys(): return self._storage.columns[field] else: @@ -292,8 +294,8 @@ def __setitem__(self: T, key: IndexIterType, value: T): def __setitem__(self: T, key, value): # single doc case if not isinstance(key, (slice, Iterable)): - if not isinstance(value, self.document_type): - raise ValueError(f'{value} is not a {self.document_type}') + if not isinstance(value, self.doc_type): + raise ValueError(f'{value} is not a {self.doc_type}') for field, value in value.dict().items(): self._storage.columns[field][key] = value # todo we might want to @@ -305,12 +307,12 @@ def __setitem__(self: T, key, value): def _set_data_and_columns( self: T, index_item: Union[Tuple, Iterable, slice], - value: Union[T, DocArray[T_doc]], + value: Union[T, DocList[T_doc]], ) -> None: """Delegates the setting to the data and the columns. :param index_item: the key used as index. Needs to be a valid index for both - DocArray (data) and column types (torch/tensorflow/numpy tensors) + DocList (data) and column types (torch/tensorflow/numpy tensors) :value: the value to set at the `key` location """ if isinstance(index_item, tuple): @@ -318,25 +320,25 @@ def _set_data_and_columns( # set data and prepare columns processed_value: T - if isinstance(value, DocArray): - if not issubclass(value.document_type, self.document_type): + if isinstance(value, DocList): + if not issubclass(value.doc_type, self.doc_type): raise TypeError( - f'{value} schema : {value.document_type} is not compatible with ' - f'this DocArrayStacked schema : {self.document_type}' + f'{value} schema : {value.doc_type} is not compatible with ' + f'this DocVec schema : {self.doc_type}' ) processed_value = cast( T, value.stack(tensor_type=self.tensor_type) ) # we need to copy data here - elif isinstance(value, DocArrayStacked): - if not issubclass(value.document_type, self.document_type): + elif isinstance(value, DocVec): + if not issubclass(value.doc_type, self.doc_type): raise TypeError( - f'{value} schema : {value.document_type} is not compatible with ' - f'this DocArrayStacked schema : {self.document_type}' + f'{value} schema : {value.doc_type} is not compatible with ' + f'this DocVec schema : {self.doc_type}' ) processed_value = value else: - raise TypeError(f'Can not set a DocArrayStacked with {type(value)}') + raise TypeError(f'Can not set a DocVec with {type(value)}') for field, col in self._storage.columns.items(): col[index_item] = processed_value._storage.columns[field] @@ -345,17 +347,17 @@ def _set_data_column( self: T, field: str, values: Union[ - Sequence[DocArray[T_doc]], + Sequence[DocList[T_doc]], Sequence[Any], T, - DocArray, + DocList, AbstractTensor, ], ) -> None: - """Set all Documents in this DocArray using the passed values + """Set all Documents in this DocList using the passed values :param field: name of the fields to set - :values: the values to set at the DocArray level + :values: the values to set at the DocList level """ if len(values) != len(self._storage): @@ -376,23 +378,21 @@ def _set_data_column( elif field in self._storage.doc_columns.keys(): values_ = parse_obj_as( - DocArrayStacked.__class_getitem__( - self._storage.doc_columns[field].document_type - ), + DocVec.__class_getitem__(self._storage.doc_columns[field].doc_type), values, ) self._storage.doc_columns[field] = values_ - elif field in self._storage.da_columns.keys(): - values_ = cast(Sequence[DocArray[T_doc]], values) + elif field in self._storage.docs_vec_columns.keys(): + values_ = cast(Sequence[DocList[T_doc]], values) # TODO here we should actually check if this is correct - self._storage.da_columns[field] = values_ + self._storage.docs_vec_columns[field] = values_ elif field in self._storage.any_columns.keys(): # TODO here we should actually check if this is correct values_ = cast(Sequence, values) self._storage.any_columns[field] = values_ else: - raise KeyError(f'{field} is not a valid field for this DocArray') + raise KeyError(f'{field} is not a valid field for this DocList') #################### # Deleting data # @@ -422,32 +422,32 @@ def __len__(self): #################### @classmethod - def from_protobuf(cls: Type[T], pb_msg: 'DocArrayStackedProto') -> T: + def from_protobuf(cls: Type[T], pb_msg: 'DocVecProto') -> T: """create a Document from a protobuf message""" storage = ColumnStorage( pb_msg.tensor_columns, pb_msg.doc_columns, - pb_msg.da_columns, + pb_msg.docs_vec_columns, pb_msg.any_columns, ) return cls.from_columns_storage(storage) - def to_protobuf(self) -> 'DocArrayStackedProto': - """Convert DocArray into a Protobuf message""" + def to_protobuf(self) -> 'DocVecProto': + """Convert DocList into a Protobuf message""" from docarray.proto import ( - DocArrayStackedProto, - DocumentArrayProto, + DocListProto, + DocVecProto, ListOfAnyProto, ListOfDocArrayProto, NdArrayProto, ) - da_proto = DocumentArrayProto() + da_proto = DocListProto() for doc in self: da_proto.docs.append(doc.to_protobuf()) - doc_columns_proto: Dict[str, DocArrayStackedProto] = dict() + doc_columns_proto: Dict[str, DocVecProto] = dict() tensor_columns_proto: Dict[str, NdArrayProto] = dict() da_columns_proto: Dict[str, ListOfDocArrayProto] = dict() any_columns_proto: Dict[str, ListOfAnyProto] = dict() @@ -456,10 +456,10 @@ def to_protobuf(self) -> 'DocArrayStackedProto': doc_columns_proto[field] = col_doc.to_protobuf() for field, col_tens in self._storage.tensor_columns.items(): tensor_columns_proto[field] = col_tens.to_protobuf() - for field, col_da in self._storage.da_columns.items(): + for field, col_da in self._storage.docs_vec_columns.items(): list_proto = ListOfDocArrayProto() - for da in col_da: - list_proto.data.append(da.to_protobuf()) + for docs in col_da: + list_proto.data.append(docs.to_protobuf()) da_columns_proto[field] = list_proto for field, col_any in self._storage.any_columns.items(): list_proto = ListOfAnyProto() @@ -467,29 +467,29 @@ def to_protobuf(self) -> 'DocArrayStackedProto': list_proto.data.append(_type_to_protobuf(data)) any_columns_proto[field] = list_proto - return DocArrayStackedProto( + return DocVecProto( doc_columns=doc_columns_proto, tensor_columns=tensor_columns_proto, - da_columns=da_columns_proto, + docs_vec_columns=da_columns_proto, any_columns=any_columns_proto, ) - def unstack(self: T) -> DocArray[T_doc]: - """Convert DocArrayStacked into a DocArray. + def unstack(self: T) -> DocList[T_doc]: + """Convert DocVec into a DocList. - Note this destroys the arguments and returns a new DocArray + Note this destroys the arguments and returns a new DocList """ - unstacked_doc_column: Dict[str, DocArray] = dict() - unstacked_da_column: Dict[str, List[DocArray]] = dict() + unstacked_doc_column: Dict[str, DocList] = dict() + unstacked_da_column: Dict[str, List[DocList]] = dict() unstacked_tensor_column: Dict[str, List[AbstractTensor]] = dict() unstacked_any_column = self._storage.any_columns for field, doc_col in self._storage.doc_columns.items(): unstacked_doc_column[field] = doc_col.unstack() - for field, da_col in self._storage.da_columns.items(): - unstacked_da_column[field] = [da.unstack() for da in da_col] + for field, da_col in self._storage.docs_vec_columns.items(): + unstacked_da_column[field] = [docs.unstack() for docs in da_col] for field, tensor_col in list(self._storage.tensor_columns.items()): # list is needed here otherwise we cannot delete the column @@ -511,11 +511,11 @@ def unstack(self: T) -> DocArray[T_doc]: for i in range(len(self)): data = {field: col[i] for field, col in unstacked_column.items()} - docs.append(self.document_type.construct(**data)) + docs.append(self.doc_type.construct(**data)) del self._storage - return DocArray.__class_getitem__(self.document_type).construct(docs) + return DocList.__class_getitem__(self.doc_type).construct(docs) def traverse_flat( self, diff --git a/docarray/array/stacked/list_advance_indexing.py b/docarray/array/doc_vec/list_advance_indexing.py similarity index 71% rename from docarray/array/stacked/list_advance_indexing.py rename to docarray/array/doc_vec/list_advance_indexing.py index 545c634a4aa..e0eaf2e970c 100644 --- a/docarray/array/stacked/list_advance_indexing.py +++ b/docarray/array/doc_vec/list_advance_indexing.py @@ -1,6 +1,6 @@ from typing import Iterator, MutableSequence, TypeVar -from docarray.array.array.sequence_indexing_mixin import IndexingSequenceMixin +from docarray.array.doc_list.sequence_indexing_mixin import IndexingSequenceMixin T_item = TypeVar('T_item') @@ -12,10 +12,10 @@ class ListAdvancedIndexing(IndexingSequenceMixin[T_item]): You can index into a ListAdvanceIndex like a numpy array or torch tensor: .. code-block:: python - da[0] # index by position - da[0:5:2] # index by slice - da[[0, 2, 3]] # index by list of indices - da[True, False, True, True, ...] # index by boolean mask + docs[0] # index by position + docs[0:5:2] # index by slice + docs[[0, 2, 3]] # index by list of indices + docs[True, False, True, True, ...] # index by boolean mask """ diff --git a/docarray/base_doc/doc.py b/docarray/base_doc/doc.py index 48afbe6eddd..a5c42a82ee4 100644 --- a/docarray/base_doc/doc.py +++ b/docarray/base_doc/doc.py @@ -1,5 +1,5 @@ import os -from typing import TYPE_CHECKING, Any, Optional, Type, TypeVar, Dict +from typing import TYPE_CHECKING, Any, Dict, Optional, Type, TypeVar import orjson from pydantic import BaseModel, Field @@ -12,7 +12,7 @@ from docarray.typing.tensor.abstract_tensor import AbstractTensor if TYPE_CHECKING: - from docarray.array.stacked.column_storage import ColumnStorageView + from docarray.array.doc_vec.column_storage import ColumnStorageView _console: Console = Console() @@ -79,7 +79,7 @@ def _ipython_display_(self) -> None: self.summary() def is_view(self) -> bool: - from docarray.array.stacked.column_storage import ColumnStorageView + from docarray.array.doc_vec.column_storage import ColumnStorageView return isinstance(self.__dict__, ColumnStorageView) diff --git a/docarray/base_doc/mixins/io.py b/docarray/base_doc/mixins/io.py index 9654ae03d41..b2a64e8082b 100644 --- a/docarray/base_doc/mixins/io.py +++ b/docarray/base_doc/mixins/io.py @@ -28,7 +28,7 @@ import torch from pydantic.fields import ModelField - from docarray.proto import DocumentProto, NodeProto + from docarray.proto import DocProto, NodeProto from docarray.typing import TensorFlowTensor, TorchTensor else: tf = import_library('tensorflow', raise_error=False) @@ -171,9 +171,9 @@ def from_bytes( if protocol == 'pickle': return pickle.loads(bstr) elif protocol == 'protobuf': - from docarray.proto import DocumentProto + from docarray.proto import DocProto - pb_msg = DocumentProto() + pb_msg = DocProto() pb_msg.ParseFromString(bstr) return cls.from_protobuf(pb_msg) else: @@ -209,7 +209,7 @@ def from_base64( return cls.from_bytes(base64.b64decode(data), protocol, compress) @classmethod - def from_protobuf(cls: Type[T], pb_msg: 'DocumentProto') -> T: + def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T: """create a Document from a protobuf message :param pb_msg: the proto message of the Document @@ -254,10 +254,10 @@ def _get_content_from_node_proto( return_field = content_type_dict[docarray_type].from_protobuf( getattr(value, content_key) ) - elif content_key in ['document', 'document_array']: + elif content_key in ['doc', 'doc_array']: if field_name is None: raise ValueError( - 'field_name cannot be None when trying to deseriliaze a Document or a DocArray' + 'field_name cannot be None when trying to deseriliaze a Document or a DocList' ) return_field = cls._get_field_type(field_name).from_protobuf( getattr(value, content_key) @@ -299,12 +299,12 @@ def _get_content_from_node_proto( return return_field - def to_protobuf(self: T) -> 'DocumentProto': + def to_protobuf(self: T) -> 'DocProto': """Convert Document into a Protobuf message. :return: the protobuf message """ - from docarray.proto import DocumentProto + from docarray.proto import DocProto data = {} for field, value in self: @@ -324,7 +324,7 @@ def to_protobuf(self: T) -> 'DocumentProto': ex.args = (f'Field `{field}` is problematic',) + ex.args raise ex - return DocumentProto(data=data) + return DocProto(data=data) def _to_node_protobuf(self) -> 'NodeProto': from docarray.proto import NodeProto @@ -335,7 +335,7 @@ def _to_node_protobuf(self) -> 'NodeProto': :return: the nested item protobuf message """ - return NodeProto(document=self.to_protobuf()) + return NodeProto(doc=self.to_protobuf()) @classmethod def _get_access_paths(cls) -> List[str]: diff --git a/docarray/base_doc/mixins/update.py b/docarray/base_doc/mixins/update.py index 1fe37015c90..99fdbc2bf8e 100644 --- a/docarray/base_doc/mixins/update.py +++ b/docarray/base_doc/mixins/update.py @@ -74,7 +74,7 @@ class MyDocument(BaseDoc): ) from collections import namedtuple - from docarray import DocArray + from docarray import DocList from docarray.utils.reduce import reduce # Declaring namedtuple() @@ -104,9 +104,7 @@ def _group_fields(doc: 'UpdateMixin') -> _FieldGroups: if field_name not in FORBIDDEN_FIELDS_TO_UPDATE: field_type = doc._get_field_type(field_name) - if isinstance(field_type, type) and issubclass( - field_type, DocArray - ): + if isinstance(field_type, type) and issubclass(field_type, DocList): nested_docarray_fields.append(field_name) else: origin = get_origin(field_type) diff --git a/docarray/computation/abstract_comp_backend.py b/docarray/computation/abstract_comp_backend.py index cfe525cc932..da80ad9f841 100644 --- a/docarray/computation/abstract_comp_backend.py +++ b/docarray/computation/abstract_comp_backend.py @@ -16,7 +16,7 @@ class AbstractComputationalBackend(ABC, typing.Generic[TTensor]): Abstract base class for computational backends. Every supported tensor/ML framework (numpy, torch etc.) should define its own computational backend exposing common functionality expressed in that framework. - That way, DocArray can leverage native implementations from all frameworks. + That way, DocList can leverage native implementations from all frameworks. """ @classmethod diff --git a/docarray/data/torch_dataset.py b/docarray/data/torch_dataset.py index dd58035cd33..25fbb9a9a6a 100644 --- a/docarray/data/torch_dataset.py +++ b/docarray/data/torch_dataset.py @@ -2,7 +2,7 @@ from torch.utils.data import Dataset -from docarray import BaseDoc, DocArray, DocArrayStacked +from docarray import BaseDoc, DocList, DocVec from docarray.typing import TorchTensor from docarray.utils._internal._typing import change_cls_name @@ -14,7 +14,7 @@ class MultiModalDataset(Dataset, Generic[T_doc]): A dataset that can be used inside a PyTorch DataLoader. In other words, it implements the PyTorch Dataset interface. - :param da: the DocArray to be used as the dataset + :param docs: the DocList to be used as the dataset :param preprocessing: a dictionary of field names and preprocessing functions The preprocessing dictionary passed to the constructor consists of keys that are @@ -24,7 +24,7 @@ class MultiModalDataset(Dataset, Generic[T_doc]): EXAMPLE USAGE .. code-block:: python from torch.utils.data import DataLoader - from docarray import DocArray + from docarray import DocList from docarray.data import MultiModalDataset from docarray.documents import Text @@ -33,8 +33,8 @@ def prepend_number(text: str): return f"Number {text}" - da = DocArray[Text](Text(text=str(i)) for i in range(16)) - ds = MultiModalDataset[Text](da, preprocessing={'text': prepend_number}) + docs = DocList[Text](Text(text=str(i)) for i in range(16)) + ds = MultiModalDataset[Text](docs, preprocessing={'text': prepend_number}) loader = DataLoader(ds, batch_size=4, collate_fn=MultiModalDataset[Text].collate_fn) for batch in loader: print(batch.text) @@ -51,7 +51,7 @@ def prepend_number(text: str): .. code-block:: python import torch from torch.utils.data import DataLoader - from docarray import DocArray, BaseDoc + from docarray import DocList, BaseDoc from docarray.data import MultiModalDataset from docarray.documents import Text @@ -78,9 +78,9 @@ def add_nonsense(student: Student): ) - da = DocArray[Student](Student(thesis=Thesis(title=str(i))) for i in range(16)) + docs = DocList[Student](Student(thesis=Thesis(title=str(i))) for i in range(16)) ds = MultiModalDataset[Student]( - da, + docs, preprocessing={ "thesis.title": embed_title, "thesis": normalize_embedding, @@ -92,20 +92,20 @@ def add_nonsense(student: Student): print(batch.thesis.title.embedding) """ - document_type: Optional[Type[BaseDoc]] = None + doc_type: Optional[Type[BaseDoc]] = None __typed_ds__: Dict[Type[BaseDoc], Type['MultiModalDataset']] = {} def __init__( - self, da: 'DocArray[T_doc]', preprocessing: Dict[str, Callable] + self, docs: 'DocList[T_doc]', preprocessing: Dict[str, Callable] ) -> None: - self.da = da + self.docs = docs self._preprocessing = preprocessing def __len__(self): - return len(self.da) + return len(self.docs) def __getitem__(self, item: int): - doc = self.da[item].copy(deep=True) + doc = self.docs[item].copy(deep=True) for field, preprocess in self._preprocessing.items(): if len(field) == 0: doc = preprocess(doc) or doc @@ -121,14 +121,14 @@ def __getitem__(self, item: int): @classmethod def collate_fn(cls, batch: List[T_doc]): - doc_type = cls.document_type + doc_type = cls.doc_type if doc_type: - batch_da = DocArrayStacked[doc_type]( # type: ignore + batch_da = DocVec[doc_type]( # type: ignore batch, tensor_type=TorchTensor, ) else: - batch_da = DocArrayStacked(batch, tensor_type=TorchTensor) + batch_da = DocVec(batch, tensor_type=TorchTensor) return batch_da @classmethod @@ -142,7 +142,7 @@ def __class_getitem__(cls, item: Type[BaseDoc]) -> Type['MultiModalDataset']: global _TypedDataset class _TypedDataset(cls): # type: ignore - document_type = item + doc_type = item change_cls_name( _TypedDataset, f'{cls.__name__}[{item.__name__}]', globals() diff --git a/docarray/display/document_array_summary.py b/docarray/display/document_array_summary.py index 401ee570a95..60d2bc7445d 100644 --- a/docarray/display/document_array_summary.py +++ b/docarray/display/document_array_summary.py @@ -3,17 +3,17 @@ from docarray.typing.tensor.abstract_tensor import AbstractTensor if TYPE_CHECKING: - from docarray.array import DocArrayStacked - from docarray.array.abstract_array import AnyDocArray + from docarray.array import DocVec + from docarray.array.any_array import AnyDocArray class DocArraySummary: - def __init__(self, da: 'AnyDocArray'): - self.da = da + def __init__(self, docs: 'AnyDocArray'): + self.docs = docs def summary(self) -> None: """ - Print a summary of this DocArray object and a summary of the schema of its + Print a summary of this DocList object and a summary of the schema of its Document type. """ from rich import box @@ -21,18 +21,18 @@ def summary(self) -> None: from rich.panel import Panel from rich.table import Table - from docarray.array import DocArrayStacked + from docarray.array import DocVec table = Table(box=box.SIMPLE, highlight=True) table.show_header = False - table.add_row('Type', self.da.__class__.__name__) - table.add_row('Length', str(len(self.da)), end_section=True) + table.add_row('Type', self.docs.__class__.__name__) + table.add_row('Length', str(len(self.docs)), end_section=True) - if isinstance(self.da, DocArrayStacked): + if isinstance(self.docs, DocVec): table.add_row('Stacked columns:') - stacked_fields = self._get_stacked_fields(da=self.da) + stacked_fields = self._get_stacked_fields(docs=self.docs) for field_name in stacked_fields: - val = self.da + val = self.docs for attr in field_name.split('.'): val = getattr(val, attr) @@ -50,25 +50,25 @@ def summary(self) -> None: table.add_row(f' • {field_name}:', col_2) - Console().print(Panel(table, title='DocArray Summary', expand=False)) - self.da.document_type.schema_summary() + Console().print(Panel(table, title='DocList Summary', expand=False)) + self.docs.doc_type.schema_summary() @staticmethod - def _get_stacked_fields(da: 'DocArrayStacked') -> List[str]: # TODO this might + def _get_stacked_fields(docs: 'DocVec') -> List[str]: # TODO this might # broken """ - Return a list of the field names of a DocArrayStacked instance that are - stacked, i.e. all the fields that are of type AbstractTensor. Nested field + Return a list of the field names of a DocVec instance that are + doc_vec, i.e. all the fields that are of type AbstractTensor. Nested field paths are separated by dot, such as: 'attr.nested_attr'. """ fields = [] - for field_name, value_tens in da._storage.tensor_columns.items(): + for field_name, value_tens in docs._storage.tensor_columns.items(): fields.append(field_name) - for field_name, value_doc in da._storage.doc_columns.items(): + for field_name, value_doc in docs._storage.doc_columns.items(): fields.extend( [ f'{field_name}.{x}' - for x in DocArraySummary._get_stacked_fields(da=value_doc) + for x in DocArraySummary._get_stacked_fields(docs=value_doc) ] ) diff --git a/docarray/display/document_summary.py b/docarray/display/document_summary.py index a7fe5009e9b..349829b6e0e 100644 --- a/docarray/display/document_summary.py +++ b/docarray/display/document_summary.py @@ -55,7 +55,7 @@ def _get_schema(cls: Type['BaseDoc'], doc_name: Optional[str] = None) -> Tree: from rich.tree import Tree - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList root = cls.__name__ if doc_name is None else f'{doc_name}: {cls.__name__}' tree = Tree(root, highlight=True) @@ -76,10 +76,8 @@ def _get_schema(cls: Type['BaseDoc'], doc_name: Optional[str] = None) -> Tree: for arg in field_type.__args__: if issubclass(arg, BaseDoc): sub_tree.add(DocumentSummary._get_schema(cls=arg)) - elif issubclass(arg, DocArray): - sub_tree.add( - DocumentSummary._get_schema(cls=arg.document_type) - ) + elif issubclass(arg, DocList): + sub_tree.add(DocumentSummary._get_schema(cls=arg.doc_type)) tree.add(sub_tree) elif issubclass(field_type, BaseDoc): @@ -87,11 +85,9 @@ def _get_schema(cls: Type['BaseDoc'], doc_name: Optional[str] = None) -> Tree: DocumentSummary._get_schema(cls=field_type, doc_name=field_name) ) - elif issubclass(field_type, DocArray): + elif issubclass(field_type, DocList): sub_tree = Tree(node_name, highlight=True) - sub_tree.add( - DocumentSummary._get_schema(cls=field_type.document_type) - ) + sub_tree.add(DocumentSummary._get_schema(cls=field_type.doc_type)) tree.add(sub_tree) else: @@ -112,7 +108,7 @@ def __rich_console__( from rich import box, text from rich.table import Table - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList table = Table( 'Attribute', @@ -125,7 +121,7 @@ def __rich_console__( for field_name, value in self.doc.__dict__.items(): col_1 = f'{field_name}: {value.__class__.__name__}' if ( - isinstance(value, (ID, DocArray, BaseDoc)) + isinstance(value, (ID, DocList, BaseDoc)) or field_name.startswith('_') or value is None ): @@ -177,7 +173,7 @@ def _plot_recursion( :return: Tree with all children. """ - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList tree = Tree(node) if tree is None else tree.add(node) # type: ignore @@ -185,7 +181,7 @@ def _plot_recursion( nested_attrs = [ k for k, v in node.doc.__dict__.items() - if isinstance(v, (DocArray, BaseDoc)) + if isinstance(v, (DocList, BaseDoc)) ] for attr in nested_attrs: value = getattr(node.doc, attr) diff --git a/docarray/documents/legacy/legacy_document.py b/docarray/documents/legacy/legacy_document.py index e550a97c800..96e2ee1e758 100644 --- a/docarray/documents/legacy/legacy_document.py +++ b/docarray/documents/legacy/legacy_document.py @@ -2,13 +2,13 @@ from typing import Any, Dict, Optional -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.typing import AnyEmbedding, AnyTensor class LegacyDocument(BaseDoc): """ - This Document is the LegacyDocument. It follows the same schema as in DocArray v1. + This Document is the LegacyDocument. It follows the same schema as in DocList v1. It can be useful to start migrating a codebase from v1 to v2. Nevertheless, the API is not totally compatible with DocAray v1 `Document`. @@ -16,7 +16,7 @@ class LegacyDocument(BaseDoc): of the data is similar. .. code-block:: python - from docarray import DocArray + from docarray import DocList from docarray.documents.legacy import LegacyDocument import numpy as np @@ -27,15 +27,15 @@ class LegacyDocument(BaseDoc): doc.tags['price'] = 10 - doc.chunks = DocArray[Document]([Document() for _ in range(10)]) + doc.chunks = DocList[Document]([Document() for _ in range(10)]) - doc.chunks = DocArray[Document]([Document() for _ in range(10)]) + doc.chunks = DocList[Document]([Document() for _ in range(10)]) """ tensor: Optional[AnyTensor] - chunks: Optional[DocArray[LegacyDocument]] - matches: Optional[DocArray[LegacyDocument]] + chunks: Optional[DocList[LegacyDocument]] + matches: Optional[DocList[LegacyDocument]] blob: Optional[bytes] text: Optional[str] url: Optional[str] diff --git a/docarray/helper.py b/docarray/helper.py index 3cf74379e8d..7c8972b4735 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -135,7 +135,7 @@ def _get_field_type_by_access_path( :param access_path: "__"-separated access path :return: field type of accessed attribute. If access path is invalid, return None. """ - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList field, _, remaining = access_path.partition('__') field_valid = field in doc_type.__fields__.keys() @@ -145,8 +145,8 @@ def _get_field_type_by_access_path( return doc_type._get_field_type(field) else: d = doc_type._get_field_type(field) - if issubclass(d, DocArray): - return _get_field_type_by_access_path(d.document_type, remaining) + if issubclass(d, DocList): + return _get_field_type_by_access_path(d.doc_type, remaining) elif issubclass(d, BaseDoc): return _get_field_type_by_access_path(d, remaining) else: @@ -180,7 +180,7 @@ def get_paths( .. code-block:: python from typing import Optional - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList from docarray.helper import get_paths from docarray.typing import TextUrl, ImageUrl @@ -191,12 +191,12 @@ class Banner(BaseDoc): # you can call it in the constructor - da = DocArray[Banner]([Banner(text_url=url) for url in get_paths(patterns='*.txt')]) + docs = DocList[Banner]([Banner(text_url=url) for url in get_paths(patterns='*.txt')]) # and call it after construction to set the urls - da.image_url = list(get_paths(patterns='*.jpg', exclude_regex='test')) + docs.image_url = list(get_paths(patterns='*.jpg', exclude_regex='test')) - for doc in da: + for doc in docs: assert doc.image_url.endswith('.txt') assert doc.text_url.endswith('.jpg') diff --git a/docarray/index/abstract.py b/docarray/index/abstract.py index 11c130086b4..03ab7361f62 100644 --- a/docarray/index/abstract.py +++ b/docarray/index/abstract.py @@ -24,8 +24,8 @@ from pydantic.error_wrappers import ValidationError from typing_inspect import get_args, is_optional_type, is_union_type -from docarray import BaseDoc, DocArray -from docarray.array.abstract_array import AnyDocArray +from docarray import BaseDoc, DocList +from docarray.array.any_array import AnyDocArray from docarray.typing import AnyTensor from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.utils._internal._typing import is_tensor_union @@ -48,12 +48,12 @@ class FindResultBatched(NamedTuple): - documents: List[DocArray] + documents: List[DocList] scores: np.ndarray class _FindResultBatched(NamedTuple): - documents: Union[List[DocArray], List[List[Dict[str, Any]]]] + documents: Union[List[DocList], List[List[Dict[str, Any]]]] scores: np.ndarray @@ -254,12 +254,12 @@ def _filter( self, filter_query: Any, limit: int, - ) -> Union[DocArray, List[Dict]]: + ) -> Union[DocList, List[Dict]]: """Find documents in the index based on a filter query :param filter_query: the DB specific filter query to execute :param limit: maximum number of documents to return - :return: a DocArray containing the documents that match the filter query + :return: a DocList containing the documents that match the filter query """ ... @@ -268,7 +268,7 @@ def _filter_batched( self, filter_queries: Any, limit: int, - ) -> Union[List[DocArray], List[List[Dict]]]: + ) -> Union[List[DocList], List[List[Dict]]]: """Find documents in the index based on multiple filter queries. Each query is considered individually, and results are returned per query. @@ -322,7 +322,7 @@ def _text_search_batched( def __getitem__( self, key: Union[str, Sequence[str]] - ) -> Union[TSchema, DocArray[TSchema]]: + ) -> Union[TSchema, DocList[TSchema]]: """Get one or multiple Documents into the index, by `id`. If no document is found, a KeyError is raised. @@ -341,15 +341,15 @@ def __getitem__( raise KeyError(f'No document with id {key} found') # cast output - if isinstance(doc_sequence, DocArray): - out_da: DocArray[TSchema] = doc_sequence + if isinstance(doc_sequence, DocList): + out_docs: DocList[TSchema] = doc_sequence elif isinstance(doc_sequence[0], Dict): - out_da = self._dict_list_to_docarray(doc_sequence) # type: ignore + out_docs = self._dict_list_to_docarray(doc_sequence) # type: ignore else: - da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) - out_da = da_cls(doc_sequence) + docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) + out_docs = docs_cls(doc_sequence) - return out_da[0] if return_singleton else out_da + return out_docs[0] if return_singleton else out_docs def __delitem__(self, key: Union[str, Sequence[str]]): """Delete one or multiple Documents from the index, by `id`. @@ -385,9 +385,9 @@ def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs): :param docs: Documents to index. """ - if not isinstance(docs, (BaseDoc, DocArray)): + if not isinstance(docs, (BaseDoc, DocList)): self._logger.warning( - 'Passing a sequence of Documents that is not a DocArray comes at ' + 'Passing a sequence of Documents that is not a DocList comes at ' 'a performance penalty, since compatibility with the schema of Index ' 'needs to be checked for every Document individually.' ) @@ -431,7 +431,7 @@ def find( def find_batched( self, - queries: Union[AnyTensor, DocArray], + queries: Union[AnyTensor, DocList], search_field: str = 'embedding', limit: int = 10, **kwargs, @@ -440,7 +440,7 @@ def find_batched( :param queries: query vector for KNN/ANN search. Can be either a tensor-like (np.array, torch.Tensor, etc.) with a, - or a DocArray. + or a DocList. If a tensor-like is passed, it should have shape (batch_size, vector_dim) :param search_field: name of the field to search on. Documents in the index are retrieved based on this similarity @@ -471,12 +471,12 @@ def filter( filter_query: Any, limit: int = 10, **kwargs, - ) -> DocArray: + ) -> DocList: """Find documents in the index based on a filter query :param filter_query: the DB specific filter query to execute :param limit: maximum number of documents to return - :return: a DocArray containing the documents that match the filter query + :return: a DocList containing the documents that match the filter query """ self._logger.debug(f'Executing `filter` for the query {filter_query}') docs = self._filter(filter_query, limit=limit, **kwargs) @@ -491,12 +491,12 @@ def filter_batched( filter_queries: Any, limit: int = 10, **kwargs, - ) -> List[DocArray]: + ) -> List[DocList]: """Find documents in the index based on multiple filter queries. :param filter_queries: the DB specific filter query to execute :param limit: maximum number of documents to return - :return: a DocArray containing the documents that match the filter query + :return: a DocList containing the documents that match the filter query """ self._logger.debug( f'Executing `filter_batched` for the queries {filter_queries}' @@ -577,7 +577,7 @@ def text_search_batched( def _get_values_by_column(docs: Sequence[BaseDoc], col_name: str) -> List[Any]: """Get the value of a column of a document. - :param docs: The DocArray to get the values from + :param docs: The DocList to get the values from :param col_name: The name of the column, e.g. 'text' or 'image__tensor' :return: The value of the column of `doc` """ @@ -600,7 +600,7 @@ def _transpose_col_value_dict( """'Transpose' the output of `_get_col_value_dict()`: Yield rows of columns, where each row represent one Document. Since a generator is returned, this process comes at negligible cost. - :param docs: The DocArray to get the values from + :param docs: The DocList to get the values from :return: The `docs` flattened out as rows. Each row is a dictionary mapping from column name to value """ return (dict(zip(col_value_dict, row)) for row in zip(*col_value_dict.values())) @@ -726,8 +726,7 @@ def _create_column_infos(self, schema: Type[BaseDoc]) -> Dict[str, _ColumnInfo]: # Union types are handle in _flatten_schema if issubclass(type_, AnyDocArray): raise ValueError( - 'Indexing field of DocArray type (=subindex)' - 'is not yet supported.' + 'Indexing field of DocList type (=subindex)' 'is not yet supported.' ) else: column_infos[field_name] = self._create_single_column(field_, type_) @@ -764,29 +763,29 @@ def _create_single_column(self, field: 'ModelField', type_: Type) -> _ColumnInfo def _validate_docs( self, docs: Union[BaseDoc, Sequence[BaseDoc]] - ) -> DocArray[BaseDoc]: + ) -> DocList[BaseDoc]: """Validates Document against the schema of the Document Index. For validation to pass, the schema of `docs` and the schema of the Document Index need to evaluate to the same flattened columns. If Validation fails, a ValueError is raised. - :param docs: Document to evaluate. If this is a DocArray, validation is + :param docs: Document to evaluate. If this is a DocList, validation is performed using its `doc_type` (parametrization), without having to check ever Document in `docs`. If this check fails, or if `docs` is not a - DocArray, evaluation is performed for every Document in `docs`. - :return: A DocArray containing the Documents in `docs` + DocList, evaluation is performed for every Document in `docs`. + :return: A DocList containing the Documents in `docs` """ if isinstance(docs, BaseDoc): docs = [docs] - if isinstance(docs, DocArray): - # validation shortcut for DocArray; only look at the schema + if isinstance(docs, DocList): + # validation shortcut for DocList; only look at the schema reference_schema_flat = self._flatten_schema( cast(Type[BaseDoc], self._schema) ) reference_names = [name for (name, _, _) in reference_schema_flat] reference_types = [t_ for (_, t_, _) in reference_schema_flat] try: - input_schema_flat = self._flatten_schema(docs.document_type) + input_schema_flat = self._flatten_schema(docs.doc_type) except ValueError: pass else: @@ -814,7 +813,7 @@ def _validate_docs( ' and that the types of your data match the types of the Document Index schema.' ) - return DocArray[BaseDoc].construct(out_docs) + return DocList[BaseDoc].construct(out_docs) def _to_numpy(self, val: Any, allow_passthrough=False) -> Any: """ @@ -871,9 +870,9 @@ def _convert_dict_to_doc( schema_cls = cast(Type[BaseDoc], schema) return schema_cls(**doc_dict) - def _dict_list_to_docarray(self, dict_list: Sequence[Dict[str, Any]]) -> DocArray: - """Convert a list of docs in dict type to a DocArray of the schema type.""" + def _dict_list_to_docarray(self, dict_list: Sequence[Dict[str, Any]]) -> DocList: + """Convert a list of docs in dict type to a DocList of the schema type.""" doc_list = [self._convert_dict_to_doc(doc_dict, self._schema) for doc_dict in dict_list] # type: ignore - da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) - return da_cls(doc_list) + docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) + return docs_cls(doc_list) diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index b7555012db8..d0e11e7e959 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -20,7 +20,7 @@ import numpy as np -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.index.abstract import ( BaseDocIndex, _ColumnInfo, @@ -28,7 +28,7 @@ _raise_not_composable, _raise_not_supported, ) -from docarray.proto import DocumentProto +from docarray.proto import DocProto from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.utils._internal.misc import import_library, is_np_int from docarray.utils.filter import filter_docs @@ -214,7 +214,7 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: f'args and kwargs not supported for `execute_query` on {type(self)}' ) - ann_docs = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema))([]) + ann_docs = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))([]) filter_conditions = [] doc_to_score: Dict[BaseDoc, Any] = {} for op, op_kwargs in query: @@ -228,8 +228,8 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: self._logger.debug(f'Executing query {query}') docs_filtered = ann_docs for cond in filter_conditions: - da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) - docs_filtered = da_cls(filter_docs(docs_filtered, cond)) + docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) + docs_filtered = docs_cls(filter_docs(docs_filtered, cond)) self._logger.debug(f'{len(docs_filtered)} results found') docs_and_scores = zip( @@ -268,7 +268,7 @@ def _filter( self, filter_query: Any, limit: int, - ) -> DocArray: + ) -> DocList: raise NotImplementedError( f'{type(self)} does not support filter-only queries.' f' To perform post-filtering on a query, use' @@ -279,7 +279,7 @@ def _filter_batched( self, filter_queries: Any, limit: int, - ) -> List[DocArray]: + ) -> List[DocList]: raise NotImplementedError( f'{type(self)} does not support filter-only queries.' f' To perform post-filtering on a query, use' @@ -387,23 +387,23 @@ def _get_docs_sqlite_unsorted(self, univ_ids: Sequence[int]): 'SELECT data FROM docs WHERE doc_id IN %s' % sql_id_list, ) rows = self._sqlite_cursor.fetchall() - da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) - return da_cls([self._doc_from_bytes(row[0]) for row in rows]) + docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) + return docs_cls([self._doc_from_bytes(row[0]) for row in rows]) - def _get_docs_sqlite_doc_id(self, doc_ids: Sequence[str]) -> DocArray[TSchema]: + def _get_docs_sqlite_doc_id(self, doc_ids: Sequence[str]) -> DocList[TSchema]: hashed_ids = tuple(self._to_hashed_id(id_) for id_ in doc_ids) docs_unsorted = self._get_docs_sqlite_unsorted(hashed_ids) - da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) - return da_cls(sorted(docs_unsorted, key=lambda doc: doc_ids.index(doc.id))) + docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) + return docs_cls(sorted(docs_unsorted, key=lambda doc: doc_ids.index(doc.id))) - def _get_docs_sqlite_hashed_id(self, hashed_ids: Sequence[int]) -> DocArray: + def _get_docs_sqlite_hashed_id(self, hashed_ids: Sequence[int]) -> DocList: docs_unsorted = self._get_docs_sqlite_unsorted(hashed_ids) def _in_position(doc): return hashed_ids.index(self._to_hashed_id(doc.id)) - da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) - return da_cls(sorted(docs_unsorted, key=_in_position)) + docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) + return docs_cls(sorted(docs_unsorted, key=_in_position)) def _delete_docs_from_sqlite(self, doc_ids: Sequence[Union[str, int]]): ids = tuple( @@ -424,4 +424,4 @@ def _doc_to_bytes(self, doc: BaseDoc) -> bytes: def _doc_from_bytes(self, data: bytes) -> BaseDoc: schema_cls = cast(Type[BaseDoc], self._schema) - return schema_cls.from_protobuf(DocumentProto.FromString(data)) + return schema_cls.from_protobuf(DocProto.FromString(data)) diff --git a/docarray/proto/__init__.py b/docarray/proto/__init__.py index 1b04df23fe6..b1a201b6e2f 100644 --- a/docarray/proto/__init__.py +++ b/docarray/proto/__init__.py @@ -12,9 +12,9 @@ if __pb__version__.startswith('4'): from docarray.proto.pb.docarray_pb2 import ( DictOfAnyProto, - DocArrayStackedProto, - DocumentArrayProto, - DocumentProto, + DocListProto, + DocProto, + DocVecProto, ListOfAnyProto, ListOfDocArrayProto, NdArrayProto, @@ -23,9 +23,9 @@ else: from docarray.proto.pb2.docarray_pb2 import ( DictOfAnyProto, - DocArrayStackedProto, - DocumentArrayProto, - DocumentProto, + DocListProto, + DocProto, + DocVecProto, ListOfAnyProto, ListOfDocArrayProto, NdArrayProto, @@ -33,12 +33,12 @@ ) __all__ = [ - 'DocumentArrayProto', - 'DocumentProto', + 'DocListProto', + 'DocProto', 'NdArrayProto', 'NodeProto', - 'DocArrayStackedProto', - 'DocumentArrayProto', + 'DocVecProto', + 'DocListProto', 'ListOfDocArrayProto', 'ListOfAnyProto', 'DictOfAnyProto', diff --git a/docarray/proto/docarray.proto b/docarray/proto/docarray.proto index 2b1d557da52..19a33ccbc22 100644 --- a/docarray/proto/docarray.proto +++ b/docarray/proto/docarray.proto @@ -53,9 +53,9 @@ message NodeProto { // the ndarray of the image/audio/video document NdArrayProto ndarray = 6; // a sub Document - DocumentProto document = 7; + DocProto doc = 7; // a sub DocArray - DocumentArrayProto document_array = 8; + DocListProto doc_array = 8; //any list ListOfAnyProto list = 9; //any set @@ -75,7 +75,7 @@ message NodeProto { /** * Represents a Document */ -message DocumentProto { +message DocProto { map data = 1; @@ -91,18 +91,18 @@ message ListOfAnyProto { repeated NodeProto data = 1; } -message DocumentArrayProto { - repeated DocumentProto docs = 1; // a list of Documents +message DocListProto { + repeated DocProto docs = 1; // a list of Documents } message ListOfDocArrayProto { - repeated DocumentArrayProto data = 1; + repeated DocListProto data = 1; } -message DocArrayStackedProto{ +message DocVecProto{ map tensor_columns = 1; // a dict of document columns - map doc_columns = 2; // a dict of tensor columns - map da_columns = 3; // a dict of document array columns + map doc_columns = 2; // a dict of tensor columns + map docs_vec_columns = 3; // a dict of document array columns map any_columns = 4; // a dict of any columns. Used for the rest of the data } \ No newline at end of file diff --git a/docarray/proto/pb/docarray_pb2.py b/docarray/proto/pb/docarray_pb2.py index a830f17ddc4..8ff91a9f5e8 100644 --- a/docarray/proto/pb/docarray_pb2.py +++ b/docarray/proto/pb/docarray_pb2.py @@ -6,7 +6,6 @@ from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database - # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -15,25 +14,25 @@ from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xc6\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12+\n\x08\x64ocument\x18\x07 \x01(\x0b\x32\x17.docarray.DocumentProtoH\x00\x12\x36\n\x0e\x64ocument_array\x18\x08 \x01(\x0b\x32\x1c.docarray.DocumentArrayProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"\x82\x01\n\rDocumentProto\x12/\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32!.docarray.DocumentProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\";\n\x12\x44ocumentArrayProto\x12%\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x17.docarray.DocumentProto\"A\n\x13ListOfDocArrayProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocumentArrayProto\"\xed\x04\n\x14\x44ocArrayStackedProto\x12I\n\x0etensor_columns\x18\x01 \x03(\x0b\x32\x31.docarray.DocArrayStackedProto.TensorColumnsEntry\x12\x43\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32..docarray.DocArrayStackedProto.DocColumnsEntry\x12\x41\n\nda_columns\x18\x03 \x03(\x0b\x32-.docarray.DocArrayStackedProto.DaColumnsEntry\x12\x43\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32..docarray.DocArrayStackedProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aQ\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12-\n\x05value\x18\x02 \x01(\x0b\x32\x1e.docarray.DocArrayStackedProto:\x02\x38\x01\x1aO\n\x0e\x44\x61\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xb1\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12!\n\x03\x64oc\x18\x07 \x01(\x0b\x32\x12.docarray.DocProtoH\x00\x12+\n\tdoc_array\x18\x08 \x01(\x0b\x32\x16.docarray.DocListProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"x\n\x08\x44ocProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\"0\n\x0c\x44ocListProto\x12 \n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x12.docarray.DocProto\";\n\x13ListOfDocArrayProto\x12$\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x16.docarray.DocListProto\"\xc7\x04\n\x0b\x44ocVecProto\x12@\n\x0etensor_columns\x18\x01 \x03(\x0b\x32(.docarray.DocVecProto.TensorColumnsEntry\x12:\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32%.docarray.DocVecProto.DocColumnsEntry\x12\x43\n\x10\x64ocs_vec_columns\x18\x03 \x03(\x0b\x32).docarray.DocVecProto.DocsVecColumnsEntry\x12:\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32%.docarray.DocVecProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aH\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12$\n\x05value\x18\x02 \x01(\x0b\x32\x15.docarray.DocVecProto:\x02\x38\x01\x1aT\n\x13\x44ocsVecColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'docarray_pb2', globals()) if _descriptor._USE_C_DESCRIPTORS == False: DESCRIPTOR._options = None - _DOCUMENTPROTO_DATAENTRY._options = None - _DOCUMENTPROTO_DATAENTRY._serialized_options = b'8\001' + _DOCPROTO_DATAENTRY._options = None + _DOCPROTO_DATAENTRY._serialized_options = b'8\001' _DICTOFANYPROTO_DATAENTRY._options = None _DICTOFANYPROTO_DATAENTRY._serialized_options = b'8\001' - _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._options = None - _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_options = b'8\001' - _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._options = None - _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_options = b'8\001' - _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._options = None - _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_options = b'8\001' - _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._options = None - _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_TENSORCOLUMNSENTRY._options = None + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_DOCCOLUMNSENTRY._options = None + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_DOCSVECCOLUMNSENTRY._options = None + _DOCVECPROTO_DOCSVECCOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_ANYCOLUMNSENTRY._options = None + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_options = b'8\001' _DENSENDARRAYPROTO._serialized_start=58 _DENSENDARRAYPROTO._serialized_end=123 _NDARRAYPROTO._serialized_start=125 @@ -43,29 +42,29 @@ _GENERICDICTVALUE._serialized_start=322 _GENERICDICTVALUE._serialized_end=381 _NODEPROTO._serialized_start=384 - _NODEPROTO._serialized_end=838 - _DOCUMENTPROTO._serialized_start=841 - _DOCUMENTPROTO._serialized_end=971 - _DOCUMENTPROTO_DATAENTRY._serialized_start=907 - _DOCUMENTPROTO_DATAENTRY._serialized_end=971 - _DICTOFANYPROTO._serialized_start=974 - _DICTOFANYPROTO._serialized_end=1106 - _DICTOFANYPROTO_DATAENTRY._serialized_start=907 - _DICTOFANYPROTO_DATAENTRY._serialized_end=971 - _LISTOFANYPROTO._serialized_start=1108 - _LISTOFANYPROTO._serialized_end=1159 - _DOCUMENTARRAYPROTO._serialized_start=1161 - _DOCUMENTARRAYPROTO._serialized_end=1220 - _LISTOFDOCARRAYPROTO._serialized_start=1222 - _LISTOFDOCARRAYPROTO._serialized_end=1287 - _DOCARRAYSTACKEDPROTO._serialized_start=1290 - _DOCARRAYSTACKEDPROTO._serialized_end=1911 - _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_start=1594 - _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_end=1670 - _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_start=1672 - _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_end=1753 - _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_start=1755 - _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_end=1834 - _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_start=1836 - _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_end=1911 + _NODEPROTO._serialized_end=817 + _DOCPROTO._serialized_start=819 + _DOCPROTO._serialized_end=939 + _DOCPROTO_DATAENTRY._serialized_start=875 + _DOCPROTO_DATAENTRY._serialized_end=939 + _DICTOFANYPROTO._serialized_start=942 + _DICTOFANYPROTO._serialized_end=1074 + _DICTOFANYPROTO_DATAENTRY._serialized_start=875 + _DICTOFANYPROTO_DATAENTRY._serialized_end=939 + _LISTOFANYPROTO._serialized_start=1076 + _LISTOFANYPROTO._serialized_end=1127 + _DOCLISTPROTO._serialized_start=1129 + _DOCLISTPROTO._serialized_end=1177 + _LISTOFDOCARRAYPROTO._serialized_start=1179 + _LISTOFDOCARRAYPROTO._serialized_end=1238 + _DOCVECPROTO._serialized_start=1241 + _DOCVECPROTO._serialized_end=1824 + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_start=1511 + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_end=1587 + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_start=1589 + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_end=1661 + _DOCVECPROTO_DOCSVECCOLUMNSENTRY._serialized_start=1663 + _DOCVECPROTO_DOCSVECCOLUMNSENTRY._serialized_end=1747 + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_start=1749 + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_end=1824 # @@protoc_insertion_point(module_scope) diff --git a/docarray/proto/pb2/docarray_pb2.py b/docarray/proto/pb2/docarray_pb2.py index 0ea41987658..9fbbbadf342 100644 --- a/docarray/proto/pb2/docarray_pb2.py +++ b/docarray/proto/pb2/docarray_pb2.py @@ -12,10 +12,11 @@ _sym_db = _symbol_database.Default() + from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xc6\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12+\n\x08\x64ocument\x18\x07 \x01(\x0b\x32\x17.docarray.DocumentProtoH\x00\x12\x36\n\x0e\x64ocument_array\x18\x08 \x01(\x0b\x32\x1c.docarray.DocumentArrayProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"\x82\x01\n\rDocumentProto\x12/\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32!.docarray.DocumentProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\";\n\x12\x44ocumentArrayProto\x12%\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x17.docarray.DocumentProto\"A\n\x13ListOfDocArrayProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocumentArrayProto\"\xed\x04\n\x14\x44ocArrayStackedProto\x12I\n\x0etensor_columns\x18\x01 \x03(\x0b\x32\x31.docarray.DocArrayStackedProto.TensorColumnsEntry\x12\x43\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32..docarray.DocArrayStackedProto.DocColumnsEntry\x12\x41\n\nda_columns\x18\x03 \x03(\x0b\x32-.docarray.DocArrayStackedProto.DaColumnsEntry\x12\x43\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32..docarray.DocArrayStackedProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aQ\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12-\n\x05value\x18\x02 \x01(\x0b\x32\x1e.docarray.DocArrayStackedProto:\x02\x38\x01\x1aO\n\x0e\x44\x61\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3' + b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xb1\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12!\n\x03\x64oc\x18\x07 \x01(\x0b\x32\x12.docarray.DocProtoH\x00\x12+\n\tdoc_array\x18\x08 \x01(\x0b\x32\x16.docarray.DocListProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"x\n\x08\x44ocProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\"0\n\x0c\x44ocListProto\x12 \n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x12.docarray.DocProto\";\n\x13ListOfDocArrayProto\x12$\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x16.docarray.DocListProto\"\xc7\x04\n\x0b\x44ocVecProto\x12@\n\x0etensor_columns\x18\x01 \x03(\x0b\x32(.docarray.DocVecProto.TensorColumnsEntry\x12:\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32%.docarray.DocVecProto.DocColumnsEntry\x12\x43\n\x10\x64ocs_vec_columns\x18\x03 \x03(\x0b\x32).docarray.DocVecProto.DocsVecColumnsEntry\x12:\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32%.docarray.DocVecProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aH\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12$\n\x05value\x18\x02 \x01(\x0b\x32\x15.docarray.DocVecProto:\x02\x38\x01\x1aT\n\x13\x44ocsVecColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3' ) @@ -24,26 +25,22 @@ _KEYVALUEPAIR = DESCRIPTOR.message_types_by_name['KeyValuePair'] _GENERICDICTVALUE = DESCRIPTOR.message_types_by_name['GenericDictValue'] _NODEPROTO = DESCRIPTOR.message_types_by_name['NodeProto'] -_DOCUMENTPROTO = DESCRIPTOR.message_types_by_name['DocumentProto'] -_DOCUMENTPROTO_DATAENTRY = _DOCUMENTPROTO.nested_types_by_name['DataEntry'] +_DOCPROTO = DESCRIPTOR.message_types_by_name['DocProto'] +_DOCPROTO_DATAENTRY = _DOCPROTO.nested_types_by_name['DataEntry'] _DICTOFANYPROTO = DESCRIPTOR.message_types_by_name['DictOfAnyProto'] _DICTOFANYPROTO_DATAENTRY = _DICTOFANYPROTO.nested_types_by_name['DataEntry'] _LISTOFANYPROTO = DESCRIPTOR.message_types_by_name['ListOfAnyProto'] -_DOCUMENTARRAYPROTO = DESCRIPTOR.message_types_by_name['DocumentArrayProto'] +_DOCLISTPROTO = DESCRIPTOR.message_types_by_name['DocListProto'] _LISTOFDOCARRAYPROTO = DESCRIPTOR.message_types_by_name['ListOfDocArrayProto'] -_DOCARRAYSTACKEDPROTO = DESCRIPTOR.message_types_by_name['DocArrayStackedProto'] -_DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY = _DOCARRAYSTACKEDPROTO.nested_types_by_name[ +_DOCVECPROTO = DESCRIPTOR.message_types_by_name['DocVecProto'] +_DOCVECPROTO_TENSORCOLUMNSENTRY = _DOCVECPROTO.nested_types_by_name[ 'TensorColumnsEntry' ] -_DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY = _DOCARRAYSTACKEDPROTO.nested_types_by_name[ - 'DocColumnsEntry' -] -_DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY = _DOCARRAYSTACKEDPROTO.nested_types_by_name[ - 'DaColumnsEntry' -] -_DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY = _DOCARRAYSTACKEDPROTO.nested_types_by_name[ - 'AnyColumnsEntry' +_DOCVECPROTO_DOCCOLUMNSENTRY = _DOCVECPROTO.nested_types_by_name['DocColumnsEntry'] +_DOCVECPROTO_DOCSVECCOLUMNSENTRY = _DOCVECPROTO.nested_types_by_name[ + 'DocsVecColumnsEntry' ] +_DOCVECPROTO_ANYCOLUMNSENTRY = _DOCVECPROTO.nested_types_by_name['AnyColumnsEntry'] DenseNdArrayProto = _reflection.GeneratedProtocolMessageType( 'DenseNdArrayProto', (_message.Message,), @@ -99,26 +96,26 @@ ) _sym_db.RegisterMessage(NodeProto) -DocumentProto = _reflection.GeneratedProtocolMessageType( - 'DocumentProto', +DocProto = _reflection.GeneratedProtocolMessageType( + 'DocProto', (_message.Message,), { 'DataEntry': _reflection.GeneratedProtocolMessageType( 'DataEntry', (_message.Message,), { - 'DESCRIPTOR': _DOCUMENTPROTO_DATAENTRY, + 'DESCRIPTOR': _DOCPROTO_DATAENTRY, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentProto.DataEntry) + # @@protoc_insertion_point(class_scope:docarray.DocProto.DataEntry) }, ), - 'DESCRIPTOR': _DOCUMENTPROTO, + 'DESCRIPTOR': _DOCPROTO, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentProto) + # @@protoc_insertion_point(class_scope:docarray.DocProto) }, ) -_sym_db.RegisterMessage(DocumentProto) -_sym_db.RegisterMessage(DocumentProto.DataEntry) +_sym_db.RegisterMessage(DocProto) +_sym_db.RegisterMessage(DocProto.DataEntry) DictOfAnyProto = _reflection.GeneratedProtocolMessageType( 'DictOfAnyProto', @@ -152,16 +149,16 @@ ) _sym_db.RegisterMessage(ListOfAnyProto) -DocumentArrayProto = _reflection.GeneratedProtocolMessageType( - 'DocumentArrayProto', +DocListProto = _reflection.GeneratedProtocolMessageType( + 'DocListProto', (_message.Message,), { - 'DESCRIPTOR': _DOCUMENTARRAYPROTO, + 'DESCRIPTOR': _DOCLISTPROTO, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentArrayProto) + # @@protoc_insertion_point(class_scope:docarray.DocListProto) }, ) -_sym_db.RegisterMessage(DocumentArrayProto) +_sym_db.RegisterMessage(DocListProto) ListOfDocArrayProto = _reflection.GeneratedProtocolMessageType( 'ListOfDocArrayProto', @@ -174,72 +171,72 @@ ) _sym_db.RegisterMessage(ListOfDocArrayProto) -DocArrayStackedProto = _reflection.GeneratedProtocolMessageType( - 'DocArrayStackedProto', +DocVecProto = _reflection.GeneratedProtocolMessageType( + 'DocVecProto', (_message.Message,), { 'TensorColumnsEntry': _reflection.GeneratedProtocolMessageType( 'TensorColumnsEntry', (_message.Message,), { - 'DESCRIPTOR': _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY, + 'DESCRIPTOR': _DOCVECPROTO_TENSORCOLUMNSENTRY, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocArrayStackedProto.TensorColumnsEntry) + # @@protoc_insertion_point(class_scope:docarray.DocVecProto.TensorColumnsEntry) }, ), 'DocColumnsEntry': _reflection.GeneratedProtocolMessageType( 'DocColumnsEntry', (_message.Message,), { - 'DESCRIPTOR': _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY, + 'DESCRIPTOR': _DOCVECPROTO_DOCCOLUMNSENTRY, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocArrayStackedProto.DocColumnsEntry) + # @@protoc_insertion_point(class_scope:docarray.DocVecProto.DocColumnsEntry) }, ), - 'DaColumnsEntry': _reflection.GeneratedProtocolMessageType( - 'DaColumnsEntry', + 'DocsVecColumnsEntry': _reflection.GeneratedProtocolMessageType( + 'DocsVecColumnsEntry', (_message.Message,), { - 'DESCRIPTOR': _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY, + 'DESCRIPTOR': _DOCVECPROTO_DOCSVECCOLUMNSENTRY, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocArrayStackedProto.DaColumnsEntry) + # @@protoc_insertion_point(class_scope:docarray.DocVecProto.DocsVecColumnsEntry) }, ), 'AnyColumnsEntry': _reflection.GeneratedProtocolMessageType( 'AnyColumnsEntry', (_message.Message,), { - 'DESCRIPTOR': _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY, + 'DESCRIPTOR': _DOCVECPROTO_ANYCOLUMNSENTRY, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocArrayStackedProto.AnyColumnsEntry) + # @@protoc_insertion_point(class_scope:docarray.DocVecProto.AnyColumnsEntry) }, ), - 'DESCRIPTOR': _DOCARRAYSTACKEDPROTO, + 'DESCRIPTOR': _DOCVECPROTO, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocArrayStackedProto) + # @@protoc_insertion_point(class_scope:docarray.DocVecProto) }, ) -_sym_db.RegisterMessage(DocArrayStackedProto) -_sym_db.RegisterMessage(DocArrayStackedProto.TensorColumnsEntry) -_sym_db.RegisterMessage(DocArrayStackedProto.DocColumnsEntry) -_sym_db.RegisterMessage(DocArrayStackedProto.DaColumnsEntry) -_sym_db.RegisterMessage(DocArrayStackedProto.AnyColumnsEntry) +_sym_db.RegisterMessage(DocVecProto) +_sym_db.RegisterMessage(DocVecProto.TensorColumnsEntry) +_sym_db.RegisterMessage(DocVecProto.DocColumnsEntry) +_sym_db.RegisterMessage(DocVecProto.DocsVecColumnsEntry) +_sym_db.RegisterMessage(DocVecProto.AnyColumnsEntry) if _descriptor._USE_C_DESCRIPTORS == False: DESCRIPTOR._options = None - _DOCUMENTPROTO_DATAENTRY._options = None - _DOCUMENTPROTO_DATAENTRY._serialized_options = b'8\001' + _DOCPROTO_DATAENTRY._options = None + _DOCPROTO_DATAENTRY._serialized_options = b'8\001' _DICTOFANYPROTO_DATAENTRY._options = None _DICTOFANYPROTO_DATAENTRY._serialized_options = b'8\001' - _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._options = None - _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_options = b'8\001' - _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._options = None - _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_options = b'8\001' - _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._options = None - _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_options = b'8\001' - _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._options = None - _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_TENSORCOLUMNSENTRY._options = None + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_DOCCOLUMNSENTRY._options = None + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_DOCSVECCOLUMNSENTRY._options = None + _DOCVECPROTO_DOCSVECCOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_ANYCOLUMNSENTRY._options = None + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_options = b'8\001' _DENSENDARRAYPROTO._serialized_start = 58 _DENSENDARRAYPROTO._serialized_end = 123 _NDARRAYPROTO._serialized_start = 125 @@ -249,29 +246,29 @@ _GENERICDICTVALUE._serialized_start = 322 _GENERICDICTVALUE._serialized_end = 381 _NODEPROTO._serialized_start = 384 - _NODEPROTO._serialized_end = 838 - _DOCUMENTPROTO._serialized_start = 841 - _DOCUMENTPROTO._serialized_end = 971 - _DOCUMENTPROTO_DATAENTRY._serialized_start = 907 - _DOCUMENTPROTO_DATAENTRY._serialized_end = 971 - _DICTOFANYPROTO._serialized_start = 974 - _DICTOFANYPROTO._serialized_end = 1106 - _DICTOFANYPROTO_DATAENTRY._serialized_start = 907 - _DICTOFANYPROTO_DATAENTRY._serialized_end = 971 - _LISTOFANYPROTO._serialized_start = 1108 - _LISTOFANYPROTO._serialized_end = 1159 - _DOCUMENTARRAYPROTO._serialized_start = 1161 - _DOCUMENTARRAYPROTO._serialized_end = 1220 - _LISTOFDOCARRAYPROTO._serialized_start = 1222 - _LISTOFDOCARRAYPROTO._serialized_end = 1287 - _DOCARRAYSTACKEDPROTO._serialized_start = 1290 - _DOCARRAYSTACKEDPROTO._serialized_end = 1911 - _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_start = 1594 - _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_end = 1670 - _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_start = 1672 - _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_end = 1753 - _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_start = 1755 - _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_end = 1834 - _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_start = 1836 - _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_end = 1911 + _NODEPROTO._serialized_end = 817 + _DOCPROTO._serialized_start = 819 + _DOCPROTO._serialized_end = 939 + _DOCPROTO_DATAENTRY._serialized_start = 875 + _DOCPROTO_DATAENTRY._serialized_end = 939 + _DICTOFANYPROTO._serialized_start = 942 + _DICTOFANYPROTO._serialized_end = 1074 + _DICTOFANYPROTO_DATAENTRY._serialized_start = 875 + _DICTOFANYPROTO_DATAENTRY._serialized_end = 939 + _LISTOFANYPROTO._serialized_start = 1076 + _LISTOFANYPROTO._serialized_end = 1127 + _DOCLISTPROTO._serialized_start = 1129 + _DOCLISTPROTO._serialized_end = 1177 + _LISTOFDOCARRAYPROTO._serialized_start = 1179 + _LISTOFDOCARRAYPROTO._serialized_end = 1238 + _DOCVECPROTO._serialized_start = 1241 + _DOCVECPROTO._serialized_end = 1824 + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_start = 1511 + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_end = 1587 + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_start = 1589 + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_end = 1661 + _DOCVECPROTO_DOCSVECCOLUMNSENTRY._serialized_start = 1663 + _DOCVECPROTO_DOCSVECCOLUMNSENTRY._serialized_end = 1747 + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_start = 1749 + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_end = 1824 # @@protoc_insertion_point(module_scope) diff --git a/docarray/store/abstract_doc_store.py b/docarray/store/abstract_doc_store.py index c5c152499c2..16c17227a64 100644 --- a/docarray/store/abstract_doc_store.py +++ b/docarray/store/abstract_doc_store.py @@ -4,7 +4,7 @@ from typing_extensions import TYPE_CHECKING if TYPE_CHECKING: - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList class AbstractDocStore(ABC): @@ -15,37 +15,37 @@ def list(namespace: str, show_table: bool) -> List[str]: :param namespace: The namespace to list :param show_table: If true, a table is printed to the console - :return: A list of DocArray names + :return: A list of DocList names """ ... @staticmethod @abstractmethod def delete(name: str, missing_ok: bool) -> bool: - """Delete the DocArray object at the specified name + """Delete the DocList object at the specified name - :param name: The name of the DocArray to delete - :param missing_ok: If true, no error will be raised if the DocArray does not exist. - :return: True if the DocArray was deleted, False if it did not exist. + :param name: The name of the DocList to delete + :param missing_ok: If true, no error will be raised if the DocList does not exist. + :return: True if the DocList was deleted, False if it did not exist. """ ... @staticmethod @abstractmethod def push( - da: 'DocArray', + docs: 'DocList', name: str, public: bool, show_progress: bool, branding: Optional[Dict], ) -> Dict: - """Push this DocArray to the specified name. + """Push this DocList to the specified name. - :param da: The DocArray to push + :param docs: The DocList to push :param name: The name to push to - :param public: Whether the DocArray should be publicly accessible + :param public: Whether the DocList should be publicly accessible :param show_progress: If true, a progress bar will be displayed. - :param branding: Branding information to be stored with the DocArray + :param branding: Branding information to be stored with the DocList """ ... @@ -62,43 +62,43 @@ def push_stream( :param docs: a stream of documents :param url: The name to push to - :param public: Whether the DocArray should be publicly accessible + :param public: Whether the DocList should be publicly accessible :param show_progress: If true, a progress bar will be displayed. - :param branding: Branding information to be stored with the DocArray + :param branding: Branding information to be stored with the DocList """ ... @staticmethod @abstractmethod def pull( - da_cls: Type['DocArray'], + docs_cls: Type['DocList'], name: str, show_progress: bool, local_cache: bool, - ) -> 'DocArray': - """Pull a DocArray from the specified name. + ) -> 'DocList': + """Pull a DocList from the specified name. - :param da_cls: The DocArray class to instantiate + :param docs_cls: The DocList class to instantiate :param name: The name to pull from :param show_progress: If true, a progress bar will be displayed. - :param local_cache: If true, the DocArray will be cached locally - :return: A DocArray + :param local_cache: If true, the DocList will be cached locally + :return: A DocList """ ... @staticmethod @abstractmethod def pull_stream( - da_cls: Type['DocArray'], + docs_cls: Type['DocList'], name: str, show_progress: bool, local_cache: bool, ) -> Iterator['BaseDoc']: """Pull a stream of documents from the specified name. - :param da_cls: The DocArray class to instantiate + :param docs_cls: The DocList class to instantiate :param name: The name to pull from :param show_progress: If true, a progress bar will be displayed. - :param local_cache: If true, the DocArray will be cached locally + :param local_cache: If true, the DocList will be cached locally :return: An iterator of documents""" ... diff --git a/docarray/store/file.py b/docarray/store/file.py index bb79162109b..b649864478a 100644 --- a/docarray/store/file.py +++ b/docarray/store/file.py @@ -10,7 +10,7 @@ from docarray.utils._internal.cache import _get_cache_path if TYPE_CHECKING: - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList SelfFileDocStore = TypeVar('SelfFileDocStore', bound='FileDocStore') @@ -41,7 +41,7 @@ def list( namespace_dir = cls._abs_filepath(namespace) if not namespace_dir.exists(): raise FileNotFoundError(f'Directory {namespace} does not exist') - da_files = [dafile for dafile in namespace_dir.glob('*.da')] + da_files = [dafile for dafile in namespace_dir.glob('*.docs')] if show_table: from datetime import datetime @@ -74,15 +74,15 @@ def list( def delete( cls: Type[SelfFileDocStore], name: str, missing_ok: bool = False ) -> bool: - """Delete a DocArray from the local filesystem. + """Delete a DocList from the local filesystem. - :param name: The name of the DocArray to delete. + :param name: The name of the DocList to delete. :param missing_ok: If True, do not raise an exception if the file does not exist. Defaults to False. :return: True if the file was deleted, False if it did not exist. """ path = cls._abs_filepath(name) try: - path.with_suffix('.da').unlink() + path.with_suffix('.docs').unlink() return True except FileNotFoundError: if not missing_ok: @@ -92,20 +92,20 @@ def delete( @classmethod def push( cls: Type[SelfFileDocStore], - da: 'DocArray', + docs: 'DocList', name: str, public: bool, show_progress: bool, branding: Optional[Dict], ) -> Dict: - """Push this DocArray object to the specified file path. + """Push this DocList object to the specified file path. :param name: The file path to push to. :param public: Not used by the ``file`` protocol. :param show_progress: If true, a progress bar will be displayed. :param branding: Not used by the ``file`` protocol. """ - return cls.push_stream(iter(da), name, public, show_progress, branding) + return cls.push_stream(iter(docs), name, public, show_progress, branding) @classmethod def push_stream( @@ -130,7 +130,7 @@ def push_stream( source = _to_binary_stream( docs, protocol='protobuf', compress='gzip', show_progress=show_progress ) - path = cls._abs_filepath(name).with_suffix('.da.tmp') + path = cls._abs_filepath(name).with_suffix('.docs.tmp') if path.exists(): raise ConcurrentPushException(f'File {path} already exists.') with open(path, 'wb') as f: @@ -145,29 +145,29 @@ def push_stream( @classmethod def pull( cls: Type[SelfFileDocStore], - da_cls: Type['DocArray'], + docs_cls: Type['DocList'], name: str, show_progress: bool, local_cache: bool, - ) -> 'DocArray': - """Pull a :class:`DocArray` from the specified url. + ) -> 'DocList': + """Pull a :class:`DocList` from the specified url. :param name: The file path to pull from. :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocArray to local folder - :return: a :class:`DocArray` object + :param local_cache: store the downloaded DocList to local folder + :return: a :class:`DocList` object """ - return da_cls( + return docs_cls( cls.pull_stream( - da_cls, name, show_progress=show_progress, local_cache=local_cache + docs_cls, name, show_progress=show_progress, local_cache=local_cache ) ) @classmethod def pull_stream( cls: Type[SelfFileDocStore], - da_cls: Type['DocArray'], + docs_cls: Type['DocList'], name: str, show_progress: bool, local_cache: bool, @@ -183,10 +183,10 @@ def pull_stream( if local_cache: logging.warning('local_cache is not supported for "file" protocol') - path = cls._abs_filepath(name).with_suffix('.da') + path = cls._abs_filepath(name).with_suffix('.docs') source = open(path, 'rb') return _from_binary_stream( - da_cls.document_type, + docs_cls.doc_type, source, protocol='protobuf', compress='gzip', diff --git a/docarray/store/jac.py b/docarray/store/jac.py index b2b2564a91e..7838e3c26c8 100644 --- a/docarray/store/jac.py +++ b/docarray/store/jac.py @@ -26,7 +26,7 @@ if TYPE_CHECKING: # pragma: no cover import io - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList if TYPE_CHECKING: import hubble @@ -46,17 +46,17 @@ def _get_length_from_summary(summary: List[Dict]) -> Optional[int]: raise ValueError('Length not found in summary') -def _get_raw_summary(self: 'DocArray') -> List[Dict[str, Any]]: +def _get_raw_summary(self: 'DocList') -> List[Dict[str, Any]]: items: List[Dict[str, Any]] = [ dict( name='Type', value=self.__class__.__name__, - description='The type of the DocArray', + description='The type of the DocList', ), dict( name='Length', value=len(self), - description='The length of the DocArray', + description='The length of the DocList', ), dict( name='Homogenous Documents', @@ -82,7 +82,7 @@ def _get_raw_summary(self: 'DocArray') -> List[Dict[str, Any]]: class JACDocStore(AbstractDocStore): - """Class to push and pull DocArray to and from Jina AI Cloud.""" + """Class to push and pull DocList to and from Jina AI Cloud.""" @staticmethod @hubble.login_required @@ -91,7 +91,7 @@ def list(namespace: str = '', show_table: bool = False) -> List[str]: :param namespace: Not supported for Jina AI Cloud. :param show_table: if true, show the table of the arrays. - :returns: List of available DocArray's names. + :returns: List of available DocList's names. """ if len(namespace) > 0: logging.warning('Namespace is not supported for Jina AI Cloud.') @@ -102,11 +102,11 @@ def list(namespace: str = '', show_table: bool = False) -> List[str]: from rich.table import Table resp = HubbleClient(jsonify=True).list_artifacts( - filter={'type': 'DocArray'}, sort={'createdAt': 1} + filter={'type': 'DocumentArray'}, sort={'createdAt': 1} ) table = Table( - title=f'You have {resp["meta"]["total"]} DocArray on the cloud', + title=f'You have {resp["meta"]["total"]} DocList on the cloud', box=box.SIMPLE, highlight=True, ) @@ -116,15 +116,15 @@ def list(namespace: str = '', show_table: bool = False) -> List[str]: table.add_column('Created at', justify='center') table.add_column('Updated at', justify='center') - for da in resp['data']: - result.append(da['name']) + for docs in resp['data']: + result.append(docs['name']) table.add_row( - da['name'], - str(_get_length_from_summary(da['metaData'].get('summary', []))), - da['visibility'], - da['createdAt'], - da['updatedAt'], + docs['name'], + str(_get_length_from_summary(docs['metaData'].get('summary', []))), + docs['visibility'], + docs['createdAt'], + docs['updatedAt'], ) if show_table: @@ -135,10 +135,10 @@ def list(namespace: str = '', show_table: bool = False) -> List[str]: @hubble.login_required def delete(name: str, missing_ok: bool = True) -> bool: """ - Delete a DocArray from the cloud. - :param name: the name of the DocArray to delete. - :param missing_ok: if true, do not raise an error if the DocArray does not exist. - :return: True if the DocArray was deleted, False if it did not exist. + Delete a DocList from the cloud. + :param name: the name of the DocList to delete. + :param missing_ok: if true, do not raise an error if the DocList does not exist. + :return: True if the DocList was deleted, False if it did not exist. """ try: HubbleClient(jsonify=True).delete_artifact(name=name) @@ -152,13 +152,13 @@ def delete(name: str, missing_ok: bool = True) -> bool: @staticmethod @hubble.login_required def push( - da: 'DocArray', + docs: 'DocList', name: str, public: bool = True, show_progress: bool = False, branding: Optional[Dict] = None, ) -> Dict: - """Push this DocArray object to Jina AI Cloud + """Push this DocList object to Jina AI Cloud .. note:: - Push with the same ``name`` will override the existing content. @@ -167,8 +167,8 @@ def push( - The lifetime of the content is not promised atm, could be a day, could be a week. Do not use it for persistence. Only use this full temporary transmission/storage/clipboard. - :param name: A name that can later be used to retrieve this :class:`DocArray`. - :param public: By default, anyone can pull a DocArray if they know its name. + :param name: A name that can later be used to retrieve this :class:`DocList`. + :param public: By default, anyone can pull a DocList if they know its name. Setting this to false will restrict access to only the creator. :param show_progress: If true, a progress bar will be displayed. :param branding: A dictionary of branding information to be sent to Jina Cloud. e.g. {"icon": "emoji", "background": "#fff"} @@ -181,15 +181,15 @@ def push( data, ctype = urllib3.filepost.encode_multipart_formdata( { 'file': ( - 'DocArray', + 'DocumentArray', delimiter, ), 'name': name, - 'type': 'DocArray', + 'type': 'DocumentArray', 'public': public, 'metaData': json.dumps( { - 'summary': _get_raw_summary(da), + 'summary': _get_raw_summary(docs), 'branding': branding, 'version': get_version_info(), }, @@ -210,7 +210,7 @@ def push( def gen(): yield _head - binary_stream = da.to_binary_stream( + binary_stream = docs.to_binary_stream( protocol='protobuf', compress='gzip', show_progress=show_progress ) while True: @@ -252,58 +252,58 @@ def push_stream( - The lifetime of the content is not promised atm, could be a day, could be a week. Do not use it for persistence. Only use this full temporary transmission/storage/clipboard. - :param name: A name that can later be used to retrieve this :class:`DocArray`. - :param public: By default, anyone can pull a DocArray if they know its name. + :param name: A name that can later be used to retrieve this :class:`DocList`. + :param public: By default, anyone can pull a DocList if they know its name. Setting this to false will restrict access to only the creator. :param show_progress: If true, a progress bar will be displayed. :param branding: A dictionary of branding information to be sent to Jina Cloud. e.g. {"icon": "emoji", "background": "#fff"} """ - from docarray import DocArray + from docarray import DocList # This is a temporary solution to push a stream of documents # The memory footprint is not ideal - # But it must be done this way for now because Hubble expects to know the length of the DocArray + # But it must be done this way for now because Hubble expects to know the length of the DocList # before it starts receiving the documents first_doc = next(docs) - da = DocArray[first_doc.__class__]([first_doc]) # type: ignore + docs = DocList[first_doc.__class__]([first_doc]) # type: ignore for doc in docs: - da.append(doc) - return cls.push(da, name, public, show_progress, branding) + docs.append(doc) + return cls.push(docs, name, public, show_progress, branding) @staticmethod @hubble.login_required def pull( - cls: Type['DocArray'], + cls: Type['DocList'], name: str, show_progress: bool = False, local_cache: bool = True, - ) -> 'DocArray': - """Pull a :class:`DocArray` from Jina AI Cloud to local. + ) -> 'DocList': + """Pull a :class:`DocList` from Jina AI Cloud to local. :param name: the upload name set during :meth:`.push` :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocArray to local folder - :return: a :class:`DocArray` object + :param local_cache: store the downloaded DocList to local folder + :return: a :class:`DocList` object """ - from docarray import DocArray + from docarray import DocList - return DocArray[cls.document_type]( # type: ignore + return DocList[cls.doc_type]( # type: ignore JACDocStore.pull_stream(cls, name, show_progress, local_cache) ) @staticmethod @hubble.login_required def pull_stream( - cls: Type['DocArray'], + cls: Type['DocList'], name: str, show_progress: bool = False, local_cache: bool = False, ) -> Iterator['BaseDoc']: - """Pull a :class:`DocArray` from Jina AI Cloud to local. + """Pull a :class:`DocList` from Jina AI Cloud to local. :param name: the upload name set during :meth:`.push` :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocArray to local folder + :param local_cache: store the downloaded DocList to local folder :return: An iterator of Documents """ import requests @@ -332,12 +332,12 @@ def pull_stream( r.raise_for_status() save_name = name.replace('/', '_') - tmp_cache_file = Path(f'/tmp/{save_name}.da') + tmp_cache_file = Path(f'/tmp/{save_name}.docs') _source: Union[ _BufferedCachingRequestReader, io.BufferedReader ] = _BufferedCachingRequestReader(r, tmp_cache_file) - cache_file = _get_cache_path() / f'{save_name}.da' + cache_file = _get_cache_path() / f'{save_name}.docs' if local_cache and cache_file.exists(): _cache_len = cache_file.stat().st_size if _cache_len == int(r.headers['Content-length']): diff --git a/docarray/store/s3.py b/docarray/store/s3.py index 23534d556fd..936a261396f 100644 --- a/docarray/store/s3.py +++ b/docarray/store/s3.py @@ -9,10 +9,11 @@ from docarray.utils._internal.misc import import_library if TYPE_CHECKING: # pragma: no cover - from docarray import BaseDoc, DocArray import boto3 import botocore from smart_open import open + + from docarray import BaseDoc, DocList else: open = import_library('smart_open', raise_error=True).open boto3 = import_library('boto3', raise_error=True) @@ -42,12 +43,12 @@ def read(self, size: Optional[int] = -1) -> bytes: def close(self): if not self.closed and self._cache: - self._cache_path.rename(self._cache_path.with_suffix('.da')) + self._cache_path.rename(self._cache_path.with_suffix('.docs')) self._cache.close() class S3DocStore(AbstractDocStore): - """Class to push and pull DocArray to and from S3.""" + """Class to push and pull DocList to and from S3.""" @staticmethod def list(namespace: str, show_table: bool = False) -> List[str]: @@ -55,7 +56,7 @@ def list(namespace: str, show_table: bool = False) -> List[str]: :param namespace: The bucket and namespace to list. e.g. my_bucket/my_namespace :param show_table: If true, a rich table will be printed to the console. - :return: A list of DocArray names. + :return: A list of DocList names. """ bucket, namespace = namespace.split('/', 1) s3 = boto3.resource('s3') @@ -63,7 +64,7 @@ def list(namespace: str, show_table: bool = False) -> List[str]: da_files = [ obj for obj in s3_bucket.objects.all() - if obj.key.startswith(namespace) and obj.key.endswith('.da') + if obj.key.startswith(namespace) and obj.key.endswith('.docs') ] da_names = [f.key.split('/')[-1].split('.')[0] for f in da_files] @@ -93,7 +94,7 @@ def list(namespace: str, show_table: bool = False) -> List[str]: @staticmethod def delete(name: str, missing_ok: bool = True) -> bool: - """Delete the DocArray object at the specified bucket and key. + """Delete the DocList object at the specified bucket and key. :param name: The bucket and key to delete. e.g. my_bucket/my_key :param missing_ok: If true, no error will be raised if the object does not exist. @@ -101,7 +102,7 @@ def delete(name: str, missing_ok: bool = True) -> bool: """ bucket, name = name.split('/', 1) s3 = boto3.resource('s3') - object = s3.Object(bucket, name + '.da') + object = s3.Object(bucket, name + '.docs') try: object.load() except botocore.exceptions.ClientError as e: @@ -118,21 +119,21 @@ def delete(name: str, missing_ok: bool = True) -> bool: @classmethod def push( cls: Type[SelfS3DocStore], - da: 'DocArray', + docs: 'DocList', name: str, public: bool = False, show_progress: bool = False, branding: Optional[Dict] = None, ) -> Dict: - """Push this DocArray object to the specified bucket and key. + """Push this DocList object to the specified bucket and key. - :param da: The DocArray to push. + :param docs: The DocList to push. :param name: The bucket and key to push to. e.g. my_bucket/my_key :param public: Not used by the ``s3`` protocol. :param show_progress: If true, a progress bar will be displayed. :param branding: Not used by the ``s3`` protocol. """ - return cls.push_stream(iter(da), name, public, show_progress, branding) + return cls.push_stream(iter(docs), name, public, show_progress, branding) @staticmethod def push_stream( @@ -160,7 +161,7 @@ def push_stream( # Upload to S3 with open( - f"s3://{bucket}/{name}.da", + f"s3://{bucket}/{name}.docs", 'wb', compression='.gz', transport_params={'multipart_upload': False}, @@ -176,29 +177,29 @@ def push_stream( @classmethod def pull( cls: Type[SelfS3DocStore], - da_cls: Type['DocArray'], + docs_cls: Type['DocList'], name: str, show_progress: bool = False, local_cache: bool = False, - ) -> 'DocArray': - """Pull a :class:`DocArray` from the specified bucket and key. + ) -> 'DocList': + """Pull a :class:`DocList` from the specified bucket and key. :param name: The bucket and key to pull from. e.g. my_bucket/my_key :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocArray to local cache - :return: a :class:`DocArray` object + :param local_cache: store the downloaded DocList to local cache + :return: a :class:`DocList` object """ - da = da_cls( # type: ignore + docs = docs_cls( # type: ignore cls.pull_stream( - da_cls, name, show_progress=show_progress, local_cache=local_cache + docs_cls, name, show_progress=show_progress, local_cache=local_cache ) ) - return da + return docs @classmethod def pull_stream( cls: Type[SelfS3DocStore], - da_cls: Type['DocArray'], + docs_cls: Type['DocList'], name: str, show_progress: bool, local_cache: bool, @@ -208,24 +209,24 @@ def pull_stream( :param name: The bucket and key to pull from. e.g. my_bucket/my_key :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocArray to local cache + :param local_cache: store the downloaded DocList to local cache :return: An iterator of Documents """ bucket, name = name.split('/', 1) save_name = name.replace('/', '_') - cache_path = _get_cache_path() / f'{save_name}.da' + cache_path = _get_cache_path() / f'{save_name}.docs' source = _BufferedCachingReader( - open(f"s3://{bucket}/{name}.da", 'rb', compression='.gz'), + open(f"s3://{bucket}/{name}.docs", 'rb', compression='.gz'), cache_path=cache_path if local_cache else None, ) if local_cache: if cache_path.exists(): object_header = boto3.client('s3').head_object( - Bucket=bucket, Key=name + '.da' + Bucket=bucket, Key=name + '.docs' ) if cache_path.stat().st_size == object_header['ContentLength']: logging.info( @@ -234,7 +235,7 @@ def pull_stream( source = open(cache_path, 'rb') return _from_binary_stream( - da_cls.document_type, + docs_cls.doc_type, source, protocol='pickle', compress=None, diff --git a/docarray/typing/tensor/abstract_tensor.py b/docarray/typing/tensor/abstract_tensor.py index f9814b429e4..b74cc06697f 100644 --- a/docarray/typing/tensor/abstract_tensor.py +++ b/docarray/typing/tensor/abstract_tensor.py @@ -265,7 +265,7 @@ def _docarray_stack(cls: Type[T], seq: Union[List[T], Tuple[T]]) -> T: @abc.abstractmethod def _docarray_from_native(cls: Type[T], value: Any) -> T: """ - Create a DocArray tensor from a tensor that is native to the given framework, + Create a DocList tensor from a tensor that is native to the given framework, e.g. from numpy.ndarray or torch.Tensor. """ ... @@ -293,11 +293,11 @@ def __iter__(self): @abc.abstractmethod def to_protobuf(self) -> 'NdArrayProto': - """Convert DocArray into a Protobuf message""" + """Convert DocList into a Protobuf message""" ... def unwrap(self): - """Return the native tensor object that this DocArray tensor wraps.""" + """Return the native tensor object that this DocList tensor wraps.""" @abc.abstractmethod def _docarray_to_json_compatible(self): diff --git a/docarray/utils/filter.py b/docarray/utils/filter.py index 773cbbe815d..5b7daa1e6f2 100644 --- a/docarray/utils/filter.py +++ b/docarray/utils/filter.py @@ -3,8 +3,8 @@ import json from typing import Dict, List, Union -from docarray.array.abstract_array import AnyDocArray -from docarray.array.array.array import DocArray +from docarray.array.any_array import AnyDocArray +from docarray.array.doc_list.doc_list import DocList def filter_docs( @@ -19,7 +19,7 @@ def filter_docs( --- ```python - from docarray import DocArray, BaseDoc + from docarray import DocList, BaseDoc from docarray.documents import TextDoc, ImageDoc from docarray.utils.filter import filter_docs @@ -30,7 +30,7 @@ class MyDocument(BaseDoc): price: int - docs = DocArray[MyDocument]( + docs = DocList[MyDocument]( [ MyDocument( caption='A tiger in the jungle', @@ -65,9 +65,9 @@ class MyDocument(BaseDoc): --- - :param docs: the DocArray where to apply the filter + :param docs: the DocList where to apply the filter :param query: the query to filter by - :return: A DocArray containing the Documents + :return: A DocList containing the Documents in `docs` that fulfill the filter conditions in the `query` """ from docarray.utils._internal.query_language.query_parser import QueryParser @@ -75,7 +75,7 @@ class MyDocument(BaseDoc): if query: query = query if not isinstance(query, str) else json.loads(query) parser = QueryParser(query) - return DocArray.__class_getitem__(docs.document_type)( + return DocList.__class_getitem__(docs.doc_type)( d for d in docs if parser.evaluate(d) ) else: diff --git a/docarray/utils/find.py b/docarray/utils/find.py index a626134d1b6..405f3e75f15 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -4,9 +4,9 @@ from typing_inspect import is_union_type -from docarray.array.abstract_array import AnyDocArray -from docarray.array.array.array import DocArray -from docarray.array.stacked.array_stacked import DocArrayStacked +from docarray.array.any_array import AnyDocArray +from docarray.array.doc_list.doc_list import DocList +from docarray.array.doc_vec.doc_vec import DocVec from docarray.base_doc import BaseDoc from docarray.helper import _get_field_type_by_access_path from docarray.typing import AnyTensor @@ -14,12 +14,12 @@ class FindResult(NamedTuple): - documents: DocArray + documents: DocList scores: AnyTensor class _FindResult(NamedTuple): - documents: Union[DocArray, List[Dict[str, Any]]] + documents: Union[DocList, List[Dict[str, Any]]] scores: AnyTensor @@ -44,7 +44,7 @@ def find( --- ```python - from docarray import DocArray, BaseDoc + from docarray import DocList, BaseDoc from docarray.typing import TorchTensor from docarray.utils.find import find import torch @@ -54,9 +54,7 @@ class MyDocument(BaseDoc): embedding: TorchTensor - index = DocArray[MyDocument]( - [MyDocument(embedding=torch.rand(128)) for _ in range(100)] - ) + index = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(100)]) # use Document as query query = MyDocument(embedding=torch.rand(128)) @@ -92,7 +90,7 @@ class MyDocument(BaseDoc): can be either `cpu` or a `cuda` device. :param descending: sort the results in descending order. Per default, this is chosen based on the `metric` argument. - :return: A named tuple of the form (DocArray, AnyTensor), + :return: A named tuple of the form (DocList, AnyTensor), where the first element contains the closes matches for the query, and the second element contains the corresponding scores. """ @@ -110,7 +108,7 @@ class MyDocument(BaseDoc): def find_batched( index: AnyDocArray, - query: Union[AnyTensor, DocArray], + query: Union[AnyTensor, DocList], embedding_field: str = 'embedding', metric: str = 'cosine_sim', limit: int = 10, @@ -130,7 +128,7 @@ def find_batched( --- ```python - from docarray import DocArray, BaseDoc + from docarray import DocList, BaseDoc from docarray.typing import TorchTensor from docarray.utils.find import find_batched import torch @@ -140,12 +138,10 @@ class MyDocument(BaseDoc): embedding: TorchTensor - index = DocArray[MyDocument]( - [MyDocument(embedding=torch.rand(128)) for _ in range(100)] - ) + index = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(100)]) - # use DocArray as query - query = DocArray[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)]) + # use DocList as query + query = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)]) results = find_batched( index=index, query=query, @@ -180,7 +176,7 @@ class MyDocument(BaseDoc): can be either `cpu` or a `cuda` device. :param descending: sort the results in descending order. Per default, this is chosen based on the `metric` argument. - :return: a list of named tuples of the form (DocArray, AnyTensor), + :return: a list of named tuples of the form (DocList, AnyTensor), where the first element contains the closes matches for each query, and the second element contains the corresponding scores. """ @@ -203,16 +199,16 @@ class MyDocument(BaseDoc): results = [] for indices_per_query, scores_per_query in zip(top_indices, top_scores): - docs_per_query: DocArray = DocArray([]) + docs_per_query: DocList = DocList([]) for idx in indices_per_query: # workaround until #930 is fixed docs_per_query.append(index[idx]) - docs_per_query = DocArray(docs_per_query) + docs_per_query = DocList(docs_per_query) results.append(FindResult(scores=scores_per_query, documents=docs_per_query)) return results def _extract_embedding_single( - data: Union[DocArray, BaseDoc, AnyTensor], + data: Union[DocList, BaseDoc, AnyTensor], embedding_field: str, ) -> AnyTensor: """Extract the embeddings from a single query, @@ -247,10 +243,10 @@ def _extract_embeddings( :return: the embeddings """ emb: AnyTensor - if isinstance(data, DocArray): + if isinstance(data, DocList): emb_list = list(AnyDocArray._traverse(data, embedding_field)) emb = embedding_type._docarray_stack(emb_list) - elif isinstance(data, (DocArrayStacked, BaseDoc)): + elif isinstance(data, (DocVec, BaseDoc)): emb = next(AnyDocArray._traverse(data, embedding_field)) else: # treat data as tensor emb = cast(AnyTensor, data) @@ -260,23 +256,23 @@ def _extract_embeddings( return emb -def _da_attr_type(da: AnyDocArray, access_path: str) -> Type[AnyTensor]: +def _da_attr_type(docs: AnyDocArray, access_path: str) -> Type[AnyTensor]: """Get the type of the attribute according to the Document type - (schema) of the DocArray. + (schema) of the DocList. - :param da: the DocArray + :param docs: the DocList :param access_path: the "__"-separated access path :return: the type of the attribute """ field_type: Optional[Type] = _get_field_type_by_access_path( - da.document_type, access_path + docs.doc_type, access_path ) if field_type is None: raise ValueError(f"Access path is not valid: {access_path}") if is_union_type(field_type): # determine type based on the fist element - field_type = type(next(AnyDocArray._traverse(da[0], access_path))) + field_type = type(next(AnyDocArray._traverse(docs[0], access_path))) if not issubclass(field_type, AbstractTensor): raise ValueError( diff --git a/docarray/utils/map.py b/docarray/utils/map.py index 31e93bc2175..09c7b1ae2ef 100644 --- a/docarray/utils/map.py +++ b/docarray/utils/map.py @@ -7,7 +7,7 @@ from rich.progress import track from docarray import BaseDoc -from docarray.array.abstract_array import AnyDocArray +from docarray.array.any_array import AnyDocArray from docarray.helper import _is_lambda_or_partial_or_local_function T = TypeVar('T', bound=AnyDocArray) @@ -15,7 +15,7 @@ def map_docs( - da: T, + docs: T, func: Callable[[T_doc], T_doc], backend: str = 'thread', num_worker: Optional[int] = None, @@ -23,13 +23,13 @@ def map_docs( show_progress: bool = False, ) -> Generator[T_doc, None, None]: """ - Return an iterator that applies `func` to every Document in `da` in parallel, + Return an iterator that applies `func` to every Document in `docs` in parallel, yielding the results. --- ```python - from docarray import DocArray + from docarray import DocList from docarray.documents import ImageDoc from docarray.utils.map import map_docs @@ -44,19 +44,19 @@ def load_url_to_tensor(img: ImageDoc) -> ImageDoc: 'Dag_Sebastian_Ahlander_at_G%C3%B6teborg_Book_Fair_2012b.jpg' ) - da = DocArray[ImageDoc]([ImageDoc(url=url) for _ in range(100)]) - da = DocArray[ImageDoc]( - list(map_docs(da, load_url_to_tensor, backend='thread')) + docs = DocList[ImageDoc]([ImageDoc(url=url) for _ in range(100)]) + docs = DocList[ImageDoc]( + list(map_docs(docs, load_url_to_tensor, backend='thread')) ) # threading is usually a good option for IO-bound tasks such as loading an # ImageDoc from url - for doc in da: + for doc in docs: assert doc.tensor is not None ``` --- - :param da: DocArray to apply function to + :param docs: DocList to apply function to :param func: a function that takes a :class:`BaseDoc` as input and outputs a :class:`BaseDoc`. :param backend: `thread` for multithreading and `process` for multiprocessing. @@ -98,13 +98,13 @@ def load_url_to_tensor(img: ImageDoc) -> ImageDoc: context_pool = p with context_pool: - imap = p.imap(func, da) - for x in track(imap, total=len(da), disable=not show_progress): + imap = p.imap(func, docs) + for x in track(imap, total=len(docs), disable=not show_progress): yield x def map_docs_batched( - da: T, + docs: T, func: Callable[[T], Union[T, T_doc]], batch_size: int, backend: str = 'thread', @@ -121,7 +121,7 @@ def map_docs_batched( --- ```python - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList from docarray.utils.map import map_docs_batched @@ -129,19 +129,19 @@ class MyDoc(BaseDoc): name: str - def upper_case_name(da: DocArray[MyDoc]) -> DocArray[MyDoc]: - da.name = [n.upper() for n in da.name] - return da + def upper_case_name(docs: DocList[MyDoc]) -> DocList[MyDoc]: + docs.name = [n.upper() for n in docs.name] + return docs batch_size = 16 - da = DocArray[MyDoc]([MyDoc(name='my orange cat') for _ in range(100)]) - it = map_docs_batched(da, upper_case_name, batch_size=batch_size) + docs = DocList[MyDoc]([MyDoc(name='my orange cat') for _ in range(100)]) + it = map_docs_batched(docs, upper_case_name, batch_size=batch_size) for i, d in enumerate(it): - da[i * batch_size : (i + 1) * batch_size] = d + docs[i * batch_size : (i + 1) * batch_size] = d - assert len(da) == 100 - print(da.name[:3]) + assert len(docs) == 100 + print(docs.name[:3]) ``` --- @@ -152,7 +152,7 @@ def upper_case_name(da: DocArray[MyDoc]) -> DocArray[MyDoc]: --- - :param da: DocArray to apply function to + :param docs: DocList to apply function to :param batch_size: Size of each generated batch (except the last one, which might be smaller). :param shuffle: If set, shuffle the Documents before dividing into minibatches. @@ -180,7 +180,7 @@ def upper_case_name(da: DocArray[MyDoc]) -> DocArray[MyDoc]: :param pool: use an existing/external pool. If given, `backend` is ignored and you will be responsible for closing the pool. - :return: yield DocArrays returned from `func` + :return: yield DocLists returned from `func` """ if backend == 'process' and _is_lambda_or_partial_or_local_function(func): raise ValueError( @@ -196,9 +196,9 @@ def upper_case_name(da: DocArray[MyDoc]) -> DocArray[MyDoc]: context_pool = p with context_pool: - imap = p.imap(func, da._batch(batch_size=batch_size, shuffle=shuffle)) + imap = p.imap(func, docs._batch(batch_size=batch_size, shuffle=shuffle)) for x in track( - imap, total=ceil(len(da) / batch_size), disable=not show_progress + imap, total=ceil(len(docs) / batch_size), disable=not show_progress ): yield x diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py index abf677b7cc9..f60ad0a1671 100644 --- a/docarray/utils/reduce.py +++ b/docarray/utils/reduce.py @@ -2,28 +2,28 @@ from typing import Dict, List, Optional -from docarray import DocArray +from docarray import DocList def reduce( - left: DocArray, right: DocArray, left_id_map: Optional[Dict] = None -) -> 'DocArray': + left: DocList, right: DocList, left_id_map: Optional[Dict] = None +) -> 'DocList': """ - Reduces left and right DocArray into one DocArray in-place. - Changes are applied to the left DocArray. - Reducing 2 DocArrays consists in adding Documents in the second DocArray - to the first DocArray if they do not exist. - If a Document exists in both DocArrays (identified by ID), + Reduces left and right DocList into one DocList in-place. + Changes are applied to the left DocList. + Reducing 2 DocLists consists in adding Documents in the second DocList + to the first DocList if they do not exist. + If a Document exists in both DocLists (identified by ID), the data properties are merged with priority to the left Document. - Nested DocArrays are also reduced in the same way. - :param left: First DocArray to be reduced. Changes will be applied to it + Nested DocLists are also reduced in the same way. + :param left: First DocList to be reduced. Changes will be applied to it in-place - :param right: Second DocArray to be reduced + :param right: Second DocList to be reduced :param left_id_map: Optional parameter to be passed in repeated calls for optimizations, keeping a map of the Document ID to its offset - in the DocArray - :return: Reduced DocArray + in the DocList + :return: Reduced DocList """ left_id_map = left_id_map or {doc.id: i for i, doc in enumerate(left)} @@ -36,35 +36,35 @@ def reduce( return left -def reduce_all(docarrays: List[DocArray]) -> DocArray: +def reduce_all(docarrays: List[DocList]) -> DocList: """ - Reduces a list of DocArrays into one DocArray. - Changes are applied to the first DocArray in-place. + Reduces a list of DocLists into one DocList. + Changes are applied to the first DocList in-place. - The resulting DocArray contains Documents of all DocArrays. - If a Document exists (identified by their ID) in many DocArrays, + The resulting DocList contains Documents of all DocLists. + If a Document exists (identified by their ID) in many DocLists, data properties are merged with priority to the left-most - DocArrays (that is, if a data attribute is set in a Document - belonging to many DocArrays, the attribute value of the left-most - DocArray is kept). - Nested DocArrays belonging to many DocArrays + DocLists (that is, if a data attribute is set in a Document + belonging to many DocLists, the attribute value of the left-most + DocList is kept). + Nested DocLists belonging to many DocLists are also reduced in the same way. .. note:: - - Nested DocArrays order does not follow any specific rule. + - Nested DocLists order does not follow any specific rule. You might want to re-sort them in a later step. - - The final result depends on the order of DocArrays + - The final result depends on the order of DocLists when applying reduction. - :param docarrays: List of DocArrays to be reduced - :return: the resulting DocArray + :param docarrays: List of DocLists to be reduced + :return: the resulting DocList """ if len(docarrays) <= 1: raise Exception( - 'In order to reduce DocArrays' ' we should have more than one DocArray' + 'In order to reduce DocLists' ' we should have more than one DocList' ) left = docarrays[0] others = docarrays[1:] left_id_map = {doc.id: i for i, doc in enumerate(left)} - for da in others: - reduce(left, da, left_id_map) + for docs in others: + reduce(left, docs, left_id_map) return left diff --git a/docs/api_references/array/da.md b/docs/api_references/array/da.md index d44a4913864..21a206a9537 100644 --- a/docs/api_references/array/da.md +++ b/docs/api_references/array/da.md @@ -1,3 +1,3 @@ -# DocArray +# DocList -::: docarray.array.array.array.DocArray +::: docarray.array.doc_list.doc_list.DocList diff --git a/docs/api_references/array/da_stack.md b/docs/api_references/array/da_stack.md index 7f5f9e51a86..c0709f2e084 100644 --- a/docs/api_references/array/da_stack.md +++ b/docs/api_references/array/da_stack.md @@ -1,3 +1,3 @@ -# DocArrayStacked +# DocVec -::: docarray.array.array.array.DocArrayStacked +::: docarray.array.doc_vec.doc_vec.DocVec diff --git a/docs/how_to/multimodal_training_and_serving.md b/docs/how_to/multimodal_training_and_serving.md index fd4421beb0f..9c30cbeffba 100644 --- a/docs/how_to/multimodal_training_and_serving.md +++ b/docs/how_to/multimodal_training_and_serving.md @@ -12,9 +12,9 @@ jupyter: name: python3 --- -# Multi-Modal Deep learning with DocArray +# Multi-Modal Deep learning with DocList -DocArray is a library for representing, sending, and storing multi-modal data that can be used for a variety of different +DocList is a library for representing, sending, and storing multi-modal data that can be used for a variety of different use cases. Here we will focus on a workflow familiar to many ML Engineers: Building and training a model, and then serving it to @@ -22,10 +22,10 @@ users. This notebook contains two parts: -1. **Representing**: We will use DocArray to represent multi-modal data while **building and training a PyTorch model**. -We will see how DocArray can help to organize and group your modalities and tensors and make clear what methods expect as inputs and return as outputs. +1. **Representing**: We will use DocList to represent multi-modal data while **building and training a PyTorch model**. +We will see how DocList can help to organize and group your modalities and tensors and make clear what methods expect as inputs and return as outputs. 2. **Sending**: We will take the model that we built and trained in part 1, and **serve it using FastAPI**. -We will see how DocArray narrows the gap between model development and model deployment, and how the same data models can be +We will see how DocList narrows the gap between model development and model deployment, and how the same data models can be reused in both contexts. That part will be very short, but that's the point! So without further ado, let's dive into it! @@ -39,11 +39,11 @@ We train the CLIP-like model on the [flickr8k](https://www.kaggle.com/datasets/a To run this notebook you need to download and unzip the data into the same folder as the notebook. Note that in this notebook by no means we aim at reproduce any CLIP results (our dataset is way too small anyways), -but we rather want to show how DocArray datastructures help researchers and practitioners to write beautiful and +but we rather want to show how DocList datastructures help researchers and practitioners to write beautiful and pythonic multi-modal PyTorch code. ```python tags=[] -#!pip install "git+https://github.com/docarray/docarray@feat-rewrite-v2#egg=docarray[torch,image]" +#!pip install "git+https://github.com/DocList/DocList@feat-rewrite-v2#egg=DocList[torch,image]" #!pip install torchvision #!pip install transformers #!pip install fastapi @@ -56,7 +56,7 @@ from typing import Callable, Dict, List, Optional ``` ```python -import docarray +import DocList import torch ``` @@ -74,23 +74,23 @@ DEVICE = "cuda:0" # change to your favourite device ## Create the Documents for handling the Muti-Modal data -The first thing we are trying to achieve when using DocArray is to clearly model our data so that we never get confused +The first thing we are trying to achieve when using DocList is to clearly model our data so that we never get confused about which tensors are supposed to represent what. -To do that we are using a concept that is at the core of DocArray. The `Document`, a collection of multi-modal data. +To do that we are using a concept that is at the core of DocList. The `Document`, a collection of multi-modal data. The `BaseDoc` class allows users to define their own (nested, multi-modal) Document schema to represent any kind of complex data. Let's start by defining a few Documents to handle the different modalities that we will use during our training: ```python -from docarray import BaseDoc, DocArray -from docarray.typing import TorchTensor, ImageUrl +from DocList import BaseDoc, DocList +from DocList.typing import TorchTensor, ImageUrl ``` Let's first create a Document for our Text modality. It will contain a number of `Tokens`, which we also define: ```python -from docarray.documents import TextDoc as BaseText +from DocList.documents import TextDoc as BaseText class Tokens(BaseDoc): @@ -106,10 +106,10 @@ Notice the `TorchTensor` type. It is a thin wrapper around `torch.Tensor` that c but also enables additional features. One such feature is shape parametrization (`TorchTensor[48]`), which lets you hint and even enforce the desired shape of any tensor! -To represent our image data, we use the `Image` Document that is included in DocArray: +To represent our image data, we use the `Image` Document that is included in DocList: ```python -from docarray.documents import ImageDoc +from DocList.documents import ImageDoc ``` Under the hood, an `Image` looks something like this (with the only main difference that it can take tensors from any @@ -136,9 +136,9 @@ class PairTextImage(BaseDoc): ## Create the Dataset -In this section we will create a multi-modal pytorch dataset around the Flick8k dataset using DocArray. +In this section we will create a multi-modal pytorch dataset around the Flick8k dataset using DocList. -We will use DocArray data loading functionality to load the data and use Torchvision and Transformers to preprocess the data before feeding it to our deep learning model: +We will use DocList data loading functionality to load the data and use Torchvision and Transformers to preprocess the data before feeding it to our deep learning model: ```python from torch.utils.data import DataLoader, Dataset @@ -184,14 +184,14 @@ import pandas as pd def get_flickr8k_da(file: str = "captions.txt", N: Optional[int] = None): df = pd.read_csv(file, nrows=N) - da = DocArray[PairTextImage]( + da = DocList[PairTextImage]( PairTextImage(text=Text(text=i.caption), image=Image(url=f"Images/{i.image}")) for i in df.itertuples() ) return da ``` -In the `get_flickr8k_da` method we process the Flickr8k dataset into a `DocArray`. +In the `get_flickr8k_da` method we process the Flickr8k dataset into a `DocList`. Now let's instantiate this dataset using the `MultiModalDataset` class. The constructor takes in the `da` and a dictionary of preprocessing transformations: @@ -201,7 +201,7 @@ preprocessing = {"image": VisionPreprocess(), "text": TextPreprocess()} ``` ```python -from docarray.data import MultiModalDataset +from DocList.data import MultiModalDataset dataset = MultiModalDataset[PairTextImage](da=da, preprocessing=preprocessing) loader = DataLoader( @@ -214,11 +214,11 @@ loader = DataLoader( ) ``` -## Create the Pytorch model that works on DocArray +## Create the Pytorch model that works on DocList In this section we create two encoders, one per modality (Text and Image). These encoders are normal PyTorch `nn.Module`s. -The only difference is that they operate on DocArray rather that on torch.Tensor: +The only difference is that they operate on DocList rather that on torch.Tensor: ```python class TextEncoder(nn.Module): @@ -226,7 +226,7 @@ class TextEncoder(nn.Module): super().__init__() self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased") - def forward(self, texts: DocArray[Text]) -> TorchTensor: + def forward(self, texts: DocList[Text]) -> TorchTensor: last_hidden_state = self.bert( input_ids=texts.tokens.input_ids, attention_mask=texts.tokens.attention_mask ).last_hidden_state @@ -240,8 +240,8 @@ class TextEncoder(nn.Module): return masked_output.sum(dim=1) / attention_mask.sum(-1, keepdim=True) ``` -The `TextEncoder` takes a `DocArray` of `Text`s as input, and returns an embedding `TorchTensor` as output. -`DocArray` can be seen as a list of `Text` documents, and the encoder will treat it as one batch. +The `TextEncoder` takes a `DocList` of `Text`s as input, and returns an embedding `TorchTensor` as output. +`DocList` can be seen as a list of `Text` documents, and the encoder will treat it as one batch. ```python @@ -251,12 +251,12 @@ class VisionEncoder(nn.Module): self.backbone = torchvision.models.resnet18(pretrained=True) self.linear = nn.LazyLinear(out_features=768) - def forward(self, images: DocArray[Image]) -> TorchTensor: + def forward(self, images: DocList[Image]) -> TorchTensor: x = self.backbone(images.tensor) return self.linear(x) ``` -Similarly, the `VisionEncoder` also takes a `DocArray` of `Image`s as input, and returns an embedding `TorchTensor` as output. +Similarly, the `VisionEncoder` also takes a `DocList` of `Image`s as input, and returns an embedding `TorchTensor` as output. However, it operates on the `image` attribute of each Document. Now we can instantiate our encoders: @@ -266,7 +266,7 @@ vision_encoder = VisionEncoder().to(DEVICE) text_encoder = TextEncoder().to(DEVICE) ``` -As you can see, DocArray helps us to clearly convey what data is expected as input and output for each method, all through Python type hints. +As you can see, DocList helps us to clearly convey what data is expected as input and output for each method, all through Python type hints. ## Train the model in a contrastive way between Text and Image (CLIP) @@ -289,7 +289,7 @@ def cosine_sim(x_mat: TorchTensor, y_mat: TorchTensor) -> TorchTensor: ``` ```python -def clip_loss(image: DocArray[Image], text: DocArray[Text]) -> TorchTensor: +def clip_loss(image: DocList[Image], text: DocList[Text]) -> TorchTensor: sims = cosine_sim(image.embedding, text.embedding) return torch.norm(sims - torch.eye(sims.shape[0], device=DEVICE)) ``` @@ -301,7 +301,7 @@ In the type hints of `cosine_sim` and `clip_loss` you can again notice that we c num_epoch = 1 # here you should do more epochs to really learn something ``` -One things to notice here is that our dataloader does not return a `torch.Tensor` but a `DocArray[PairTextImage]`, +One things to notice here is that our dataloader does not return a `torch.Tensor` but a `DocList[PairTextImage]`, which is exactly what our model can operate on. So let's write a training loop and train our encoders: @@ -312,7 +312,7 @@ from tqdm import tqdm with torch.autocast(device_type="cuda", dtype=torch.float16): for epoch in range(num_epoch): for i, batch in tqdm(enumerate(loader), total=len(loader), desc=f"Epoch {epoch}"): - batch.to(DEVICE) # DocArray can be moved to device + batch.to(DEVICE) # DocList can be moved to device optim.zero_grad() # FORWARD PASS: @@ -337,12 +337,12 @@ Let's use our beloved [FastAPI](https://fastapi.tiangolo.com/) for that! FastAPI is powerful because it allows you to define your Rest API data schema in pure Python. -And DocArray is fully compatible with FastAPI and Pydantic, which means that as long as you have a function that takes a Document as input, +And DocList is fully compatible with FastAPI and Pydantic, which means that as long as you have a function that takes a Document as input, FastAPI will be able to automatically translate it into a fully fledged API with documentation, openAPI specification and more: ```python from fastapi import FastAPI -from docarray.base_doc import DocumentResponse +from DocList.base_doc import DocumentResponse ``` ```python @@ -366,7 +366,7 @@ async def embed_text(doc: Text) -> Text: with torch.autocast(device_type="cuda", dtype=torch.float16): with torch.inference_mode(): text_preprocess(doc) - da = DocArray[Text]([doc], tensor_type=TorchTensor).stack() + da = DocList[Text]([doc], tensor_type=TorchTensor).stack() da.to(DEVICE) doc.embedding = text_encoder(da)[0].to('cpu') return doc @@ -400,4 +400,4 @@ doc_resp = Text.parse_raw(response.content.decode()) doc_resp.embedding.shape ``` -And we're done! You have trained and served a mulit-modal ML model, with zero headache and a lot of DocArray! +And we're done! You have trained and served a mulit-modal ML model, with zero headache and a lot of DocList! diff --git a/tests/benchmark_tests/test_map.py b/tests/benchmark_tests/test_map.py index d6018b9fdb0..e5c664a408b 100644 --- a/tests/benchmark_tests/test_map.py +++ b/tests/benchmark_tests/test_map.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from docarray.typing import NdArray from docarray.utils.map import map_docs, map_docs_batched @@ -32,11 +32,14 @@ def time_multiprocessing(num_workers: int) -> float: n_docs = 5 rng = np.random.RandomState(0) matrices = [rng.random(size=(1000, 1000)) for _ in range(n_docs)] - da = DocArray[MyMatrix]([MyMatrix(matrix=m) for m in matrices]) + da = DocList[MyMatrix]([MyMatrix(matrix=m) for m in matrices]) start_time = time() list( map_docs( - da=da, func=cpu_intensive, backend='process', num_worker=num_workers + docs=da, + func=cpu_intensive, + backend='process', + num_worker=num_workers, ) ) return time() - start_time @@ -47,7 +50,7 @@ def time_multiprocessing(num_workers: int) -> float: assert time_2_cpu < time_1_cpu -def cpu_intensive_batch(da: DocArray[MyMatrix]) -> DocArray[MyMatrix]: +def cpu_intensive_batch(da: DocList[MyMatrix]) -> DocList[MyMatrix]: # some cpu intensive function for doc in da: for i in range(3000): @@ -63,11 +66,11 @@ def time_multiprocessing(num_workers: int) -> float: n_docs = 16 rng = np.random.RandomState(0) matrices = [rng.random(size=(1000, 1000)) for _ in range(n_docs)] - da = DocArray[MyMatrix]([MyMatrix(matrix=m) for m in matrices]) + da = DocList[MyMatrix]([MyMatrix(matrix=m) for m in matrices]) start_time = time() list( map_docs_batched( - da=da, + docs=da, func=cpu_intensive_batch, batch_size=8, backend='process', @@ -91,12 +94,14 @@ def io_intensive(img: ImageDoc) -> ImageDoc: def test_map_docs_multithreading(): def time_multithreading(num_workers: int) -> float: n_docs = 100 - da = DocArray[ImageDoc]( + da = DocList[ImageDoc]( [ImageDoc(url=IMAGE_PATHS['png']) for _ in range(n_docs)] ) start_time = time() list( - map_docs(da=da, func=io_intensive, backend='thread', num_worker=num_workers) + map_docs( + docs=da, func=io_intensive, backend='thread', num_worker=num_workers + ) ) return time() - start_time @@ -106,7 +111,7 @@ def time_multithreading(num_workers: int) -> float: assert time_2_thread < time_1_thread -def io_intensive_batch(da: DocArray[ImageDoc]) -> DocArray[ImageDoc]: +def io_intensive_batch(da: DocList[ImageDoc]) -> DocList[ImageDoc]: # some io intensive function: load and set image url for doc in da: doc.tensor = doc.url.load() @@ -116,13 +121,13 @@ def io_intensive_batch(da: DocArray[ImageDoc]) -> DocArray[ImageDoc]: def test_map_docs_batched_multithreading(): def time_multithreading_batch(num_workers: int) -> float: n_docs = 100 - da = DocArray[ImageDoc]( + da = DocList[ImageDoc]( [ImageDoc(url=IMAGE_PATHS['png']) for _ in range(n_docs)] ) start_time = time() list( map_docs_batched( - da=da, + docs=da, func=io_intensive_batch, backend='thread', num_worker=num_workers, diff --git a/tests/index/base_classes/test_base_doc_store.py b/tests/index/base_classes/test_base_doc_store.py index b5774020524..8e4764f5a88 100644 --- a/tests/index/base_classes/test_base_doc_store.py +++ b/tests/index/base_classes/test_base_doc_store.py @@ -5,7 +5,7 @@ import pytest from pydantic import Field -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from docarray.index.abstract import BaseDocIndex, _raise_not_composable from docarray.typing import ID, ImageBytes, ImageUrl, NdArray @@ -262,12 +262,12 @@ class OtherNestedDoc(NestedDoc): # SIMPLE store = DummyDocIndex[SimpleDoc]() in_list = [SimpleDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) - in_da = DocArray[SimpleDoc](in_list) + assert isinstance(store._validate_docs(in_list), DocList[BaseDoc]) + in_da = DocList[SimpleDoc](in_list) assert store._validate_docs(in_da) == in_da in_other_list = [OtherSimpleDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_other_list), DocArray[BaseDoc]) - in_other_da = DocArray[OtherSimpleDoc](in_other_list) + assert isinstance(store._validate_docs(in_other_list), DocList[BaseDoc]) + in_other_da = DocList[OtherSimpleDoc](in_other_list) assert store._validate_docs(in_other_da) == in_other_da with pytest.raises(ValueError): @@ -280,7 +280,7 @@ class OtherNestedDoc(NestedDoc): ) with pytest.raises(ValueError): store._validate_docs( - DocArray[FlatDoc]( + DocList[FlatDoc]( [ FlatDoc( tens_one=np.random.random((10,)), @@ -295,16 +295,16 @@ class OtherNestedDoc(NestedDoc): in_list = [ FlatDoc(tens_one=np.random.random((10,)), tens_two=np.random.random((50,))) ] - assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) - in_da = DocArray[FlatDoc]( + assert isinstance(store._validate_docs(in_list), DocList[BaseDoc]) + in_da = DocList[FlatDoc]( [FlatDoc(tens_one=np.random.random((10,)), tens_two=np.random.random((50,)))] ) assert store._validate_docs(in_da) == in_da in_other_list = [ OtherFlatDoc(tens_one=np.random.random((10,)), tens_two=np.random.random((50,))) ] - assert isinstance(store._validate_docs(in_other_list), DocArray[BaseDoc]) - in_other_da = DocArray[OtherFlatDoc]( + assert isinstance(store._validate_docs(in_other_list), DocList[BaseDoc]) + in_other_da = DocList[OtherFlatDoc]( [ OtherFlatDoc( tens_one=np.random.random((10,)), tens_two=np.random.random((50,)) @@ -316,18 +316,18 @@ class OtherNestedDoc(NestedDoc): store._validate_docs([SimpleDoc(tens=np.random.random((10,)))]) with pytest.raises(ValueError): assert not store._validate_docs( - DocArray[SimpleDoc]([SimpleDoc(tens=np.random.random((10,)))]) + DocList[SimpleDoc]([SimpleDoc(tens=np.random.random((10,)))]) ) # NESTED store = DummyDocIndex[NestedDoc]() in_list = [NestedDoc(d=SimpleDoc(tens=np.random.random((10,))))] - assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) - in_da = DocArray[NestedDoc]([NestedDoc(d=SimpleDoc(tens=np.random.random((10,))))]) + assert isinstance(store._validate_docs(in_list), DocList[BaseDoc]) + in_da = DocList[NestedDoc]([NestedDoc(d=SimpleDoc(tens=np.random.random((10,))))]) assert store._validate_docs(in_da) == in_da in_other_list = [OtherNestedDoc(d=OtherSimpleDoc(tens=np.random.random((10,))))] - assert isinstance(store._validate_docs(in_other_list), DocArray[BaseDoc]) - in_other_da = DocArray[OtherNestedDoc]( + assert isinstance(store._validate_docs(in_other_list), DocList[BaseDoc]) + in_other_da = DocList[OtherNestedDoc]( [OtherNestedDoc(d=OtherSimpleDoc(tens=np.random.random((10,))))] ) @@ -336,7 +336,7 @@ class OtherNestedDoc(NestedDoc): store._validate_docs([SimpleDoc(tens=np.random.random((10,)))]) with pytest.raises(ValueError): store._validate_docs( - DocArray[SimpleDoc]([SimpleDoc(tens=np.random.random((10,)))]) + DocList[SimpleDoc]([SimpleDoc(tens=np.random.random((10,)))]) ) @@ -353,8 +353,8 @@ class TensorUnionDoc(BaseDoc): # OPTIONAL store = DummyDocIndex[SimpleDoc]() in_list = [OptionalDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) - in_da = DocArray[OptionalDoc](in_list) + assert isinstance(store._validate_docs(in_list), DocList[BaseDoc]) + in_da = DocList[OptionalDoc](in_list) assert store._validate_docs(in_da) == in_da with pytest.raises(ValueError): @@ -363,9 +363,9 @@ class TensorUnionDoc(BaseDoc): # MIXED UNION store = DummyDocIndex[SimpleDoc]() in_list = [MixedUnionDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) - in_da = DocArray[MixedUnionDoc](in_list) - assert isinstance(store._validate_docs(in_da), DocArray[BaseDoc]) + assert isinstance(store._validate_docs(in_list), DocList[BaseDoc]) + in_da = DocList[MixedUnionDoc](in_list) + assert isinstance(store._validate_docs(in_da), DocList[BaseDoc]) with pytest.raises(ValueError): store._validate_docs([MixedUnionDoc(tens='hello')]) @@ -373,14 +373,14 @@ class TensorUnionDoc(BaseDoc): # TENSOR UNION store = DummyDocIndex[TensorUnionDoc]() in_list = [SimpleDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) - in_da = DocArray[SimpleDoc](in_list) + assert isinstance(store._validate_docs(in_list), DocList[BaseDoc]) + in_da = DocList[SimpleDoc](in_list) assert store._validate_docs(in_da) == in_da store = DummyDocIndex[SimpleDoc]() in_list = [TensorUnionDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) - in_da = DocArray[TensorUnionDoc](in_list) + assert isinstance(store._validate_docs(in_list), DocList[BaseDoc]) + in_da = DocList[TensorUnionDoc](in_list) assert store._validate_docs(in_da) == in_da diff --git a/tests/index/hnswlib/test_index_get_del.py b/tests/index/hnswlib/test_index_get_del.py index d8336e0ed6d..d9437878698 100644 --- a/tests/index/hnswlib/test_index_get_del.py +++ b/tests/index/hnswlib/test_index_get_del.py @@ -6,7 +6,7 @@ import torch from pydantic import Field -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc, TextDoc from docarray.index import HnswDocumentIndex from docarray.typing import NdArray, NdArrayEmbedding, TorchTensor @@ -57,7 +57,7 @@ def ten_nested_docs(): def test_index_simple_schema(ten_simple_docs, tmp_path, use_docarray): store = HnswDocumentIndex[SimpleDoc](work_dir=str(tmp_path)) if use_docarray: - ten_simple_docs = DocArray[SimpleDoc](ten_simple_docs) + ten_simple_docs = DocList[SimpleDoc](ten_simple_docs) store.index(ten_simple_docs) assert store.num_docs() == 10 @@ -77,7 +77,7 @@ class MyDoc(BaseDoc): def test_index_flat_schema(ten_flat_docs, tmp_path, use_docarray): store = HnswDocumentIndex[FlatDoc](work_dir=str(tmp_path)) if use_docarray: - ten_flat_docs = DocArray[FlatDoc](ten_flat_docs) + ten_flat_docs = DocList[FlatDoc](ten_flat_docs) store.index(ten_flat_docs) assert store.num_docs() == 10 @@ -89,7 +89,7 @@ def test_index_flat_schema(ten_flat_docs, tmp_path, use_docarray): def test_index_nested_schema(ten_nested_docs, tmp_path, use_docarray): store = HnswDocumentIndex[NestedDoc](work_dir=str(tmp_path)) if use_docarray: - ten_nested_docs = DocArray[NestedDoc](ten_nested_docs) + ten_nested_docs = DocList[NestedDoc](ten_nested_docs) store.index(ten_nested_docs) assert store.num_docs() == 10 @@ -137,7 +137,7 @@ class TextSchema(TextDoc): store = HnswDocumentIndex[TextSchema](work_dir=str(tmp_path)) store.index( - DocArray[TextDoc]( + DocList[TextDoc]( [TextDoc(embedding=np.random.randn(10), text=f'{i}') for i in range(10)] ) ) @@ -154,7 +154,7 @@ class ImageSchema(ImageDoc): ) store.index( - DocArray[ImageDoc]( + DocList[ImageDoc]( [ ImageDoc( embedding=np.random.randn(10), tensor=np.random.randn(3, 224, 224) diff --git a/tests/integrations/array/test_torch_train.py b/tests/integrations/array/test_torch_train.py index 930f237b0a1..e269659462a 100644 --- a/tests/integrations/array/test_torch_train.py +++ b/tests/integrations/array/test_torch_train.py @@ -2,7 +2,7 @@ import torch -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.typing import TorchTensor @@ -13,7 +13,7 @@ class Mmdoc(BaseDoc): N = 10 - batch = DocArray[Mmdoc](Mmdoc(text=f'hello{i}') for i in range(N)) + batch = DocList[Mmdoc](Mmdoc(text=f'hello{i}') for i in range(N)) batch.tensor = torch.zeros(N, 3, 224, 224) batch = batch.stack() diff --git a/tests/integrations/doc_index/elastic/v7/test_index_get_del.py b/tests/integrations/doc_index/elastic/v7/test_index_get_del.py index d5394a7925b..40779116c4e 100644 --- a/tests/integrations/doc_index/elastic/v7/test_index_get_del.py +++ b/tests/integrations/doc_index/elastic/v7/test_index_get_del.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc, TextDoc from docarray.index import ElasticV7DocIndex from docarray.typing import NdArray @@ -48,7 +48,7 @@ def ten_deep_nested_docs(): def test_index_simple_schema(ten_simple_docs, use_docarray): store = ElasticV7DocIndex[SimpleDoc]() if use_docarray: - ten_simple_docs = DocArray[SimpleDoc](ten_simple_docs) + ten_simple_docs = DocList[SimpleDoc](ten_simple_docs) store.index(ten_simple_docs) assert store.num_docs() == 10 @@ -58,7 +58,7 @@ def test_index_simple_schema(ten_simple_docs, use_docarray): def test_index_flat_schema(ten_flat_docs, use_docarray): store = ElasticV7DocIndex[FlatDoc]() if use_docarray: - ten_flat_docs = DocArray[FlatDoc](ten_flat_docs) + ten_flat_docs = DocList[FlatDoc](ten_flat_docs) store.index(ten_flat_docs) assert store.num_docs() == 10 @@ -68,7 +68,7 @@ def test_index_flat_schema(ten_flat_docs, use_docarray): def test_index_nested_schema(ten_nested_docs, use_docarray): store = ElasticV7DocIndex[NestedDoc]() if use_docarray: - ten_nested_docs = DocArray[NestedDoc](ten_nested_docs) + ten_nested_docs = DocList[NestedDoc](ten_nested_docs) store.index(ten_nested_docs) assert store.num_docs() == 10 @@ -78,7 +78,7 @@ def test_index_nested_schema(ten_nested_docs, use_docarray): def test_index_deep_nested_schema(ten_deep_nested_docs, use_docarray): store = ElasticV7DocIndex[DeepNestedDoc]() if use_docarray: - ten_deep_nested_docs = DocArray[DeepNestedDoc](ten_deep_nested_docs) + ten_deep_nested_docs = DocList[DeepNestedDoc](ten_deep_nested_docs) store.index(ten_deep_nested_docs) assert store.num_docs() == 10 diff --git a/tests/integrations/document/test_document.py b/tests/integrations/document/test_document.py index 9d8b85f260d..6d3d44fd270 100644 --- a/tests/integrations/document/test_document.py +++ b/tests/integrations/document/test_document.py @@ -5,7 +5,7 @@ from pydantic import BaseModel, ValidationError from typing_extensions import TypedDict -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import AudioDoc, ImageDoc, TextDoc from docarray.documents.helper import ( create_doc, @@ -35,14 +35,14 @@ class MyMultiModalDoc(BaseDoc): def test_nested_chunks_document(): class ChunksDocument(BaseDoc): text: str - images: DocArray[ImageDoc] + images: DocList[ImageDoc] doc = ChunksDocument( text='hello', - images=DocArray[ImageDoc]([ImageDoc() for _ in range(10)]), + images=DocList[ImageDoc]([ImageDoc() for _ in range(10)]), ) - assert isinstance(doc.images, DocArray) + assert isinstance(doc.images, DocList) def test_create_doc(): diff --git a/tests/integrations/document/test_proto.py b/tests/integrations/document/test_proto.py index 2f656e6b4b4..add031f066e 100644 --- a/tests/integrations/document/test_proto.py +++ b/tests/integrations/document/test_proto.py @@ -2,7 +2,7 @@ import pytest import torch -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc, TextDoc from docarray.typing import ( AnyEmbedding, @@ -61,7 +61,7 @@ class MyDoc(BaseDoc): embedding: AnyEmbedding torch_embedding: TorchEmbedding[128] np_embedding: NdArrayEmbedding[128] - nested_docs: DocArray[NestedDoc] + nested_docs: DocList[NestedDoc] bytes_: bytes img_bytes: ImageBytes @@ -80,7 +80,7 @@ class MyDoc(BaseDoc): embedding=np.zeros((3, 224, 224)), torch_embedding=torch.zeros((128,)), np_embedding=np.zeros((128,)), - nested_docs=DocArray[NestedDoc]([NestedDoc(tensor=np.zeros((128,)))]), + nested_docs=DocList[NestedDoc]([NestedDoc(tensor=np.zeros((128,)))]), bytes_=b'hello', img_bytes=b'img', ) @@ -136,7 +136,7 @@ class MyDoc(BaseDoc): generic_tf_tensor: AnyTensor embedding: AnyEmbedding tf_embedding: TensorFlowEmbedding[128] - nested_docs: DocArray[NestedDoc] + nested_docs: DocList[NestedDoc] doc = MyDoc( tf_tensor=tf.zeros((3, 224, 224)), @@ -144,7 +144,7 @@ class MyDoc(BaseDoc): generic_tf_tensor=tf.zeros((3, 224, 224)), embedding=tf.zeros((3, 224, 224)), tf_embedding=tf.zeros((128,)), - nested_docs=DocArray[NestedDoc]([NestedDoc(tensor=tf.zeros((128,)))]), + nested_docs=DocList[NestedDoc]([NestedDoc(tensor=tf.zeros((128,)))]), ) doc = doc.to_protobuf() doc = MyDoc.from_protobuf(doc) diff --git a/tests/integrations/externals/test_fastapi.py b/tests/integrations/externals/test_fastapi.py index 438d2a86402..02967a07cd0 100644 --- a/tests/integrations/externals/test_fastapi.py +++ b/tests/integrations/externals/test_fastapi.py @@ -5,7 +5,7 @@ from fastapi import FastAPI from httpx import AsyncClient -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.base_doc import DocArrayResponse from docarray.documents import ImageDoc, TextDoc from docarray.typing import NdArray @@ -114,13 +114,13 @@ async def create_item(doc: InputDoc) -> OutputDoc: @pytest.mark.asyncio async def test_docarray(): doc = ImageDoc(tensor=np.zeros((3, 224, 224))) - docs = DocArray[ImageDoc]([doc, doc]) + docs = DocList[ImageDoc]([doc, doc]) app = FastAPI() @app.post("/doc/", response_class=DocArrayResponse) async def func(fastapi_docs: List[ImageDoc]) -> List[ImageDoc]: - docarray_docs = DocArray[ImageDoc].construct(fastapi_docs) + docarray_docs = DocList[ImageDoc].construct(fastapi_docs) return list(docarray_docs) async with AsyncClient(app=app, base_url="http://test") as ac: @@ -132,6 +132,6 @@ async def func(fastapi_docs: List[ImageDoc]) -> List[ImageDoc]: assert resp_doc.status_code == 200 assert resp_redoc.status_code == 200 - docs = DocArray[ImageDoc].from_json(response.content.decode()) + docs = DocList[ImageDoc].from_json(response.content.decode()) assert len(docs) == 2 assert docs[0].tensor.shape == (3, 224, 224) diff --git a/tests/integrations/store/__init__.py b/tests/integrations/store/__init__.py index 1191c403140..6dc05e16a11 100644 --- a/tests/integrations/store/__init__.py +++ b/tests/integrations/store/__init__.py @@ -1,12 +1,12 @@ import tracemalloc from functools import wraps -from docarray import DocArray +from docarray import DocList from docarray.documents import TextDoc def get_test_da(n: int): - return DocArray[TextDoc](gen_text_docs(n)) + return DocList[TextDoc](gen_text_docs(n)) def gen_text_docs(n: int): diff --git a/tests/integrations/store/test_file.py b/tests/integrations/store/test_file.py index 4b6a72c5b62..c57e90d529d 100644 --- a/tests/integrations/store/test_file.py +++ b/tests/integrations/store/test_file.py @@ -3,7 +3,7 @@ import pytest -from docarray import DocArray +from docarray import DocList from docarray.documents import TextDoc from docarray.store.file import ConcurrentPushException, FileDocStore from docarray.utils._internal.cache import _get_cache_path @@ -28,7 +28,7 @@ def test_pushpull_correct(capsys, tmp_path: Path): # Verbose da1.push(f'file://{namespace_dir}/meow', show_progress=True) - da2 = DocArray[TextDoc].pull(f'file://{namespace_dir}/meow', show_progress=True) + da2 = DocList[TextDoc].pull(f'file://{namespace_dir}/meow', show_progress=True) assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -39,7 +39,7 @@ def test_pushpull_correct(capsys, tmp_path: Path): # Quiet da2.push(f'file://{namespace_dir}/meow') - da1 = DocArray[TextDoc].pull(f'file://{namespace_dir}/meow') + da1 = DocList[TextDoc].pull(f'file://{namespace_dir}/meow') assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -55,10 +55,10 @@ def test_pushpull_stream_correct(capsys, tmp_path: Path): da1 = get_test_da(DA_LEN) # Verbosity and correctness - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( iter(da1), f'file://{namespace_dir}/meow', show_progress=True ) - doc_stream2 = DocArray[TextDoc].pull_stream( + doc_stream2 = DocList[TextDoc].pull_stream( f'file://{namespace_dir}/meow', show_progress=True ) @@ -71,10 +71,10 @@ def test_pushpull_stream_correct(capsys, tmp_path: Path): assert len(captured.err) == 0 # Quiet and chained - doc_stream = DocArray[TextDoc].pull_stream( + doc_stream = DocList[TextDoc].pull_stream( f'file://{namespace_dir}/meow', show_progress=False ) - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( doc_stream, f'file://{namespace_dir}/meow2', show_progress=False ) @@ -87,12 +87,12 @@ def test_pushpull_stream_correct(capsys, tmp_path: Path): def test_pull_stream_vs_pull_full(tmp_path: Path): tmp_path.mkdir(parents=True, exist_ok=True) namespace_dir = tmp_path - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN * 1), f'file://{namespace_dir}/meow-short', show_progress=False, ) - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN * 4), f'file://{namespace_dir}/meow-long', show_progress=False, @@ -101,14 +101,12 @@ def test_pull_stream_vs_pull_full(tmp_path: Path): @profile_memory def get_total_stream(url: str): return sum( - len(d.text) for d in DocArray[TextDoc].pull_stream(url, show_progress=False) + len(d.text) for d in DocList[TextDoc].pull_stream(url, show_progress=False) ) @profile_memory def get_total_full(url: str): - return sum( - len(d.text) for d in DocArray[TextDoc].pull(url, show_progress=False) - ) + return sum(len(d.text) for d in DocList[TextDoc].pull(url, show_progress=False)) # A warmup is needed to get accurate memory usage comparison _ = get_total_stream(f'file://{namespace_dir}/meow-short') @@ -149,12 +147,12 @@ def test_list_and_delete(tmp_path: Path): da_names = FileDocStore.list(namespace_dir, show_table=False) assert len(da_names) == 0 - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/meow', show_progress=False ) da_names = FileDocStore.list(namespace_dir, show_table=False) assert set(da_names) == {'meow'} - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/woof', show_progress=False ) da_names = FileDocStore.list(namespace_dir, show_table=False) @@ -181,7 +179,7 @@ def test_concurrent_push_pull(tmp_path: Path): tmp_path.mkdir(parents=True, exist_ok=True) namespace_dir = tmp_path - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/da0', show_progress=False, @@ -191,14 +189,14 @@ def test_concurrent_push_pull(tmp_path: Path): def _task(choice: str): if choice == 'push': - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/da0', show_progress=False, ) elif choice == 'pull': pull_len = sum( - 1 for _ in DocArray[TextDoc].pull_stream(f'file://{namespace_dir}/da0') + 1 for _ in DocList[TextDoc].pull_stream(f'file://{namespace_dir}/da0') ) assert pull_len == DA_LEN else: @@ -216,7 +214,7 @@ def test_concurrent_push(tmp_path: Path): tmp_path.mkdir(parents=True, exist_ok=True) namespace_dir = tmp_path - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/da0', show_progress=False, @@ -232,7 +230,7 @@ def _slowdown_iterator(iterator): def _push(choice: str): if choice == 'slow': - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( _slowdown_iterator(gen_text_docs(DA_LEN)), f'file://{namespace_dir}/da0', show_progress=False, @@ -241,7 +239,7 @@ def _push(choice: str): elif choice == 'cold_start': try: time.sleep(0.1) - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/da0', show_progress=False, diff --git a/tests/integrations/store/test_jac.py b/tests/integrations/store/test_jac.py index 3e070b6de2b..63dcdc33b15 100644 --- a/tests/integrations/store/test_jac.py +++ b/tests/integrations/store/test_jac.py @@ -4,7 +4,7 @@ import hubble import pytest -from docarray import DocArray +from docarray import DocList from docarray.documents import TextDoc from docarray.store import JACDocStore from tests.integrations.store import gen_text_docs, get_test_da, profile_memory @@ -45,7 +45,7 @@ def test_pushpull_correct(capsys): # Verbose da1.push(f'jac://{DA_NAME}', show_progress=True) - da2 = DocArray[TextDoc].pull(f'jac://{DA_NAME}', show_progress=True) + da2 = DocList[TextDoc].pull(f'jac://{DA_NAME}', show_progress=True) assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -56,7 +56,7 @@ def test_pushpull_correct(capsys): # Quiet da2.push(f'jac://{DA_NAME}') - da1 = DocArray[TextDoc].pull(f'jac://{DA_NAME}') + da1 = DocList[TextDoc].pull(f'jac://{DA_NAME}') assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -77,10 +77,8 @@ def test_pushpull_stream_correct(capsys): da1 = get_test_da(DA_LEN) # Verbosity and correctness - DocArray[TextDoc].push_stream(iter(da1), f'jac://{DA_NAME_1}', show_progress=True) - doc_stream2 = DocArray[TextDoc].pull_stream( - f'jac://{DA_NAME_1}', show_progress=True - ) + DocList[TextDoc].push_stream(iter(da1), f'jac://{DA_NAME_1}', show_progress=True) + doc_stream2 = DocList[TextDoc].pull_stream(f'jac://{DA_NAME_1}', show_progress=True) assert all(d1.id == d2.id for d1, d2 in zip(da1, doc_stream2)) with pytest.raises(StopIteration): @@ -91,10 +89,8 @@ def test_pushpull_stream_correct(capsys): assert len(captured.err) == 0 # Quiet and chained - doc_stream = DocArray[TextDoc].pull_stream( - f'jac://{DA_NAME_1}', show_progress=False - ) - DocArray[TextDoc].push_stream(doc_stream, f'jac://{DA_NAME_2}', show_progress=False) + doc_stream = DocList[TextDoc].pull_stream(f'jac://{DA_NAME_1}', show_progress=False) + DocList[TextDoc].push_stream(doc_stream, f'jac://{DA_NAME_2}', show_progress=False) captured = capsys.readouterr() assert ( @@ -112,12 +108,12 @@ def test_pull_stream_vs_pull_full(): DA_NAME_SHORT: str = f'test{RANDOM}-pull-stream-vs-pull-full-short' DA_NAME_LONG: str = f'test{RANDOM}-pull-stream-vs-pull-full-long' - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN * 1), f'jac://{DA_NAME_SHORT}', show_progress=False, ) - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN * 4), f'jac://{DA_NAME_LONG}', show_progress=False, @@ -126,14 +122,12 @@ def test_pull_stream_vs_pull_full(): @profile_memory def get_total_stream(url: str): return sum( - len(d.text) for d in DocArray[TextDoc].pull_stream(url, show_progress=False) + len(d.text) for d in DocList[TextDoc].pull_stream(url, show_progress=False) ) @profile_memory def get_total_full(url: str): - return sum( - len(d.text) for d in DocArray[TextDoc].pull(url, show_progress=False) - ) + return sum(len(d.text) for d in DocList[TextDoc].pull(url, show_progress=False)) # A warmup is needed to get accurate memory usage comparison _ = get_total_stream(f'jac://{DA_NAME_SHORT}') @@ -176,7 +170,7 @@ def test_list_and_delete(): ) assert len(da_names) == 0 - DocArray[TextDoc].push( + DocList[TextDoc].push( get_test_da(DA_LEN), f'jac://{DA_NAME_0}', show_progress=False ) da_names = list( @@ -186,7 +180,7 @@ def test_list_and_delete(): ) ) assert set(da_names) == {DA_NAME_0} - DocArray[TextDoc].push( + DocList[TextDoc].push( get_test_da(DA_LEN), f'jac://{DA_NAME_1}', show_progress=False ) da_names = list( @@ -224,7 +218,7 @@ def test_concurrent_push_pull(): # Push to DA that is being pulled should not mess up the pull DA_NAME_0 = f'test{RANDOM}-concurrent-push-pull-da0' - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f'jac://{DA_NAME_0}', show_progress=False, @@ -234,14 +228,14 @@ def test_concurrent_push_pull(): def _task(choice: str): if choice == 'push': - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f'jac://{DA_NAME_0}', show_progress=False, ) elif choice == 'pull': pull_len = sum( - 1 for _ in DocArray[TextDoc].pull_stream(f'jac://{DA_NAME_0}') + 1 for _ in DocList[TextDoc].pull_stream(f'jac://{DA_NAME_0}') ) assert pull_len == DA_LEN else: diff --git a/tests/integrations/store/test_s3.py b/tests/integrations/store/test_s3.py index ebe51b8c223..373a4d89663 100644 --- a/tests/integrations/store/test_s3.py +++ b/tests/integrations/store/test_s3.py @@ -5,7 +5,7 @@ import pytest -from docarray import DocArray +from docarray import DocList from docarray.documents import TextDoc from docarray.store import S3DocStore from tests.integrations.store import gen_text_docs, get_test_da, profile_memory @@ -72,7 +72,7 @@ def test_pushpull_correct(capsys): # Verbose da1.push(f's3://{namespace_dir}/meow', show_progress=True) - da2 = DocArray[TextDoc].pull(f's3://{namespace_dir}/meow', show_progress=True) + da2 = DocList[TextDoc].pull(f's3://{namespace_dir}/meow', show_progress=True) assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -83,7 +83,7 @@ def test_pushpull_correct(capsys): # Quiet da2.push(f's3://{namespace_dir}/meow') - da1 = DocArray[TextDoc].pull(f's3://{namespace_dir}/meow') + da1 = DocList[TextDoc].pull(f's3://{namespace_dir}/meow') assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -99,10 +99,10 @@ def test_pushpull_stream_correct(capsys): da1 = get_test_da(DA_LEN) # Verbosity and correctness - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( iter(da1), f's3://{namespace_dir}/meow', show_progress=True ) - doc_stream2 = DocArray[TextDoc].pull_stream( + doc_stream2 = DocList[TextDoc].pull_stream( f's3://{namespace_dir}/meow', show_progress=True ) @@ -115,10 +115,10 @@ def test_pushpull_stream_correct(capsys): assert len(captured.err) == 0 # Quiet and chained - doc_stream = DocArray[TextDoc].pull_stream( + doc_stream = DocList[TextDoc].pull_stream( f's3://{namespace_dir}/meow', show_progress=False ) - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( doc_stream, f's3://{namespace_dir}/meow2', show_progress=False ) @@ -130,12 +130,12 @@ def test_pushpull_stream_correct(capsys): @pytest.mark.slow def test_pull_stream_vs_pull_full(): namespace_dir = f'{BUCKET}/test{RANDOM}/pull-stream-vs-pull-full' - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN * 1), f's3://{namespace_dir}/meow-short', show_progress=False, ) - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN * 4), f's3://{namespace_dir}/meow-long', show_progress=False, @@ -144,14 +144,12 @@ def test_pull_stream_vs_pull_full(): @profile_memory def get_total_stream(url: str): return sum( - len(d.text) for d in DocArray[TextDoc].pull_stream(url, show_progress=False) + len(d.text) for d in DocList[TextDoc].pull_stream(url, show_progress=False) ) @profile_memory def get_total_full(url: str): - return sum( - len(d.text) for d in DocArray[TextDoc].pull(url, show_progress=False) - ) + return sum(len(d.text) for d in DocList[TextDoc].pull(url, show_progress=False)) # A warmup is needed to get accurate memory usage comparison _ = get_total_stream(f's3://{namespace_dir}/meow-short') @@ -192,12 +190,12 @@ def test_list_and_delete(): da_names = S3DocStore.list(namespace_dir, show_table=False) assert len(da_names) == 0 - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f's3://{namespace_dir}/meow', show_progress=False ) da_names = S3DocStore.list(f'{namespace_dir}', show_table=False) assert set(da_names) == {'meow'} - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f's3://{namespace_dir}/woof', show_progress=False ) da_names = S3DocStore.list(f'{namespace_dir}', show_table=False) @@ -224,7 +222,7 @@ def test_concurrent_push_pull(): # Push to DA that is being pulled should not mess up the pull namespace_dir = f'{BUCKET}/test{RANDOM}/concurrent-push-pull' - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f's3://{namespace_dir}/da0', show_progress=False, @@ -234,14 +232,14 @@ def test_concurrent_push_pull(): def _task(choice: str): if choice == 'push': - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f's3://{namespace_dir}/da0', show_progress=False, ) elif choice == 'pull': pull_len = sum( - 1 for _ in DocArray[TextDoc].pull_stream(f's3://{namespace_dir}/da0') + 1 for _ in DocList[TextDoc].pull_stream(f's3://{namespace_dir}/da0') ) assert pull_len == DA_LEN else: diff --git a/tests/integrations/torch/data/test_torch_dataset.py b/tests/integrations/torch/data/test_torch_dataset.py index 238e05e8ac2..f358f1c16b8 100644 --- a/tests/integrations/torch/data/test_torch_dataset.py +++ b/tests/integrations/torch/data/test_torch_dataset.py @@ -2,7 +2,7 @@ import torch from torch.utils.data import DataLoader -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.data import MultiModalDataset from docarray.documents import ImageDoc, TextDoc @@ -34,10 +34,10 @@ def __call__(self, text: str) -> None: @pytest.fixture -def captions_da() -> DocArray[PairTextImage]: +def captions_da() -> DocList[PairTextImage]: with open("tests/toydata/captions.csv", "r") as f: f.readline() - da = DocArray[PairTextImage]( + da = DocList[PairTextImage]( PairTextImage( text=TextDoc(text=i[1]), image=ImageDoc(url=f"tests/toydata/image-data/{i[0]}"), @@ -47,7 +47,7 @@ def captions_da() -> DocArray[PairTextImage]: return da -def test_torch_dataset(captions_da: DocArray[PairTextImage]): +def test_torch_dataset(captions_da: DocList[PairTextImage]): BATCH_SIZE = 32 preprocessing = {"image": ImagePreprocess(), "text": TextPreprocess()} @@ -56,16 +56,16 @@ def test_torch_dataset(captions_da: DocArray[PairTextImage]): dataset, batch_size=BATCH_SIZE, collate_fn=dataset.collate_fn, shuffle=True ) - from docarray.array.stacked.array_stacked import DocArrayStacked + from docarray.array.doc_vec.doc_vec import DocVec batch_lens = [] for batch in loader: - assert isinstance(batch, DocArrayStacked[PairTextImage]) + assert isinstance(batch, DocVec[PairTextImage]) batch_lens.append(len(batch)) assert all(x == BATCH_SIZE for x in batch_lens[:-1]) -def test_primitives(captions_da: DocArray[PairTextImage]): +def test_primitives(captions_da: DocList[PairTextImage]): BATCH_SIZE = 32 preprocessing = {"text": Meowification()} @@ -78,7 +78,7 @@ def test_primitives(captions_da: DocArray[PairTextImage]): assert all(t.endswith(' meow') for t in batch.text) -def test_root_field(captions_da: DocArray[TextDoc]): +def test_root_field(captions_da: DocList[TextDoc]): BATCH_SIZE = 32 preprocessing = {"": TextPreprocess()} @@ -91,7 +91,7 @@ def test_root_field(captions_da: DocArray[TextDoc]): assert batch.embedding.shape[1] == 64 -def test_nested_field(captions_da: DocArray[PairTextImage]): +def test_nested_field(captions_da: DocList[PairTextImage]): BATCH_SIZE = 32 preprocessing = { @@ -122,7 +122,7 @@ def test_nested_field(captions_da: DocArray[PairTextImage]): @pytest.mark.slow -def test_torch_dl_multiprocessing(captions_da: DocArray[PairTextImage]): +def test_torch_dl_multiprocessing(captions_da: DocList[PairTextImage]): BATCH_SIZE = 32 preprocessing = {"image": ImagePreprocess(), "text": TextPreprocess()} @@ -136,17 +136,17 @@ def test_torch_dl_multiprocessing(captions_da: DocArray[PairTextImage]): multiprocessing_context='fork', ) - from docarray.array.stacked.array_stacked import DocArrayStacked + from docarray.array.doc_vec.doc_vec import DocVec batch_lens = [] for batch in loader: - assert isinstance(batch, DocArrayStacked[PairTextImage]) + assert isinstance(batch, DocVec[PairTextImage]) batch_lens.append(len(batch)) assert all(x == BATCH_SIZE for x in batch_lens[:-1]) @pytest.mark.skip(reason="UNRESOLVED BUG") -def test_torch_dl_pin_memory(captions_da: DocArray[PairTextImage]): +def test_torch_dl_pin_memory(captions_da: DocList[PairTextImage]): BATCH_SIZE = 32 preprocessing = {"image": ImagePreprocess(), "text": TextPreprocess()} @@ -164,10 +164,10 @@ def test_torch_dl_pin_memory(captions_da: DocArray[PairTextImage]): multiprocessing_context='fork', ) - from docarray.array.stacked.array_stacked import DocArrayStacked + from docarray.array.doc_vec.doc_vec import DocVec batch_lens = [] for batch in loader: - assert isinstance(batch, DocArrayStacked[PairTextImage]) + assert isinstance(batch, DocVec[PairTextImage]) batch_lens.append(len(batch)) assert all(x == BATCH_SIZE for x in batch_lens[:-1]) diff --git a/tests/units/array/stack/storage/test_storage.py b/tests/units/array/stack/storage/test_storage.py index 591c2057d8b..7fdb8133bef 100644 --- a/tests/units/array/stack/storage/test_storage.py +++ b/tests/units/array/stack/storage/test_storage.py @@ -1,8 +1,8 @@ import numpy as np from docarray import BaseDoc -from docarray.array import DocArrayStacked -from docarray.array.stacked.column_storage import ColumnStorageView +from docarray.array import DocVec +from docarray.array.doc_vec.column_storage import ColumnStorageView from docarray.typing import AnyTensor @@ -20,13 +20,13 @@ class MyDoc(BaseDoc): for i in range(4) ] - storage = DocArrayStacked[MyDoc](docs)._storage + storage = DocVec[MyDoc](docs)._storage assert (storage.tensor_columns['tensor'] == np.zeros((4, 10))).all() for name in storage.any_columns['name']: assert name == 'hello' inner_docs = storage.doc_columns['doc'] - assert isinstance(inner_docs, DocArrayStacked[InnerDoc]) + assert isinstance(inner_docs, DocVec[InnerDoc]) for i, doc in enumerate(inner_docs): assert doc.price == i @@ -38,7 +38,7 @@ class MyDoc(BaseDoc): docs = [MyDoc(tensor=np.zeros((10, 10)), name='hello', id=i) for i in range(4)] - storage = DocArrayStacked[MyDoc](docs)._storage + storage = DocVec[MyDoc](docs)._storage view = ColumnStorageView(0, storage) diff --git a/tests/units/array/stack/test_array_stacked.py b/tests/units/array/stack/test_array_stacked.py index 95cbf58c150..14f5238873a 100644 --- a/tests/units/array/stack/test_array_stacked.py +++ b/tests/units/array/stack/test_array_stacked.py @@ -5,8 +5,8 @@ import torch from pydantic import parse_obj_as -from docarray import BaseDoc, DocArray -from docarray.array import DocArrayStacked +from docarray import BaseDoc, DocList +from docarray.array import DocVec from docarray.documents import ImageDoc from docarray.typing import AnyEmbedding, AnyTensor, NdArray, TorchTensor @@ -16,7 +16,7 @@ def batch(): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocArrayStacked[ImageDoc]( + batch = DocVec[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) @@ -29,12 +29,12 @@ class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] class MMdoc(BaseDoc): - img: DocArray[ImageDoc] + img: DocList[ImageDoc] - batch = DocArray[MMdoc]( + batch = DocList[MMdoc]( [ MMdoc( - img=DocArray[ImageDoc]( + img=DocList[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) ) @@ -47,7 +47,7 @@ class MMdoc(BaseDoc): def test_create_from_list_docs(): list_ = [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] - da_stacked = DocArrayStacked[ImageDoc](docs=list_, tensor_type=TorchTensor) + da_stacked = DocVec[ImageDoc](docs=list_, tensor_type=TorchTensor) assert len(da_stacked) == 10 assert da_stacked.tensor.shape == tuple([10, 3, 224, 224]) @@ -58,7 +58,7 @@ def test_len(batch): def test_create_from_None(): with pytest.raises(ValueError): - DocArrayStacked[ImageDoc]([]) + DocVec[ImageDoc]([]) def test_getitem(batch): @@ -75,7 +75,7 @@ def test_stack_setter(): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocArray[ImageDoc]( + batch = DocList[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) @@ -92,7 +92,7 @@ def test_stack_setter_np(): class ImageDoc(BaseDoc): tensor: NdArray[3, 224, 224] - batch = DocArray[ImageDoc]( + batch = DocList[ImageDoc]( [ImageDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)] ) @@ -116,7 +116,7 @@ def test_stack_numpy(): class ImageDoc(BaseDoc): tensor: NdArray[3, 224, 224] - batch = DocArray[ImageDoc]( + batch = DocList[ImageDoc]( [ImageDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)] ) @@ -152,7 +152,7 @@ class ImageDoc(BaseDoc): class MMdoc(BaseDoc): img: ImageDoc - batch = DocArray[MMdoc]( + batch = DocList[MMdoc]( [MMdoc(img=ImageDoc(tensor=torch.zeros(3, 224, 224))) for _ in range(10)] ) @@ -188,7 +188,7 @@ def test_convert_to_da(batch): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocArray[ImageDoc]( + batch = DocList[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) @@ -206,7 +206,7 @@ class ImageDoc(BaseDoc): class MMdoc(BaseDoc): img: ImageDoc - batch = DocArray[MMdoc]( + batch = DocList[MMdoc]( [MMdoc(img=ImageDoc(tensor=torch.zeros(3, 224, 224))) for _ in range(10)] ) @@ -221,7 +221,7 @@ class MMdoc(BaseDoc): def test_unstack_nested_DocArray(nested_batch): batch = nested_batch.unstack() for i in range(len(batch)): - assert isinstance(batch[i].img, DocArray) + assert isinstance(batch[i].img, DocList) for doc in batch[i].img: assert (doc.tensor == torch.zeros(3, 224, 224)).all() @@ -230,7 +230,7 @@ def test_stack_call(): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - da = DocArray[ImageDoc]( + da = DocList[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) @@ -245,12 +245,12 @@ def test_stack_union(): class ImageDoc(BaseDoc): tensor: Union[NdArray[3, 224, 224], TorchTensor[3, 224, 224]] - batch = DocArray[ImageDoc]( + batch = DocList[ImageDoc]( [ImageDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)] ) batch[3].tensor = np.zeros((3, 224, 224)) - # union fields aren't actually stacked + # union fields aren't actually doc_vec # just checking that there is no error batch.stack() @@ -263,7 +263,7 @@ def test_any_tensor_with_torch(tensor_type, tensor): class ImageDoc(BaseDoc): tensor: AnyTensor - da = DocArrayStacked[ImageDoc]( + da = DocVec[ImageDoc]( [ImageDoc(tensor=tensor) for _ in range(10)], tensor_type=tensor_type, ) @@ -284,7 +284,7 @@ class ImageDoc(BaseDoc): class TopDoc(BaseDoc): img: ImageDoc - da = DocArrayStacked[TopDoc]( + da = DocVec[TopDoc]( [TopDoc(img=ImageDoc(tensor=tensor)) for _ in range(10)], tensor_type=TorchTensor, ) @@ -300,7 +300,7 @@ def test_dict_stack(): class MyDoc(BaseDoc): my_dict: Dict[str, int] - da = DocArrayStacked[MyDoc]([MyDoc(my_dict={'a': 1, 'b': 2}) for _ in range(10)]) + da = DocVec[MyDoc]([MyDoc(my_dict={'a': 1, 'b': 2}) for _ in range(10)]) da.my_dict @@ -312,12 +312,12 @@ class Doc(BaseDoc): N = 10 - da = DocArrayStacked[Doc]( + da = DocVec[Doc]( [Doc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)] ) da_sliced = da[0:10:2] - assert isinstance(da_sliced, DocArrayStacked) + assert isinstance(da_sliced, DocVec) tensors = da_sliced.tensor assert tensors.shape == (5, 3, 224, 224) @@ -332,7 +332,7 @@ def test_stack_embedding(): class MyDoc(BaseDoc): embedding: AnyEmbedding - da = DocArrayStacked[MyDoc]([MyDoc(embedding=np.zeros(10)) for _ in range(10)]) + da = DocVec[MyDoc]([MyDoc(embedding=np.zeros(10)) for _ in range(10)]) assert 'embedding' in da._storage.tensor_columns.keys() assert (da.embedding == np.zeros((10, 10))).all() @@ -343,7 +343,7 @@ def test_stack_none(tensor_backend): class MyDoc(BaseDoc): tensor: Optional[AnyTensor] - da = DocArrayStacked[MyDoc]( + da = DocVec[MyDoc]( [MyDoc(tensor=None) for _ in range(10)], tensor_type=tensor_backend ) @@ -351,9 +351,7 @@ class MyDoc(BaseDoc): def test_to_device(): - da = DocArrayStacked[ImageDoc]( - [ImageDoc(tensor=torch.zeros(3, 5))], tensor_type=TorchTensor - ) + da = DocVec[ImageDoc]([ImageDoc(tensor=torch.zeros(3, 5))], tensor_type=TorchTensor) assert da.tensor.device == torch.device('cpu') da.to('meta') assert da.tensor.device == torch.device('meta') @@ -361,13 +359,13 @@ def test_to_device(): def test_to_device_with_nested_da(): class Video(BaseDoc): - images: DocArray[ImageDoc] + images: DocList[ImageDoc] - da_image = DocArrayStacked[ImageDoc]( + da_image = DocVec[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 5))], tensor_type=TorchTensor ) - da = DocArrayStacked[Video]([Video(images=da_image)]) + da = DocVec[Video]([Video(images=da_image)]) assert da.images[0].tensor.device == torch.device('cpu') da.to('meta') assert da.images[0].tensor.device == torch.device('meta') @@ -378,7 +376,7 @@ class MyDoc(BaseDoc): tensor: TorchTensor docs: ImageDoc - da = DocArrayStacked[MyDoc]( + da = DocVec[MyDoc]( [MyDoc(tensor=torch.zeros(3, 5), docs=ImageDoc(tensor=torch.zeros(3, 5)))], tensor_type=TorchTensor, ) @@ -390,9 +388,7 @@ class MyDoc(BaseDoc): def test_to_device_numpy(): - da = DocArrayStacked[ImageDoc]( - [ImageDoc(tensor=np.zeros((3, 5)))], tensor_type=NdArray - ) + da = DocVec[ImageDoc]([ImageDoc(tensor=np.zeros((3, 5)))], tensor_type=NdArray) with pytest.raises(NotImplementedError): da.to('meta') @@ -401,7 +397,7 @@ def test_keep_dtype_torch(): class MyDoc(BaseDoc): tensor: TorchTensor - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [MyDoc(tensor=torch.zeros([2, 4], dtype=torch.int32)) for _ in range(3)] ) assert da[0].tensor.dtype == torch.int32 @@ -415,7 +411,7 @@ def test_keep_dtype_np(): class MyDoc(BaseDoc): tensor: NdArray - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [MyDoc(tensor=np.zeros([2, 4], dtype=np.int32)) for _ in range(3)] ) assert da[0].tensor.dtype == np.int32 @@ -436,7 +432,7 @@ def test_np_scalar(): class MyDoc(BaseDoc): scalar: NdArray - da = DocArray[MyDoc]([MyDoc(scalar=np.array(2.0)) for _ in range(3)]) + da = DocList[MyDoc]([MyDoc(scalar=np.array(2.0)) for _ in range(3)]) assert all(doc.scalar.ndim == 0 for doc in da) assert all(doc.scalar == 2.0 for doc in da) @@ -456,7 +452,7 @@ def test_torch_scalar(): class MyDoc(BaseDoc): scalar: TorchTensor - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [MyDoc(scalar=torch.tensor(2.0)) for _ in range(3)], ) assert all(doc.scalar.ndim == 0 for doc in da) @@ -476,7 +472,7 @@ def test_np_nan(): class MyDoc(BaseDoc): scalar: Optional[NdArray] - da = DocArray[MyDoc]([MyDoc() for _ in range(3)]) + da = DocList[MyDoc]([MyDoc() for _ in range(3)]) assert all(doc.scalar is None for doc in da) assert all(doc.scalar == doc.scalar for doc in da) stacked_da = da.stack() @@ -495,7 +491,7 @@ def test_torch_nan(): class MyDoc(BaseDoc): scalar: Optional[TorchTensor] - da = DocArray[MyDoc]([MyDoc() for _ in range(3)]) + da = DocList[MyDoc]([MyDoc() for _ in range(3)]) assert all(doc.scalar is None for doc in da) assert all(doc.scalar == doc.scalar for doc in da) stacked_da = da.stack(tensor_type=TorchTensor) @@ -515,24 +511,24 @@ def test_from_storage(): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocArrayStacked[ImageDoc]( + batch = DocVec[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) - DocArrayStacked[ImageDoc].from_columns_storage(batch._storage) + DocVec[ImageDoc].from_columns_storage(batch._storage) def test_validate_from_da(): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocArray[ImageDoc]( + batch = DocList[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) - da = parse_obj_as(DocArrayStacked[ImageDoc], batch) + da = parse_obj_as(DocVec[ImageDoc], batch) - assert isinstance(da, DocArrayStacked[ImageDoc]) + assert isinstance(da, DocVec[ImageDoc]) def test_validation_column_tensor(batch): @@ -556,22 +552,22 @@ class Inner(BaseDoc): class Doc(BaseDoc): inner: Inner - batch = DocArrayStacked[Doc]([Doc(inner=Inner(hello='hello')) for _ in range(10)]) + batch = DocVec[Doc]([Doc(inner=Inner(hello='hello')) for _ in range(10)]) return batch, Doc, Inner def test_validation_column_doc(batch_nested_doc): batch, Doc, Inner = batch_nested_doc - batch.inner = DocArray[Inner]([Inner(hello='hello') for _ in range(10)]) - assert isinstance(batch.inner, DocArrayStacked[Inner]) + batch.inner = DocList[Inner]([Inner(hello='hello') for _ in range(10)]) + assert isinstance(batch.inner, DocVec[Inner]) def test_validation_list_doc(batch_nested_doc): batch, Doc, Inner = batch_nested_doc batch.inner = [Inner(hello='hello') for _ in range(10)] - assert isinstance(batch.inner, DocArrayStacked[Inner]) + assert isinstance(batch.inner, DocVec[Inner]) def test_validation_col_doc_fail(batch_nested_doc): @@ -581,7 +577,7 @@ def test_validation_col_doc_fail(batch_nested_doc): batch.inner = ['hello'] * 10 with pytest.raises(ValueError): - batch.inner = DocArray[Inner]([Inner(hello='hello') for _ in range(11)]) + batch.inner = DocList[Inner]([Inner(hello='hello') for _ in range(11)]) def test_doc_view_update(batch): diff --git a/tests/units/array/stack/test_array_stacked_tf.py b/tests/units/array/stack/test_array_stacked_tf.py index 0ec91268575..c5bd31fea2e 100644 --- a/tests/units/array/stack/test_array_stacked_tf.py +++ b/tests/units/array/stack/test_array_stacked_tf.py @@ -2,8 +2,8 @@ import pytest -from docarray import BaseDoc, DocArray -from docarray.array import DocArrayStacked +from docarray import BaseDoc, DocList +from docarray.array import DocVec from docarray.typing import AnyTensor, NdArray from docarray.utils._internal.misc import is_tf_available @@ -22,7 +22,7 @@ class Image(BaseDoc): import tensorflow as tf - batch = DocArray[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) + batch = DocList[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) return batch.stack() @@ -33,14 +33,14 @@ class Image(BaseDoc): tensor: TensorFlowTensor[3, 224, 224] class MMdoc(BaseDoc): - img: DocArray[Image] + img: DocList[Image] import tensorflow as tf - batch = DocArrayStacked[MMdoc]( + batch = DocVec[MMdoc]( [ MMdoc( - img=DocArray[Image]( + img=DocList[Image]( [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)] ) ) @@ -67,7 +67,7 @@ def test_getitem(batch): @pytest.mark.tensorflow def test_get_slice(batch): sliced = batch[0:2] - assert isinstance(sliced, DocArrayStacked) + assert isinstance(sliced, DocVec) assert len(sliced) == 2 @@ -82,9 +82,7 @@ def test_set_after_stacking(): class Image(BaseDoc): tensor: TensorFlowTensor[3, 224, 224] - batch = DocArrayStacked[Image]( - [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)] - ) + batch = DocVec[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) batch.tensor = tf.ones((10, 3, 224, 224)) assert tnp.allclose(batch.tensor.tensor, tf.ones((10, 3, 224, 224))) @@ -109,7 +107,7 @@ class Image(BaseDoc): class MMdoc(BaseDoc): img: Image - batch = DocArray[MMdoc]( + batch = DocList[MMdoc]( [MMdoc(img=Image(tensor=tf.zeros((3, 224, 224)))) for _ in range(10)] ).stack() @@ -150,7 +148,7 @@ class Image(BaseDoc): class MMdoc(BaseDoc): img: Image - batch = DocArrayStacked[MMdoc]( + batch = DocVec[MMdoc]( [MMdoc(img=Image(tensor=tf.zeros((3, 224, 224)))) for _ in range(10)] ) assert isinstance(batch.img._storage.tensor_columns['tensor'], TensorFlowTensor) @@ -164,7 +162,7 @@ class MMdoc(BaseDoc): def test_unstack_nested_DocArray(nested_batch): batch = nested_batch.unstack() for i in range(len(batch)): - assert isinstance(batch[i].img, DocArray) + assert isinstance(batch[i].img, DocList) for doc in batch[i].img: assert tnp.allclose(doc.tensor.tensor, tf.zeros((3, 224, 224))) @@ -174,7 +172,7 @@ def test_stack_call(): class Image(BaseDoc): tensor: TensorFlowTensor[3, 224, 224] - da = DocArray[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) + da = DocList[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) da = da.stack() @@ -188,12 +186,12 @@ def test_stack_union(): class Image(BaseDoc): tensor: Union[NdArray[3, 224, 224], TensorFlowTensor[3, 224, 224]] - DocArrayStacked[Image]( + DocVec[Image]( [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)], tensor_type=TensorFlowTensor, ) - # union fields aren't actually stacked + # union fields aren't actually doc_vec # just checking that there is no error @@ -215,7 +213,7 @@ def test_any_tensor_with_tf(): class Image(BaseDoc): tensor: AnyTensor - da = DocArrayStacked[Image]( + da = DocVec[Image]( [Image(tensor=tensor) for _ in range(10)], tensor_type=TensorFlowTensor, ) @@ -237,7 +235,7 @@ class Image(BaseDoc): class TopDoc(BaseDoc): img: Image - da = DocArrayStacked[TopDoc]( + da = DocVec[TopDoc]( [TopDoc(img=Image(tensor=tensor)) for _ in range(10)], tensor_type=TensorFlowTensor, ) @@ -256,12 +254,12 @@ class Doc(BaseDoc): text: str tensor: TensorFlowTensor - da = DocArrayStacked[Doc]( + da = DocVec[Doc]( [Doc(text=f'hello{i}', tensor=tf.zeros((3, 224, 224))) for i in range(10)] ) da_sliced = da[0:10:2] - assert isinstance(da_sliced, DocArrayStacked) + assert isinstance(da_sliced, DocVec) tensors = da_sliced.tensor.tensor assert tensors.shape == (5, 3, 224, 224) @@ -272,7 +270,7 @@ def test_stack_none(): class MyDoc(BaseDoc): tensor: Optional[AnyTensor] - da = DocArrayStacked[MyDoc]( + da = DocVec[MyDoc]( [MyDoc(tensor=None) for _ in range(10)], tensor_type=TensorFlowTensor ) assert 'tensor' in da._storage.tensor_columns.keys() @@ -283,7 +281,7 @@ def test_keep_dtype_tf(): class MyDoc(BaseDoc): tensor: TensorFlowTensor - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [MyDoc(tensor=tf.zeros([2, 4], dtype=tf.int32)) for _ in range(3)] ) assert da[0].tensor.tensor.dtype == tf.int32 diff --git a/tests/units/array/stack/test_init.py b/tests/units/array/stack/test_init.py index c4e906e82b1..663eebadf89 100644 --- a/tests/units/array/stack/test_init.py +++ b/tests/units/array/stack/test_init.py @@ -1,7 +1,7 @@ import numpy as np from docarray import BaseDoc -from docarray.array.stacked.array_stacked import DocArrayStacked +from docarray.array.doc_vec.doc_vec import DocVec from docarray.typing import AnyTensor, NdArray @@ -12,7 +12,7 @@ class MyDoc(BaseDoc): docs = [MyDoc(tensor=np.zeros(10), name='hello') for _ in range(4)] - da = DocArrayStacked[MyDoc](docs, tensor_type=NdArray) + da = DocVec[MyDoc](docs, tensor_type=NdArray) assert (da._storage.tensor_columns['tensor'] == np.zeros((4, 10))).all() assert da._storage.any_columns['name']._data == ['hello' for _ in range(4)] @@ -25,7 +25,7 @@ class MyDoc(BaseDoc): docs = [MyDoc(tensor=i * np.zeros((10, 10)), name=f'hello{i}') for i in range(4)] - da = DocArrayStacked[MyDoc](docs, tensor_type=NdArray) + da = DocVec[MyDoc](docs, tensor_type=NdArray) for i, doc in enumerate(da): assert isinstance(doc, MyDoc) diff --git a/tests/units/array/stack/test_proto.py b/tests/units/array/stack/test_proto.py index 1589c28197b..585bdcf8d05 100644 --- a/tests/units/array/stack/test_proto.py +++ b/tests/units/array/stack/test_proto.py @@ -2,8 +2,8 @@ import pytest import torch -from docarray import BaseDoc, DocArray -from docarray.array import DocArrayStacked +from docarray import BaseDoc, DocList +from docarray.array import DocVec from docarray.typing import NdArray, TorchTensor @@ -12,7 +12,7 @@ def batch(): class Image(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocArray[Image]([Image(tensor=torch.zeros(3, 224, 224)) for _ in range(10)]) + batch = DocList[Image]([Image(tensor=torch.zeros(3, 224, 224)) for _ in range(10)]) return batch.stack() @@ -27,7 +27,7 @@ def test_proto_stacked_mode_numpy(): class MyDoc(BaseDoc): tensor: NdArray[3, 224, 224] - da = DocArray[MyDoc]([MyDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)]) + da = DocList[MyDoc]([MyDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)]) da = da.stack() @@ -39,10 +39,10 @@ def test_stacked_proto(): class CustomDocument(BaseDoc): image: NdArray - da = DocArray[CustomDocument]( + da = DocList[CustomDocument]( [CustomDocument(image=np.zeros((3, 224, 224))) for _ in range(10)] ).stack() - da2 = DocArrayStacked.from_protobuf(da.to_protobuf()) + da2 = DocVec.from_protobuf(da.to_protobuf()) - assert isinstance(da2, DocArrayStacked) + assert isinstance(da2, DocVec) diff --git a/tests/units/array/test_array.py b/tests/units/array/test_array.py index d47089176bb..79d50b64e82 100644 --- a/tests/units/array/test_array.py +++ b/tests/units/array/test_array.py @@ -1,9 +1,10 @@ from typing import Optional, TypeVar, Union + import numpy as np import pytest import torch -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.typing import ImageUrl, NdArray, TorchTensor from docarray.utils._internal.misc import is_tf_available @@ -19,7 +20,7 @@ def da(): class Text(BaseDoc): text: str - return DocArray[Text]([Text(text=f'hello {i}') for i in range(10)]) + return DocList[Text]([Text(text=f'hello {i}') for i in range(10)]) def test_iterate(da): @@ -31,7 +32,7 @@ def test_append(): class Text(BaseDoc): text: str - da = DocArray[Text]([]) + da = DocList[Text]([]) da.append(Text(text='hello', id='1')) @@ -43,7 +44,7 @@ def test_extend(): class Text(BaseDoc): text: str - da = DocArray[Text]([Text(text='hello', id=str(i)) for i in range(10)]) + da = DocList[Text]([Text(text='hello', id=str(i)) for i in range(10)]) da.extend([Text(text='hello', id=str(10 + i)) for i in range(10)]) @@ -62,13 +63,13 @@ def test_document_array(): class Text(BaseDoc): text: str - da = DocArray([Text(text='hello') for _ in range(10)]) + da = DocList([Text(text='hello') for _ in range(10)]) assert len(da) == 10 def test_empty_array(): - da = DocArray() + da = DocList() len(da) == 0 @@ -76,7 +77,7 @@ def test_document_array_fixed_type(): class Text(BaseDoc): text: str - da = DocArray[Text]([Text(text='hello') for _ in range(10)]) + da = DocList[Text]([Text(text='hello') for _ in range(10)]) assert len(da) == 10 @@ -113,8 +114,8 @@ def test_documentarray(): class Text(BaseDoc): text: str - da1 = DocArray([Text(text='hello')]) - da2 = DocArray([Text(text='hello')]) + da1 = DocList([Text(text='hello')]) + da2 = DocList([Text(text='hello')]) assert da1 == da2 assert da1 == [Text(text='hello') for _ in range(len(da1))] @@ -156,7 +157,7 @@ class Mmdoc(BaseDoc): N = 10 - da = DocArray[Mmdoc]( + da = DocList[Mmdoc]( (Mmdoc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) ) @@ -182,7 +183,7 @@ class Mmdoc(BaseDoc): N = 10 - da = DocArray[Mmdoc]((Mmdoc(inner=InnerDoc(text=f'hello{i}')) for i in range(N))) + da = DocList[Mmdoc]((Mmdoc(inner=InnerDoc(text=f'hello{i}')) for i in range(N))) list_docs = [InnerDoc(text=f'hello{i}') for i in range(N)] da._set_data_column('inner', list_docs) @@ -198,7 +199,7 @@ class Mmdoc(BaseDoc): N = 10 - da = DocArray[Mmdoc]( + da = DocList[Mmdoc]( (Mmdoc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) ) @@ -224,9 +225,9 @@ class Mmdoc(BaseDoc): N = 10 - da = DocArray[Mmdoc]((Mmdoc(inner=InnerDoc(text=f'hello{i}')) for i in range(N))) + da = DocList[Mmdoc]((Mmdoc(inner=InnerDoc(text=f'hello{i}')) for i in range(N))) - assert isinstance(da.inner, DocArray) + assert isinstance(da.inner, DocList) def test_get_bulk_attributes_optional_type(): @@ -236,7 +237,7 @@ class Mmdoc(BaseDoc): N = 10 - da = DocArray[Mmdoc]( + da = DocList[Mmdoc]( (Mmdoc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) ) @@ -260,7 +261,7 @@ class Mmdoc(BaseDoc): N = 10 - da = DocArray[Mmdoc]( + da = DocList[Mmdoc]( (Mmdoc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) ) @@ -288,7 +289,7 @@ class MyDoc(BaseDoc): Optional[Union[TorchTensor, NdArray, TensorFlowTensor]], TorchTensor ] - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [ MyDoc( embedding=torch.rand(10), @@ -315,12 +316,12 @@ class Doc(BaseDoc): N = 10 - da = DocArray[Doc]( + da = DocList[Doc]( (Doc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) ) da_sliced = da[0:10:2] - assert isinstance(da_sliced, DocArray) + assert isinstance(da_sliced, DocList) tensors = da_sliced.tensor assert len(tensors) == 5 @@ -364,13 +365,13 @@ def test_del_item(da): def test_generic_type_var(): T = TypeVar('T', bound=BaseDoc) - def f(a: DocArray[T]) -> DocArray[T]: + def f(a: DocList[T]) -> DocList[T]: return a - def g(a: DocArray['BaseDoc']) -> DocArray['BaseDoc']: + def g(a: DocList['BaseDoc']) -> DocList['BaseDoc']: return a - a = DocArray() + a = DocList() f(a) g(a) @@ -381,7 +382,7 @@ class Text(BaseDoc): docs = [Text(text=f'hello {i}') for i in range(10)] - da = DocArray[Text].construct(docs) + da = DocList[Text].construct(docs) assert da._data is docs @@ -392,7 +393,7 @@ class Text(BaseDoc): docs = [Text(text=f'hello {i}') for i in range(10)] - da = DocArray[Text](docs) + da = DocList[Text](docs) da.reverse() assert da[-1].text == 'hello 0' assert da[0].text == 'hello 9' @@ -405,7 +406,7 @@ class Image(BaseDoc): def test_remove(): images = [Image(url=f'http://url.com/foo_{i}.png') for i in range(3)] - da = DocArray[Image](images) + da = DocList[Image](images) da.remove(images[1]) assert len(da) == 2 assert da[0] == images[0] @@ -414,7 +415,7 @@ def test_remove(): def test_pop(): images = [Image(url=f'http://url.com/foo_{i}.png') for i in range(3)] - da = DocArray[Image](images) + da = DocList[Image](images) popped = da.pop(1) assert len(da) == 2 assert popped == images[1] @@ -426,7 +427,7 @@ def test_sort(): images = [ Image(url=f'http://url.com/foo_{i}.png', tensor=NdArray(i)) for i in [2, 0, 1] ] - da = DocArray[Image](images) + da = DocList[Image](images) da.sort(key=lambda img: len(img.tensor)) assert len(da) == 3 assert da[0].url == 'http://url.com/foo_0.png' diff --git a/tests/units/array/test_array_from_to_bytes.py b/tests/units/array/test_array_from_to_bytes.py index 0d269e036a3..7cd9f0dfd8c 100644 --- a/tests/units/array/test_array_from_to_bytes.py +++ b/tests/units/array/test_array_from_to_bytes.py @@ -1,6 +1,6 @@ import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from docarray.typing import NdArray @@ -17,7 +17,7 @@ class MyDoc(BaseDoc): @pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) @pytest.mark.parametrize('show_progress', [False, True]) def test_from_to_bytes(protocol, compress, show_progress): - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [ MyDoc( embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') @@ -28,7 +28,7 @@ def test_from_to_bytes(protocol, compress, show_progress): bytes_da = da.to_bytes( protocol=protocol, compress=compress, show_progress=show_progress ) - da2 = DocArray[MyDoc].from_bytes( + da2 = DocList[MyDoc].from_bytes( bytes_da, protocol=protocol, compress=compress, show_progress=show_progress ) assert len(da2) == 2 @@ -47,7 +47,7 @@ def test_from_to_bytes(protocol, compress, show_progress): @pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) @pytest.mark.parametrize('show_progress', [False, True]) def test_from_to_base64(protocol, compress, show_progress): - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [ MyDoc( embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') @@ -58,7 +58,7 @@ def test_from_to_base64(protocol, compress, show_progress): bytes_da = da.to_base64( protocol=protocol, compress=compress, show_progress=show_progress ) - da2 = DocArray[MyDoc].from_base64( + da2 = DocList[MyDoc].from_base64( bytes_da, protocol=protocol, compress=compress, show_progress=show_progress ) assert len(da2) == 2 diff --git a/tests/units/array/test_array_from_to_csv.py b/tests/units/array/test_array_from_to_csv.py index ecec376d433..09ec98b6432 100644 --- a/tests/units/array/test_array_from_to_csv.py +++ b/tests/units/array/test_array_from_to_csv.py @@ -3,7 +3,7 @@ import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from tests import TOYDATA_DIR @@ -22,7 +22,7 @@ class MyDocNested(MyDoc): def test_to_from_csv(tmpdir, nested_doc_cls): - da = DocArray[nested_doc_cls]( + da = DocList[nested_doc_cls]( [ nested_doc_cls( count=0, @@ -37,13 +37,13 @@ def test_to_from_csv(tmpdir, nested_doc_cls): da.to_csv(tmp_file) assert os.path.isfile(tmp_file) - da_from = DocArray[nested_doc_cls].from_csv(tmp_file) + da_from = DocList[nested_doc_cls].from_csv(tmp_file) for doc1, doc2 in zip(da, da_from): assert doc1 == doc2 def test_from_csv_nested(nested_doc_cls): - da = DocArray[nested_doc_cls].from_csv( + da = DocList[nested_doc_cls].from_csv( file_path=str(TOYDATA_DIR / 'docs_nested.csv') ) assert len(da) == 3 @@ -91,9 +91,9 @@ class Outer(BaseDoc): def test_from_csv_without_schema_raise_exception(): with pytest.raises(TypeError, match='no document schema defined'): - DocArray.from_csv(file_path=str(TOYDATA_DIR / 'docs_nested.csv')) + DocList.from_csv(file_path=str(TOYDATA_DIR / 'docs_nested.csv')) def test_from_csv_with_wrong_schema_raise_exception(nested_doc): with pytest.raises(ValueError, match='Column names do not match the schema'): - DocArray[nested_doc.__class__].from_csv(file_path=str(TOYDATA_DIR / 'docs.csv')) + DocList[nested_doc.__class__].from_csv(file_path=str(TOYDATA_DIR / 'docs.csv')) diff --git a/tests/units/array/test_array_from_to_json.py b/tests/units/array/test_array_from_to_json.py index 52d6b2ec977..c36b8af92a9 100644 --- a/tests/units/array/test_array_from_to_json.py +++ b/tests/units/array/test_array_from_to_json.py @@ -1,4 +1,4 @@ -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from docarray.typing import NdArray @@ -10,7 +10,7 @@ class MyDoc(BaseDoc): def test_from_to_json(): - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [ MyDoc( embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') @@ -19,7 +19,7 @@ def test_from_to_json(): ] ) json_da = da.to_json() - da2 = DocArray[MyDoc].from_json(json_da) + da2 = DocList[MyDoc].from_json(json_da) assert len(da2) == 2 assert len(da) == len(da2) for d1, d2 in zip(da, da2): diff --git a/tests/units/array/test_array_from_to_pandas.py b/tests/units/array/test_array_from_to_pandas.py index d01cd8a1d68..2f95f4f66aa 100644 --- a/tests/units/array/test_array_from_to_pandas.py +++ b/tests/units/array/test_array_from_to_pandas.py @@ -3,7 +3,7 @@ import pandas as pd import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc @@ -20,7 +20,7 @@ class MyDocNested(MyDoc): def test_to_from_pandas_df(nested_doc_cls): - da = DocArray[nested_doc_cls]( + da = DocList[nested_doc_cls]( [ nested_doc_cls( count=0, @@ -47,7 +47,7 @@ def test_to_from_pandas_df(nested_doc_cls): ] ).all() - da_from_df = DocArray[nested_doc_cls].from_pandas(df) + da_from_df = DocList[nested_doc_cls].from_pandas(df) for doc1, doc2 in zip(da, da_from_df): assert doc1 == doc2 @@ -76,7 +76,7 @@ def test_from_pandas_without_schema_raise_exception(): df = pd.DataFrame( columns=['title', 'count'], data=[['title 0', 0], ['title 1', 1]] ) - DocArray.from_pandas(df=df) + DocList.from_pandas(df=df) def test_from_pandas_with_wrong_schema_raise_exception(nested_doc): @@ -84,4 +84,4 @@ def test_from_pandas_with_wrong_schema_raise_exception(nested_doc): df = pd.DataFrame( columns=['title', 'count'], data=[['title 0', 0], ['title 1', 1]] ) - DocArray[nested_doc.__class__].from_pandas(df=df) + DocList[nested_doc.__class__].from_pandas(df=df) diff --git a/tests/units/array/test_array_proto.py b/tests/units/array/test_array_proto.py index ac0265016fc..ebdf0d9a3f9 100644 --- a/tests/units/array/test_array_proto.py +++ b/tests/units/array/test_array_proto.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc, TextDoc from docarray.typing import NdArray @@ -12,11 +12,11 @@ class CustomDoc(BaseDoc): text: str tensor: NdArray - da = DocArray( + da = DocList( [CustomDoc(text='hello', tensor=np.zeros((3, 224, 224))) for _ in range(10)] ) - new_da = DocArray[CustomDoc].from_protobuf(da.to_protobuf()) + new_da = DocList[CustomDoc].from_protobuf(da.to_protobuf()) for doc1, doc2 in zip(da, new_da): assert doc1.text == doc2.text @@ -29,7 +29,7 @@ class CustomDocument(BaseDoc): text: TextDoc image: ImageDoc - da = DocArray[CustomDocument]( + da = DocList[CustomDocument]( [ CustomDocument( text=TextDoc(text='hello'), @@ -39,7 +39,7 @@ class CustomDocument(BaseDoc): ] ) - DocArray[CustomDocument].from_protobuf(da.to_protobuf()) + DocList[CustomDocument].from_protobuf(da.to_protobuf()) @pytest.mark.proto @@ -48,7 +48,7 @@ class CustomDocument(BaseDoc): text: TextDoc image: ImageDoc - da = DocArray[CustomDocument]( + da = DocList[CustomDocument]( [ CustomDocument( text=TextDoc(text='hello'), @@ -58,4 +58,4 @@ class CustomDocument(BaseDoc): ] ) - DocArray.from_protobuf(da.to_protobuf()) + DocList.from_protobuf(da.to_protobuf()) diff --git a/tests/units/array/test_array_save_load.py b/tests/units/array/test_array_save_load.py index 795c437608d..1a632673d15 100644 --- a/tests/units/array/test_array_save_load.py +++ b/tests/units/array/test_array_save_load.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from docarray.typing import NdArray @@ -23,7 +23,7 @@ class MyDoc(BaseDoc): def test_array_save_load_binary(protocol, compress, tmp_path, show_progress): tmp_file = os.path.join(tmp_path, 'test') - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [ MyDoc( embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') @@ -36,7 +36,7 @@ def test_array_save_load_binary(protocol, compress, tmp_path, show_progress): tmp_file, protocol=protocol, compress=compress, show_progress=show_progress ) - da2 = DocArray[MyDoc].load_binary( + da2 = DocList[MyDoc].load_binary( tmp_file, protocol=protocol, compress=compress, show_progress=show_progress ) @@ -59,7 +59,7 @@ def test_array_save_load_binary(protocol, compress, tmp_path, show_progress): def test_array_save_load_binary_streaming(protocol, compress, tmp_path, show_progress): tmp_file = os.path.join(tmp_path, 'test') - da = DocArray[MyDoc]() + da = DocList[MyDoc]() def _extend_da(num_docs=100): for _ in range(num_docs): @@ -79,8 +79,8 @@ def _extend_da(num_docs=100): tmp_file, protocol=protocol, compress=compress, show_progress=show_progress ) - da2 = DocArray[MyDoc]() - da_generator = DocArray[MyDoc].load_binary( + da2 = DocList[MyDoc]() + da_generator = DocList[MyDoc].load_binary( tmp_file, protocol=protocol, compress=compress, show_progress=show_progress ) diff --git a/tests/units/array/test_batching.py b/tests/units/array/test_batching.py index 389d649dbc4..88689c0f644 100644 --- a/tests/units/array/test_batching.py +++ b/tests/units/array/test_batching.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.typing import NdArray @@ -14,7 +14,7 @@ class MyDoc(BaseDoc): tensor: NdArray t_shape = (32, 32) - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [ MyDoc( id=i, diff --git a/tests/units/array/test_generic_array.py b/tests/units/array/test_generic_array.py index e0b5386e676..a51789ed81e 100644 --- a/tests/units/array/test_generic_array.py +++ b/tests/units/array/test_generic_array.py @@ -1,4 +1,4 @@ -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.base_doc import AnyDoc @@ -6,14 +6,14 @@ def test_generic_init(): class Text(BaseDoc): text: str - da = DocArray[Text]([]) - da.document_type == Text + da = DocList[Text]([]) + da.doc_type == Text - assert isinstance(da, DocArray) + assert isinstance(da, DocList) def test_normal_access_init(): - da = DocArray([]) - da.document_type == AnyDoc + da = DocList([]) + da.doc_type == AnyDoc - assert isinstance(da, DocArray) + assert isinstance(da, DocList) diff --git a/tests/units/array/test_indexing.py b/tests/units/array/test_indexing.py index 6aa9e363301..eb225d97ec7 100644 --- a/tests/units/array/test_indexing.py +++ b/tests/units/array/test_indexing.py @@ -2,7 +2,7 @@ import pytest import torch -from docarray import DocArray, DocArrayStacked +from docarray import DocList, DocVec from docarray.documents import TextDoc from docarray.typing import TorchTensor @@ -11,7 +11,7 @@ def da(): texts = [f'hello {i}' for i in range(10)] tensors = [torch.ones((4,)) * i for i in range(10)] - return DocArray[TextDoc]( + return DocList[TextDoc]( [TextDoc(text=text, embedding=tens) for text, tens in zip(texts, tensors)], ) @@ -20,7 +20,7 @@ def da(): def da_to_set(): texts = [f'hello {2*i}' for i in range(5)] tensors = [torch.ones((4,)) * i * 2 for i in range(5)] - return DocArray[TextDoc]( + return DocList[TextDoc]( [TextDoc(text=text, embedding=tens) for text, tens in zip(texts, tensors)], ) @@ -236,7 +236,7 @@ def test_boolmask_setitem(stack_left, stack_right, da, da_to_set, index): def test_setitem_update_column(): texts = [f'hello {i}' for i in range(10)] tensors = [torch.ones((4,)) * (i + 1) for i in range(10)] - da = DocArrayStacked[TextDoc]( + da = DocVec[TextDoc]( [TextDoc(text=text, embedding=tens) for text, tens in zip(texts, tensors)], tensor_type=TorchTensor, ) diff --git a/tests/units/array/test_traverse.py b/tests/units/array/test_traverse.py index b6bd25f0be8..281abe0ce0e 100644 --- a/tests/units/array/test_traverse.py +++ b/tests/units/array/test_traverse.py @@ -3,8 +3,8 @@ import pytest import torch -from docarray import BaseDoc, DocArray -from docarray.array.abstract_array import AnyDocArray +from docarray import BaseDoc, DocList +from docarray.array.any_array import AnyDocArray from docarray.documents import TextDoc from docarray.typing import TorchTensor @@ -21,21 +21,21 @@ class SubSubDoc(BaseDoc): class SubDoc(BaseDoc): sub_text: TextDoc - sub_da: DocArray[SubSubDoc] + sub_da: DocList[SubSubDoc] class MultiModalDoc(BaseDoc): mm_text: TextDoc mm_tensor: Optional[TorchTensor[3, 2, 2]] - mm_da: DocArray[SubDoc] + mm_da: DocList[SubDoc] - docs = DocArray[MultiModalDoc]( + docs = DocList[MultiModalDoc]( [ MultiModalDoc( mm_text=TextDoc(text=f'hello{i}'), mm_da=[ SubDoc( sub_text=TextDoc(text=f'sub_{i}_1'), - sub_da=DocArray[SubSubDoc]( + sub_da=DocList[SubSubDoc]( [ SubSubDoc( sub_sub_text=TextDoc(text='subsub'), @@ -81,7 +81,7 @@ def test_traverse_stacked_da(): class Image(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocArray[Image]( + batch = DocList[Image]( [ Image( tensor=torch.zeros(3, 224, 224), @@ -112,7 +112,7 @@ def test_flatten_one_level(input_list, output_list): def test_flatten_one_level_list_of_da(): doc = BaseDoc() - input_list = [DocArray([doc, doc, doc])] + input_list = [DocList([doc, doc, doc])] flattened = AnyDocArray._flatten_one_level(sequence=input_list) assert flattened == [doc, doc, doc] diff --git a/tests/units/document/proto/test_document_proto.py b/tests/units/document/proto/test_document_proto.py index 1642c17631d..cb5442f7700 100644 --- a/tests/units/document/proto/test_document_proto.py +++ b/tests/units/document/proto/test_document_proto.py @@ -4,7 +4,7 @@ import pytest import torch -from docarray import DocArray +from docarray import DocList from docarray.base_doc import BaseDoc from docarray.typing import NdArray, TorchTensor from docarray.utils._internal.misc import is_tf_available @@ -57,11 +57,11 @@ class CustomInnerDoc(BaseDoc): class CustomDoc(BaseDoc): text: str - chunks: DocArray[CustomInnerDoc] + chunks: DocList[CustomInnerDoc] doc = CustomDoc( text='hello', - chunks=DocArray[CustomInnerDoc]( + chunks=DocList[CustomInnerDoc]( [CustomInnerDoc(tensor=np.zeros((3, 224, 224))) for _ in range(5)], ), ) @@ -95,11 +95,11 @@ class CustomInnerDoc(BaseDoc): class CustomDoc(BaseDoc): text: str - chunks: DocArray[CustomInnerDoc] + chunks: DocList[CustomInnerDoc] doc = CustomDoc( text='hello', - chunks=DocArray[CustomInnerDoc]( + chunks=DocList[CustomInnerDoc]( [CustomInnerDoc(tensor=torch.zeros((3, 224, 224))) for _ in range(5)], ), ) diff --git a/tests/units/document/proto/test_proto_based_object.py b/tests/units/document/proto/test_proto_based_object.py index ecec88fb6e6..96708dea32b 100644 --- a/tests/units/document/proto/test_proto_based_object.py +++ b/tests/units/document/proto/test_proto_based_object.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from docarray.proto import DocumentProto, NodeProto +from docarray.proto import DocProto, NodeProto from docarray.typing import NdArray @@ -32,4 +32,4 @@ def test_document_proto_set(): data['a'] = nested_item1 data['b'] = nested_item2 - DocumentProto(data=data) + DocProto(data=data) diff --git a/tests/units/document/test_update.py b/tests/units/document/test_update.py index 690b83649ed..5e76caa0dc2 100644 --- a/tests/units/document/test_update.py +++ b/tests/units/document/test_update.py @@ -2,7 +2,7 @@ import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc @@ -16,8 +16,8 @@ class MMDoc(BaseDoc): price: int = 0 categories: Optional[List[str]] = None image: Optional[ImageDoc] = None - matches: Optional[DocArray] = None - matches_with_same_id: Optional[DocArray] = None + matches: Optional[DocList] = None + matches_with_same_id: Optional[DocList] = None opt_int: Optional[int] = None test_set: Optional[Set] = None inner_doc: Optional[InnerDoc] = None @@ -30,9 +30,9 @@ def doc1(): text='hey here', categories=['a', 'b', 'c'], price=10, - matches=DocArray[MMDoc]([MMDoc()]), - matches_with_same_id=DocArray[MMDoc]( - [MMDoc(id='a', matches=DocArray[MMDoc]([MMDoc()]))] + matches=DocList[MMDoc]([MMDoc()]), + matches_with_same_id=DocList[MMDoc]( + [MMDoc(id='a', matches=DocList[MMDoc]([MMDoc()]))] ), test_set={'a', 'a'}, inner_doc=InnerDoc(integer=2, inner_list=['c', 'd']), @@ -48,9 +48,9 @@ def doc2(doc1): categories=['d', 'e', 'f'], price=5, opt_int=5, - matches=DocArray[MMDoc]([MMDoc()]), - matches_with_same_id=DocArray[MMDoc]( - [MMDoc(id='a', matches=DocArray[MMDoc]([MMDoc()]))] + matches=DocList[MMDoc]([MMDoc()]), + matches_with_same_id=DocList[MMDoc]( + [MMDoc(id='a', matches=DocList[MMDoc]([MMDoc()]))] ), test_set={'a', 'b'}, inner_doc=InnerDoc(integer=3, inner_list=['a', 'b']), diff --git a/tests/units/document/test_view.py b/tests/units/document/test_view.py index a544289f7ec..ad9a56027c3 100644 --- a/tests/units/document/test_view.py +++ b/tests/units/document/test_view.py @@ -1,8 +1,8 @@ import numpy as np from docarray import BaseDoc -from docarray.array import DocArrayStacked -from docarray.array.stacked.column_storage import ColumnStorageView +from docarray.array import DocVec +from docarray.array.doc_vec.column_storage import ColumnStorageView from docarray.typing import AnyTensor @@ -13,7 +13,7 @@ class MyDoc(BaseDoc): docs = [MyDoc(tensor=np.zeros((10, 10)), name='hello', id=i) for i in range(4)] - storage = DocArrayStacked[MyDoc](docs)._storage + storage = DocVec[MyDoc](docs)._storage doc = MyDoc.from_view(ColumnStorageView(0, storage)) assert doc.is_view() diff --git a/tests/units/test_helper.py b/tests/units/test_helper.py index 652400d2905..bb7e51b25fc 100644 --- a/tests/units/test_helper.py +++ b/tests/units/test_helper.py @@ -2,7 +2,7 @@ import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from docarray.helper import ( _access_path_dict_to_nested_dict, @@ -26,12 +26,12 @@ class Middle(BaseDoc): class Outer(BaseDoc): img: Optional[ImageDoc] middle: Optional[Middle] - da: DocArray[Inner] + da: DocList[Inner] doc = Outer( img=ImageDoc(), middle=Middle(img=ImageDoc(), inner=Inner(img=ImageDoc())), - da=DocArray[Inner]([Inner(img=ImageDoc(url='test.png'))]), + da=DocList[Inner]([Inner(img=ImageDoc(url='test.png'))]), ) return doc diff --git a/tests/units/typing/da/test_relations.py b/tests/units/typing/da/test_relations.py index fcdf1177657..b00e965c8e7 100644 --- a/tests/units/typing/da/test_relations.py +++ b/tests/units/typing/da/test_relations.py @@ -1,33 +1,33 @@ -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList def test_instance_and_equivalence(): class MyDoc(BaseDoc): text: str - docs = DocArray[MyDoc]([MyDoc(text='hello')]) + docs = DocList[MyDoc]([MyDoc(text='hello')]) - assert issubclass(DocArray[MyDoc], DocArray[MyDoc]) - assert issubclass(docs.__class__, DocArray[MyDoc]) + assert issubclass(DocList[MyDoc], DocList[MyDoc]) + assert issubclass(docs.__class__, DocList[MyDoc]) - assert isinstance(docs, DocArray[MyDoc]) + assert isinstance(docs, DocList[MyDoc]) def test_subclassing(): class MyDoc(BaseDoc): text: str - class MyDocArray(DocArray[MyDoc]): + class MyDocList(DocList[MyDoc]): pass - docs = MyDocArray([MyDoc(text='hello')]) + docs = MyDocList([MyDoc(text='hello')]) - assert issubclass(MyDocArray, DocArray[MyDoc]) - assert issubclass(docs.__class__, DocArray[MyDoc]) + assert issubclass(MyDocList, DocList[MyDoc]) + assert issubclass(docs.__class__, DocList[MyDoc]) - assert isinstance(docs, MyDocArray) - assert isinstance(docs, DocArray[MyDoc]) + assert isinstance(docs, MyDocList) + assert isinstance(docs, DocList[MyDoc]) assert issubclass(MyDoc, BaseDoc) - assert not issubclass(DocArray[MyDoc], DocArray[BaseDoc]) - assert not issubclass(MyDocArray, DocArray[BaseDoc]) + assert not issubclass(DocList[MyDoc], DocList[BaseDoc]) + assert not issubclass(MyDocList, DocList[BaseDoc]) diff --git a/tests/units/util/test_filter.py b/tests/units/util/test_filter.py index 14e43290e9a..21c427a7bbf 100644 --- a/tests/units/util/test_filter.py +++ b/tests/units/util/test_filter.py @@ -3,7 +3,7 @@ import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc, TextDoc from docarray.utils.filter import filter_docs @@ -45,7 +45,7 @@ def docs(): optional_num=30, dictionary={'a': 0, 'b': 1}, ) - docs = DocArray[MMDoc]([mmdoc1, mmdoc2, mmdoc3]) + docs = DocList[MMDoc]([mmdoc1, mmdoc2, mmdoc3]) return docs @@ -173,7 +173,7 @@ def test_array_simple_filters(docs, dict_api): @pytest.mark.parametrize('dict_api', [True, False]) def test_placehold_filter(dict_api): - docs = DocArray[MMDoc]( + docs = DocList[MMDoc]( [ MMDoc(text='A', text_doc=TextDoc(text='A')), MMDoc(text='A', text_doc=TextDoc(text='B')), @@ -251,7 +251,7 @@ class MyDocument(BaseDoc): image: ImageDoc price: int - docs = DocArray[MyDocument]( + docs = DocList[MyDocument]( [ MyDocument( caption='A tiger in the jungle', diff --git a/tests/units/util/test_find.py b/tests/units/util/test_find.py index 9239e6d8dff..90b3c7005d8 100644 --- a/tests/units/util/test_find.py +++ b/tests/units/util/test_find.py @@ -4,7 +4,7 @@ import pytest import torch -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.typing import NdArray, TorchTensor from docarray.utils.find import find, find_batched @@ -24,7 +24,7 @@ def random_torch_query(): @pytest.fixture() def random_torch_batch_query(): - return DocArray[TorchDoc]([TorchDoc(tensor=torch.rand(128)) for _ in range(5)]) + return DocList[TorchDoc]([TorchDoc(tensor=torch.rand(128)) for _ in range(5)]) @pytest.fixture() @@ -34,17 +34,17 @@ def random_nd_query(): @pytest.fixture() def random_nd_batch_query(): - return DocArray[NdDoc]([NdDoc(tensor=np.random.rand(128)) for _ in range(5)]) + return DocList[NdDoc]([NdDoc(tensor=np.random.rand(128)) for _ in range(5)]) @pytest.fixture() def random_torch_index(): - return DocArray[TorchDoc](TorchDoc(tensor=torch.rand(128)) for _ in range(10)) + return DocList[TorchDoc](TorchDoc(tensor=torch.rand(128)) for _ in range(10)) @pytest.fixture() def random_nd_index(): - return DocArray[NdDoc](NdDoc(tensor=np.random.rand(128)) for _ in range(10)) + return DocList[NdDoc](NdDoc(tensor=np.random.rand(128)) for _ in range(10)) @pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) @@ -261,7 +261,7 @@ class MyDoc(BaseDoc): embedding: Optional[TorchTensor] query = MyDoc(embedding=torch.rand(10)) - index = DocArray[MyDoc]([MyDoc(embedding=torch.rand(10)) for _ in range(10)]) + index = DocList[MyDoc]([MyDoc(embedding=torch.rand(10)) for _ in range(10)]) top_k, scores = find( index, @@ -279,7 +279,7 @@ class MyDoc(BaseDoc): embedding: Union[TorchTensor, NdArray] query = MyDoc(embedding=torch.rand(10)) - index = DocArray[MyDoc]([MyDoc(embedding=torch.rand(10)) for _ in range(10)]) + index = DocList[MyDoc]([MyDoc(embedding=torch.rand(10)) for _ in range(10)]) top_k, scores = find( index, @@ -302,7 +302,7 @@ class MyDoc(BaseDoc): inner: InnerDoc query = MyDoc(inner=InnerDoc(title='query', embedding=torch.rand(2))) - index = DocArray[MyDoc]( + index = DocList[MyDoc]( [ MyDoc(inner=InnerDoc(title=f'doc {i}', embedding=torch.rand(2))) for i in range(10) @@ -335,7 +335,7 @@ class MyDoc(BaseDoc): embedding3=torch.rand(10), embedding4=torch.rand(10), ) - index = DocArray[MyDoc]( + index = DocList[MyDoc]( [ MyDoc( embedding=torch.rand(10), diff --git a/tests/units/util/test_map.py b/tests/units/util/test_map.py index c36ebc2f46e..68efdfbbd7e 100644 --- a/tests/units/util/test_map.py +++ b/tests/units/util/test_map.py @@ -2,7 +2,7 @@ import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from docarray.typing import ImageUrl, NdArray from docarray.utils.map import map_docs, map_docs_batched @@ -19,7 +19,7 @@ def load_from_doc(d: ImageDoc) -> ImageDoc: @pytest.fixture() def da(): - da = DocArray[ImageDoc]([ImageDoc(url=IMAGE_PATHS['png']) for _ in range(N_DOCS)]) + da = DocList[ImageDoc]([ImageDoc(url=IMAGE_PATHS['png']) for _ in range(N_DOCS)]) return da @@ -28,7 +28,7 @@ def test_map(da, backend): for tensor in da.tensor: assert tensor is None - docs = list(map_docs(da=da, func=load_from_doc, backend=backend)) + docs = list(map_docs(docs=da, func=load_from_doc, backend=backend)) assert len(docs) == N_DOCS for doc in docs: @@ -37,7 +37,7 @@ def test_map(da, backend): def test_map_multiprocessing_lambda_func_raise_exception(da): with pytest.raises(ValueError, match='Multiprocessing does not allow'): - list(map_docs(da=da, func=lambda x: x, backend='process')) + list(map_docs(docs=da, func=lambda x: x, backend='process')) def test_map_multiprocessing_local_func_raise_exception(da): @@ -45,21 +45,21 @@ def local_func(x): return x with pytest.raises(ValueError, match='Multiprocessing does not allow'): - list(map_docs(da=da, func=local_func, backend='process')) + list(map_docs(docs=da, func=local_func, backend='process')) @pytest.mark.parametrize('backend', ['thread', 'process']) def test_check_order(backend): - da = DocArray[ImageDoc]([ImageDoc(id=i) for i in range(N_DOCS)]) + da = DocList[ImageDoc]([ImageDoc(id=i) for i in range(N_DOCS)]) - docs = list(map_docs(da=da, func=load_from_doc, backend=backend)) + docs = list(map_docs(docs=da, func=load_from_doc, backend=backend)) assert len(docs) == N_DOCS for i, doc in enumerate(docs): assert doc.id == str(i) -def load_from_da(da: DocArray) -> DocArray: +def load_from_da(da: DocList) -> DocList: for doc in da: doc.tensor = doc.url.load() return da @@ -75,11 +75,11 @@ class MyImage(BaseDoc): @pytest.mark.parametrize('backend', ['thread', 'process']) def test_map_docs_batched(n_docs, batch_size, backend): - da = DocArray[MyImage]([MyImage(url=IMAGE_PATHS['png']) for _ in range(n_docs)]) + da = DocList[MyImage]([MyImage(url=IMAGE_PATHS['png']) for _ in range(n_docs)]) it = map_docs_batched( - da=da, func=load_from_da, batch_size=batch_size, backend=backend + docs=da, func=load_from_da, batch_size=batch_size, backend=backend ) assert isinstance(it, Generator) for batch in it: - assert isinstance(batch, DocArray[MyImage]) + assert isinstance(batch, DocList[MyImage]) diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py index e72e8863a46..e07af67b0ec 100644 --- a/tests/units/util/test_reduce.py +++ b/tests/units/util/test_reduce.py @@ -2,7 +2,7 @@ import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from docarray.utils.reduce import reduce, reduce_all @@ -17,8 +17,8 @@ class MMDoc(BaseDoc): price: int = 0 categories: Optional[List[str]] = None image: Optional[ImageDoc] = None - matches: Optional[DocArray] = None - matches_with_same_id: Optional[DocArray] = None + matches: Optional[DocList] = None + matches_with_same_id: Optional[DocList] = None opt_int: Optional[int] = None test_set: Optional[Set] = None inner_doc: Optional[InnerDoc] = None @@ -31,9 +31,9 @@ def doc1(): text='hey here', categories=['a', 'b', 'c'], price=10, - matches=DocArray[MMDoc]([MMDoc()]), - matches_with_same_id=DocArray[MMDoc]( - [MMDoc(id='a', matches=DocArray[MMDoc]([MMDoc()]))] + matches=DocList[MMDoc]([MMDoc()]), + matches_with_same_id=DocList[MMDoc]( + [MMDoc(id='a', matches=DocList[MMDoc]([MMDoc()]))] ), test_set={'a', 'a'}, inner_doc=InnerDoc(integer=2, inner_list=['c', 'd']), @@ -49,9 +49,9 @@ def doc2(doc1): categories=['d', 'e', 'f'], price=5, opt_int=5, - matches=DocArray[MMDoc]([MMDoc()]), - matches_with_same_id=DocArray[MMDoc]( - [MMDoc(id='a', matches=DocArray[MMDoc]([MMDoc()]))] + matches=DocList[MMDoc]([MMDoc()]), + matches_with_same_id=DocList[MMDoc]( + [MMDoc(id='a', matches=DocList[MMDoc]([MMDoc()]))] ), test_set={'a', 'b'}, inner_doc=InnerDoc(integer=3, inner_list=['a', 'b']), @@ -60,8 +60,8 @@ def doc2(doc1): def test_reduce_different_ids(): - da1 = DocArray[MMDoc]([MMDoc() for _ in range(10)]) - da2 = DocArray[MMDoc]([MMDoc() for _ in range(10)]) + da1 = DocList[MMDoc]([MMDoc() for _ in range(10)]) + da2 = DocList[MMDoc]([MMDoc() for _ in range(10)]) result = reduce(da1, da2) assert len(result) == 20 # da1 is changed in place (no extra memory) @@ -69,8 +69,8 @@ def test_reduce_different_ids(): def test_reduce(doc1, doc2): - da1 = DocArray[MMDoc]([doc1, MMDoc()]) - da2 = DocArray[MMDoc]([MMDoc(), doc2]) + da1 = DocList[MMDoc]([doc1, MMDoc()]) + da2 = DocList[MMDoc]([MMDoc(), doc2]) result = reduce(da1, da2) assert len(result) == 3 # da1 is changed in place (no extra memory) @@ -89,9 +89,9 @@ def test_reduce(doc1, doc2): def test_reduce_all(doc1, doc2): - da1 = DocArray[MMDoc]([doc1, MMDoc()]) - da2 = DocArray[MMDoc]([MMDoc(), doc2]) - da3 = DocArray[MMDoc]([MMDoc(), MMDoc(), doc1]) + da1 = DocList[MMDoc]([doc1, MMDoc()]) + da2 = DocList[MMDoc]([MMDoc(), doc2]) + da3 = DocList[MMDoc]([MMDoc(), MMDoc(), doc1]) result = reduce_all([da1, da2, da3]) assert len(result) == 5 # da1 is changed in place (no extra memory)