From f2c407f835e949c845a6ddbfd6b36400d7651b61 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 3 Apr 2023 15:09:01 +0200 Subject: [PATCH 01/27] refactor: rename DocArray to DocList Signed-off-by: samsja --- README.md | 29 +++++----- docarray/__init__.py | 4 +- docarray/array/__init__.py | 4 +- docarray/array/abstract_array.py | 20 +++---- docarray/array/array/array.py | 8 +-- docarray/array/array/io.py | 14 ++--- docarray/array/array/pushpull.py | 4 +- docarray/array/stacked/array_stacked.py | 28 +++++----- docarray/base_doc/mixins/update.py | 6 +- docarray/data/torch_dataset.py | 12 ++-- docarray/display/document_summary.py | 14 ++--- docarray/documents/legacy/legacy_document.py | 6 +- docarray/helper.py | 8 +-- docarray/index/abstract.py | 36 ++++++------ docarray/index/backends/hnswlib.py | 20 +++---- docarray/store/abstract_doc_store.py | 10 ++-- docarray/store/file.py | 10 ++-- docarray/store/jac.py | 20 +++---- docarray/store/s3.py | 11 ++-- docarray/utils/filter.py | 4 +- docarray/utils/find.py | 16 +++--- docarray/utils/reduce.py | 8 +-- .../how_to/multimodal_training_and_serving.md | 6 +- tests/benchmark_tests/test_map.py | 14 ++--- .../index/base_classes/test_base_doc_store.py | 50 ++++++++--------- tests/index/hnswlib/test_index_get_del.py | 12 ++-- tests/integrations/array/test_torch_train.py | 4 +- .../elastic/v7/test_index_get_del.py | 10 ++-- tests/integrations/document/test_document.py | 8 +-- tests/integrations/document/test_proto.py | 10 ++-- tests/integrations/externals/test_fastapi.py | 8 +-- tests/integrations/store/__init__.py | 4 +- tests/integrations/store/test_file.py | 40 +++++++------- tests/integrations/store/test_jac.py | 38 ++++++------- tests/integrations/store/test_s3.py | 34 ++++++------ .../torch/data/test_torch_dataset.py | 18 +++--- tests/units/array/stack/test_array_stacked.py | 46 ++++++++-------- .../array/stack/test_array_stacked_tf.py | 16 +++--- tests/units/array/stack/test_proto.py | 8 +-- tests/units/array/test_array.py | 55 ++++++++++--------- tests/units/array/test_array_from_to_bytes.py | 10 ++-- tests/units/array/test_array_from_to_csv.py | 12 ++-- tests/units/array/test_array_from_to_json.py | 6 +- .../units/array/test_array_from_to_pandas.py | 10 ++-- tests/units/array/test_array_proto.py | 14 ++--- tests/units/array/test_array_save_load.py | 12 ++-- tests/units/array/test_batching.py | 4 +- tests/units/array/test_generic_array.py | 10 ++-- tests/units/array/test_indexing.py | 6 +- tests/units/array/test_traverse.py | 14 ++--- .../document/proto/test_document_proto.py | 10 ++-- tests/units/document/test_update.py | 18 +++--- tests/units/test_helper.py | 6 +- tests/units/typing/da/test_relations.py | 26 ++++----- tests/units/util/test_filter.py | 8 +-- tests/units/util/test_find.py | 18 +++--- tests/units/util/test_map.py | 12 ++-- tests/units/util/test_reduce.py | 32 +++++------ 58 files changed, 446 insertions(+), 455 deletions(-) diff --git a/README.md b/README.md index 8d4b45ae264..90b5db0997a 100644 --- a/README.md +++ b/README.md @@ -77,9 +77,10 @@ doc = MultiModalDocument( ) ``` -### Collect multiple `Documents` into a `DocArray`: +### Collect multiple `Documents` into a `DocList`: + ```python -from docarray import DocArray, BaseDoc +from docarray import DocList, BaseDoc from docarray.typing import AnyTensor, ImageUrl import numpy as np @@ -90,9 +91,9 @@ class Image(BaseDoc): ``` ```python -from docarray import DocArray +from docarray import DocList -da = DocArray[Image]( +da = DocList[Image]( [ Image( url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", @@ -150,16 +151,16 @@ Image.from_protobuf(doc.to_protobuf()) ```python # NOTE: DocumentStores are not yet implemented in version 2 -from docarray import DocArray +from docarray import DocList from docarray.documents import ImageDoc from docarray.stores import DocumentStore import numpy as np -da = DocArray([ImageDoc(embedding=np.zeros((128,))) for _ in range(1000)]) +da = DocList([ImageDoc(embedding=np.zeros((128,))) for _ in range(1000)]) store = DocumentStore[ImageDoc]( storage='qdrant' ) # create a DocumentStore with Qdrant as backend -store.insert(da) # insert the DocArray into the DocumentStore +store.insert(da) # insert the DocList into the DocumentStore # find the 10 most similar images based on the 'embedding' field match = store.find(ImageDoc(embedding=np.zeros((128,))), field='embedding', top_k=10) ``` @@ -233,7 +234,7 @@ Not very easy on the eyes if you ask us. And even worse, if you need to add one So, now let's see what the same code looks like with DocArray: ```python -from docarray import DocArray, BaseDoc +from docarray import DocList, BaseDoc from docarray.documents import ImageDoc, TextDoc, AudioDoc from docarray.typing import TorchTensor @@ -258,14 +259,14 @@ class MyPodcastModel(nn.Module): self.image_encoder = ImageEncoder() self.text_encoder = TextEncoder() - def forward_podcast(self, da: DocArray[Podcast]) -> DocArray[Podcast]: + def forward_podcast(self, da: DocList[Podcast]) -> DocList[Podcast]: da.audio.embedding = self.audio_encoder(da.audio.tensor) da.text.embedding = self.text_encoder(da.text.tensor) da.image.embedding = self.image_encoder(da.image.tensor) return da - def forward(self, da: DocArray[PairPodcast]) -> DocArray[PairPodcast]: + def forward(self, da: DocList[PairPodcast]) -> DocList[PairPodcast]: da.left = self.forward_podcast(da.left) da.right = self.forward_podcast(da.right) @@ -297,7 +298,7 @@ This would look like the following: ```python from typing import Optional -from docarray import DocArray, BaseDoc +from docarray import DocList, BaseDoc import tensorflow as tf @@ -312,7 +313,7 @@ class MyPodcastModel(tf.keras.Model): super().__init__() self.audio_encoder = AudioEncoder() - def call(self, inputs: DocArray[Podcast]) -> DocArray[Podcast]: + def call(self, inputs: DocList[Podcast]) -> DocList[Podcast]: inputs.audio_tensor.embedding = self.audio_encoder( inputs.audio_tensor.tensor ) # access audio_tensor's .tensor attribute @@ -407,7 +408,7 @@ store it there, and thus make it searchable: ```python # NOTE: DocumentStores are not yet implemented in version 2 -from docarray import DocArray, BaseDoc +from docarray import DocList, BaseDoc from docarray.stores import DocumentStore from docarray.documents import ImageDoc, TextDoc import numpy as np @@ -427,7 +428,7 @@ def _random_my_doc(): ) -da = DocArray([_random_my_doc() for _ in range(1000)]) # create some data +da = DocList([_random_my_doc() for _ in range(1000)]) # create some data store = DocumentStore[MyDoc]( storage='qdrant' ) # create a DocumentStore with Qdrant as backend diff --git a/docarray/__init__.py b/docarray/__init__.py index d8b6bae90a5..0189cab7250 100644 --- a/docarray/__init__.py +++ b/docarray/__init__.py @@ -2,10 +2,10 @@ import logging -from docarray.array import DocArray, DocArrayStacked +from docarray.array import DocArrayStacked, DocList from docarray.base_doc.doc import BaseDoc -__all__ = ['BaseDoc', 'DocArray', 'DocArrayStacked'] +__all__ = ['BaseDoc', 'DocList', 'DocArrayStacked'] logger = logging.getLogger('docarray') diff --git a/docarray/array/__init__.py b/docarray/array/__init__.py index 9c0176426e2..fd0544f0c91 100644 --- a/docarray/array/__init__.py +++ b/docarray/array/__init__.py @@ -1,4 +1,4 @@ -from docarray.array.array.array import DocArray +from docarray.array.array.array import DocList from docarray.array.stacked.array_stacked import DocArrayStacked -__all__ = ['DocArray', 'DocArrayStacked'] +__all__ = ['DocList', 'DocArrayStacked'] diff --git a/docarray/array/abstract_array.py b/docarray/array/abstract_array.py index f3a94b5b6fa..8b33d95f3c3 100644 --- a/docarray/array/abstract_array.py +++ b/docarray/array/abstract_array.py @@ -121,10 +121,10 @@ def _set_data_column( field: str, values: Union[List, T, 'AbstractTensor'], ): - """Set all Documents in this DocArray using the passed values + """Set all Documents in this DocList using the passed values :param field: name of the fields to extract - :values: the values to set at the DocArray level + :values: the values to set at the DocList level """ ... @@ -136,12 +136,12 @@ def from_protobuf(cls: Type[T], pb_msg: 'DocumentArrayProto') -> T: @abstractmethod def to_protobuf(self) -> 'DocumentArrayProto': - """Convert DocArray into a Protobuf message""" + """Convert DocList into a Protobuf message""" ... def _to_node_protobuf(self) -> 'NodeProto': - """Convert a DocArray into a NodeProto protobuf message. - This function should be called when a DocArray + """Convert a DocList into a NodeProto protobuf message. + This function should be called when a DocList is nested into another Document that need to be converted into a protobuf :return: the nested item protobuf message @@ -157,7 +157,7 @@ def traverse_flat( ) -> Union[List[Any], 'AbstractTensor']: """ Return a List of the accessed objects when applying the `access_path`. If this - results in a nested list or list of DocArrays, the list will be flattened + results in a nested list or list of DocLists, the list will be flattened on the first level. The access path is a string that consists of attribute names, concatenated and "__"-separated. It describes the path from the first level to an arbitrary one, e.g. 'content__image__url'. @@ -243,9 +243,9 @@ def _traverse(node: Any, access_path: str): if access_path: curr_attr, _, path_attrs = access_path.partition('__') - from docarray.array import DocArray + from docarray.array import DocList - if isinstance(node, (DocArray, list)): + if isinstance(node, (DocList, list)): for n in node: x = getattr(n, curr_attr) yield from AnyDocArray._traverse(x, path_attrs) @@ -257,9 +257,9 @@ def _traverse(node: Any, access_path: str): @staticmethod def _flatten_one_level(sequence: List[Any]) -> List[Any]: - from docarray import DocArray + from docarray import DocList - if len(sequence) == 0 or not isinstance(sequence[0], (list, DocArray)): + if len(sequence) == 0 or not isinstance(sequence[0], (list, DocList)): return sequence else: return [item for sublist in sequence for item in sublist] diff --git a/docarray/array/array/array.py b/docarray/array/array/array.py index e3f56e74fda..ba1acfab013 100644 --- a/docarray/array/array/array.py +++ b/docarray/array/array/array.py @@ -36,7 +36,7 @@ from docarray.typing import TorchTensor from docarray.typing.tensor.abstract_tensor import AbstractTensor -T = TypeVar('T', bound='DocArray') +T = TypeVar('T', bound='DocList') T_doc = TypeVar('T_doc', bound=BaseDoc) @@ -57,7 +57,7 @@ def _delegate_meth(self, *args, **kwargs): return _delegate_meth -class DocArray( +class DocList( IndexingSequenceMixin[T_doc], PushPullMixin, IOMixinArray, AnyDocArray[T_doc] ): """ @@ -229,7 +229,7 @@ def _get_data_column( # calling __class_getitem__ ourselves is a hack otherwise mypy complain # most likely a bug in mypy though # bug reported here https://github.com/python/mypy/issues/14111 - return DocArray.__class_getitem__(field_type)( + return DocList.__class_getitem__(field_type)( (getattr(doc, field) for doc in self), ) else: @@ -284,7 +284,7 @@ def validate( raise TypeError(f'Expecting an Iterable of {cls.document_type}') def traverse_flat( - self: 'DocArray', + self: 'DocList', access_path: str, ) -> List[Any]: nodes = list(AnyDocArray._traverse(node=self, access_path=access_path)) diff --git a/docarray/array/array/io.py b/docarray/array/array/io.py index 02b250fad4e..91bb169b3dc 100644 --- a/docarray/array/array/io.py +++ b/docarray/array/array/io.py @@ -39,7 +39,7 @@ if TYPE_CHECKING: import pandas as pd - from docarray import DocArray + from docarray import DocList from docarray.proto import DocumentArrayProto T = TypeVar('T', bound='IOMixinArray') @@ -343,7 +343,7 @@ def from_csv( file_path: str, encoding: str = 'utf-8', dialect: Union[str, csv.Dialect] = 'excel', - ) -> 'DocArray': + ) -> 'DocList': """ Load a DocArray from a csv file following the schema defined in the :attr:`~docarray.DocArray.document_type` attribute. @@ -363,7 +363,7 @@ def from_csv( 'unix' (for csv file generated on UNIX systems). :return: DocArray """ - from docarray import DocArray + from docarray import DocList if cls.document_type == AnyDoc: raise TypeError( @@ -372,7 +372,7 @@ def from_csv( ) doc_type = cls.document_type - da = DocArray.__class_getitem__(doc_type)() + da = DocList.__class_getitem__(doc_type)() with open(file_path, 'r', encoding=encoding) as fp: rows = csv.DictReader(fp, dialect=dialect) @@ -428,7 +428,7 @@ def to_csv( writer.writerow(doc_dict) @classmethod - def from_pandas(cls, df: 'pd.DataFrame') -> 'DocArray': + def from_pandas(cls, df: 'pd.DataFrame') -> 'DocList': """ Load a DocArray from a `pandas.DataFrame` following the schema defined in the :attr:`~docarray.DocArray.document_type` attribute. @@ -468,7 +468,7 @@ class Person(BaseDoc): :return: DocArray where each Document contains the information of one corresponding row of the `pandas.DataFrame`. """ - from docarray import DocArray + from docarray import DocList if cls.document_type == AnyDoc: raise TypeError( @@ -477,7 +477,7 @@ class Person(BaseDoc): ) doc_type = cls.document_type - da = DocArray.__class_getitem__(doc_type)() + da = DocList.__class_getitem__(doc_type)() field_names = df.columns.tolist() if field_names is None or len(field_names) == 0: diff --git a/docarray/array/array/pushpull.py b/docarray/array/array/pushpull.py index ee306620f4d..2d85af8e711 100644 --- a/docarray/array/array/pushpull.py +++ b/docarray/array/array/pushpull.py @@ -19,7 +19,7 @@ SUPPORTED_PUSH_PULL_PROTOCOLS = get_args(PUSH_PULL_PROTOCOL) if TYPE_CHECKING: # pragma: no cover - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList from docarray.store.abstract_doc_store import AbstractDocStore @@ -129,7 +129,7 @@ def pull( url: str, show_progress: bool = False, local_cache: bool = True, - ) -> 'DocArray': + ) -> 'DocList': """Pull a :class:`DocArray` from the specified url. :param url: url specifying the protocol and save name of the DocArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` diff --git a/docarray/array/stacked/array_stacked.py b/docarray/array/stacked/array_stacked.py index bc6ff8965a5..5e634f36b00 100644 --- a/docarray/array/stacked/array_stacked.py +++ b/docarray/array/stacked/array_stacked.py @@ -19,7 +19,7 @@ from pydantic import BaseConfig, parse_obj_as from docarray.array.abstract_array import AnyDocArray -from docarray.array.array.array import DocArray +from docarray.array.array.array import DocList from docarray.array.stacked.column_storage import ColumnStorage, ColumnStorageView from docarray.array.stacked.list_advance_indexing import ListAdvancedIndexing from docarray.base_doc import BaseDoc @@ -105,8 +105,8 @@ def __init__( raise ValueError(f'docs {docs}: should not be empty') docs = ( docs - if isinstance(docs, DocArray) - else DocArray.__class_getitem__(self.document_type)(docs) + if isinstance(docs, DocList) + else DocList.__class_getitem__(self.document_type)(docs) ) for field_name, field in self.document_type.__fields__.items(): @@ -167,7 +167,7 @@ def __init__( docs_list = list() for doc in docs: da = getattr(doc, field_name) - if isinstance(da, DocArray): + if isinstance(da, DocList): da = da.stack(tensor_type=self.tensor_type) docs_list.append(da) da_columns[field_name] = ListAdvancedIndexing(docs_list) @@ -209,7 +209,7 @@ def validate( ) -> T: if isinstance(value, cls): return value - elif isinstance(value, DocArray.__class_getitem__(cls.document_type)): + elif isinstance(value, DocList.__class_getitem__(cls.document_type)): return cast(T, value.stack()) elif isinstance(value, Sequence): return cls(value) @@ -305,7 +305,7 @@ def __setitem__(self: T, key, value): def _set_data_and_columns( self: T, index_item: Union[Tuple, Iterable, slice], - value: Union[T, DocArray[T_doc]], + value: Union[T, DocList[T_doc]], ) -> None: """Delegates the setting to the data and the columns. @@ -318,7 +318,7 @@ def _set_data_and_columns( # set data and prepare columns processed_value: T - if isinstance(value, DocArray): + if isinstance(value, DocList): if not issubclass(value.document_type, self.document_type): raise TypeError( f'{value} schema : {value.document_type} is not compatible with ' @@ -345,10 +345,10 @@ def _set_data_column( self: T, field: str, values: Union[ - Sequence[DocArray[T_doc]], + Sequence[DocList[T_doc]], Sequence[Any], T, - DocArray, + DocList, AbstractTensor, ], ) -> None: @@ -384,7 +384,7 @@ def _set_data_column( self._storage.doc_columns[field] = values_ elif field in self._storage.da_columns.keys(): - values_ = cast(Sequence[DocArray[T_doc]], values) + values_ = cast(Sequence[DocList[T_doc]], values) # TODO here we should actually check if this is correct self._storage.da_columns[field] = values_ elif field in self._storage.any_columns.keys(): @@ -474,14 +474,14 @@ def to_protobuf(self) -> 'DocArrayStackedProto': any_columns=any_columns_proto, ) - def unstack(self: T) -> DocArray[T_doc]: + def unstack(self: T) -> DocList[T_doc]: """Convert DocArrayStacked into a DocArray. Note this destroys the arguments and returns a new DocArray """ - unstacked_doc_column: Dict[str, DocArray] = dict() - unstacked_da_column: Dict[str, List[DocArray]] = dict() + unstacked_doc_column: Dict[str, DocList] = dict() + unstacked_da_column: Dict[str, List[DocList]] = dict() unstacked_tensor_column: Dict[str, List[AbstractTensor]] = dict() unstacked_any_column = self._storage.any_columns @@ -515,7 +515,7 @@ def unstack(self: T) -> DocArray[T_doc]: del self._storage - return DocArray.__class_getitem__(self.document_type).construct(docs) + return DocList.__class_getitem__(self.document_type).construct(docs) def traverse_flat( self, diff --git a/docarray/base_doc/mixins/update.py b/docarray/base_doc/mixins/update.py index 1fe37015c90..99fdbc2bf8e 100644 --- a/docarray/base_doc/mixins/update.py +++ b/docarray/base_doc/mixins/update.py @@ -74,7 +74,7 @@ class MyDocument(BaseDoc): ) from collections import namedtuple - from docarray import DocArray + from docarray import DocList from docarray.utils.reduce import reduce # Declaring namedtuple() @@ -104,9 +104,7 @@ def _group_fields(doc: 'UpdateMixin') -> _FieldGroups: if field_name not in FORBIDDEN_FIELDS_TO_UPDATE: field_type = doc._get_field_type(field_name) - if isinstance(field_type, type) and issubclass( - field_type, DocArray - ): + if isinstance(field_type, type) and issubclass(field_type, DocList): nested_docarray_fields.append(field_name) else: origin = get_origin(field_type) diff --git a/docarray/data/torch_dataset.py b/docarray/data/torch_dataset.py index dd58035cd33..59f4843b899 100644 --- a/docarray/data/torch_dataset.py +++ b/docarray/data/torch_dataset.py @@ -2,7 +2,7 @@ from torch.utils.data import Dataset -from docarray import BaseDoc, DocArray, DocArrayStacked +from docarray import BaseDoc, DocArrayStacked, DocList from docarray.typing import TorchTensor from docarray.utils._internal._typing import change_cls_name @@ -14,7 +14,7 @@ class MultiModalDataset(Dataset, Generic[T_doc]): A dataset that can be used inside a PyTorch DataLoader. In other words, it implements the PyTorch Dataset interface. - :param da: the DocArray to be used as the dataset + :param da: the DocList to be used as the dataset :param preprocessing: a dictionary of field names and preprocessing functions The preprocessing dictionary passed to the constructor consists of keys that are @@ -24,7 +24,7 @@ class MultiModalDataset(Dataset, Generic[T_doc]): EXAMPLE USAGE .. code-block:: python from torch.utils.data import DataLoader - from docarray import DocArray + from docarray import DocList from docarray.data import MultiModalDataset from docarray.documents import Text @@ -33,7 +33,7 @@ def prepend_number(text: str): return f"Number {text}" - da = DocArray[Text](Text(text=str(i)) for i in range(16)) + da = DocList[Text](Text(text=str(i)) for i in range(16)) ds = MultiModalDataset[Text](da, preprocessing={'text': prepend_number}) loader = DataLoader(ds, batch_size=4, collate_fn=MultiModalDataset[Text].collate_fn) for batch in loader: @@ -51,7 +51,7 @@ def prepend_number(text: str): .. code-block:: python import torch from torch.utils.data import DataLoader - from docarray import DocArray, BaseDoc + from docarray import DocList, BaseDoc from docarray.data import MultiModalDataset from docarray.documents import Text @@ -96,7 +96,7 @@ def add_nonsense(student: Student): __typed_ds__: Dict[Type[BaseDoc], Type['MultiModalDataset']] = {} def __init__( - self, da: 'DocArray[T_doc]', preprocessing: Dict[str, Callable] + self, da: 'DocList[T_doc]', preprocessing: Dict[str, Callable] ) -> None: self.da = da self._preprocessing = preprocessing diff --git a/docarray/display/document_summary.py b/docarray/display/document_summary.py index a7fe5009e9b..23482903763 100644 --- a/docarray/display/document_summary.py +++ b/docarray/display/document_summary.py @@ -55,7 +55,7 @@ def _get_schema(cls: Type['BaseDoc'], doc_name: Optional[str] = None) -> Tree: from rich.tree import Tree - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList root = cls.__name__ if doc_name is None else f'{doc_name}: {cls.__name__}' tree = Tree(root, highlight=True) @@ -76,7 +76,7 @@ def _get_schema(cls: Type['BaseDoc'], doc_name: Optional[str] = None) -> Tree: for arg in field_type.__args__: if issubclass(arg, BaseDoc): sub_tree.add(DocumentSummary._get_schema(cls=arg)) - elif issubclass(arg, DocArray): + elif issubclass(arg, DocList): sub_tree.add( DocumentSummary._get_schema(cls=arg.document_type) ) @@ -87,7 +87,7 @@ def _get_schema(cls: Type['BaseDoc'], doc_name: Optional[str] = None) -> Tree: DocumentSummary._get_schema(cls=field_type, doc_name=field_name) ) - elif issubclass(field_type, DocArray): + elif issubclass(field_type, DocList): sub_tree = Tree(node_name, highlight=True) sub_tree.add( DocumentSummary._get_schema(cls=field_type.document_type) @@ -112,7 +112,7 @@ def __rich_console__( from rich import box, text from rich.table import Table - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList table = Table( 'Attribute', @@ -125,7 +125,7 @@ def __rich_console__( for field_name, value in self.doc.__dict__.items(): col_1 = f'{field_name}: {value.__class__.__name__}' if ( - isinstance(value, (ID, DocArray, BaseDoc)) + isinstance(value, (ID, DocList, BaseDoc)) or field_name.startswith('_') or value is None ): @@ -177,7 +177,7 @@ def _plot_recursion( :return: Tree with all children. """ - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList tree = Tree(node) if tree is None else tree.add(node) # type: ignore @@ -185,7 +185,7 @@ def _plot_recursion( nested_attrs = [ k for k, v in node.doc.__dict__.items() - if isinstance(v, (DocArray, BaseDoc)) + if isinstance(v, (DocList, BaseDoc)) ] for attr in nested_attrs: value = getattr(node.doc, attr) diff --git a/docarray/documents/legacy/legacy_document.py b/docarray/documents/legacy/legacy_document.py index e550a97c800..0c16d512d7a 100644 --- a/docarray/documents/legacy/legacy_document.py +++ b/docarray/documents/legacy/legacy_document.py @@ -2,7 +2,7 @@ from typing import Any, Dict, Optional -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.typing import AnyEmbedding, AnyTensor @@ -34,8 +34,8 @@ class LegacyDocument(BaseDoc): """ tensor: Optional[AnyTensor] - chunks: Optional[DocArray[LegacyDocument]] - matches: Optional[DocArray[LegacyDocument]] + chunks: Optional[DocList[LegacyDocument]] + matches: Optional[DocList[LegacyDocument]] blob: Optional[bytes] text: Optional[str] url: Optional[str] diff --git a/docarray/helper.py b/docarray/helper.py index 3cf74379e8d..d9c1d779809 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -135,7 +135,7 @@ def _get_field_type_by_access_path( :param access_path: "__"-separated access path :return: field type of accessed attribute. If access path is invalid, return None. """ - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList field, _, remaining = access_path.partition('__') field_valid = field in doc_type.__fields__.keys() @@ -145,7 +145,7 @@ def _get_field_type_by_access_path( return doc_type._get_field_type(field) else: d = doc_type._get_field_type(field) - if issubclass(d, DocArray): + if issubclass(d, DocList): return _get_field_type_by_access_path(d.document_type, remaining) elif issubclass(d, BaseDoc): return _get_field_type_by_access_path(d, remaining) @@ -180,7 +180,7 @@ def get_paths( .. code-block:: python from typing import Optional - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList from docarray.helper import get_paths from docarray.typing import TextUrl, ImageUrl @@ -191,7 +191,7 @@ class Banner(BaseDoc): # you can call it in the constructor - da = DocArray[Banner]([Banner(text_url=url) for url in get_paths(patterns='*.txt')]) + da = DocList[Banner]([Banner(text_url=url) for url in get_paths(patterns='*.txt')]) # and call it after construction to set the urls da.image_url = list(get_paths(patterns='*.jpg', exclude_regex='test')) diff --git a/docarray/index/abstract.py b/docarray/index/abstract.py index 11c130086b4..1a4e9571ce5 100644 --- a/docarray/index/abstract.py +++ b/docarray/index/abstract.py @@ -24,7 +24,7 @@ from pydantic.error_wrappers import ValidationError from typing_inspect import get_args, is_optional_type, is_union_type -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.array.abstract_array import AnyDocArray from docarray.typing import AnyTensor from docarray.typing.tensor.abstract_tensor import AbstractTensor @@ -48,12 +48,12 @@ class FindResultBatched(NamedTuple): - documents: List[DocArray] + documents: List[DocList] scores: np.ndarray class _FindResultBatched(NamedTuple): - documents: Union[List[DocArray], List[List[Dict[str, Any]]]] + documents: Union[List[DocList], List[List[Dict[str, Any]]]] scores: np.ndarray @@ -254,7 +254,7 @@ def _filter( self, filter_query: Any, limit: int, - ) -> Union[DocArray, List[Dict]]: + ) -> Union[DocList, List[Dict]]: """Find documents in the index based on a filter query :param filter_query: the DB specific filter query to execute @@ -268,7 +268,7 @@ def _filter_batched( self, filter_queries: Any, limit: int, - ) -> Union[List[DocArray], List[List[Dict]]]: + ) -> Union[List[DocList], List[List[Dict]]]: """Find documents in the index based on multiple filter queries. Each query is considered individually, and results are returned per query. @@ -322,7 +322,7 @@ def _text_search_batched( def __getitem__( self, key: Union[str, Sequence[str]] - ) -> Union[TSchema, DocArray[TSchema]]: + ) -> Union[TSchema, DocList[TSchema]]: """Get one or multiple Documents into the index, by `id`. If no document is found, a KeyError is raised. @@ -341,12 +341,12 @@ def __getitem__( raise KeyError(f'No document with id {key} found') # cast output - if isinstance(doc_sequence, DocArray): - out_da: DocArray[TSchema] = doc_sequence + if isinstance(doc_sequence, DocList): + out_da: DocList[TSchema] = doc_sequence elif isinstance(doc_sequence[0], Dict): out_da = self._dict_list_to_docarray(doc_sequence) # type: ignore else: - da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) + da_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) out_da = da_cls(doc_sequence) return out_da[0] if return_singleton else out_da @@ -385,7 +385,7 @@ def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs): :param docs: Documents to index. """ - if not isinstance(docs, (BaseDoc, DocArray)): + if not isinstance(docs, (BaseDoc, DocList)): self._logger.warning( 'Passing a sequence of Documents that is not a DocArray comes at ' 'a performance penalty, since compatibility with the schema of Index ' @@ -431,7 +431,7 @@ def find( def find_batched( self, - queries: Union[AnyTensor, DocArray], + queries: Union[AnyTensor, DocList], search_field: str = 'embedding', limit: int = 10, **kwargs, @@ -471,7 +471,7 @@ def filter( filter_query: Any, limit: int = 10, **kwargs, - ) -> DocArray: + ) -> DocList: """Find documents in the index based on a filter query :param filter_query: the DB specific filter query to execute @@ -491,7 +491,7 @@ def filter_batched( filter_queries: Any, limit: int = 10, **kwargs, - ) -> List[DocArray]: + ) -> List[DocList]: """Find documents in the index based on multiple filter queries. :param filter_queries: the DB specific filter query to execute @@ -764,7 +764,7 @@ def _create_single_column(self, field: 'ModelField', type_: Type) -> _ColumnInfo def _validate_docs( self, docs: Union[BaseDoc, Sequence[BaseDoc]] - ) -> DocArray[BaseDoc]: + ) -> DocList[BaseDoc]: """Validates Document against the schema of the Document Index. For validation to pass, the schema of `docs` and the schema of the Document Index need to evaluate to the same flattened columns. @@ -778,7 +778,7 @@ def _validate_docs( """ if isinstance(docs, BaseDoc): docs = [docs] - if isinstance(docs, DocArray): + if isinstance(docs, DocList): # validation shortcut for DocArray; only look at the schema reference_schema_flat = self._flatten_schema( cast(Type[BaseDoc], self._schema) @@ -814,7 +814,7 @@ def _validate_docs( ' and that the types of your data match the types of the Document Index schema.' ) - return DocArray[BaseDoc].construct(out_docs) + return DocList[BaseDoc].construct(out_docs) def _to_numpy(self, val: Any, allow_passthrough=False) -> Any: """ @@ -871,9 +871,9 @@ def _convert_dict_to_doc( schema_cls = cast(Type[BaseDoc], schema) return schema_cls(**doc_dict) - def _dict_list_to_docarray(self, dict_list: Sequence[Dict[str, Any]]) -> DocArray: + def _dict_list_to_docarray(self, dict_list: Sequence[Dict[str, Any]]) -> DocList: """Convert a list of docs in dict type to a DocArray of the schema type.""" doc_list = [self._convert_dict_to_doc(doc_dict, self._schema) for doc_dict in dict_list] # type: ignore - da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) + da_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) return da_cls(doc_list) diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index b7555012db8..6399f76b8e9 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -20,7 +20,7 @@ import numpy as np -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.index.abstract import ( BaseDocIndex, _ColumnInfo, @@ -214,7 +214,7 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: f'args and kwargs not supported for `execute_query` on {type(self)}' ) - ann_docs = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema))([]) + ann_docs = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))([]) filter_conditions = [] doc_to_score: Dict[BaseDoc, Any] = {} for op, op_kwargs in query: @@ -228,7 +228,7 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: self._logger.debug(f'Executing query {query}') docs_filtered = ann_docs for cond in filter_conditions: - da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) + da_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) docs_filtered = da_cls(filter_docs(docs_filtered, cond)) self._logger.debug(f'{len(docs_filtered)} results found') @@ -268,7 +268,7 @@ def _filter( self, filter_query: Any, limit: int, - ) -> DocArray: + ) -> DocList: raise NotImplementedError( f'{type(self)} does not support filter-only queries.' f' To perform post-filtering on a query, use' @@ -279,7 +279,7 @@ def _filter_batched( self, filter_queries: Any, limit: int, - ) -> List[DocArray]: + ) -> List[DocList]: raise NotImplementedError( f'{type(self)} does not support filter-only queries.' f' To perform post-filtering on a query, use' @@ -387,22 +387,22 @@ def _get_docs_sqlite_unsorted(self, univ_ids: Sequence[int]): 'SELECT data FROM docs WHERE doc_id IN %s' % sql_id_list, ) rows = self._sqlite_cursor.fetchall() - da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) + da_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) return da_cls([self._doc_from_bytes(row[0]) for row in rows]) - def _get_docs_sqlite_doc_id(self, doc_ids: Sequence[str]) -> DocArray[TSchema]: + def _get_docs_sqlite_doc_id(self, doc_ids: Sequence[str]) -> DocList[TSchema]: hashed_ids = tuple(self._to_hashed_id(id_) for id_ in doc_ids) docs_unsorted = self._get_docs_sqlite_unsorted(hashed_ids) - da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) + da_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) return da_cls(sorted(docs_unsorted, key=lambda doc: doc_ids.index(doc.id))) - def _get_docs_sqlite_hashed_id(self, hashed_ids: Sequence[int]) -> DocArray: + def _get_docs_sqlite_hashed_id(self, hashed_ids: Sequence[int]) -> DocList: docs_unsorted = self._get_docs_sqlite_unsorted(hashed_ids) def _in_position(doc): return hashed_ids.index(self._to_hashed_id(doc.id)) - da_cls = DocArray.__class_getitem__(cast(Type[BaseDoc], self._schema)) + da_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) return da_cls(sorted(docs_unsorted, key=_in_position)) def _delete_docs_from_sqlite(self, doc_ids: Sequence[Union[str, int]]): diff --git a/docarray/store/abstract_doc_store.py b/docarray/store/abstract_doc_store.py index c5c152499c2..1b926e47703 100644 --- a/docarray/store/abstract_doc_store.py +++ b/docarray/store/abstract_doc_store.py @@ -4,7 +4,7 @@ from typing_extensions import TYPE_CHECKING if TYPE_CHECKING: - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList class AbstractDocStore(ABC): @@ -33,7 +33,7 @@ def delete(name: str, missing_ok: bool) -> bool: @staticmethod @abstractmethod def push( - da: 'DocArray', + da: 'DocList', name: str, public: bool, show_progress: bool, @@ -71,11 +71,11 @@ def push_stream( @staticmethod @abstractmethod def pull( - da_cls: Type['DocArray'], + da_cls: Type['DocList'], name: str, show_progress: bool, local_cache: bool, - ) -> 'DocArray': + ) -> 'DocList': """Pull a DocArray from the specified name. :param da_cls: The DocArray class to instantiate @@ -89,7 +89,7 @@ def pull( @staticmethod @abstractmethod def pull_stream( - da_cls: Type['DocArray'], + da_cls: Type['DocList'], name: str, show_progress: bool, local_cache: bool, diff --git a/docarray/store/file.py b/docarray/store/file.py index bb79162109b..f3164704c00 100644 --- a/docarray/store/file.py +++ b/docarray/store/file.py @@ -10,7 +10,7 @@ from docarray.utils._internal.cache import _get_cache_path if TYPE_CHECKING: - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList SelfFileDocStore = TypeVar('SelfFileDocStore', bound='FileDocStore') @@ -92,7 +92,7 @@ def delete( @classmethod def push( cls: Type[SelfFileDocStore], - da: 'DocArray', + da: 'DocList', name: str, public: bool, show_progress: bool, @@ -145,11 +145,11 @@ def push_stream( @classmethod def pull( cls: Type[SelfFileDocStore], - da_cls: Type['DocArray'], + da_cls: Type['DocList'], name: str, show_progress: bool, local_cache: bool, - ) -> 'DocArray': + ) -> 'DocList': """Pull a :class:`DocArray` from the specified url. :param name: The file path to pull from. @@ -167,7 +167,7 @@ def pull( @classmethod def pull_stream( cls: Type[SelfFileDocStore], - da_cls: Type['DocArray'], + da_cls: Type['DocList'], name: str, show_progress: bool, local_cache: bool, diff --git a/docarray/store/jac.py b/docarray/store/jac.py index b2b2564a91e..f590fc7aa52 100644 --- a/docarray/store/jac.py +++ b/docarray/store/jac.py @@ -26,7 +26,7 @@ if TYPE_CHECKING: # pragma: no cover import io - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList if TYPE_CHECKING: import hubble @@ -46,7 +46,7 @@ def _get_length_from_summary(summary: List[Dict]) -> Optional[int]: raise ValueError('Length not found in summary') -def _get_raw_summary(self: 'DocArray') -> List[Dict[str, Any]]: +def _get_raw_summary(self: 'DocList') -> List[Dict[str, Any]]: items: List[Dict[str, Any]] = [ dict( name='Type', @@ -152,7 +152,7 @@ def delete(name: str, missing_ok: bool = True) -> bool: @staticmethod @hubble.login_required def push( - da: 'DocArray', + da: 'DocList', name: str, public: bool = True, show_progress: bool = False, @@ -258,14 +258,14 @@ def push_stream( :param show_progress: If true, a progress bar will be displayed. :param branding: A dictionary of branding information to be sent to Jina Cloud. e.g. {"icon": "emoji", "background": "#fff"} """ - from docarray import DocArray + from docarray import DocList # This is a temporary solution to push a stream of documents # The memory footprint is not ideal # But it must be done this way for now because Hubble expects to know the length of the DocArray # before it starts receiving the documents first_doc = next(docs) - da = DocArray[first_doc.__class__]([first_doc]) # type: ignore + da = DocList[first_doc.__class__]([first_doc]) # type: ignore for doc in docs: da.append(doc) return cls.push(da, name, public, show_progress, branding) @@ -273,11 +273,11 @@ def push_stream( @staticmethod @hubble.login_required def pull( - cls: Type['DocArray'], + cls: Type['DocList'], name: str, show_progress: bool = False, local_cache: bool = True, - ) -> 'DocArray': + ) -> 'DocList': """Pull a :class:`DocArray` from Jina AI Cloud to local. :param name: the upload name set during :meth:`.push` @@ -285,16 +285,16 @@ def pull( :param local_cache: store the downloaded DocArray to local folder :return: a :class:`DocArray` object """ - from docarray import DocArray + from docarray import DocList - return DocArray[cls.document_type]( # type: ignore + return DocList[cls.document_type]( # type: ignore JACDocStore.pull_stream(cls, name, show_progress, local_cache) ) @staticmethod @hubble.login_required def pull_stream( - cls: Type['DocArray'], + cls: Type['DocList'], name: str, show_progress: bool = False, local_cache: bool = False, diff --git a/docarray/store/s3.py b/docarray/store/s3.py index 23534d556fd..f940a77a626 100644 --- a/docarray/store/s3.py +++ b/docarray/store/s3.py @@ -9,10 +9,11 @@ from docarray.utils._internal.misc import import_library if TYPE_CHECKING: # pragma: no cover - from docarray import BaseDoc, DocArray import boto3 import botocore from smart_open import open + + from docarray import BaseDoc, DocList else: open = import_library('smart_open', raise_error=True).open boto3 = import_library('boto3', raise_error=True) @@ -118,7 +119,7 @@ def delete(name: str, missing_ok: bool = True) -> bool: @classmethod def push( cls: Type[SelfS3DocStore], - da: 'DocArray', + da: 'DocList', name: str, public: bool = False, show_progress: bool = False, @@ -176,11 +177,11 @@ def push_stream( @classmethod def pull( cls: Type[SelfS3DocStore], - da_cls: Type['DocArray'], + da_cls: Type['DocList'], name: str, show_progress: bool = False, local_cache: bool = False, - ) -> 'DocArray': + ) -> 'DocList': """Pull a :class:`DocArray` from the specified bucket and key. :param name: The bucket and key to pull from. e.g. my_bucket/my_key @@ -198,7 +199,7 @@ def pull( @classmethod def pull_stream( cls: Type[SelfS3DocStore], - da_cls: Type['DocArray'], + da_cls: Type['DocList'], name: str, show_progress: bool, local_cache: bool, diff --git a/docarray/utils/filter.py b/docarray/utils/filter.py index 773cbbe815d..f17fc8fd9ff 100644 --- a/docarray/utils/filter.py +++ b/docarray/utils/filter.py @@ -4,7 +4,7 @@ from typing import Dict, List, Union from docarray.array.abstract_array import AnyDocArray -from docarray.array.array.array import DocArray +from docarray.array.array.array import DocList def filter_docs( @@ -75,7 +75,7 @@ class MyDocument(BaseDoc): if query: query = query if not isinstance(query, str) else json.loads(query) parser = QueryParser(query) - return DocArray.__class_getitem__(docs.document_type)( + return DocList.__class_getitem__(docs.document_type)( d for d in docs if parser.evaluate(d) ) else: diff --git a/docarray/utils/find.py b/docarray/utils/find.py index a626134d1b6..bdf5ead3fa6 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -5,7 +5,7 @@ from typing_inspect import is_union_type from docarray.array.abstract_array import AnyDocArray -from docarray.array.array.array import DocArray +from docarray.array.array.array import DocList from docarray.array.stacked.array_stacked import DocArrayStacked from docarray.base_doc import BaseDoc from docarray.helper import _get_field_type_by_access_path @@ -14,12 +14,12 @@ class FindResult(NamedTuple): - documents: DocArray + documents: DocList scores: AnyTensor class _FindResult(NamedTuple): - documents: Union[DocArray, List[Dict[str, Any]]] + documents: Union[DocList, List[Dict[str, Any]]] scores: AnyTensor @@ -110,7 +110,7 @@ class MyDocument(BaseDoc): def find_batched( index: AnyDocArray, - query: Union[AnyTensor, DocArray], + query: Union[AnyTensor, DocList], embedding_field: str = 'embedding', metric: str = 'cosine_sim', limit: int = 10, @@ -203,16 +203,16 @@ class MyDocument(BaseDoc): results = [] for indices_per_query, scores_per_query in zip(top_indices, top_scores): - docs_per_query: DocArray = DocArray([]) + docs_per_query: DocList = DocList([]) for idx in indices_per_query: # workaround until #930 is fixed docs_per_query.append(index[idx]) - docs_per_query = DocArray(docs_per_query) + docs_per_query = DocList(docs_per_query) results.append(FindResult(scores=scores_per_query, documents=docs_per_query)) return results def _extract_embedding_single( - data: Union[DocArray, BaseDoc, AnyTensor], + data: Union[DocList, BaseDoc, AnyTensor], embedding_field: str, ) -> AnyTensor: """Extract the embeddings from a single query, @@ -247,7 +247,7 @@ def _extract_embeddings( :return: the embeddings """ emb: AnyTensor - if isinstance(data, DocArray): + if isinstance(data, DocList): emb_list = list(AnyDocArray._traverse(data, embedding_field)) emb = embedding_type._docarray_stack(emb_list) elif isinstance(data, (DocArrayStacked, BaseDoc)): diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py index abf677b7cc9..41761a241f1 100644 --- a/docarray/utils/reduce.py +++ b/docarray/utils/reduce.py @@ -2,12 +2,12 @@ from typing import Dict, List, Optional -from docarray import DocArray +from docarray import DocList def reduce( - left: DocArray, right: DocArray, left_id_map: Optional[Dict] = None -) -> 'DocArray': + left: DocList, right: DocList, left_id_map: Optional[Dict] = None +) -> 'DocList': """ Reduces left and right DocArray into one DocArray in-place. Changes are applied to the left DocArray. @@ -36,7 +36,7 @@ def reduce( return left -def reduce_all(docarrays: List[DocArray]) -> DocArray: +def reduce_all(docarrays: List[DocList]) -> DocList: """ Reduces a list of DocArrays into one DocArray. Changes are applied to the first DocArray in-place. diff --git a/docs/how_to/multimodal_training_and_serving.md b/docs/how_to/multimodal_training_and_serving.md index fd4421beb0f..81cbe2917d6 100644 --- a/docs/how_to/multimodal_training_and_serving.md +++ b/docs/how_to/multimodal_training_and_serving.md @@ -83,7 +83,7 @@ The `BaseDoc` class allows users to define their own (nested, multi-modal) Docum Let's start by defining a few Documents to handle the different modalities that we will use during our training: ```python -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.typing import TorchTensor, ImageUrl ``` @@ -184,7 +184,7 @@ import pandas as pd def get_flickr8k_da(file: str = "captions.txt", N: Optional[int] = None): df = pd.read_csv(file, nrows=N) - da = DocArray[PairTextImage]( + da = DocList[PairTextImage]( PairTextImage(text=Text(text=i.caption), image=Image(url=f"Images/{i.image}")) for i in df.itertuples() ) @@ -289,7 +289,7 @@ def cosine_sim(x_mat: TorchTensor, y_mat: TorchTensor) -> TorchTensor: ``` ```python -def clip_loss(image: DocArray[Image], text: DocArray[Text]) -> TorchTensor: +def clip_loss(image: DocList[Image], text: DocArray[Text]) -> TorchTensor: sims = cosine_sim(image.embedding, text.embedding) return torch.norm(sims - torch.eye(sims.shape[0], device=DEVICE)) ``` diff --git a/tests/benchmark_tests/test_map.py b/tests/benchmark_tests/test_map.py index d6018b9fdb0..ad1067c5824 100644 --- a/tests/benchmark_tests/test_map.py +++ b/tests/benchmark_tests/test_map.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from docarray.typing import NdArray from docarray.utils.map import map_docs, map_docs_batched @@ -32,7 +32,7 @@ def time_multiprocessing(num_workers: int) -> float: n_docs = 5 rng = np.random.RandomState(0) matrices = [rng.random(size=(1000, 1000)) for _ in range(n_docs)] - da = DocArray[MyMatrix]([MyMatrix(matrix=m) for m in matrices]) + da = DocList[MyMatrix]([MyMatrix(matrix=m) for m in matrices]) start_time = time() list( map_docs( @@ -47,7 +47,7 @@ def time_multiprocessing(num_workers: int) -> float: assert time_2_cpu < time_1_cpu -def cpu_intensive_batch(da: DocArray[MyMatrix]) -> DocArray[MyMatrix]: +def cpu_intensive_batch(da: DocList[MyMatrix]) -> DocList[MyMatrix]: # some cpu intensive function for doc in da: for i in range(3000): @@ -63,7 +63,7 @@ def time_multiprocessing(num_workers: int) -> float: n_docs = 16 rng = np.random.RandomState(0) matrices = [rng.random(size=(1000, 1000)) for _ in range(n_docs)] - da = DocArray[MyMatrix]([MyMatrix(matrix=m) for m in matrices]) + da = DocList[MyMatrix]([MyMatrix(matrix=m) for m in matrices]) start_time = time() list( map_docs_batched( @@ -91,7 +91,7 @@ def io_intensive(img: ImageDoc) -> ImageDoc: def test_map_docs_multithreading(): def time_multithreading(num_workers: int) -> float: n_docs = 100 - da = DocArray[ImageDoc]( + da = DocList[ImageDoc]( [ImageDoc(url=IMAGE_PATHS['png']) for _ in range(n_docs)] ) start_time = time() @@ -106,7 +106,7 @@ def time_multithreading(num_workers: int) -> float: assert time_2_thread < time_1_thread -def io_intensive_batch(da: DocArray[ImageDoc]) -> DocArray[ImageDoc]: +def io_intensive_batch(da: DocList[ImageDoc]) -> DocList[ImageDoc]: # some io intensive function: load and set image url for doc in da: doc.tensor = doc.url.load() @@ -116,7 +116,7 @@ def io_intensive_batch(da: DocArray[ImageDoc]) -> DocArray[ImageDoc]: def test_map_docs_batched_multithreading(): def time_multithreading_batch(num_workers: int) -> float: n_docs = 100 - da = DocArray[ImageDoc]( + da = DocList[ImageDoc]( [ImageDoc(url=IMAGE_PATHS['png']) for _ in range(n_docs)] ) start_time = time() diff --git a/tests/index/base_classes/test_base_doc_store.py b/tests/index/base_classes/test_base_doc_store.py index b5774020524..8e4764f5a88 100644 --- a/tests/index/base_classes/test_base_doc_store.py +++ b/tests/index/base_classes/test_base_doc_store.py @@ -5,7 +5,7 @@ import pytest from pydantic import Field -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from docarray.index.abstract import BaseDocIndex, _raise_not_composable from docarray.typing import ID, ImageBytes, ImageUrl, NdArray @@ -262,12 +262,12 @@ class OtherNestedDoc(NestedDoc): # SIMPLE store = DummyDocIndex[SimpleDoc]() in_list = [SimpleDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) - in_da = DocArray[SimpleDoc](in_list) + assert isinstance(store._validate_docs(in_list), DocList[BaseDoc]) + in_da = DocList[SimpleDoc](in_list) assert store._validate_docs(in_da) == in_da in_other_list = [OtherSimpleDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_other_list), DocArray[BaseDoc]) - in_other_da = DocArray[OtherSimpleDoc](in_other_list) + assert isinstance(store._validate_docs(in_other_list), DocList[BaseDoc]) + in_other_da = DocList[OtherSimpleDoc](in_other_list) assert store._validate_docs(in_other_da) == in_other_da with pytest.raises(ValueError): @@ -280,7 +280,7 @@ class OtherNestedDoc(NestedDoc): ) with pytest.raises(ValueError): store._validate_docs( - DocArray[FlatDoc]( + DocList[FlatDoc]( [ FlatDoc( tens_one=np.random.random((10,)), @@ -295,16 +295,16 @@ class OtherNestedDoc(NestedDoc): in_list = [ FlatDoc(tens_one=np.random.random((10,)), tens_two=np.random.random((50,))) ] - assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) - in_da = DocArray[FlatDoc]( + assert isinstance(store._validate_docs(in_list), DocList[BaseDoc]) + in_da = DocList[FlatDoc]( [FlatDoc(tens_one=np.random.random((10,)), tens_two=np.random.random((50,)))] ) assert store._validate_docs(in_da) == in_da in_other_list = [ OtherFlatDoc(tens_one=np.random.random((10,)), tens_two=np.random.random((50,))) ] - assert isinstance(store._validate_docs(in_other_list), DocArray[BaseDoc]) - in_other_da = DocArray[OtherFlatDoc]( + assert isinstance(store._validate_docs(in_other_list), DocList[BaseDoc]) + in_other_da = DocList[OtherFlatDoc]( [ OtherFlatDoc( tens_one=np.random.random((10,)), tens_two=np.random.random((50,)) @@ -316,18 +316,18 @@ class OtherNestedDoc(NestedDoc): store._validate_docs([SimpleDoc(tens=np.random.random((10,)))]) with pytest.raises(ValueError): assert not store._validate_docs( - DocArray[SimpleDoc]([SimpleDoc(tens=np.random.random((10,)))]) + DocList[SimpleDoc]([SimpleDoc(tens=np.random.random((10,)))]) ) # NESTED store = DummyDocIndex[NestedDoc]() in_list = [NestedDoc(d=SimpleDoc(tens=np.random.random((10,))))] - assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) - in_da = DocArray[NestedDoc]([NestedDoc(d=SimpleDoc(tens=np.random.random((10,))))]) + assert isinstance(store._validate_docs(in_list), DocList[BaseDoc]) + in_da = DocList[NestedDoc]([NestedDoc(d=SimpleDoc(tens=np.random.random((10,))))]) assert store._validate_docs(in_da) == in_da in_other_list = [OtherNestedDoc(d=OtherSimpleDoc(tens=np.random.random((10,))))] - assert isinstance(store._validate_docs(in_other_list), DocArray[BaseDoc]) - in_other_da = DocArray[OtherNestedDoc]( + assert isinstance(store._validate_docs(in_other_list), DocList[BaseDoc]) + in_other_da = DocList[OtherNestedDoc]( [OtherNestedDoc(d=OtherSimpleDoc(tens=np.random.random((10,))))] ) @@ -336,7 +336,7 @@ class OtherNestedDoc(NestedDoc): store._validate_docs([SimpleDoc(tens=np.random.random((10,)))]) with pytest.raises(ValueError): store._validate_docs( - DocArray[SimpleDoc]([SimpleDoc(tens=np.random.random((10,)))]) + DocList[SimpleDoc]([SimpleDoc(tens=np.random.random((10,)))]) ) @@ -353,8 +353,8 @@ class TensorUnionDoc(BaseDoc): # OPTIONAL store = DummyDocIndex[SimpleDoc]() in_list = [OptionalDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) - in_da = DocArray[OptionalDoc](in_list) + assert isinstance(store._validate_docs(in_list), DocList[BaseDoc]) + in_da = DocList[OptionalDoc](in_list) assert store._validate_docs(in_da) == in_da with pytest.raises(ValueError): @@ -363,9 +363,9 @@ class TensorUnionDoc(BaseDoc): # MIXED UNION store = DummyDocIndex[SimpleDoc]() in_list = [MixedUnionDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) - in_da = DocArray[MixedUnionDoc](in_list) - assert isinstance(store._validate_docs(in_da), DocArray[BaseDoc]) + assert isinstance(store._validate_docs(in_list), DocList[BaseDoc]) + in_da = DocList[MixedUnionDoc](in_list) + assert isinstance(store._validate_docs(in_da), DocList[BaseDoc]) with pytest.raises(ValueError): store._validate_docs([MixedUnionDoc(tens='hello')]) @@ -373,14 +373,14 @@ class TensorUnionDoc(BaseDoc): # TENSOR UNION store = DummyDocIndex[TensorUnionDoc]() in_list = [SimpleDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) - in_da = DocArray[SimpleDoc](in_list) + assert isinstance(store._validate_docs(in_list), DocList[BaseDoc]) + in_da = DocList[SimpleDoc](in_list) assert store._validate_docs(in_da) == in_da store = DummyDocIndex[SimpleDoc]() in_list = [TensorUnionDoc(tens=np.random.random((10,)))] - assert isinstance(store._validate_docs(in_list), DocArray[BaseDoc]) - in_da = DocArray[TensorUnionDoc](in_list) + assert isinstance(store._validate_docs(in_list), DocList[BaseDoc]) + in_da = DocList[TensorUnionDoc](in_list) assert store._validate_docs(in_da) == in_da diff --git a/tests/index/hnswlib/test_index_get_del.py b/tests/index/hnswlib/test_index_get_del.py index d8336e0ed6d..d9437878698 100644 --- a/tests/index/hnswlib/test_index_get_del.py +++ b/tests/index/hnswlib/test_index_get_del.py @@ -6,7 +6,7 @@ import torch from pydantic import Field -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc, TextDoc from docarray.index import HnswDocumentIndex from docarray.typing import NdArray, NdArrayEmbedding, TorchTensor @@ -57,7 +57,7 @@ def ten_nested_docs(): def test_index_simple_schema(ten_simple_docs, tmp_path, use_docarray): store = HnswDocumentIndex[SimpleDoc](work_dir=str(tmp_path)) if use_docarray: - ten_simple_docs = DocArray[SimpleDoc](ten_simple_docs) + ten_simple_docs = DocList[SimpleDoc](ten_simple_docs) store.index(ten_simple_docs) assert store.num_docs() == 10 @@ -77,7 +77,7 @@ class MyDoc(BaseDoc): def test_index_flat_schema(ten_flat_docs, tmp_path, use_docarray): store = HnswDocumentIndex[FlatDoc](work_dir=str(tmp_path)) if use_docarray: - ten_flat_docs = DocArray[FlatDoc](ten_flat_docs) + ten_flat_docs = DocList[FlatDoc](ten_flat_docs) store.index(ten_flat_docs) assert store.num_docs() == 10 @@ -89,7 +89,7 @@ def test_index_flat_schema(ten_flat_docs, tmp_path, use_docarray): def test_index_nested_schema(ten_nested_docs, tmp_path, use_docarray): store = HnswDocumentIndex[NestedDoc](work_dir=str(tmp_path)) if use_docarray: - ten_nested_docs = DocArray[NestedDoc](ten_nested_docs) + ten_nested_docs = DocList[NestedDoc](ten_nested_docs) store.index(ten_nested_docs) assert store.num_docs() == 10 @@ -137,7 +137,7 @@ class TextSchema(TextDoc): store = HnswDocumentIndex[TextSchema](work_dir=str(tmp_path)) store.index( - DocArray[TextDoc]( + DocList[TextDoc]( [TextDoc(embedding=np.random.randn(10), text=f'{i}') for i in range(10)] ) ) @@ -154,7 +154,7 @@ class ImageSchema(ImageDoc): ) store.index( - DocArray[ImageDoc]( + DocList[ImageDoc]( [ ImageDoc( embedding=np.random.randn(10), tensor=np.random.randn(3, 224, 224) diff --git a/tests/integrations/array/test_torch_train.py b/tests/integrations/array/test_torch_train.py index 930f237b0a1..e269659462a 100644 --- a/tests/integrations/array/test_torch_train.py +++ b/tests/integrations/array/test_torch_train.py @@ -2,7 +2,7 @@ import torch -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.typing import TorchTensor @@ -13,7 +13,7 @@ class Mmdoc(BaseDoc): N = 10 - batch = DocArray[Mmdoc](Mmdoc(text=f'hello{i}') for i in range(N)) + batch = DocList[Mmdoc](Mmdoc(text=f'hello{i}') for i in range(N)) batch.tensor = torch.zeros(N, 3, 224, 224) batch = batch.stack() diff --git a/tests/integrations/doc_index/elastic/v7/test_index_get_del.py b/tests/integrations/doc_index/elastic/v7/test_index_get_del.py index d5394a7925b..40779116c4e 100644 --- a/tests/integrations/doc_index/elastic/v7/test_index_get_del.py +++ b/tests/integrations/doc_index/elastic/v7/test_index_get_del.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc, TextDoc from docarray.index import ElasticV7DocIndex from docarray.typing import NdArray @@ -48,7 +48,7 @@ def ten_deep_nested_docs(): def test_index_simple_schema(ten_simple_docs, use_docarray): store = ElasticV7DocIndex[SimpleDoc]() if use_docarray: - ten_simple_docs = DocArray[SimpleDoc](ten_simple_docs) + ten_simple_docs = DocList[SimpleDoc](ten_simple_docs) store.index(ten_simple_docs) assert store.num_docs() == 10 @@ -58,7 +58,7 @@ def test_index_simple_schema(ten_simple_docs, use_docarray): def test_index_flat_schema(ten_flat_docs, use_docarray): store = ElasticV7DocIndex[FlatDoc]() if use_docarray: - ten_flat_docs = DocArray[FlatDoc](ten_flat_docs) + ten_flat_docs = DocList[FlatDoc](ten_flat_docs) store.index(ten_flat_docs) assert store.num_docs() == 10 @@ -68,7 +68,7 @@ def test_index_flat_schema(ten_flat_docs, use_docarray): def test_index_nested_schema(ten_nested_docs, use_docarray): store = ElasticV7DocIndex[NestedDoc]() if use_docarray: - ten_nested_docs = DocArray[NestedDoc](ten_nested_docs) + ten_nested_docs = DocList[NestedDoc](ten_nested_docs) store.index(ten_nested_docs) assert store.num_docs() == 10 @@ -78,7 +78,7 @@ def test_index_nested_schema(ten_nested_docs, use_docarray): def test_index_deep_nested_schema(ten_deep_nested_docs, use_docarray): store = ElasticV7DocIndex[DeepNestedDoc]() if use_docarray: - ten_deep_nested_docs = DocArray[DeepNestedDoc](ten_deep_nested_docs) + ten_deep_nested_docs = DocList[DeepNestedDoc](ten_deep_nested_docs) store.index(ten_deep_nested_docs) assert store.num_docs() == 10 diff --git a/tests/integrations/document/test_document.py b/tests/integrations/document/test_document.py index 9d8b85f260d..6d3d44fd270 100644 --- a/tests/integrations/document/test_document.py +++ b/tests/integrations/document/test_document.py @@ -5,7 +5,7 @@ from pydantic import BaseModel, ValidationError from typing_extensions import TypedDict -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import AudioDoc, ImageDoc, TextDoc from docarray.documents.helper import ( create_doc, @@ -35,14 +35,14 @@ class MyMultiModalDoc(BaseDoc): def test_nested_chunks_document(): class ChunksDocument(BaseDoc): text: str - images: DocArray[ImageDoc] + images: DocList[ImageDoc] doc = ChunksDocument( text='hello', - images=DocArray[ImageDoc]([ImageDoc() for _ in range(10)]), + images=DocList[ImageDoc]([ImageDoc() for _ in range(10)]), ) - assert isinstance(doc.images, DocArray) + assert isinstance(doc.images, DocList) def test_create_doc(): diff --git a/tests/integrations/document/test_proto.py b/tests/integrations/document/test_proto.py index 2f656e6b4b4..add031f066e 100644 --- a/tests/integrations/document/test_proto.py +++ b/tests/integrations/document/test_proto.py @@ -2,7 +2,7 @@ import pytest import torch -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc, TextDoc from docarray.typing import ( AnyEmbedding, @@ -61,7 +61,7 @@ class MyDoc(BaseDoc): embedding: AnyEmbedding torch_embedding: TorchEmbedding[128] np_embedding: NdArrayEmbedding[128] - nested_docs: DocArray[NestedDoc] + nested_docs: DocList[NestedDoc] bytes_: bytes img_bytes: ImageBytes @@ -80,7 +80,7 @@ class MyDoc(BaseDoc): embedding=np.zeros((3, 224, 224)), torch_embedding=torch.zeros((128,)), np_embedding=np.zeros((128,)), - nested_docs=DocArray[NestedDoc]([NestedDoc(tensor=np.zeros((128,)))]), + nested_docs=DocList[NestedDoc]([NestedDoc(tensor=np.zeros((128,)))]), bytes_=b'hello', img_bytes=b'img', ) @@ -136,7 +136,7 @@ class MyDoc(BaseDoc): generic_tf_tensor: AnyTensor embedding: AnyEmbedding tf_embedding: TensorFlowEmbedding[128] - nested_docs: DocArray[NestedDoc] + nested_docs: DocList[NestedDoc] doc = MyDoc( tf_tensor=tf.zeros((3, 224, 224)), @@ -144,7 +144,7 @@ class MyDoc(BaseDoc): generic_tf_tensor=tf.zeros((3, 224, 224)), embedding=tf.zeros((3, 224, 224)), tf_embedding=tf.zeros((128,)), - nested_docs=DocArray[NestedDoc]([NestedDoc(tensor=tf.zeros((128,)))]), + nested_docs=DocList[NestedDoc]([NestedDoc(tensor=tf.zeros((128,)))]), ) doc = doc.to_protobuf() doc = MyDoc.from_protobuf(doc) diff --git a/tests/integrations/externals/test_fastapi.py b/tests/integrations/externals/test_fastapi.py index 438d2a86402..02967a07cd0 100644 --- a/tests/integrations/externals/test_fastapi.py +++ b/tests/integrations/externals/test_fastapi.py @@ -5,7 +5,7 @@ from fastapi import FastAPI from httpx import AsyncClient -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.base_doc import DocArrayResponse from docarray.documents import ImageDoc, TextDoc from docarray.typing import NdArray @@ -114,13 +114,13 @@ async def create_item(doc: InputDoc) -> OutputDoc: @pytest.mark.asyncio async def test_docarray(): doc = ImageDoc(tensor=np.zeros((3, 224, 224))) - docs = DocArray[ImageDoc]([doc, doc]) + docs = DocList[ImageDoc]([doc, doc]) app = FastAPI() @app.post("/doc/", response_class=DocArrayResponse) async def func(fastapi_docs: List[ImageDoc]) -> List[ImageDoc]: - docarray_docs = DocArray[ImageDoc].construct(fastapi_docs) + docarray_docs = DocList[ImageDoc].construct(fastapi_docs) return list(docarray_docs) async with AsyncClient(app=app, base_url="http://test") as ac: @@ -132,6 +132,6 @@ async def func(fastapi_docs: List[ImageDoc]) -> List[ImageDoc]: assert resp_doc.status_code == 200 assert resp_redoc.status_code == 200 - docs = DocArray[ImageDoc].from_json(response.content.decode()) + docs = DocList[ImageDoc].from_json(response.content.decode()) assert len(docs) == 2 assert docs[0].tensor.shape == (3, 224, 224) diff --git a/tests/integrations/store/__init__.py b/tests/integrations/store/__init__.py index 1191c403140..6dc05e16a11 100644 --- a/tests/integrations/store/__init__.py +++ b/tests/integrations/store/__init__.py @@ -1,12 +1,12 @@ import tracemalloc from functools import wraps -from docarray import DocArray +from docarray import DocList from docarray.documents import TextDoc def get_test_da(n: int): - return DocArray[TextDoc](gen_text_docs(n)) + return DocList[TextDoc](gen_text_docs(n)) def gen_text_docs(n: int): diff --git a/tests/integrations/store/test_file.py b/tests/integrations/store/test_file.py index 4b6a72c5b62..c57e90d529d 100644 --- a/tests/integrations/store/test_file.py +++ b/tests/integrations/store/test_file.py @@ -3,7 +3,7 @@ import pytest -from docarray import DocArray +from docarray import DocList from docarray.documents import TextDoc from docarray.store.file import ConcurrentPushException, FileDocStore from docarray.utils._internal.cache import _get_cache_path @@ -28,7 +28,7 @@ def test_pushpull_correct(capsys, tmp_path: Path): # Verbose da1.push(f'file://{namespace_dir}/meow', show_progress=True) - da2 = DocArray[TextDoc].pull(f'file://{namespace_dir}/meow', show_progress=True) + da2 = DocList[TextDoc].pull(f'file://{namespace_dir}/meow', show_progress=True) assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -39,7 +39,7 @@ def test_pushpull_correct(capsys, tmp_path: Path): # Quiet da2.push(f'file://{namespace_dir}/meow') - da1 = DocArray[TextDoc].pull(f'file://{namespace_dir}/meow') + da1 = DocList[TextDoc].pull(f'file://{namespace_dir}/meow') assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -55,10 +55,10 @@ def test_pushpull_stream_correct(capsys, tmp_path: Path): da1 = get_test_da(DA_LEN) # Verbosity and correctness - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( iter(da1), f'file://{namespace_dir}/meow', show_progress=True ) - doc_stream2 = DocArray[TextDoc].pull_stream( + doc_stream2 = DocList[TextDoc].pull_stream( f'file://{namespace_dir}/meow', show_progress=True ) @@ -71,10 +71,10 @@ def test_pushpull_stream_correct(capsys, tmp_path: Path): assert len(captured.err) == 0 # Quiet and chained - doc_stream = DocArray[TextDoc].pull_stream( + doc_stream = DocList[TextDoc].pull_stream( f'file://{namespace_dir}/meow', show_progress=False ) - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( doc_stream, f'file://{namespace_dir}/meow2', show_progress=False ) @@ -87,12 +87,12 @@ def test_pushpull_stream_correct(capsys, tmp_path: Path): def test_pull_stream_vs_pull_full(tmp_path: Path): tmp_path.mkdir(parents=True, exist_ok=True) namespace_dir = tmp_path - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN * 1), f'file://{namespace_dir}/meow-short', show_progress=False, ) - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN * 4), f'file://{namespace_dir}/meow-long', show_progress=False, @@ -101,14 +101,12 @@ def test_pull_stream_vs_pull_full(tmp_path: Path): @profile_memory def get_total_stream(url: str): return sum( - len(d.text) for d in DocArray[TextDoc].pull_stream(url, show_progress=False) + len(d.text) for d in DocList[TextDoc].pull_stream(url, show_progress=False) ) @profile_memory def get_total_full(url: str): - return sum( - len(d.text) for d in DocArray[TextDoc].pull(url, show_progress=False) - ) + return sum(len(d.text) for d in DocList[TextDoc].pull(url, show_progress=False)) # A warmup is needed to get accurate memory usage comparison _ = get_total_stream(f'file://{namespace_dir}/meow-short') @@ -149,12 +147,12 @@ def test_list_and_delete(tmp_path: Path): da_names = FileDocStore.list(namespace_dir, show_table=False) assert len(da_names) == 0 - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/meow', show_progress=False ) da_names = FileDocStore.list(namespace_dir, show_table=False) assert set(da_names) == {'meow'} - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/woof', show_progress=False ) da_names = FileDocStore.list(namespace_dir, show_table=False) @@ -181,7 +179,7 @@ def test_concurrent_push_pull(tmp_path: Path): tmp_path.mkdir(parents=True, exist_ok=True) namespace_dir = tmp_path - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/da0', show_progress=False, @@ -191,14 +189,14 @@ def test_concurrent_push_pull(tmp_path: Path): def _task(choice: str): if choice == 'push': - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/da0', show_progress=False, ) elif choice == 'pull': pull_len = sum( - 1 for _ in DocArray[TextDoc].pull_stream(f'file://{namespace_dir}/da0') + 1 for _ in DocList[TextDoc].pull_stream(f'file://{namespace_dir}/da0') ) assert pull_len == DA_LEN else: @@ -216,7 +214,7 @@ def test_concurrent_push(tmp_path: Path): tmp_path.mkdir(parents=True, exist_ok=True) namespace_dir = tmp_path - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/da0', show_progress=False, @@ -232,7 +230,7 @@ def _slowdown_iterator(iterator): def _push(choice: str): if choice == 'slow': - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( _slowdown_iterator(gen_text_docs(DA_LEN)), f'file://{namespace_dir}/da0', show_progress=False, @@ -241,7 +239,7 @@ def _push(choice: str): elif choice == 'cold_start': try: time.sleep(0.1) - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f'file://{namespace_dir}/da0', show_progress=False, diff --git a/tests/integrations/store/test_jac.py b/tests/integrations/store/test_jac.py index 3e070b6de2b..63dcdc33b15 100644 --- a/tests/integrations/store/test_jac.py +++ b/tests/integrations/store/test_jac.py @@ -4,7 +4,7 @@ import hubble import pytest -from docarray import DocArray +from docarray import DocList from docarray.documents import TextDoc from docarray.store import JACDocStore from tests.integrations.store import gen_text_docs, get_test_da, profile_memory @@ -45,7 +45,7 @@ def test_pushpull_correct(capsys): # Verbose da1.push(f'jac://{DA_NAME}', show_progress=True) - da2 = DocArray[TextDoc].pull(f'jac://{DA_NAME}', show_progress=True) + da2 = DocList[TextDoc].pull(f'jac://{DA_NAME}', show_progress=True) assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -56,7 +56,7 @@ def test_pushpull_correct(capsys): # Quiet da2.push(f'jac://{DA_NAME}') - da1 = DocArray[TextDoc].pull(f'jac://{DA_NAME}') + da1 = DocList[TextDoc].pull(f'jac://{DA_NAME}') assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -77,10 +77,8 @@ def test_pushpull_stream_correct(capsys): da1 = get_test_da(DA_LEN) # Verbosity and correctness - DocArray[TextDoc].push_stream(iter(da1), f'jac://{DA_NAME_1}', show_progress=True) - doc_stream2 = DocArray[TextDoc].pull_stream( - f'jac://{DA_NAME_1}', show_progress=True - ) + DocList[TextDoc].push_stream(iter(da1), f'jac://{DA_NAME_1}', show_progress=True) + doc_stream2 = DocList[TextDoc].pull_stream(f'jac://{DA_NAME_1}', show_progress=True) assert all(d1.id == d2.id for d1, d2 in zip(da1, doc_stream2)) with pytest.raises(StopIteration): @@ -91,10 +89,8 @@ def test_pushpull_stream_correct(capsys): assert len(captured.err) == 0 # Quiet and chained - doc_stream = DocArray[TextDoc].pull_stream( - f'jac://{DA_NAME_1}', show_progress=False - ) - DocArray[TextDoc].push_stream(doc_stream, f'jac://{DA_NAME_2}', show_progress=False) + doc_stream = DocList[TextDoc].pull_stream(f'jac://{DA_NAME_1}', show_progress=False) + DocList[TextDoc].push_stream(doc_stream, f'jac://{DA_NAME_2}', show_progress=False) captured = capsys.readouterr() assert ( @@ -112,12 +108,12 @@ def test_pull_stream_vs_pull_full(): DA_NAME_SHORT: str = f'test{RANDOM}-pull-stream-vs-pull-full-short' DA_NAME_LONG: str = f'test{RANDOM}-pull-stream-vs-pull-full-long' - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN * 1), f'jac://{DA_NAME_SHORT}', show_progress=False, ) - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN * 4), f'jac://{DA_NAME_LONG}', show_progress=False, @@ -126,14 +122,12 @@ def test_pull_stream_vs_pull_full(): @profile_memory def get_total_stream(url: str): return sum( - len(d.text) for d in DocArray[TextDoc].pull_stream(url, show_progress=False) + len(d.text) for d in DocList[TextDoc].pull_stream(url, show_progress=False) ) @profile_memory def get_total_full(url: str): - return sum( - len(d.text) for d in DocArray[TextDoc].pull(url, show_progress=False) - ) + return sum(len(d.text) for d in DocList[TextDoc].pull(url, show_progress=False)) # A warmup is needed to get accurate memory usage comparison _ = get_total_stream(f'jac://{DA_NAME_SHORT}') @@ -176,7 +170,7 @@ def test_list_and_delete(): ) assert len(da_names) == 0 - DocArray[TextDoc].push( + DocList[TextDoc].push( get_test_da(DA_LEN), f'jac://{DA_NAME_0}', show_progress=False ) da_names = list( @@ -186,7 +180,7 @@ def test_list_and_delete(): ) ) assert set(da_names) == {DA_NAME_0} - DocArray[TextDoc].push( + DocList[TextDoc].push( get_test_da(DA_LEN), f'jac://{DA_NAME_1}', show_progress=False ) da_names = list( @@ -224,7 +218,7 @@ def test_concurrent_push_pull(): # Push to DA that is being pulled should not mess up the pull DA_NAME_0 = f'test{RANDOM}-concurrent-push-pull-da0' - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f'jac://{DA_NAME_0}', show_progress=False, @@ -234,14 +228,14 @@ def test_concurrent_push_pull(): def _task(choice: str): if choice == 'push': - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f'jac://{DA_NAME_0}', show_progress=False, ) elif choice == 'pull': pull_len = sum( - 1 for _ in DocArray[TextDoc].pull_stream(f'jac://{DA_NAME_0}') + 1 for _ in DocList[TextDoc].pull_stream(f'jac://{DA_NAME_0}') ) assert pull_len == DA_LEN else: diff --git a/tests/integrations/store/test_s3.py b/tests/integrations/store/test_s3.py index ebe51b8c223..373a4d89663 100644 --- a/tests/integrations/store/test_s3.py +++ b/tests/integrations/store/test_s3.py @@ -5,7 +5,7 @@ import pytest -from docarray import DocArray +from docarray import DocList from docarray.documents import TextDoc from docarray.store import S3DocStore from tests.integrations.store import gen_text_docs, get_test_da, profile_memory @@ -72,7 +72,7 @@ def test_pushpull_correct(capsys): # Verbose da1.push(f's3://{namespace_dir}/meow', show_progress=True) - da2 = DocArray[TextDoc].pull(f's3://{namespace_dir}/meow', show_progress=True) + da2 = DocList[TextDoc].pull(f's3://{namespace_dir}/meow', show_progress=True) assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -83,7 +83,7 @@ def test_pushpull_correct(capsys): # Quiet da2.push(f's3://{namespace_dir}/meow') - da1 = DocArray[TextDoc].pull(f's3://{namespace_dir}/meow') + da1 = DocList[TextDoc].pull(f's3://{namespace_dir}/meow') assert len(da1) == len(da2) assert all(d1.id == d2.id for d1, d2 in zip(da1, da2)) assert all(d1.text == d2.text for d1, d2 in zip(da1, da2)) @@ -99,10 +99,10 @@ def test_pushpull_stream_correct(capsys): da1 = get_test_da(DA_LEN) # Verbosity and correctness - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( iter(da1), f's3://{namespace_dir}/meow', show_progress=True ) - doc_stream2 = DocArray[TextDoc].pull_stream( + doc_stream2 = DocList[TextDoc].pull_stream( f's3://{namespace_dir}/meow', show_progress=True ) @@ -115,10 +115,10 @@ def test_pushpull_stream_correct(capsys): assert len(captured.err) == 0 # Quiet and chained - doc_stream = DocArray[TextDoc].pull_stream( + doc_stream = DocList[TextDoc].pull_stream( f's3://{namespace_dir}/meow', show_progress=False ) - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( doc_stream, f's3://{namespace_dir}/meow2', show_progress=False ) @@ -130,12 +130,12 @@ def test_pushpull_stream_correct(capsys): @pytest.mark.slow def test_pull_stream_vs_pull_full(): namespace_dir = f'{BUCKET}/test{RANDOM}/pull-stream-vs-pull-full' - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN * 1), f's3://{namespace_dir}/meow-short', show_progress=False, ) - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN * 4), f's3://{namespace_dir}/meow-long', show_progress=False, @@ -144,14 +144,12 @@ def test_pull_stream_vs_pull_full(): @profile_memory def get_total_stream(url: str): return sum( - len(d.text) for d in DocArray[TextDoc].pull_stream(url, show_progress=False) + len(d.text) for d in DocList[TextDoc].pull_stream(url, show_progress=False) ) @profile_memory def get_total_full(url: str): - return sum( - len(d.text) for d in DocArray[TextDoc].pull(url, show_progress=False) - ) + return sum(len(d.text) for d in DocList[TextDoc].pull(url, show_progress=False)) # A warmup is needed to get accurate memory usage comparison _ = get_total_stream(f's3://{namespace_dir}/meow-short') @@ -192,12 +190,12 @@ def test_list_and_delete(): da_names = S3DocStore.list(namespace_dir, show_table=False) assert len(da_names) == 0 - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f's3://{namespace_dir}/meow', show_progress=False ) da_names = S3DocStore.list(f'{namespace_dir}', show_table=False) assert set(da_names) == {'meow'} - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f's3://{namespace_dir}/woof', show_progress=False ) da_names = S3DocStore.list(f'{namespace_dir}', show_table=False) @@ -224,7 +222,7 @@ def test_concurrent_push_pull(): # Push to DA that is being pulled should not mess up the pull namespace_dir = f'{BUCKET}/test{RANDOM}/concurrent-push-pull' - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f's3://{namespace_dir}/da0', show_progress=False, @@ -234,14 +232,14 @@ def test_concurrent_push_pull(): def _task(choice: str): if choice == 'push': - DocArray[TextDoc].push_stream( + DocList[TextDoc].push_stream( gen_text_docs(DA_LEN), f's3://{namespace_dir}/da0', show_progress=False, ) elif choice == 'pull': pull_len = sum( - 1 for _ in DocArray[TextDoc].pull_stream(f's3://{namespace_dir}/da0') + 1 for _ in DocList[TextDoc].pull_stream(f's3://{namespace_dir}/da0') ) assert pull_len == DA_LEN else: diff --git a/tests/integrations/torch/data/test_torch_dataset.py b/tests/integrations/torch/data/test_torch_dataset.py index 238e05e8ac2..ef6c1e98597 100644 --- a/tests/integrations/torch/data/test_torch_dataset.py +++ b/tests/integrations/torch/data/test_torch_dataset.py @@ -2,7 +2,7 @@ import torch from torch.utils.data import DataLoader -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.data import MultiModalDataset from docarray.documents import ImageDoc, TextDoc @@ -34,10 +34,10 @@ def __call__(self, text: str) -> None: @pytest.fixture -def captions_da() -> DocArray[PairTextImage]: +def captions_da() -> DocList[PairTextImage]: with open("tests/toydata/captions.csv", "r") as f: f.readline() - da = DocArray[PairTextImage]( + da = DocList[PairTextImage]( PairTextImage( text=TextDoc(text=i[1]), image=ImageDoc(url=f"tests/toydata/image-data/{i[0]}"), @@ -47,7 +47,7 @@ def captions_da() -> DocArray[PairTextImage]: return da -def test_torch_dataset(captions_da: DocArray[PairTextImage]): +def test_torch_dataset(captions_da: DocList[PairTextImage]): BATCH_SIZE = 32 preprocessing = {"image": ImagePreprocess(), "text": TextPreprocess()} @@ -65,7 +65,7 @@ def test_torch_dataset(captions_da: DocArray[PairTextImage]): assert all(x == BATCH_SIZE for x in batch_lens[:-1]) -def test_primitives(captions_da: DocArray[PairTextImage]): +def test_primitives(captions_da: DocList[PairTextImage]): BATCH_SIZE = 32 preprocessing = {"text": Meowification()} @@ -78,7 +78,7 @@ def test_primitives(captions_da: DocArray[PairTextImage]): assert all(t.endswith(' meow') for t in batch.text) -def test_root_field(captions_da: DocArray[TextDoc]): +def test_root_field(captions_da: DocList[TextDoc]): BATCH_SIZE = 32 preprocessing = {"": TextPreprocess()} @@ -91,7 +91,7 @@ def test_root_field(captions_da: DocArray[TextDoc]): assert batch.embedding.shape[1] == 64 -def test_nested_field(captions_da: DocArray[PairTextImage]): +def test_nested_field(captions_da: DocList[PairTextImage]): BATCH_SIZE = 32 preprocessing = { @@ -122,7 +122,7 @@ def test_nested_field(captions_da: DocArray[PairTextImage]): @pytest.mark.slow -def test_torch_dl_multiprocessing(captions_da: DocArray[PairTextImage]): +def test_torch_dl_multiprocessing(captions_da: DocList[PairTextImage]): BATCH_SIZE = 32 preprocessing = {"image": ImagePreprocess(), "text": TextPreprocess()} @@ -146,7 +146,7 @@ def test_torch_dl_multiprocessing(captions_da: DocArray[PairTextImage]): @pytest.mark.skip(reason="UNRESOLVED BUG") -def test_torch_dl_pin_memory(captions_da: DocArray[PairTextImage]): +def test_torch_dl_pin_memory(captions_da: DocList[PairTextImage]): BATCH_SIZE = 32 preprocessing = {"image": ImagePreprocess(), "text": TextPreprocess()} diff --git a/tests/units/array/stack/test_array_stacked.py b/tests/units/array/stack/test_array_stacked.py index 95cbf58c150..e867dd76a5c 100644 --- a/tests/units/array/stack/test_array_stacked.py +++ b/tests/units/array/stack/test_array_stacked.py @@ -5,7 +5,7 @@ import torch from pydantic import parse_obj_as -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.array import DocArrayStacked from docarray.documents import ImageDoc from docarray.typing import AnyEmbedding, AnyTensor, NdArray, TorchTensor @@ -29,12 +29,12 @@ class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] class MMdoc(BaseDoc): - img: DocArray[ImageDoc] + img: DocList[ImageDoc] - batch = DocArray[MMdoc]( + batch = DocList[MMdoc]( [ MMdoc( - img=DocArray[ImageDoc]( + img=DocList[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) ) @@ -75,7 +75,7 @@ def test_stack_setter(): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocArray[ImageDoc]( + batch = DocList[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) @@ -92,7 +92,7 @@ def test_stack_setter_np(): class ImageDoc(BaseDoc): tensor: NdArray[3, 224, 224] - batch = DocArray[ImageDoc]( + batch = DocList[ImageDoc]( [ImageDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)] ) @@ -116,7 +116,7 @@ def test_stack_numpy(): class ImageDoc(BaseDoc): tensor: NdArray[3, 224, 224] - batch = DocArray[ImageDoc]( + batch = DocList[ImageDoc]( [ImageDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)] ) @@ -152,7 +152,7 @@ class ImageDoc(BaseDoc): class MMdoc(BaseDoc): img: ImageDoc - batch = DocArray[MMdoc]( + batch = DocList[MMdoc]( [MMdoc(img=ImageDoc(tensor=torch.zeros(3, 224, 224))) for _ in range(10)] ) @@ -188,7 +188,7 @@ def test_convert_to_da(batch): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocArray[ImageDoc]( + batch = DocList[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) @@ -206,7 +206,7 @@ class ImageDoc(BaseDoc): class MMdoc(BaseDoc): img: ImageDoc - batch = DocArray[MMdoc]( + batch = DocList[MMdoc]( [MMdoc(img=ImageDoc(tensor=torch.zeros(3, 224, 224))) for _ in range(10)] ) @@ -221,7 +221,7 @@ class MMdoc(BaseDoc): def test_unstack_nested_DocArray(nested_batch): batch = nested_batch.unstack() for i in range(len(batch)): - assert isinstance(batch[i].img, DocArray) + assert isinstance(batch[i].img, DocList) for doc in batch[i].img: assert (doc.tensor == torch.zeros(3, 224, 224)).all() @@ -230,7 +230,7 @@ def test_stack_call(): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - da = DocArray[ImageDoc]( + da = DocList[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) @@ -245,7 +245,7 @@ def test_stack_union(): class ImageDoc(BaseDoc): tensor: Union[NdArray[3, 224, 224], TorchTensor[3, 224, 224]] - batch = DocArray[ImageDoc]( + batch = DocList[ImageDoc]( [ImageDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)] ) batch[3].tensor = np.zeros((3, 224, 224)) @@ -361,7 +361,7 @@ def test_to_device(): def test_to_device_with_nested_da(): class Video(BaseDoc): - images: DocArray[ImageDoc] + images: DocList[ImageDoc] da_image = DocArrayStacked[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 5))], tensor_type=TorchTensor @@ -401,7 +401,7 @@ def test_keep_dtype_torch(): class MyDoc(BaseDoc): tensor: TorchTensor - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [MyDoc(tensor=torch.zeros([2, 4], dtype=torch.int32)) for _ in range(3)] ) assert da[0].tensor.dtype == torch.int32 @@ -415,7 +415,7 @@ def test_keep_dtype_np(): class MyDoc(BaseDoc): tensor: NdArray - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [MyDoc(tensor=np.zeros([2, 4], dtype=np.int32)) for _ in range(3)] ) assert da[0].tensor.dtype == np.int32 @@ -436,7 +436,7 @@ def test_np_scalar(): class MyDoc(BaseDoc): scalar: NdArray - da = DocArray[MyDoc]([MyDoc(scalar=np.array(2.0)) for _ in range(3)]) + da = DocList[MyDoc]([MyDoc(scalar=np.array(2.0)) for _ in range(3)]) assert all(doc.scalar.ndim == 0 for doc in da) assert all(doc.scalar == 2.0 for doc in da) @@ -456,7 +456,7 @@ def test_torch_scalar(): class MyDoc(BaseDoc): scalar: TorchTensor - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [MyDoc(scalar=torch.tensor(2.0)) for _ in range(3)], ) assert all(doc.scalar.ndim == 0 for doc in da) @@ -476,7 +476,7 @@ def test_np_nan(): class MyDoc(BaseDoc): scalar: Optional[NdArray] - da = DocArray[MyDoc]([MyDoc() for _ in range(3)]) + da = DocList[MyDoc]([MyDoc() for _ in range(3)]) assert all(doc.scalar is None for doc in da) assert all(doc.scalar == doc.scalar for doc in da) stacked_da = da.stack() @@ -495,7 +495,7 @@ def test_torch_nan(): class MyDoc(BaseDoc): scalar: Optional[TorchTensor] - da = DocArray[MyDoc]([MyDoc() for _ in range(3)]) + da = DocList[MyDoc]([MyDoc() for _ in range(3)]) assert all(doc.scalar is None for doc in da) assert all(doc.scalar == doc.scalar for doc in da) stacked_da = da.stack(tensor_type=TorchTensor) @@ -526,7 +526,7 @@ def test_validate_from_da(): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocArray[ImageDoc]( + batch = DocList[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) @@ -563,7 +563,7 @@ class Doc(BaseDoc): def test_validation_column_doc(batch_nested_doc): batch, Doc, Inner = batch_nested_doc - batch.inner = DocArray[Inner]([Inner(hello='hello') for _ in range(10)]) + batch.inner = DocList[Inner]([Inner(hello='hello') for _ in range(10)]) assert isinstance(batch.inner, DocArrayStacked[Inner]) @@ -581,7 +581,7 @@ def test_validation_col_doc_fail(batch_nested_doc): batch.inner = ['hello'] * 10 with pytest.raises(ValueError): - batch.inner = DocArray[Inner]([Inner(hello='hello') for _ in range(11)]) + batch.inner = DocList[Inner]([Inner(hello='hello') for _ in range(11)]) def test_doc_view_update(batch): diff --git a/tests/units/array/stack/test_array_stacked_tf.py b/tests/units/array/stack/test_array_stacked_tf.py index 0ec91268575..19f27fad114 100644 --- a/tests/units/array/stack/test_array_stacked_tf.py +++ b/tests/units/array/stack/test_array_stacked_tf.py @@ -2,7 +2,7 @@ import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.array import DocArrayStacked from docarray.typing import AnyTensor, NdArray from docarray.utils._internal.misc import is_tf_available @@ -22,7 +22,7 @@ class Image(BaseDoc): import tensorflow as tf - batch = DocArray[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) + batch = DocList[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) return batch.stack() @@ -33,14 +33,14 @@ class Image(BaseDoc): tensor: TensorFlowTensor[3, 224, 224] class MMdoc(BaseDoc): - img: DocArray[Image] + img: DocList[Image] import tensorflow as tf batch = DocArrayStacked[MMdoc]( [ MMdoc( - img=DocArray[Image]( + img=DocList[Image]( [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)] ) ) @@ -109,7 +109,7 @@ class Image(BaseDoc): class MMdoc(BaseDoc): img: Image - batch = DocArray[MMdoc]( + batch = DocList[MMdoc]( [MMdoc(img=Image(tensor=tf.zeros((3, 224, 224)))) for _ in range(10)] ).stack() @@ -164,7 +164,7 @@ class MMdoc(BaseDoc): def test_unstack_nested_DocArray(nested_batch): batch = nested_batch.unstack() for i in range(len(batch)): - assert isinstance(batch[i].img, DocArray) + assert isinstance(batch[i].img, DocList) for doc in batch[i].img: assert tnp.allclose(doc.tensor.tensor, tf.zeros((3, 224, 224))) @@ -174,7 +174,7 @@ def test_stack_call(): class Image(BaseDoc): tensor: TensorFlowTensor[3, 224, 224] - da = DocArray[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) + da = DocList[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) da = da.stack() @@ -283,7 +283,7 @@ def test_keep_dtype_tf(): class MyDoc(BaseDoc): tensor: TensorFlowTensor - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [MyDoc(tensor=tf.zeros([2, 4], dtype=tf.int32)) for _ in range(3)] ) assert da[0].tensor.tensor.dtype == tf.int32 diff --git a/tests/units/array/stack/test_proto.py b/tests/units/array/stack/test_proto.py index 1589c28197b..4f2db70df48 100644 --- a/tests/units/array/stack/test_proto.py +++ b/tests/units/array/stack/test_proto.py @@ -2,7 +2,7 @@ import pytest import torch -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.array import DocArrayStacked from docarray.typing import NdArray, TorchTensor @@ -12,7 +12,7 @@ def batch(): class Image(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocArray[Image]([Image(tensor=torch.zeros(3, 224, 224)) for _ in range(10)]) + batch = DocList[Image]([Image(tensor=torch.zeros(3, 224, 224)) for _ in range(10)]) return batch.stack() @@ -27,7 +27,7 @@ def test_proto_stacked_mode_numpy(): class MyDoc(BaseDoc): tensor: NdArray[3, 224, 224] - da = DocArray[MyDoc]([MyDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)]) + da = DocList[MyDoc]([MyDoc(tensor=np.zeros((3, 224, 224))) for _ in range(10)]) da = da.stack() @@ -39,7 +39,7 @@ def test_stacked_proto(): class CustomDocument(BaseDoc): image: NdArray - da = DocArray[CustomDocument]( + da = DocList[CustomDocument]( [CustomDocument(image=np.zeros((3, 224, 224))) for _ in range(10)] ).stack() diff --git a/tests/units/array/test_array.py b/tests/units/array/test_array.py index d47089176bb..79d50b64e82 100644 --- a/tests/units/array/test_array.py +++ b/tests/units/array/test_array.py @@ -1,9 +1,10 @@ from typing import Optional, TypeVar, Union + import numpy as np import pytest import torch -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.typing import ImageUrl, NdArray, TorchTensor from docarray.utils._internal.misc import is_tf_available @@ -19,7 +20,7 @@ def da(): class Text(BaseDoc): text: str - return DocArray[Text]([Text(text=f'hello {i}') for i in range(10)]) + return DocList[Text]([Text(text=f'hello {i}') for i in range(10)]) def test_iterate(da): @@ -31,7 +32,7 @@ def test_append(): class Text(BaseDoc): text: str - da = DocArray[Text]([]) + da = DocList[Text]([]) da.append(Text(text='hello', id='1')) @@ -43,7 +44,7 @@ def test_extend(): class Text(BaseDoc): text: str - da = DocArray[Text]([Text(text='hello', id=str(i)) for i in range(10)]) + da = DocList[Text]([Text(text='hello', id=str(i)) for i in range(10)]) da.extend([Text(text='hello', id=str(10 + i)) for i in range(10)]) @@ -62,13 +63,13 @@ def test_document_array(): class Text(BaseDoc): text: str - da = DocArray([Text(text='hello') for _ in range(10)]) + da = DocList([Text(text='hello') for _ in range(10)]) assert len(da) == 10 def test_empty_array(): - da = DocArray() + da = DocList() len(da) == 0 @@ -76,7 +77,7 @@ def test_document_array_fixed_type(): class Text(BaseDoc): text: str - da = DocArray[Text]([Text(text='hello') for _ in range(10)]) + da = DocList[Text]([Text(text='hello') for _ in range(10)]) assert len(da) == 10 @@ -113,8 +114,8 @@ def test_documentarray(): class Text(BaseDoc): text: str - da1 = DocArray([Text(text='hello')]) - da2 = DocArray([Text(text='hello')]) + da1 = DocList([Text(text='hello')]) + da2 = DocList([Text(text='hello')]) assert da1 == da2 assert da1 == [Text(text='hello') for _ in range(len(da1))] @@ -156,7 +157,7 @@ class Mmdoc(BaseDoc): N = 10 - da = DocArray[Mmdoc]( + da = DocList[Mmdoc]( (Mmdoc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) ) @@ -182,7 +183,7 @@ class Mmdoc(BaseDoc): N = 10 - da = DocArray[Mmdoc]((Mmdoc(inner=InnerDoc(text=f'hello{i}')) for i in range(N))) + da = DocList[Mmdoc]((Mmdoc(inner=InnerDoc(text=f'hello{i}')) for i in range(N))) list_docs = [InnerDoc(text=f'hello{i}') for i in range(N)] da._set_data_column('inner', list_docs) @@ -198,7 +199,7 @@ class Mmdoc(BaseDoc): N = 10 - da = DocArray[Mmdoc]( + da = DocList[Mmdoc]( (Mmdoc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) ) @@ -224,9 +225,9 @@ class Mmdoc(BaseDoc): N = 10 - da = DocArray[Mmdoc]((Mmdoc(inner=InnerDoc(text=f'hello{i}')) for i in range(N))) + da = DocList[Mmdoc]((Mmdoc(inner=InnerDoc(text=f'hello{i}')) for i in range(N))) - assert isinstance(da.inner, DocArray) + assert isinstance(da.inner, DocList) def test_get_bulk_attributes_optional_type(): @@ -236,7 +237,7 @@ class Mmdoc(BaseDoc): N = 10 - da = DocArray[Mmdoc]( + da = DocList[Mmdoc]( (Mmdoc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) ) @@ -260,7 +261,7 @@ class Mmdoc(BaseDoc): N = 10 - da = DocArray[Mmdoc]( + da = DocList[Mmdoc]( (Mmdoc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) ) @@ -288,7 +289,7 @@ class MyDoc(BaseDoc): Optional[Union[TorchTensor, NdArray, TensorFlowTensor]], TorchTensor ] - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [ MyDoc( embedding=torch.rand(10), @@ -315,12 +316,12 @@ class Doc(BaseDoc): N = 10 - da = DocArray[Doc]( + da = DocList[Doc]( (Doc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)) ) da_sliced = da[0:10:2] - assert isinstance(da_sliced, DocArray) + assert isinstance(da_sliced, DocList) tensors = da_sliced.tensor assert len(tensors) == 5 @@ -364,13 +365,13 @@ def test_del_item(da): def test_generic_type_var(): T = TypeVar('T', bound=BaseDoc) - def f(a: DocArray[T]) -> DocArray[T]: + def f(a: DocList[T]) -> DocList[T]: return a - def g(a: DocArray['BaseDoc']) -> DocArray['BaseDoc']: + def g(a: DocList['BaseDoc']) -> DocList['BaseDoc']: return a - a = DocArray() + a = DocList() f(a) g(a) @@ -381,7 +382,7 @@ class Text(BaseDoc): docs = [Text(text=f'hello {i}') for i in range(10)] - da = DocArray[Text].construct(docs) + da = DocList[Text].construct(docs) assert da._data is docs @@ -392,7 +393,7 @@ class Text(BaseDoc): docs = [Text(text=f'hello {i}') for i in range(10)] - da = DocArray[Text](docs) + da = DocList[Text](docs) da.reverse() assert da[-1].text == 'hello 0' assert da[0].text == 'hello 9' @@ -405,7 +406,7 @@ class Image(BaseDoc): def test_remove(): images = [Image(url=f'http://url.com/foo_{i}.png') for i in range(3)] - da = DocArray[Image](images) + da = DocList[Image](images) da.remove(images[1]) assert len(da) == 2 assert da[0] == images[0] @@ -414,7 +415,7 @@ def test_remove(): def test_pop(): images = [Image(url=f'http://url.com/foo_{i}.png') for i in range(3)] - da = DocArray[Image](images) + da = DocList[Image](images) popped = da.pop(1) assert len(da) == 2 assert popped == images[1] @@ -426,7 +427,7 @@ def test_sort(): images = [ Image(url=f'http://url.com/foo_{i}.png', tensor=NdArray(i)) for i in [2, 0, 1] ] - da = DocArray[Image](images) + da = DocList[Image](images) da.sort(key=lambda img: len(img.tensor)) assert len(da) == 3 assert da[0].url == 'http://url.com/foo_0.png' diff --git a/tests/units/array/test_array_from_to_bytes.py b/tests/units/array/test_array_from_to_bytes.py index 0d269e036a3..7cd9f0dfd8c 100644 --- a/tests/units/array/test_array_from_to_bytes.py +++ b/tests/units/array/test_array_from_to_bytes.py @@ -1,6 +1,6 @@ import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from docarray.typing import NdArray @@ -17,7 +17,7 @@ class MyDoc(BaseDoc): @pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) @pytest.mark.parametrize('show_progress', [False, True]) def test_from_to_bytes(protocol, compress, show_progress): - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [ MyDoc( embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') @@ -28,7 +28,7 @@ def test_from_to_bytes(protocol, compress, show_progress): bytes_da = da.to_bytes( protocol=protocol, compress=compress, show_progress=show_progress ) - da2 = DocArray[MyDoc].from_bytes( + da2 = DocList[MyDoc].from_bytes( bytes_da, protocol=protocol, compress=compress, show_progress=show_progress ) assert len(da2) == 2 @@ -47,7 +47,7 @@ def test_from_to_bytes(protocol, compress, show_progress): @pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) @pytest.mark.parametrize('show_progress', [False, True]) def test_from_to_base64(protocol, compress, show_progress): - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [ MyDoc( embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') @@ -58,7 +58,7 @@ def test_from_to_base64(protocol, compress, show_progress): bytes_da = da.to_base64( protocol=protocol, compress=compress, show_progress=show_progress ) - da2 = DocArray[MyDoc].from_base64( + da2 = DocList[MyDoc].from_base64( bytes_da, protocol=protocol, compress=compress, show_progress=show_progress ) assert len(da2) == 2 diff --git a/tests/units/array/test_array_from_to_csv.py b/tests/units/array/test_array_from_to_csv.py index ecec376d433..09ec98b6432 100644 --- a/tests/units/array/test_array_from_to_csv.py +++ b/tests/units/array/test_array_from_to_csv.py @@ -3,7 +3,7 @@ import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from tests import TOYDATA_DIR @@ -22,7 +22,7 @@ class MyDocNested(MyDoc): def test_to_from_csv(tmpdir, nested_doc_cls): - da = DocArray[nested_doc_cls]( + da = DocList[nested_doc_cls]( [ nested_doc_cls( count=0, @@ -37,13 +37,13 @@ def test_to_from_csv(tmpdir, nested_doc_cls): da.to_csv(tmp_file) assert os.path.isfile(tmp_file) - da_from = DocArray[nested_doc_cls].from_csv(tmp_file) + da_from = DocList[nested_doc_cls].from_csv(tmp_file) for doc1, doc2 in zip(da, da_from): assert doc1 == doc2 def test_from_csv_nested(nested_doc_cls): - da = DocArray[nested_doc_cls].from_csv( + da = DocList[nested_doc_cls].from_csv( file_path=str(TOYDATA_DIR / 'docs_nested.csv') ) assert len(da) == 3 @@ -91,9 +91,9 @@ class Outer(BaseDoc): def test_from_csv_without_schema_raise_exception(): with pytest.raises(TypeError, match='no document schema defined'): - DocArray.from_csv(file_path=str(TOYDATA_DIR / 'docs_nested.csv')) + DocList.from_csv(file_path=str(TOYDATA_DIR / 'docs_nested.csv')) def test_from_csv_with_wrong_schema_raise_exception(nested_doc): with pytest.raises(ValueError, match='Column names do not match the schema'): - DocArray[nested_doc.__class__].from_csv(file_path=str(TOYDATA_DIR / 'docs.csv')) + DocList[nested_doc.__class__].from_csv(file_path=str(TOYDATA_DIR / 'docs.csv')) diff --git a/tests/units/array/test_array_from_to_json.py b/tests/units/array/test_array_from_to_json.py index 52d6b2ec977..c36b8af92a9 100644 --- a/tests/units/array/test_array_from_to_json.py +++ b/tests/units/array/test_array_from_to_json.py @@ -1,4 +1,4 @@ -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from docarray.typing import NdArray @@ -10,7 +10,7 @@ class MyDoc(BaseDoc): def test_from_to_json(): - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [ MyDoc( embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') @@ -19,7 +19,7 @@ def test_from_to_json(): ] ) json_da = da.to_json() - da2 = DocArray[MyDoc].from_json(json_da) + da2 = DocList[MyDoc].from_json(json_da) assert len(da2) == 2 assert len(da) == len(da2) for d1, d2 in zip(da, da2): diff --git a/tests/units/array/test_array_from_to_pandas.py b/tests/units/array/test_array_from_to_pandas.py index d01cd8a1d68..2f95f4f66aa 100644 --- a/tests/units/array/test_array_from_to_pandas.py +++ b/tests/units/array/test_array_from_to_pandas.py @@ -3,7 +3,7 @@ import pandas as pd import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc @@ -20,7 +20,7 @@ class MyDocNested(MyDoc): def test_to_from_pandas_df(nested_doc_cls): - da = DocArray[nested_doc_cls]( + da = DocList[nested_doc_cls]( [ nested_doc_cls( count=0, @@ -47,7 +47,7 @@ def test_to_from_pandas_df(nested_doc_cls): ] ).all() - da_from_df = DocArray[nested_doc_cls].from_pandas(df) + da_from_df = DocList[nested_doc_cls].from_pandas(df) for doc1, doc2 in zip(da, da_from_df): assert doc1 == doc2 @@ -76,7 +76,7 @@ def test_from_pandas_without_schema_raise_exception(): df = pd.DataFrame( columns=['title', 'count'], data=[['title 0', 0], ['title 1', 1]] ) - DocArray.from_pandas(df=df) + DocList.from_pandas(df=df) def test_from_pandas_with_wrong_schema_raise_exception(nested_doc): @@ -84,4 +84,4 @@ def test_from_pandas_with_wrong_schema_raise_exception(nested_doc): df = pd.DataFrame( columns=['title', 'count'], data=[['title 0', 0], ['title 1', 1]] ) - DocArray[nested_doc.__class__].from_pandas(df=df) + DocList[nested_doc.__class__].from_pandas(df=df) diff --git a/tests/units/array/test_array_proto.py b/tests/units/array/test_array_proto.py index ac0265016fc..ebdf0d9a3f9 100644 --- a/tests/units/array/test_array_proto.py +++ b/tests/units/array/test_array_proto.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc, TextDoc from docarray.typing import NdArray @@ -12,11 +12,11 @@ class CustomDoc(BaseDoc): text: str tensor: NdArray - da = DocArray( + da = DocList( [CustomDoc(text='hello', tensor=np.zeros((3, 224, 224))) for _ in range(10)] ) - new_da = DocArray[CustomDoc].from_protobuf(da.to_protobuf()) + new_da = DocList[CustomDoc].from_protobuf(da.to_protobuf()) for doc1, doc2 in zip(da, new_da): assert doc1.text == doc2.text @@ -29,7 +29,7 @@ class CustomDocument(BaseDoc): text: TextDoc image: ImageDoc - da = DocArray[CustomDocument]( + da = DocList[CustomDocument]( [ CustomDocument( text=TextDoc(text='hello'), @@ -39,7 +39,7 @@ class CustomDocument(BaseDoc): ] ) - DocArray[CustomDocument].from_protobuf(da.to_protobuf()) + DocList[CustomDocument].from_protobuf(da.to_protobuf()) @pytest.mark.proto @@ -48,7 +48,7 @@ class CustomDocument(BaseDoc): text: TextDoc image: ImageDoc - da = DocArray[CustomDocument]( + da = DocList[CustomDocument]( [ CustomDocument( text=TextDoc(text='hello'), @@ -58,4 +58,4 @@ class CustomDocument(BaseDoc): ] ) - DocArray.from_protobuf(da.to_protobuf()) + DocList.from_protobuf(da.to_protobuf()) diff --git a/tests/units/array/test_array_save_load.py b/tests/units/array/test_array_save_load.py index 795c437608d..1a632673d15 100644 --- a/tests/units/array/test_array_save_load.py +++ b/tests/units/array/test_array_save_load.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from docarray.typing import NdArray @@ -23,7 +23,7 @@ class MyDoc(BaseDoc): def test_array_save_load_binary(protocol, compress, tmp_path, show_progress): tmp_file = os.path.join(tmp_path, 'test') - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [ MyDoc( embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png') @@ -36,7 +36,7 @@ def test_array_save_load_binary(protocol, compress, tmp_path, show_progress): tmp_file, protocol=protocol, compress=compress, show_progress=show_progress ) - da2 = DocArray[MyDoc].load_binary( + da2 = DocList[MyDoc].load_binary( tmp_file, protocol=protocol, compress=compress, show_progress=show_progress ) @@ -59,7 +59,7 @@ def test_array_save_load_binary(protocol, compress, tmp_path, show_progress): def test_array_save_load_binary_streaming(protocol, compress, tmp_path, show_progress): tmp_file = os.path.join(tmp_path, 'test') - da = DocArray[MyDoc]() + da = DocList[MyDoc]() def _extend_da(num_docs=100): for _ in range(num_docs): @@ -79,8 +79,8 @@ def _extend_da(num_docs=100): tmp_file, protocol=protocol, compress=compress, show_progress=show_progress ) - da2 = DocArray[MyDoc]() - da_generator = DocArray[MyDoc].load_binary( + da2 = DocList[MyDoc]() + da_generator = DocList[MyDoc].load_binary( tmp_file, protocol=protocol, compress=compress, show_progress=show_progress ) diff --git a/tests/units/array/test_batching.py b/tests/units/array/test_batching.py index 389d649dbc4..88689c0f644 100644 --- a/tests/units/array/test_batching.py +++ b/tests/units/array/test_batching.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.typing import NdArray @@ -14,7 +14,7 @@ class MyDoc(BaseDoc): tensor: NdArray t_shape = (32, 32) - da = DocArray[MyDoc]( + da = DocList[MyDoc]( [ MyDoc( id=i, diff --git a/tests/units/array/test_generic_array.py b/tests/units/array/test_generic_array.py index e0b5386e676..a693e810e95 100644 --- a/tests/units/array/test_generic_array.py +++ b/tests/units/array/test_generic_array.py @@ -1,4 +1,4 @@ -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.base_doc import AnyDoc @@ -6,14 +6,14 @@ def test_generic_init(): class Text(BaseDoc): text: str - da = DocArray[Text]([]) + da = DocList[Text]([]) da.document_type == Text - assert isinstance(da, DocArray) + assert isinstance(da, DocList) def test_normal_access_init(): - da = DocArray([]) + da = DocList([]) da.document_type == AnyDoc - assert isinstance(da, DocArray) + assert isinstance(da, DocList) diff --git a/tests/units/array/test_indexing.py b/tests/units/array/test_indexing.py index 6aa9e363301..7377e3aac08 100644 --- a/tests/units/array/test_indexing.py +++ b/tests/units/array/test_indexing.py @@ -2,7 +2,7 @@ import pytest import torch -from docarray import DocArray, DocArrayStacked +from docarray import DocArrayStacked, DocList from docarray.documents import TextDoc from docarray.typing import TorchTensor @@ -11,7 +11,7 @@ def da(): texts = [f'hello {i}' for i in range(10)] tensors = [torch.ones((4,)) * i for i in range(10)] - return DocArray[TextDoc]( + return DocList[TextDoc]( [TextDoc(text=text, embedding=tens) for text, tens in zip(texts, tensors)], ) @@ -20,7 +20,7 @@ def da(): def da_to_set(): texts = [f'hello {2*i}' for i in range(5)] tensors = [torch.ones((4,)) * i * 2 for i in range(5)] - return DocArray[TextDoc]( + return DocList[TextDoc]( [TextDoc(text=text, embedding=tens) for text, tens in zip(texts, tensors)], ) diff --git a/tests/units/array/test_traverse.py b/tests/units/array/test_traverse.py index b6bd25f0be8..8f648526faa 100644 --- a/tests/units/array/test_traverse.py +++ b/tests/units/array/test_traverse.py @@ -3,7 +3,7 @@ import pytest import torch -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.array.abstract_array import AnyDocArray from docarray.documents import TextDoc from docarray.typing import TorchTensor @@ -21,21 +21,21 @@ class SubSubDoc(BaseDoc): class SubDoc(BaseDoc): sub_text: TextDoc - sub_da: DocArray[SubSubDoc] + sub_da: DocList[SubSubDoc] class MultiModalDoc(BaseDoc): mm_text: TextDoc mm_tensor: Optional[TorchTensor[3, 2, 2]] - mm_da: DocArray[SubDoc] + mm_da: DocList[SubDoc] - docs = DocArray[MultiModalDoc]( + docs = DocList[MultiModalDoc]( [ MultiModalDoc( mm_text=TextDoc(text=f'hello{i}'), mm_da=[ SubDoc( sub_text=TextDoc(text=f'sub_{i}_1'), - sub_da=DocArray[SubSubDoc]( + sub_da=DocList[SubSubDoc]( [ SubSubDoc( sub_sub_text=TextDoc(text='subsub'), @@ -81,7 +81,7 @@ def test_traverse_stacked_da(): class Image(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocArray[Image]( + batch = DocList[Image]( [ Image( tensor=torch.zeros(3, 224, 224), @@ -112,7 +112,7 @@ def test_flatten_one_level(input_list, output_list): def test_flatten_one_level_list_of_da(): doc = BaseDoc() - input_list = [DocArray([doc, doc, doc])] + input_list = [DocList([doc, doc, doc])] flattened = AnyDocArray._flatten_one_level(sequence=input_list) assert flattened == [doc, doc, doc] diff --git a/tests/units/document/proto/test_document_proto.py b/tests/units/document/proto/test_document_proto.py index 1642c17631d..cb5442f7700 100644 --- a/tests/units/document/proto/test_document_proto.py +++ b/tests/units/document/proto/test_document_proto.py @@ -4,7 +4,7 @@ import pytest import torch -from docarray import DocArray +from docarray import DocList from docarray.base_doc import BaseDoc from docarray.typing import NdArray, TorchTensor from docarray.utils._internal.misc import is_tf_available @@ -57,11 +57,11 @@ class CustomInnerDoc(BaseDoc): class CustomDoc(BaseDoc): text: str - chunks: DocArray[CustomInnerDoc] + chunks: DocList[CustomInnerDoc] doc = CustomDoc( text='hello', - chunks=DocArray[CustomInnerDoc]( + chunks=DocList[CustomInnerDoc]( [CustomInnerDoc(tensor=np.zeros((3, 224, 224))) for _ in range(5)], ), ) @@ -95,11 +95,11 @@ class CustomInnerDoc(BaseDoc): class CustomDoc(BaseDoc): text: str - chunks: DocArray[CustomInnerDoc] + chunks: DocList[CustomInnerDoc] doc = CustomDoc( text='hello', - chunks=DocArray[CustomInnerDoc]( + chunks=DocList[CustomInnerDoc]( [CustomInnerDoc(tensor=torch.zeros((3, 224, 224))) for _ in range(5)], ), ) diff --git a/tests/units/document/test_update.py b/tests/units/document/test_update.py index 690b83649ed..5e76caa0dc2 100644 --- a/tests/units/document/test_update.py +++ b/tests/units/document/test_update.py @@ -2,7 +2,7 @@ import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc @@ -16,8 +16,8 @@ class MMDoc(BaseDoc): price: int = 0 categories: Optional[List[str]] = None image: Optional[ImageDoc] = None - matches: Optional[DocArray] = None - matches_with_same_id: Optional[DocArray] = None + matches: Optional[DocList] = None + matches_with_same_id: Optional[DocList] = None opt_int: Optional[int] = None test_set: Optional[Set] = None inner_doc: Optional[InnerDoc] = None @@ -30,9 +30,9 @@ def doc1(): text='hey here', categories=['a', 'b', 'c'], price=10, - matches=DocArray[MMDoc]([MMDoc()]), - matches_with_same_id=DocArray[MMDoc]( - [MMDoc(id='a', matches=DocArray[MMDoc]([MMDoc()]))] + matches=DocList[MMDoc]([MMDoc()]), + matches_with_same_id=DocList[MMDoc]( + [MMDoc(id='a', matches=DocList[MMDoc]([MMDoc()]))] ), test_set={'a', 'a'}, inner_doc=InnerDoc(integer=2, inner_list=['c', 'd']), @@ -48,9 +48,9 @@ def doc2(doc1): categories=['d', 'e', 'f'], price=5, opt_int=5, - matches=DocArray[MMDoc]([MMDoc()]), - matches_with_same_id=DocArray[MMDoc]( - [MMDoc(id='a', matches=DocArray[MMDoc]([MMDoc()]))] + matches=DocList[MMDoc]([MMDoc()]), + matches_with_same_id=DocList[MMDoc]( + [MMDoc(id='a', matches=DocList[MMDoc]([MMDoc()]))] ), test_set={'a', 'b'}, inner_doc=InnerDoc(integer=3, inner_list=['a', 'b']), diff --git a/tests/units/test_helper.py b/tests/units/test_helper.py index 652400d2905..bb7e51b25fc 100644 --- a/tests/units/test_helper.py +++ b/tests/units/test_helper.py @@ -2,7 +2,7 @@ import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from docarray.helper import ( _access_path_dict_to_nested_dict, @@ -26,12 +26,12 @@ class Middle(BaseDoc): class Outer(BaseDoc): img: Optional[ImageDoc] middle: Optional[Middle] - da: DocArray[Inner] + da: DocList[Inner] doc = Outer( img=ImageDoc(), middle=Middle(img=ImageDoc(), inner=Inner(img=ImageDoc())), - da=DocArray[Inner]([Inner(img=ImageDoc(url='test.png'))]), + da=DocList[Inner]([Inner(img=ImageDoc(url='test.png'))]), ) return doc diff --git a/tests/units/typing/da/test_relations.py b/tests/units/typing/da/test_relations.py index fcdf1177657..b00e965c8e7 100644 --- a/tests/units/typing/da/test_relations.py +++ b/tests/units/typing/da/test_relations.py @@ -1,33 +1,33 @@ -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList def test_instance_and_equivalence(): class MyDoc(BaseDoc): text: str - docs = DocArray[MyDoc]([MyDoc(text='hello')]) + docs = DocList[MyDoc]([MyDoc(text='hello')]) - assert issubclass(DocArray[MyDoc], DocArray[MyDoc]) - assert issubclass(docs.__class__, DocArray[MyDoc]) + assert issubclass(DocList[MyDoc], DocList[MyDoc]) + assert issubclass(docs.__class__, DocList[MyDoc]) - assert isinstance(docs, DocArray[MyDoc]) + assert isinstance(docs, DocList[MyDoc]) def test_subclassing(): class MyDoc(BaseDoc): text: str - class MyDocArray(DocArray[MyDoc]): + class MyDocList(DocList[MyDoc]): pass - docs = MyDocArray([MyDoc(text='hello')]) + docs = MyDocList([MyDoc(text='hello')]) - assert issubclass(MyDocArray, DocArray[MyDoc]) - assert issubclass(docs.__class__, DocArray[MyDoc]) + assert issubclass(MyDocList, DocList[MyDoc]) + assert issubclass(docs.__class__, DocList[MyDoc]) - assert isinstance(docs, MyDocArray) - assert isinstance(docs, DocArray[MyDoc]) + assert isinstance(docs, MyDocList) + assert isinstance(docs, DocList[MyDoc]) assert issubclass(MyDoc, BaseDoc) - assert not issubclass(DocArray[MyDoc], DocArray[BaseDoc]) - assert not issubclass(MyDocArray, DocArray[BaseDoc]) + assert not issubclass(DocList[MyDoc], DocList[BaseDoc]) + assert not issubclass(MyDocList, DocList[BaseDoc]) diff --git a/tests/units/util/test_filter.py b/tests/units/util/test_filter.py index 14e43290e9a..21c427a7bbf 100644 --- a/tests/units/util/test_filter.py +++ b/tests/units/util/test_filter.py @@ -3,7 +3,7 @@ import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc, TextDoc from docarray.utils.filter import filter_docs @@ -45,7 +45,7 @@ def docs(): optional_num=30, dictionary={'a': 0, 'b': 1}, ) - docs = DocArray[MMDoc]([mmdoc1, mmdoc2, mmdoc3]) + docs = DocList[MMDoc]([mmdoc1, mmdoc2, mmdoc3]) return docs @@ -173,7 +173,7 @@ def test_array_simple_filters(docs, dict_api): @pytest.mark.parametrize('dict_api', [True, False]) def test_placehold_filter(dict_api): - docs = DocArray[MMDoc]( + docs = DocList[MMDoc]( [ MMDoc(text='A', text_doc=TextDoc(text='A')), MMDoc(text='A', text_doc=TextDoc(text='B')), @@ -251,7 +251,7 @@ class MyDocument(BaseDoc): image: ImageDoc price: int - docs = DocArray[MyDocument]( + docs = DocList[MyDocument]( [ MyDocument( caption='A tiger in the jungle', diff --git a/tests/units/util/test_find.py b/tests/units/util/test_find.py index 9239e6d8dff..90b3c7005d8 100644 --- a/tests/units/util/test_find.py +++ b/tests/units/util/test_find.py @@ -4,7 +4,7 @@ import pytest import torch -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.typing import NdArray, TorchTensor from docarray.utils.find import find, find_batched @@ -24,7 +24,7 @@ def random_torch_query(): @pytest.fixture() def random_torch_batch_query(): - return DocArray[TorchDoc]([TorchDoc(tensor=torch.rand(128)) for _ in range(5)]) + return DocList[TorchDoc]([TorchDoc(tensor=torch.rand(128)) for _ in range(5)]) @pytest.fixture() @@ -34,17 +34,17 @@ def random_nd_query(): @pytest.fixture() def random_nd_batch_query(): - return DocArray[NdDoc]([NdDoc(tensor=np.random.rand(128)) for _ in range(5)]) + return DocList[NdDoc]([NdDoc(tensor=np.random.rand(128)) for _ in range(5)]) @pytest.fixture() def random_torch_index(): - return DocArray[TorchDoc](TorchDoc(tensor=torch.rand(128)) for _ in range(10)) + return DocList[TorchDoc](TorchDoc(tensor=torch.rand(128)) for _ in range(10)) @pytest.fixture() def random_nd_index(): - return DocArray[NdDoc](NdDoc(tensor=np.random.rand(128)) for _ in range(10)) + return DocList[NdDoc](NdDoc(tensor=np.random.rand(128)) for _ in range(10)) @pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) @@ -261,7 +261,7 @@ class MyDoc(BaseDoc): embedding: Optional[TorchTensor] query = MyDoc(embedding=torch.rand(10)) - index = DocArray[MyDoc]([MyDoc(embedding=torch.rand(10)) for _ in range(10)]) + index = DocList[MyDoc]([MyDoc(embedding=torch.rand(10)) for _ in range(10)]) top_k, scores = find( index, @@ -279,7 +279,7 @@ class MyDoc(BaseDoc): embedding: Union[TorchTensor, NdArray] query = MyDoc(embedding=torch.rand(10)) - index = DocArray[MyDoc]([MyDoc(embedding=torch.rand(10)) for _ in range(10)]) + index = DocList[MyDoc]([MyDoc(embedding=torch.rand(10)) for _ in range(10)]) top_k, scores = find( index, @@ -302,7 +302,7 @@ class MyDoc(BaseDoc): inner: InnerDoc query = MyDoc(inner=InnerDoc(title='query', embedding=torch.rand(2))) - index = DocArray[MyDoc]( + index = DocList[MyDoc]( [ MyDoc(inner=InnerDoc(title=f'doc {i}', embedding=torch.rand(2))) for i in range(10) @@ -335,7 +335,7 @@ class MyDoc(BaseDoc): embedding3=torch.rand(10), embedding4=torch.rand(10), ) - index = DocArray[MyDoc]( + index = DocList[MyDoc]( [ MyDoc( embedding=torch.rand(10), diff --git a/tests/units/util/test_map.py b/tests/units/util/test_map.py index c36ebc2f46e..f4864c239f5 100644 --- a/tests/units/util/test_map.py +++ b/tests/units/util/test_map.py @@ -2,7 +2,7 @@ import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from docarray.typing import ImageUrl, NdArray from docarray.utils.map import map_docs, map_docs_batched @@ -19,7 +19,7 @@ def load_from_doc(d: ImageDoc) -> ImageDoc: @pytest.fixture() def da(): - da = DocArray[ImageDoc]([ImageDoc(url=IMAGE_PATHS['png']) for _ in range(N_DOCS)]) + da = DocList[ImageDoc]([ImageDoc(url=IMAGE_PATHS['png']) for _ in range(N_DOCS)]) return da @@ -50,7 +50,7 @@ def local_func(x): @pytest.mark.parametrize('backend', ['thread', 'process']) def test_check_order(backend): - da = DocArray[ImageDoc]([ImageDoc(id=i) for i in range(N_DOCS)]) + da = DocList[ImageDoc]([ImageDoc(id=i) for i in range(N_DOCS)]) docs = list(map_docs(da=da, func=load_from_doc, backend=backend)) @@ -59,7 +59,7 @@ def test_check_order(backend): assert doc.id == str(i) -def load_from_da(da: DocArray) -> DocArray: +def load_from_da(da: DocList) -> DocList: for doc in da: doc.tensor = doc.url.load() return da @@ -75,11 +75,11 @@ class MyImage(BaseDoc): @pytest.mark.parametrize('backend', ['thread', 'process']) def test_map_docs_batched(n_docs, batch_size, backend): - da = DocArray[MyImage]([MyImage(url=IMAGE_PATHS['png']) for _ in range(n_docs)]) + da = DocList[MyImage]([MyImage(url=IMAGE_PATHS['png']) for _ in range(n_docs)]) it = map_docs_batched( da=da, func=load_from_da, batch_size=batch_size, backend=backend ) assert isinstance(it, Generator) for batch in it: - assert isinstance(batch, DocArray[MyImage]) + assert isinstance(batch, DocList[MyImage]) diff --git a/tests/units/util/test_reduce.py b/tests/units/util/test_reduce.py index e72e8863a46..e07af67b0ec 100644 --- a/tests/units/util/test_reduce.py +++ b/tests/units/util/test_reduce.py @@ -2,7 +2,7 @@ import pytest -from docarray import BaseDoc, DocArray +from docarray import BaseDoc, DocList from docarray.documents import ImageDoc from docarray.utils.reduce import reduce, reduce_all @@ -17,8 +17,8 @@ class MMDoc(BaseDoc): price: int = 0 categories: Optional[List[str]] = None image: Optional[ImageDoc] = None - matches: Optional[DocArray] = None - matches_with_same_id: Optional[DocArray] = None + matches: Optional[DocList] = None + matches_with_same_id: Optional[DocList] = None opt_int: Optional[int] = None test_set: Optional[Set] = None inner_doc: Optional[InnerDoc] = None @@ -31,9 +31,9 @@ def doc1(): text='hey here', categories=['a', 'b', 'c'], price=10, - matches=DocArray[MMDoc]([MMDoc()]), - matches_with_same_id=DocArray[MMDoc]( - [MMDoc(id='a', matches=DocArray[MMDoc]([MMDoc()]))] + matches=DocList[MMDoc]([MMDoc()]), + matches_with_same_id=DocList[MMDoc]( + [MMDoc(id='a', matches=DocList[MMDoc]([MMDoc()]))] ), test_set={'a', 'a'}, inner_doc=InnerDoc(integer=2, inner_list=['c', 'd']), @@ -49,9 +49,9 @@ def doc2(doc1): categories=['d', 'e', 'f'], price=5, opt_int=5, - matches=DocArray[MMDoc]([MMDoc()]), - matches_with_same_id=DocArray[MMDoc]( - [MMDoc(id='a', matches=DocArray[MMDoc]([MMDoc()]))] + matches=DocList[MMDoc]([MMDoc()]), + matches_with_same_id=DocList[MMDoc]( + [MMDoc(id='a', matches=DocList[MMDoc]([MMDoc()]))] ), test_set={'a', 'b'}, inner_doc=InnerDoc(integer=3, inner_list=['a', 'b']), @@ -60,8 +60,8 @@ def doc2(doc1): def test_reduce_different_ids(): - da1 = DocArray[MMDoc]([MMDoc() for _ in range(10)]) - da2 = DocArray[MMDoc]([MMDoc() for _ in range(10)]) + da1 = DocList[MMDoc]([MMDoc() for _ in range(10)]) + da2 = DocList[MMDoc]([MMDoc() for _ in range(10)]) result = reduce(da1, da2) assert len(result) == 20 # da1 is changed in place (no extra memory) @@ -69,8 +69,8 @@ def test_reduce_different_ids(): def test_reduce(doc1, doc2): - da1 = DocArray[MMDoc]([doc1, MMDoc()]) - da2 = DocArray[MMDoc]([MMDoc(), doc2]) + da1 = DocList[MMDoc]([doc1, MMDoc()]) + da2 = DocList[MMDoc]([MMDoc(), doc2]) result = reduce(da1, da2) assert len(result) == 3 # da1 is changed in place (no extra memory) @@ -89,9 +89,9 @@ def test_reduce(doc1, doc2): def test_reduce_all(doc1, doc2): - da1 = DocArray[MMDoc]([doc1, MMDoc()]) - da2 = DocArray[MMDoc]([MMDoc(), doc2]) - da3 = DocArray[MMDoc]([MMDoc(), MMDoc(), doc1]) + da1 = DocList[MMDoc]([doc1, MMDoc()]) + da2 = DocList[MMDoc]([MMDoc(), doc2]) + da3 = DocList[MMDoc]([MMDoc(), MMDoc(), doc1]) result = reduce_all([da1, da2, da3]) assert len(result) == 5 # da1 is changed in place (no extra memory) From d33e29512747efad00c05b83518b86d00026b986 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 3 Apr 2023 15:10:26 +0200 Subject: [PATCH 02/27] refactor: rename DocArray to DocList Signed-off-by: samsja --- docarray/array/array/array.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docarray/array/array/array.py b/docarray/array/array/array.py index ba1acfab013..9d3f43ece1a 100644 --- a/docarray/array/array/array.py +++ b/docarray/array/array/array.py @@ -61,19 +61,19 @@ class DocList( IndexingSequenceMixin[T_doc], PushPullMixin, IOMixinArray, AnyDocArray[T_doc] ): """ - DocArray is a container of Documents. + DocList is a container of Documents. - A DocArray is a list of Documents of any schema. However, many + A DocList is a list of Documents of any schema. However, many DocArray features are only available if these Documents are homogeneous and follow the same schema. To precise this schema you can use the `DocArray[MyDocument]` syntax where MyDocument is a Document class - (i.e. schema). This creates a DocArray that can only contains Documents of + (i.e. schema). This creates a DocList that can only contains Documents of the type 'MyDocument'. --- ```python - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList from docarray.typing import NdArray, ImageUrl from typing import Optional @@ -83,7 +83,7 @@ class Image(BaseDoc): url: ImageUrl - da = DocArray[Image]( + da = DocList[Image]( Image(url='http://url.com/foo.png') for _ in range(10) ) # noqa: E510 ``` @@ -91,7 +91,7 @@ class Image(BaseDoc): --- - If your DocArray is homogeneous (i.e. follows the same schema), you can access + If your DocList is homogeneous (i.e. follows the same schema), you can access fields at the DocArray level (for example `da.tensor` or `da.url`). You can also set fields, with `da.tensor = np.random.random([10, 100])`: @@ -104,7 +104,7 @@ class Image(BaseDoc): # [NdArray([0.11299577, 0.47206767, 0.481723 , 0.34754724, 0.15016037, # 0.88861321, 0.88317666, 0.93845579, 0.60486676, ... ]), ...] - You can index into a DocArray like a numpy array or torch tensor: + You can index into a DocList like a numpy array or torch tensor: da[0] # index by position @@ -112,7 +112,7 @@ class Image(BaseDoc): da[[0, 2, 3]] # index by list of indices da[True, False, True, True, ...] # index by boolean mask - You can delete items from a DocArray like a Python List + You can delete items from a DocList like a Python List del da[0] # remove first element from DocArray del da[0:5] # remove elements for 0 to 5 from DocArray From 60a3050cab07135307e4ac3c46dc0dba6410bc83 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 3 Apr 2023 15:16:06 +0200 Subject: [PATCH 03/27] fix: fix Ci Signed-off-by: samsja --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 130e72de9dd..d49d1d603c7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -61,7 +61,7 @@ jobs: poetry install --without dev poetry run pip install tensorflow==2.11.0 - name: Test basic import - run: poetry run python -c 'from docarray import DocArray, BaseDoc' + run: poetry run python -c 'from docarray import DocList, BaseDoc' check-mypy: From a8b108704e3676da162934039f5e5b5d70d2c3ad Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 3 Apr 2023 15:40:47 +0200 Subject: [PATCH 04/27] refactor: rename DocArrayStack to DocVec Signed-off-by: samsja --- docarray/__init__.py | 4 +- docarray/array/__init__.py | 4 +- docarray/array/array/array.py | 12 ++--- docarray/array/stacked/array_stacked.py | 28 +++++------ docarray/array/stacked/column_storage.py | 6 +-- docarray/data/torch_dataset.py | 6 +-- docarray/display/document_array_summary.py | 8 +-- docarray/utils/find.py | 4 +- docs/api_references/array/da_stack.md | 4 +- .../torch/data/test_torch_dataset.py | 12 ++--- .../units/array/stack/storage/test_storage.py | 8 +-- tests/units/array/stack/test_array_stacked.py | 50 +++++++++---------- .../array/stack/test_array_stacked_tf.py | 24 ++++----- tests/units/array/stack/test_init.py | 6 +-- tests/units/array/stack/test_proto.py | 6 +-- tests/units/array/test_indexing.py | 4 +- tests/units/document/test_view.py | 4 +- 17 files changed, 92 insertions(+), 98 deletions(-) diff --git a/docarray/__init__.py b/docarray/__init__.py index 0189cab7250..2bffdc80803 100644 --- a/docarray/__init__.py +++ b/docarray/__init__.py @@ -2,10 +2,10 @@ import logging -from docarray.array import DocArrayStacked, DocList +from docarray.array import DocList, DocVec from docarray.base_doc.doc import BaseDoc -__all__ = ['BaseDoc', 'DocList', 'DocArrayStacked'] +__all__ = ['BaseDoc', 'DocList', 'DocVec'] logger = logging.getLogger('docarray') diff --git a/docarray/array/__init__.py b/docarray/array/__init__.py index fd0544f0c91..0f200cc598e 100644 --- a/docarray/array/__init__.py +++ b/docarray/array/__init__.py @@ -1,4 +1,4 @@ from docarray.array.array.array import DocList -from docarray.array.stacked.array_stacked import DocArrayStacked +from docarray.array.stacked.array_stacked import DocVec -__all__ = ['DocList', 'DocArrayStacked'] +__all__ = ['DocList', 'DocVec'] diff --git a/docarray/array/array/array.py b/docarray/array/array/array.py index 9d3f43ece1a..3f881049388 100644 --- a/docarray/array/array/array.py +++ b/docarray/array/array/array.py @@ -31,7 +31,7 @@ from pydantic import BaseConfig from pydantic.fields import ModelField - from docarray.array.stacked.array_stacked import DocArrayStacked + from docarray.array.stacked.array_stacked import DocVec from docarray.proto import DocumentArrayProto from docarray.typing import TorchTensor from docarray.typing.tensor.abstract_tensor import AbstractTensor @@ -253,7 +253,7 @@ def _set_data_column( def stack( self, tensor_type: Type['AbstractTensor'] = NdArray, - ) -> 'DocArrayStacked': + ) -> 'DocVec': """ Convert the DocArray into a DocArrayStacked. `Self` cannot be used afterwards @@ -261,9 +261,9 @@ def stack( if the BaseDoc has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor :return: A DocArrayStacked of the same document type as self """ - from docarray.array.stacked.array_stacked import DocArrayStacked + from docarray.array.stacked.array_stacked import DocVec - return DocArrayStacked.__class_getitem__(self.document_type)( + return DocVec.__class_getitem__(self.document_type)( self, tensor_type=tensor_type ) @@ -274,9 +274,9 @@ def validate( field: 'ModelField', config: 'BaseConfig', ): - from docarray.array.stacked.array_stacked import DocArrayStacked + from docarray.array.stacked.array_stacked import DocVec - if isinstance(value, (cls, DocArrayStacked)): + if isinstance(value, (cls, DocVec)): return value elif isinstance(value, Iterable): return cls(value) diff --git a/docarray/array/stacked/array_stacked.py b/docarray/array/stacked/array_stacked.py index 5e634f36b00..aa0f0e1403c 100644 --- a/docarray/array/stacked/array_stacked.py +++ b/docarray/array/stacked/array_stacked.py @@ -49,39 +49,39 @@ TensorFlowTensor = None # type: ignore T_doc = TypeVar('T_doc', bound=BaseDoc) -T = TypeVar('T', bound='DocArrayStacked') +T = TypeVar('T', bound='DocVec') IndexIterType = Union[slice, Iterable[int], Iterable[bool], None] -class DocArrayStacked(AnyDocArray[T_doc]): +class DocVec(AnyDocArray[T_doc]): """ - DocArrayStacked is a container of Documents appropriates to perform + DocVec is a container of Documents appropriates to perform computation that require batches of data (ex: matrix multiplication, distance calculation, deep learning forward pass) - A DocArrayStacked has a similar interface as + A DocVec has a similar interface as {class}`~docarray.array.DocArray` but with an underlying implementation that is column based instead of row based. Each field of the schema of the DocArrayStack - (the :attr:`~docarray.array.stacked.DocArrayStacked.document_type` which is a + (the :attr:`~docarray.array.stacked.DocVec.document_type` which is a `BaseDoc`) will be stored in a column. If the field is a tensor, the data from all Documents will be stored as a single, stacked (torch/np/tf) tensor. If the tensor field is `AnyTensor` or a Union of tensor types, the :attr:`~docarray.array.stacked.DocArrayStacked.tensor_type` will be used to determine the type of the stacked column. - If the field is another `BasedDocument` the column will be another DocArrayStacked that follows the + If the field is another `BasedDoc` the column will be another DocArrayStacked that follows the schema of the nested Document. If the field is a `DocArray` or - `DocArrayStacked` then the column will be a list of `DocArrayStacked`. + `DocVec` then the column will be a list of `DocVec`. For any other type the column is a Python list. - Every `Document` inside a `DocArrayStacked` is a view into the data columns stored at the `DocArrayStacked` level. The `Document` does + Every `Document` inside a `DocArrayStacked` is a view into the data columns stored at the `DocVec` level. The `BaseDoc` does not hold any data itself. The behavior of this Document "view" is similar to the behavior of `view = tensor[i]` in numpy/PyTorch. - :param docs: a DocArray + :param docs: a homogeneous sequence of BaseDoc :param tensor_type: Tensor Class used to wrap the stacked tensors. This is useful if the BaseDoc of this DocArrayStacked has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor @@ -97,8 +97,8 @@ def __init__( self.tensor_type = tensor_type tensor_columns: Dict[str, AbstractTensor] = dict() - doc_columns: Dict[str, 'DocArrayStacked'] = dict() - da_columns: Dict[str, ListAdvancedIndexing['DocArrayStacked']] = dict() + doc_columns: Dict[str, 'DocVec'] = dict() + da_columns: Dict[str, ListAdvancedIndexing['DocVec']] = dict() any_columns: Dict[str, ListAdvancedIndexing] = dict() if len(docs) == 0: @@ -260,7 +260,7 @@ def __getitem__(self: T, item: Union[int, IndexIterType]) -> Union[T_doc, T]: def _get_data_column( self: T, field: str, - ) -> Union[MutableSequence, 'DocArrayStacked', AbstractTensor]: + ) -> Union[MutableSequence, 'DocVec', AbstractTensor]: """Return one column of the data :param field: name of the fields to extract @@ -328,7 +328,7 @@ def _set_data_and_columns( T, value.stack(tensor_type=self.tensor_type) ) # we need to copy data here - elif isinstance(value, DocArrayStacked): + elif isinstance(value, DocVec): if not issubclass(value.document_type, self.document_type): raise TypeError( f'{value} schema : {value.document_type} is not compatible with ' @@ -376,7 +376,7 @@ def _set_data_column( elif field in self._storage.doc_columns.keys(): values_ = parse_obj_as( - DocArrayStacked.__class_getitem__( + DocVec.__class_getitem__( self._storage.doc_columns[field].document_type ), values, diff --git a/docarray/array/stacked/column_storage.py b/docarray/array/stacked/column_storage.py index 80129cfcdfd..9b0bd991985 100644 --- a/docarray/array/stacked/column_storage.py +++ b/docarray/array/stacked/column_storage.py @@ -15,7 +15,7 @@ from docarray.typing.tensor.abstract_tensor import AbstractTensor if TYPE_CHECKING: - from docarray.array.stacked.array_stacked import DocArrayStacked + from docarray.array.stacked.array_stacked import DocVec IndexIterType = Union[slice, Iterable[int], Iterable[bool], None] @@ -38,8 +38,8 @@ class ColumnStorage: def __init__( self, tensor_columns: Dict[str, AbstractTensor], - doc_columns: Dict[str, 'DocArrayStacked'], - da_columns: Dict[str, ListAdvancedIndexing['DocArrayStacked']], + doc_columns: Dict[str, 'DocVec'], + da_columns: Dict[str, ListAdvancedIndexing['DocVec']], any_columns: Dict[str, ListAdvancedIndexing], tensor_type: Type[AbstractTensor] = NdArray, ): diff --git a/docarray/data/torch_dataset.py b/docarray/data/torch_dataset.py index 59f4843b899..09a4f2326dd 100644 --- a/docarray/data/torch_dataset.py +++ b/docarray/data/torch_dataset.py @@ -2,7 +2,7 @@ from torch.utils.data import Dataset -from docarray import BaseDoc, DocArrayStacked, DocList +from docarray import BaseDoc, DocList, DocVec from docarray.typing import TorchTensor from docarray.utils._internal._typing import change_cls_name @@ -123,12 +123,12 @@ def __getitem__(self, item: int): def collate_fn(cls, batch: List[T_doc]): doc_type = cls.document_type if doc_type: - batch_da = DocArrayStacked[doc_type]( # type: ignore + batch_da = DocVec[doc_type]( # type: ignore batch, tensor_type=TorchTensor, ) else: - batch_da = DocArrayStacked(batch, tensor_type=TorchTensor) + batch_da = DocVec(batch, tensor_type=TorchTensor) return batch_da @classmethod diff --git a/docarray/display/document_array_summary.py b/docarray/display/document_array_summary.py index 401ee570a95..05a4d5cf2dc 100644 --- a/docarray/display/document_array_summary.py +++ b/docarray/display/document_array_summary.py @@ -3,7 +3,7 @@ from docarray.typing.tensor.abstract_tensor import AbstractTensor if TYPE_CHECKING: - from docarray.array import DocArrayStacked + from docarray.array import DocVec from docarray.array.abstract_array import AnyDocArray @@ -21,14 +21,14 @@ def summary(self) -> None: from rich.panel import Panel from rich.table import Table - from docarray.array import DocArrayStacked + from docarray.array import DocVec table = Table(box=box.SIMPLE, highlight=True) table.show_header = False table.add_row('Type', self.da.__class__.__name__) table.add_row('Length', str(len(self.da)), end_section=True) - if isinstance(self.da, DocArrayStacked): + if isinstance(self.da, DocVec): table.add_row('Stacked columns:') stacked_fields = self._get_stacked_fields(da=self.da) for field_name in stacked_fields: @@ -54,7 +54,7 @@ def summary(self) -> None: self.da.document_type.schema_summary() @staticmethod - def _get_stacked_fields(da: 'DocArrayStacked') -> List[str]: # TODO this might + def _get_stacked_fields(da: 'DocVec') -> List[str]: # TODO this might # broken """ Return a list of the field names of a DocArrayStacked instance that are diff --git a/docarray/utils/find.py b/docarray/utils/find.py index bdf5ead3fa6..e55f39e542c 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -6,7 +6,7 @@ from docarray.array.abstract_array import AnyDocArray from docarray.array.array.array import DocList -from docarray.array.stacked.array_stacked import DocArrayStacked +from docarray.array.stacked.array_stacked import DocVec from docarray.base_doc import BaseDoc from docarray.helper import _get_field_type_by_access_path from docarray.typing import AnyTensor @@ -250,7 +250,7 @@ def _extract_embeddings( if isinstance(data, DocList): emb_list = list(AnyDocArray._traverse(data, embedding_field)) emb = embedding_type._docarray_stack(emb_list) - elif isinstance(data, (DocArrayStacked, BaseDoc)): + elif isinstance(data, (DocVec, BaseDoc)): emb = next(AnyDocArray._traverse(data, embedding_field)) else: # treat data as tensor emb = cast(AnyTensor, data) diff --git a/docs/api_references/array/da_stack.md b/docs/api_references/array/da_stack.md index 7f5f9e51a86..3e003fe60ff 100644 --- a/docs/api_references/array/da_stack.md +++ b/docs/api_references/array/da_stack.md @@ -1,3 +1,3 @@ -# DocArrayStacked +# DocVeced -::: docarray.array.array.array.DocArrayStacked +::: docarray.array.array.array.DocVeced diff --git a/tests/integrations/torch/data/test_torch_dataset.py b/tests/integrations/torch/data/test_torch_dataset.py index ef6c1e98597..19355604a8b 100644 --- a/tests/integrations/torch/data/test_torch_dataset.py +++ b/tests/integrations/torch/data/test_torch_dataset.py @@ -56,11 +56,11 @@ def test_torch_dataset(captions_da: DocList[PairTextImage]): dataset, batch_size=BATCH_SIZE, collate_fn=dataset.collate_fn, shuffle=True ) - from docarray.array.stacked.array_stacked import DocArrayStacked + from docarray.array.stacked.array_stacked import DocVec batch_lens = [] for batch in loader: - assert isinstance(batch, DocArrayStacked[PairTextImage]) + assert isinstance(batch, DocVec[PairTextImage]) batch_lens.append(len(batch)) assert all(x == BATCH_SIZE for x in batch_lens[:-1]) @@ -136,11 +136,11 @@ def test_torch_dl_multiprocessing(captions_da: DocList[PairTextImage]): multiprocessing_context='fork', ) - from docarray.array.stacked.array_stacked import DocArrayStacked + from docarray.array.stacked.array_stacked import DocVec batch_lens = [] for batch in loader: - assert isinstance(batch, DocArrayStacked[PairTextImage]) + assert isinstance(batch, DocVec[PairTextImage]) batch_lens.append(len(batch)) assert all(x == BATCH_SIZE for x in batch_lens[:-1]) @@ -164,10 +164,10 @@ def test_torch_dl_pin_memory(captions_da: DocList[PairTextImage]): multiprocessing_context='fork', ) - from docarray.array.stacked.array_stacked import DocArrayStacked + from docarray.array.stacked.array_stacked import DocVec batch_lens = [] for batch in loader: - assert isinstance(batch, DocArrayStacked[PairTextImage]) + assert isinstance(batch, DocVec[PairTextImage]) batch_lens.append(len(batch)) assert all(x == BATCH_SIZE for x in batch_lens[:-1]) diff --git a/tests/units/array/stack/storage/test_storage.py b/tests/units/array/stack/storage/test_storage.py index 591c2057d8b..d55f80b7823 100644 --- a/tests/units/array/stack/storage/test_storage.py +++ b/tests/units/array/stack/storage/test_storage.py @@ -1,7 +1,7 @@ import numpy as np from docarray import BaseDoc -from docarray.array import DocArrayStacked +from docarray.array import DocVec from docarray.array.stacked.column_storage import ColumnStorageView from docarray.typing import AnyTensor @@ -20,13 +20,13 @@ class MyDoc(BaseDoc): for i in range(4) ] - storage = DocArrayStacked[MyDoc](docs)._storage + storage = DocVec[MyDoc](docs)._storage assert (storage.tensor_columns['tensor'] == np.zeros((4, 10))).all() for name in storage.any_columns['name']: assert name == 'hello' inner_docs = storage.doc_columns['doc'] - assert isinstance(inner_docs, DocArrayStacked[InnerDoc]) + assert isinstance(inner_docs, DocVec[InnerDoc]) for i, doc in enumerate(inner_docs): assert doc.price == i @@ -38,7 +38,7 @@ class MyDoc(BaseDoc): docs = [MyDoc(tensor=np.zeros((10, 10)), name='hello', id=i) for i in range(4)] - storage = DocArrayStacked[MyDoc](docs)._storage + storage = DocVec[MyDoc](docs)._storage view = ColumnStorageView(0, storage) diff --git a/tests/units/array/stack/test_array_stacked.py b/tests/units/array/stack/test_array_stacked.py index e867dd76a5c..54086f85845 100644 --- a/tests/units/array/stack/test_array_stacked.py +++ b/tests/units/array/stack/test_array_stacked.py @@ -6,7 +6,7 @@ from pydantic import parse_obj_as from docarray import BaseDoc, DocList -from docarray.array import DocArrayStacked +from docarray.array import DocVec from docarray.documents import ImageDoc from docarray.typing import AnyEmbedding, AnyTensor, NdArray, TorchTensor @@ -16,7 +16,7 @@ def batch(): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocArrayStacked[ImageDoc]( + batch = DocVec[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) @@ -47,7 +47,7 @@ class MMdoc(BaseDoc): def test_create_from_list_docs(): list_ = [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] - da_stacked = DocArrayStacked[ImageDoc](docs=list_, tensor_type=TorchTensor) + da_stacked = DocVec[ImageDoc](docs=list_, tensor_type=TorchTensor) assert len(da_stacked) == 10 assert da_stacked.tensor.shape == tuple([10, 3, 224, 224]) @@ -58,7 +58,7 @@ def test_len(batch): def test_create_from_None(): with pytest.raises(ValueError): - DocArrayStacked[ImageDoc]([]) + DocVec[ImageDoc]([]) def test_getitem(batch): @@ -263,7 +263,7 @@ def test_any_tensor_with_torch(tensor_type, tensor): class ImageDoc(BaseDoc): tensor: AnyTensor - da = DocArrayStacked[ImageDoc]( + da = DocVec[ImageDoc]( [ImageDoc(tensor=tensor) for _ in range(10)], tensor_type=tensor_type, ) @@ -284,7 +284,7 @@ class ImageDoc(BaseDoc): class TopDoc(BaseDoc): img: ImageDoc - da = DocArrayStacked[TopDoc]( + da = DocVec[TopDoc]( [TopDoc(img=ImageDoc(tensor=tensor)) for _ in range(10)], tensor_type=TorchTensor, ) @@ -300,7 +300,7 @@ def test_dict_stack(): class MyDoc(BaseDoc): my_dict: Dict[str, int] - da = DocArrayStacked[MyDoc]([MyDoc(my_dict={'a': 1, 'b': 2}) for _ in range(10)]) + da = DocVec[MyDoc]([MyDoc(my_dict={'a': 1, 'b': 2}) for _ in range(10)]) da.my_dict @@ -312,12 +312,12 @@ class Doc(BaseDoc): N = 10 - da = DocArrayStacked[Doc]( + da = DocVec[Doc]( [Doc(text=f'hello{i}', tensor=np.zeros((3, 224, 224))) for i in range(N)] ) da_sliced = da[0:10:2] - assert isinstance(da_sliced, DocArrayStacked) + assert isinstance(da_sliced, DocVec) tensors = da_sliced.tensor assert tensors.shape == (5, 3, 224, 224) @@ -332,7 +332,7 @@ def test_stack_embedding(): class MyDoc(BaseDoc): embedding: AnyEmbedding - da = DocArrayStacked[MyDoc]([MyDoc(embedding=np.zeros(10)) for _ in range(10)]) + da = DocVec[MyDoc]([MyDoc(embedding=np.zeros(10)) for _ in range(10)]) assert 'embedding' in da._storage.tensor_columns.keys() assert (da.embedding == np.zeros((10, 10))).all() @@ -343,7 +343,7 @@ def test_stack_none(tensor_backend): class MyDoc(BaseDoc): tensor: Optional[AnyTensor] - da = DocArrayStacked[MyDoc]( + da = DocVec[MyDoc]( [MyDoc(tensor=None) for _ in range(10)], tensor_type=tensor_backend ) @@ -351,9 +351,7 @@ class MyDoc(BaseDoc): def test_to_device(): - da = DocArrayStacked[ImageDoc]( - [ImageDoc(tensor=torch.zeros(3, 5))], tensor_type=TorchTensor - ) + da = DocVec[ImageDoc]([ImageDoc(tensor=torch.zeros(3, 5))], tensor_type=TorchTensor) assert da.tensor.device == torch.device('cpu') da.to('meta') assert da.tensor.device == torch.device('meta') @@ -363,11 +361,11 @@ def test_to_device_with_nested_da(): class Video(BaseDoc): images: DocList[ImageDoc] - da_image = DocArrayStacked[ImageDoc]( + da_image = DocVec[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 5))], tensor_type=TorchTensor ) - da = DocArrayStacked[Video]([Video(images=da_image)]) + da = DocVec[Video]([Video(images=da_image)]) assert da.images[0].tensor.device == torch.device('cpu') da.to('meta') assert da.images[0].tensor.device == torch.device('meta') @@ -378,7 +376,7 @@ class MyDoc(BaseDoc): tensor: TorchTensor docs: ImageDoc - da = DocArrayStacked[MyDoc]( + da = DocVec[MyDoc]( [MyDoc(tensor=torch.zeros(3, 5), docs=ImageDoc(tensor=torch.zeros(3, 5)))], tensor_type=TorchTensor, ) @@ -390,9 +388,7 @@ class MyDoc(BaseDoc): def test_to_device_numpy(): - da = DocArrayStacked[ImageDoc]( - [ImageDoc(tensor=np.zeros((3, 5)))], tensor_type=NdArray - ) + da = DocVec[ImageDoc]([ImageDoc(tensor=np.zeros((3, 5)))], tensor_type=NdArray) with pytest.raises(NotImplementedError): da.to('meta') @@ -515,11 +511,11 @@ def test_from_storage(): class ImageDoc(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocArrayStacked[ImageDoc]( + batch = DocVec[ImageDoc]( [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) - DocArrayStacked[ImageDoc].from_columns_storage(batch._storage) + DocVec[ImageDoc].from_columns_storage(batch._storage) def test_validate_from_da(): @@ -530,9 +526,9 @@ class ImageDoc(BaseDoc): [ImageDoc(tensor=torch.zeros(3, 224, 224)) for _ in range(10)] ) - da = parse_obj_as(DocArrayStacked[ImageDoc], batch) + da = parse_obj_as(DocVec[ImageDoc], batch) - assert isinstance(da, DocArrayStacked[ImageDoc]) + assert isinstance(da, DocVec[ImageDoc]) def test_validation_column_tensor(batch): @@ -556,7 +552,7 @@ class Inner(BaseDoc): class Doc(BaseDoc): inner: Inner - batch = DocArrayStacked[Doc]([Doc(inner=Inner(hello='hello')) for _ in range(10)]) + batch = DocVec[Doc]([Doc(inner=Inner(hello='hello')) for _ in range(10)]) return batch, Doc, Inner @@ -564,14 +560,14 @@ def test_validation_column_doc(batch_nested_doc): batch, Doc, Inner = batch_nested_doc batch.inner = DocList[Inner]([Inner(hello='hello') for _ in range(10)]) - assert isinstance(batch.inner, DocArrayStacked[Inner]) + assert isinstance(batch.inner, DocVec[Inner]) def test_validation_list_doc(batch_nested_doc): batch, Doc, Inner = batch_nested_doc batch.inner = [Inner(hello='hello') for _ in range(10)] - assert isinstance(batch.inner, DocArrayStacked[Inner]) + assert isinstance(batch.inner, DocVec[Inner]) def test_validation_col_doc_fail(batch_nested_doc): diff --git a/tests/units/array/stack/test_array_stacked_tf.py b/tests/units/array/stack/test_array_stacked_tf.py index 19f27fad114..ab65db69c95 100644 --- a/tests/units/array/stack/test_array_stacked_tf.py +++ b/tests/units/array/stack/test_array_stacked_tf.py @@ -3,7 +3,7 @@ import pytest from docarray import BaseDoc, DocList -from docarray.array import DocArrayStacked +from docarray.array import DocVec from docarray.typing import AnyTensor, NdArray from docarray.utils._internal.misc import is_tf_available @@ -37,7 +37,7 @@ class MMdoc(BaseDoc): import tensorflow as tf - batch = DocArrayStacked[MMdoc]( + batch = DocVec[MMdoc]( [ MMdoc( img=DocList[Image]( @@ -67,7 +67,7 @@ def test_getitem(batch): @pytest.mark.tensorflow def test_get_slice(batch): sliced = batch[0:2] - assert isinstance(sliced, DocArrayStacked) + assert isinstance(sliced, DocVec) assert len(sliced) == 2 @@ -82,9 +82,7 @@ def test_set_after_stacking(): class Image(BaseDoc): tensor: TensorFlowTensor[3, 224, 224] - batch = DocArrayStacked[Image]( - [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)] - ) + batch = DocVec[Image]([Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)]) batch.tensor = tf.ones((10, 3, 224, 224)) assert tnp.allclose(batch.tensor.tensor, tf.ones((10, 3, 224, 224))) @@ -150,7 +148,7 @@ class Image(BaseDoc): class MMdoc(BaseDoc): img: Image - batch = DocArrayStacked[MMdoc]( + batch = DocVec[MMdoc]( [MMdoc(img=Image(tensor=tf.zeros((3, 224, 224)))) for _ in range(10)] ) assert isinstance(batch.img._storage.tensor_columns['tensor'], TensorFlowTensor) @@ -188,7 +186,7 @@ def test_stack_union(): class Image(BaseDoc): tensor: Union[NdArray[3, 224, 224], TensorFlowTensor[3, 224, 224]] - DocArrayStacked[Image]( + DocVec[Image]( [Image(tensor=tf.zeros((3, 224, 224))) for _ in range(10)], tensor_type=TensorFlowTensor, ) @@ -215,7 +213,7 @@ def test_any_tensor_with_tf(): class Image(BaseDoc): tensor: AnyTensor - da = DocArrayStacked[Image]( + da = DocVec[Image]( [Image(tensor=tensor) for _ in range(10)], tensor_type=TensorFlowTensor, ) @@ -237,7 +235,7 @@ class Image(BaseDoc): class TopDoc(BaseDoc): img: Image - da = DocArrayStacked[TopDoc]( + da = DocVec[TopDoc]( [TopDoc(img=Image(tensor=tensor)) for _ in range(10)], tensor_type=TensorFlowTensor, ) @@ -256,12 +254,12 @@ class Doc(BaseDoc): text: str tensor: TensorFlowTensor - da = DocArrayStacked[Doc]( + da = DocVec[Doc]( [Doc(text=f'hello{i}', tensor=tf.zeros((3, 224, 224))) for i in range(10)] ) da_sliced = da[0:10:2] - assert isinstance(da_sliced, DocArrayStacked) + assert isinstance(da_sliced, DocVec) tensors = da_sliced.tensor.tensor assert tensors.shape == (5, 3, 224, 224) @@ -272,7 +270,7 @@ def test_stack_none(): class MyDoc(BaseDoc): tensor: Optional[AnyTensor] - da = DocArrayStacked[MyDoc]( + da = DocVec[MyDoc]( [MyDoc(tensor=None) for _ in range(10)], tensor_type=TensorFlowTensor ) assert 'tensor' in da._storage.tensor_columns.keys() diff --git a/tests/units/array/stack/test_init.py b/tests/units/array/stack/test_init.py index c4e906e82b1..12cfedf48aa 100644 --- a/tests/units/array/stack/test_init.py +++ b/tests/units/array/stack/test_init.py @@ -1,7 +1,7 @@ import numpy as np from docarray import BaseDoc -from docarray.array.stacked.array_stacked import DocArrayStacked +from docarray.array.stacked.array_stacked import DocVec from docarray.typing import AnyTensor, NdArray @@ -12,7 +12,7 @@ class MyDoc(BaseDoc): docs = [MyDoc(tensor=np.zeros(10), name='hello') for _ in range(4)] - da = DocArrayStacked[MyDoc](docs, tensor_type=NdArray) + da = DocVec[MyDoc](docs, tensor_type=NdArray) assert (da._storage.tensor_columns['tensor'] == np.zeros((4, 10))).all() assert da._storage.any_columns['name']._data == ['hello' for _ in range(4)] @@ -25,7 +25,7 @@ class MyDoc(BaseDoc): docs = [MyDoc(tensor=i * np.zeros((10, 10)), name=f'hello{i}') for i in range(4)] - da = DocArrayStacked[MyDoc](docs, tensor_type=NdArray) + da = DocVec[MyDoc](docs, tensor_type=NdArray) for i, doc in enumerate(da): assert isinstance(doc, MyDoc) diff --git a/tests/units/array/stack/test_proto.py b/tests/units/array/stack/test_proto.py index 4f2db70df48..585bdcf8d05 100644 --- a/tests/units/array/stack/test_proto.py +++ b/tests/units/array/stack/test_proto.py @@ -3,7 +3,7 @@ import torch from docarray import BaseDoc, DocList -from docarray.array import DocArrayStacked +from docarray.array import DocVec from docarray.typing import NdArray, TorchTensor @@ -43,6 +43,6 @@ class CustomDocument(BaseDoc): [CustomDocument(image=np.zeros((3, 224, 224))) for _ in range(10)] ).stack() - da2 = DocArrayStacked.from_protobuf(da.to_protobuf()) + da2 = DocVec.from_protobuf(da.to_protobuf()) - assert isinstance(da2, DocArrayStacked) + assert isinstance(da2, DocVec) diff --git a/tests/units/array/test_indexing.py b/tests/units/array/test_indexing.py index 7377e3aac08..eb225d97ec7 100644 --- a/tests/units/array/test_indexing.py +++ b/tests/units/array/test_indexing.py @@ -2,7 +2,7 @@ import pytest import torch -from docarray import DocArrayStacked, DocList +from docarray import DocList, DocVec from docarray.documents import TextDoc from docarray.typing import TorchTensor @@ -236,7 +236,7 @@ def test_boolmask_setitem(stack_left, stack_right, da, da_to_set, index): def test_setitem_update_column(): texts = [f'hello {i}' for i in range(10)] tensors = [torch.ones((4,)) * (i + 1) for i in range(10)] - da = DocArrayStacked[TextDoc]( + da = DocVec[TextDoc]( [TextDoc(text=text, embedding=tens) for text, tens in zip(texts, tensors)], tensor_type=TorchTensor, ) diff --git a/tests/units/document/test_view.py b/tests/units/document/test_view.py index a544289f7ec..db7d2fb5024 100644 --- a/tests/units/document/test_view.py +++ b/tests/units/document/test_view.py @@ -1,7 +1,7 @@ import numpy as np from docarray import BaseDoc -from docarray.array import DocArrayStacked +from docarray.array import DocVec from docarray.array.stacked.column_storage import ColumnStorageView from docarray.typing import AnyTensor @@ -13,7 +13,7 @@ class MyDoc(BaseDoc): docs = [MyDoc(tensor=np.zeros((10, 10)), name='hello', id=i) for i in range(4)] - storage = DocArrayStacked[MyDoc](docs)._storage + storage = DocVec[MyDoc](docs)._storage doc = MyDoc.from_view(ColumnStorageView(0, storage)) assert doc.is_view() From f5b471a4a147849249bf5928f88a74ac9ee784e3 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 3 Apr 2023 15:45:01 +0200 Subject: [PATCH 05/27] refactor: rename DocArrayStack to DocVec Signed-off-by: samsja --- docarray/array/__init__.py | 2 +- docarray/array/abstract_array.py | 4 ++-- docarray/array/array/array.py | 8 ++++---- docarray/array/{stacked => doc_vec}/__init__.py | 0 .../array/{stacked => doc_vec}/column_storage.py | 12 ++++++------ .../array_stacked.py => doc_vec/doc_vec.py} | 14 +++++++------- .../{stacked => doc_vec}/list_advance_indexing.py | 0 docarray/base_doc/doc.py | 6 +++--- docarray/display/document_array_summary.py | 2 +- docarray/utils/find.py | 2 +- .../integrations/torch/data/test_torch_dataset.py | 6 +++--- tests/units/array/stack/storage/test_storage.py | 2 +- tests/units/array/stack/test_array_stacked.py | 2 +- tests/units/array/stack/test_array_stacked_tf.py | 2 +- tests/units/array/stack/test_init.py | 2 +- tests/units/document/test_view.py | 2 +- 16 files changed, 33 insertions(+), 33 deletions(-) rename docarray/array/{stacked => doc_vec}/__init__.py (100%) rename docarray/array/{stacked => doc_vec}/column_storage.py (90%) rename docarray/array/{stacked/array_stacked.py => doc_vec/doc_vec.py} (97%) rename docarray/array/{stacked => doc_vec}/list_advance_indexing.py (100%) diff --git a/docarray/array/__init__.py b/docarray/array/__init__.py index 0f200cc598e..18c5bd5e19d 100644 --- a/docarray/array/__init__.py +++ b/docarray/array/__init__.py @@ -1,4 +1,4 @@ from docarray.array.array.array import DocList -from docarray.array.stacked.array_stacked import DocVec +from docarray.array.doc_vec.doc_vec import DocVec __all__ = ['DocList', 'DocVec'] diff --git a/docarray/array/abstract_array.py b/docarray/array/abstract_array.py index 8b33d95f3c3..583fc8e29d3 100644 --- a/docarray/array/abstract_array.py +++ b/docarray/array/abstract_array.py @@ -212,8 +212,8 @@ class Book(BaseDoc): chapters = da.traverse_flat(access_path='chapters') # list of 30 strings - If your DocArray is in stacked mode, and you want to access a field of - type AnyTensor, the stacked tensor will be returned instead of a list: + If your DocArray is in doc_vec mode, and you want to access a field of + type AnyTensor, the doc_vec tensor will be returned instead of a list: EXAMPLE USAGE .. code-block:: python diff --git a/docarray/array/array/array.py b/docarray/array/array/array.py index 3f881049388..784f9ed7e57 100644 --- a/docarray/array/array/array.py +++ b/docarray/array/array/array.py @@ -31,7 +31,7 @@ from pydantic import BaseConfig from pydantic.fields import ModelField - from docarray.array.stacked.array_stacked import DocVec + from docarray.array.doc_vec.doc_vec import DocVec from docarray.proto import DocumentArrayProto from docarray.typing import TorchTensor from docarray.typing.tensor.abstract_tensor import AbstractTensor @@ -257,11 +257,11 @@ def stack( """ Convert the DocArray into a DocArrayStacked. `Self` cannot be used afterwards - :param tensor_type: Tensor Class used to wrap the stacked tensors. This is useful + :param tensor_type: Tensor Class used to wrap the doc_vec tensors. This is useful if the BaseDoc has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor :return: A DocArrayStacked of the same document type as self """ - from docarray.array.stacked.array_stacked import DocVec + from docarray.array.doc_vec.doc_vec import DocVec return DocVec.__class_getitem__(self.document_type)( self, tensor_type=tensor_type @@ -274,7 +274,7 @@ def validate( field: 'ModelField', config: 'BaseConfig', ): - from docarray.array.stacked.array_stacked import DocVec + from docarray.array.doc_vec.doc_vec import DocVec if isinstance(value, (cls, DocVec)): return value diff --git a/docarray/array/stacked/__init__.py b/docarray/array/doc_vec/__init__.py similarity index 100% rename from docarray/array/stacked/__init__.py rename to docarray/array/doc_vec/__init__.py diff --git a/docarray/array/stacked/column_storage.py b/docarray/array/doc_vec/column_storage.py similarity index 90% rename from docarray/array/stacked/column_storage.py rename to docarray/array/doc_vec/column_storage.py index 9b0bd991985..fa1aca74a8a 100644 --- a/docarray/array/stacked/column_storage.py +++ b/docarray/array/doc_vec/column_storage.py @@ -10,12 +10,12 @@ Union, ) -from docarray.array.stacked.list_advance_indexing import ListAdvancedIndexing +from docarray.array.doc_vec.list_advance_indexing import ListAdvancedIndexing from docarray.typing import NdArray from docarray.typing.tensor.abstract_tensor import AbstractTensor if TYPE_CHECKING: - from docarray.array.stacked.array_stacked import DocVec + from docarray.array.doc_vec.doc_vec import DocVec IndexIterType = Union[slice, Iterable[int], Iterable[bool], None] @@ -26,13 +26,13 @@ class ColumnStorage: """ ColumnStorage is a container to store the columns of the - :class:`~docarray.array.stacked.DocArrayStacked`. + :class:`~docarray.array.doc_vec.DocArrayStacked`. :param tensor_columns: a Dict of AbstractTensor - :param doc_columns: a Dict of :class:`~docarray.array.stacked.DocArrayStacked` - :param da_columns: a Dict of List of :class:`~docarray.array.stacked.DocArrayStacked` + :param doc_columns: a Dict of :class:`~docarray.array.doc_vec.DocArrayStacked` + :param da_columns: a Dict of List of :class:`~docarray.array.doc_vec.DocArrayStacked` :param any_columns: a Dict of List - :param tensor_type: Class used to wrap the stacked tensors + :param tensor_type: Class used to wrap the doc_vec tensors """ def __init__( diff --git a/docarray/array/stacked/array_stacked.py b/docarray/array/doc_vec/doc_vec.py similarity index 97% rename from docarray/array/stacked/array_stacked.py rename to docarray/array/doc_vec/doc_vec.py index aa0f0e1403c..175bbf6da49 100644 --- a/docarray/array/stacked/array_stacked.py +++ b/docarray/array/doc_vec/doc_vec.py @@ -20,8 +20,8 @@ from docarray.array.abstract_array import AnyDocArray from docarray.array.array.array import DocList -from docarray.array.stacked.column_storage import ColumnStorage, ColumnStorageView -from docarray.array.stacked.list_advance_indexing import ListAdvancedIndexing +from docarray.array.doc_vec.column_storage import ColumnStorage, ColumnStorageView +from docarray.array.doc_vec.list_advance_indexing import ListAdvancedIndexing from docarray.base_doc import BaseDoc from docarray.base_doc.mixins.io import _type_to_protobuf from docarray.typing import NdArray @@ -63,12 +63,12 @@ class DocVec(AnyDocArray[T_doc]): {class}`~docarray.array.DocArray` but with an underlying implementation that is column based instead of row based. Each field of the schema of the DocArrayStack - (the :attr:`~docarray.array.stacked.DocVec.document_type` which is a - `BaseDoc`) will be stored in a column. If the field is a tensor, the data from all Documents will be stored as a single, stacked (torch/np/tf) tensor. + (the :attr:`~docarray.array.doc_vec.DocVec.document_type` which is a + `BaseDoc`) will be stored in a column. If the field is a tensor, the data from all Documents will be stored as a single, doc_vec (torch/np/tf) tensor. If the tensor field is `AnyTensor` or a Union of tensor types, the - :attr:`~docarray.array.stacked.DocArrayStacked.tensor_type` will be used to determine - the type of the stacked column. + :attr:`~docarray.array.doc_vec.DocArrayStacked.tensor_type` will be used to determine + the type of the doc_vec column. If the field is another `BasedDoc` the column will be another DocArrayStacked that follows the schema of the nested Document. @@ -82,7 +82,7 @@ class DocVec(AnyDocArray[T_doc]): numpy/PyTorch. :param docs: a homogeneous sequence of BaseDoc - :param tensor_type: Tensor Class used to wrap the stacked tensors. This is useful + :param tensor_type: Tensor Class used to wrap the doc_vec tensors. This is useful if the BaseDoc of this DocArrayStacked has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor """ diff --git a/docarray/array/stacked/list_advance_indexing.py b/docarray/array/doc_vec/list_advance_indexing.py similarity index 100% rename from docarray/array/stacked/list_advance_indexing.py rename to docarray/array/doc_vec/list_advance_indexing.py diff --git a/docarray/base_doc/doc.py b/docarray/base_doc/doc.py index 48afbe6eddd..a5c42a82ee4 100644 --- a/docarray/base_doc/doc.py +++ b/docarray/base_doc/doc.py @@ -1,5 +1,5 @@ import os -from typing import TYPE_CHECKING, Any, Optional, Type, TypeVar, Dict +from typing import TYPE_CHECKING, Any, Dict, Optional, Type, TypeVar import orjson from pydantic import BaseModel, Field @@ -12,7 +12,7 @@ from docarray.typing.tensor.abstract_tensor import AbstractTensor if TYPE_CHECKING: - from docarray.array.stacked.column_storage import ColumnStorageView + from docarray.array.doc_vec.column_storage import ColumnStorageView _console: Console = Console() @@ -79,7 +79,7 @@ def _ipython_display_(self) -> None: self.summary() def is_view(self) -> bool: - from docarray.array.stacked.column_storage import ColumnStorageView + from docarray.array.doc_vec.column_storage import ColumnStorageView return isinstance(self.__dict__, ColumnStorageView) diff --git a/docarray/display/document_array_summary.py b/docarray/display/document_array_summary.py index 05a4d5cf2dc..e587bf9d9e4 100644 --- a/docarray/display/document_array_summary.py +++ b/docarray/display/document_array_summary.py @@ -58,7 +58,7 @@ def _get_stacked_fields(da: 'DocVec') -> List[str]: # TODO this might # broken """ Return a list of the field names of a DocArrayStacked instance that are - stacked, i.e. all the fields that are of type AbstractTensor. Nested field + doc_vec, i.e. all the fields that are of type AbstractTensor. Nested field paths are separated by dot, such as: 'attr.nested_attr'. """ fields = [] diff --git a/docarray/utils/find.py b/docarray/utils/find.py index e55f39e542c..7086b55c675 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -6,7 +6,7 @@ from docarray.array.abstract_array import AnyDocArray from docarray.array.array.array import DocList -from docarray.array.stacked.array_stacked import DocVec +from docarray.array.doc_vec.doc_vec import DocVec from docarray.base_doc import BaseDoc from docarray.helper import _get_field_type_by_access_path from docarray.typing import AnyTensor diff --git a/tests/integrations/torch/data/test_torch_dataset.py b/tests/integrations/torch/data/test_torch_dataset.py index 19355604a8b..f358f1c16b8 100644 --- a/tests/integrations/torch/data/test_torch_dataset.py +++ b/tests/integrations/torch/data/test_torch_dataset.py @@ -56,7 +56,7 @@ def test_torch_dataset(captions_da: DocList[PairTextImage]): dataset, batch_size=BATCH_SIZE, collate_fn=dataset.collate_fn, shuffle=True ) - from docarray.array.stacked.array_stacked import DocVec + from docarray.array.doc_vec.doc_vec import DocVec batch_lens = [] for batch in loader: @@ -136,7 +136,7 @@ def test_torch_dl_multiprocessing(captions_da: DocList[PairTextImage]): multiprocessing_context='fork', ) - from docarray.array.stacked.array_stacked import DocVec + from docarray.array.doc_vec.doc_vec import DocVec batch_lens = [] for batch in loader: @@ -164,7 +164,7 @@ def test_torch_dl_pin_memory(captions_da: DocList[PairTextImage]): multiprocessing_context='fork', ) - from docarray.array.stacked.array_stacked import DocVec + from docarray.array.doc_vec.doc_vec import DocVec batch_lens = [] for batch in loader: diff --git a/tests/units/array/stack/storage/test_storage.py b/tests/units/array/stack/storage/test_storage.py index d55f80b7823..7fdb8133bef 100644 --- a/tests/units/array/stack/storage/test_storage.py +++ b/tests/units/array/stack/storage/test_storage.py @@ -2,7 +2,7 @@ from docarray import BaseDoc from docarray.array import DocVec -from docarray.array.stacked.column_storage import ColumnStorageView +from docarray.array.doc_vec.column_storage import ColumnStorageView from docarray.typing import AnyTensor diff --git a/tests/units/array/stack/test_array_stacked.py b/tests/units/array/stack/test_array_stacked.py index 54086f85845..14f5238873a 100644 --- a/tests/units/array/stack/test_array_stacked.py +++ b/tests/units/array/stack/test_array_stacked.py @@ -250,7 +250,7 @@ class ImageDoc(BaseDoc): ) batch[3].tensor = np.zeros((3, 224, 224)) - # union fields aren't actually stacked + # union fields aren't actually doc_vec # just checking that there is no error batch.stack() diff --git a/tests/units/array/stack/test_array_stacked_tf.py b/tests/units/array/stack/test_array_stacked_tf.py index ab65db69c95..c5bd31fea2e 100644 --- a/tests/units/array/stack/test_array_stacked_tf.py +++ b/tests/units/array/stack/test_array_stacked_tf.py @@ -191,7 +191,7 @@ class Image(BaseDoc): tensor_type=TensorFlowTensor, ) - # union fields aren't actually stacked + # union fields aren't actually doc_vec # just checking that there is no error diff --git a/tests/units/array/stack/test_init.py b/tests/units/array/stack/test_init.py index 12cfedf48aa..663eebadf89 100644 --- a/tests/units/array/stack/test_init.py +++ b/tests/units/array/stack/test_init.py @@ -1,7 +1,7 @@ import numpy as np from docarray import BaseDoc -from docarray.array.stacked.array_stacked import DocVec +from docarray.array.doc_vec.doc_vec import DocVec from docarray.typing import AnyTensor, NdArray diff --git a/tests/units/document/test_view.py b/tests/units/document/test_view.py index db7d2fb5024..ad9a56027c3 100644 --- a/tests/units/document/test_view.py +++ b/tests/units/document/test_view.py @@ -2,7 +2,7 @@ from docarray import BaseDoc from docarray.array import DocVec -from docarray.array.stacked.column_storage import ColumnStorageView +from docarray.array.doc_vec.column_storage import ColumnStorageView from docarray.typing import AnyTensor From 6dd9265850f1031d50b9c091b3a79faaa9472adc Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 3 Apr 2023 15:55:17 +0200 Subject: [PATCH 06/27] refactor: rename namespace stacked to doc vec Signed-off-by: samsja --- docarray/array/__init__.py | 2 +- docarray/array/{array => doc_list}/__init__.py | 0 docarray/array/{array => doc_list}/array.py | 12 ++++++------ docarray/array/{array => doc_list}/io.py | 8 ++++---- docarray/array/{array => doc_list}/pushpull.py | 0 .../{array => doc_list}/sequence_indexing_mixin.py | 2 +- docarray/array/doc_vec/doc_vec.py | 2 +- docarray/array/doc_vec/list_advance_indexing.py | 2 +- docarray/utils/filter.py | 2 +- docarray/utils/find.py | 2 +- 10 files changed, 16 insertions(+), 16 deletions(-) rename docarray/array/{array => doc_list}/__init__.py (100%) rename docarray/array/{array => doc_list}/array.py (96%) rename docarray/array/{array => doc_list}/io.py (99%) rename docarray/array/{array => doc_list}/pushpull.py (100%) rename docarray/array/{array => doc_list}/sequence_indexing_mixin.py (99%) diff --git a/docarray/array/__init__.py b/docarray/array/__init__.py index 18c5bd5e19d..9726475da9b 100644 --- a/docarray/array/__init__.py +++ b/docarray/array/__init__.py @@ -1,4 +1,4 @@ -from docarray.array.array.array import DocList +from docarray.array.doc_list.array import DocList from docarray.array.doc_vec.doc_vec import DocVec __all__ = ['DocList', 'DocVec'] diff --git a/docarray/array/array/__init__.py b/docarray/array/doc_list/__init__.py similarity index 100% rename from docarray/array/array/__init__.py rename to docarray/array/doc_list/__init__.py diff --git a/docarray/array/array/array.py b/docarray/array/doc_list/array.py similarity index 96% rename from docarray/array/array/array.py rename to docarray/array/doc_list/array.py index 784f9ed7e57..e5a2478f028 100644 --- a/docarray/array/array/array.py +++ b/docarray/array/doc_list/array.py @@ -18,9 +18,9 @@ from typing_inspect import is_union_type from docarray.array.abstract_array import AnyDocArray -from docarray.array.array.io import IOMixinArray -from docarray.array.array.pushpull import PushPullMixin -from docarray.array.array.sequence_indexing_mixin import ( +from docarray.array.doc_list.io import IOMixinArray +from docarray.array.doc_list.pushpull import PushPullMixin +from docarray.array.doc_list.sequence_indexing_mixin import ( IndexingSequenceMixin, IndexIterType, ) @@ -104,7 +104,7 @@ class Image(BaseDoc): # [NdArray([0.11299577, 0.47206767, 0.481723 , 0.34754724, 0.15016037, # 0.88861321, 0.88317666, 0.93845579, 0.60486676, ... ]), ...] - You can index into a DocList like a numpy array or torch tensor: + You can index into a DocList like a numpy doc_list or torch tensor: da[0] # index by position @@ -213,11 +213,11 @@ def _get_data_column( self: T, field: str, ) -> Union[MutableSequence, T, 'TorchTensor', 'NdArray']: - """Return all values of the fields from all docs this array contains + """Return all values of the fields from all docs this doc_list contains :param field: name of the fields to extract :return: Returns a list of the field value for each document - in the array like container + in the doc_list like container """ field_type = self.__class__.document_type._get_field_type(field) diff --git a/docarray/array/array/io.py b/docarray/array/doc_list/io.py similarity index 99% rename from docarray/array/array/io.py rename to docarray/array/doc_list/io.py index 91bb169b3dc..273a4f38d61 100644 --- a/docarray/array/array/io.py +++ b/docarray/array/doc_list/io.py @@ -347,7 +347,7 @@ def from_csv( """ Load a DocArray from a csv file following the schema defined in the :attr:`~docarray.DocArray.document_type` attribute. - Every row of the csv file will be mapped to one document in the array. + Every row of the csv file will be mapped to one document in the doc_list. The column names (defined in the first row) have to match the field names of the Document type. For nested fields use "__"-separated access paths, such as 'image__url'. @@ -432,7 +432,7 @@ def from_pandas(cls, df: 'pd.DataFrame') -> 'DocList': """ Load a DocArray from a `pandas.DataFrame` following the schema defined in the :attr:`~docarray.DocArray.document_type` attribute. - Every row of the dataframe will be mapped to one Document in the array. + Every row of the dataframe will be mapped to one Document in the doc_list. The column names of the dataframe have to match the field names of the Document type. For nested fields use "__"-separated access paths as column names, @@ -683,7 +683,7 @@ def load_binary( show_progress: bool = False, streaming: bool = False, ) -> Union[T, Generator['T_doc', None, None]]: - """Load array elements from a compressed binary file. + """Load doc_list elements from a compressed binary file. :param file: File or filename or serialized bytes where the data is stored. :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' @@ -741,7 +741,7 @@ def save_binary( """Save DocArray into a binary file. It will use the protocol to pick how to save the DocArray. - If used 'picke-array` and `protobuf-array` the DocArray will be stored + If used 'picke-doc_list` and `protobuf-array` the DocArray will be stored and compressed at complete level using `pickle` or `protobuf`. When using `protobuf` or `pickle` as protocol each Document in DocArray will be stored individually and this would make it available for streaming. diff --git a/docarray/array/array/pushpull.py b/docarray/array/doc_list/pushpull.py similarity index 100% rename from docarray/array/array/pushpull.py rename to docarray/array/doc_list/pushpull.py diff --git a/docarray/array/array/sequence_indexing_mixin.py b/docarray/array/doc_list/sequence_indexing_mixin.py similarity index 99% rename from docarray/array/array/sequence_indexing_mixin.py rename to docarray/array/doc_list/sequence_indexing_mixin.py index ac07359f3b0..601545c3a9d 100644 --- a/docarray/array/array/sequence_indexing_mixin.py +++ b/docarray/array/doc_list/sequence_indexing_mixin.py @@ -39,7 +39,7 @@ class IndexingSequenceMixin(Iterable[T_item]): This mixin allow sto extend a list into an object that can be indexed a la numpy/pytorch. - You can index into, delete from, and set items in a IndexingSequenceMixin like a numpy array or torch tensor: + You can index into, delete from, and set items in a IndexingSequenceMixin like a numpy doc_list or torch tensor: .. code-block:: python da[0] # index by position diff --git a/docarray/array/doc_vec/doc_vec.py b/docarray/array/doc_vec/doc_vec.py index 175bbf6da49..2bf49576931 100644 --- a/docarray/array/doc_vec/doc_vec.py +++ b/docarray/array/doc_vec/doc_vec.py @@ -19,7 +19,7 @@ from pydantic import BaseConfig, parse_obj_as from docarray.array.abstract_array import AnyDocArray -from docarray.array.array.array import DocList +from docarray.array.doc_list.array import DocList from docarray.array.doc_vec.column_storage import ColumnStorage, ColumnStorageView from docarray.array.doc_vec.list_advance_indexing import ListAdvancedIndexing from docarray.base_doc import BaseDoc diff --git a/docarray/array/doc_vec/list_advance_indexing.py b/docarray/array/doc_vec/list_advance_indexing.py index 545c634a4aa..1de13dd9f27 100644 --- a/docarray/array/doc_vec/list_advance_indexing.py +++ b/docarray/array/doc_vec/list_advance_indexing.py @@ -1,6 +1,6 @@ from typing import Iterator, MutableSequence, TypeVar -from docarray.array.array.sequence_indexing_mixin import IndexingSequenceMixin +from docarray.array.doc_list.sequence_indexing_mixin import IndexingSequenceMixin T_item = TypeVar('T_item') diff --git a/docarray/utils/filter.py b/docarray/utils/filter.py index f17fc8fd9ff..9bb41dd1bc4 100644 --- a/docarray/utils/filter.py +++ b/docarray/utils/filter.py @@ -4,7 +4,7 @@ from typing import Dict, List, Union from docarray.array.abstract_array import AnyDocArray -from docarray.array.array.array import DocList +from docarray.array.doc_list.array import DocList def filter_docs( diff --git a/docarray/utils/find.py b/docarray/utils/find.py index 7086b55c675..3ed4c19fbb6 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -5,7 +5,7 @@ from typing_inspect import is_union_type from docarray.array.abstract_array import AnyDocArray -from docarray.array.array.array import DocList +from docarray.array.doc_list.array import DocList from docarray.array.doc_vec.doc_vec import DocVec from docarray.base_doc import BaseDoc from docarray.helper import _get_field_type_by_access_path From cccbc978afb0800077a9800b9f73dc08f8632127 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 3 Apr 2023 15:58:49 +0200 Subject: [PATCH 07/27] refactor: rename namespace stacked to doc vec Signed-off-by: samsja --- docarray/array/__init__.py | 2 +- docarray/array/doc_list/{array.py => doc_list.py} | 0 docarray/array/doc_vec/doc_vec.py | 2 +- docarray/utils/filter.py | 2 +- docarray/utils/find.py | 2 +- docs/api_references/array/da.md | 4 ++-- docs/api_references/array/da_stack.md | 4 ++-- 7 files changed, 8 insertions(+), 8 deletions(-) rename docarray/array/doc_list/{array.py => doc_list.py} (100%) diff --git a/docarray/array/__init__.py b/docarray/array/__init__.py index 9726475da9b..3792c3c6755 100644 --- a/docarray/array/__init__.py +++ b/docarray/array/__init__.py @@ -1,4 +1,4 @@ -from docarray.array.doc_list.array import DocList +from docarray.array.doc_list.doc_list import DocList from docarray.array.doc_vec.doc_vec import DocVec __all__ = ['DocList', 'DocVec'] diff --git a/docarray/array/doc_list/array.py b/docarray/array/doc_list/doc_list.py similarity index 100% rename from docarray/array/doc_list/array.py rename to docarray/array/doc_list/doc_list.py diff --git a/docarray/array/doc_vec/doc_vec.py b/docarray/array/doc_vec/doc_vec.py index 2bf49576931..7e34f9be97d 100644 --- a/docarray/array/doc_vec/doc_vec.py +++ b/docarray/array/doc_vec/doc_vec.py @@ -19,7 +19,7 @@ from pydantic import BaseConfig, parse_obj_as from docarray.array.abstract_array import AnyDocArray -from docarray.array.doc_list.array import DocList +from docarray.array.doc_list.doc_list import DocList from docarray.array.doc_vec.column_storage import ColumnStorage, ColumnStorageView from docarray.array.doc_vec.list_advance_indexing import ListAdvancedIndexing from docarray.base_doc import BaseDoc diff --git a/docarray/utils/filter.py b/docarray/utils/filter.py index 9bb41dd1bc4..bafe2e1bfb7 100644 --- a/docarray/utils/filter.py +++ b/docarray/utils/filter.py @@ -4,7 +4,7 @@ from typing import Dict, List, Union from docarray.array.abstract_array import AnyDocArray -from docarray.array.doc_list.array import DocList +from docarray.array.doc_list.doc_list import DocList def filter_docs( diff --git a/docarray/utils/find.py b/docarray/utils/find.py index 3ed4c19fbb6..dab2415c194 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -5,7 +5,7 @@ from typing_inspect import is_union_type from docarray.array.abstract_array import AnyDocArray -from docarray.array.doc_list.array import DocList +from docarray.array.doc_list.doc_list import DocList from docarray.array.doc_vec.doc_vec import DocVec from docarray.base_doc import BaseDoc from docarray.helper import _get_field_type_by_access_path diff --git a/docs/api_references/array/da.md b/docs/api_references/array/da.md index d44a4913864..79c898cafb3 100644 --- a/docs/api_references/array/da.md +++ b/docs/api_references/array/da.md @@ -1,3 +1,3 @@ -# DocArray +# DocList -::: docarray.array.array.array.DocArray +::: docarray.array.doc_list.doc_list.DocArray diff --git a/docs/api_references/array/da_stack.md b/docs/api_references/array/da_stack.md index 3e003fe60ff..c0709f2e084 100644 --- a/docs/api_references/array/da_stack.md +++ b/docs/api_references/array/da_stack.md @@ -1,3 +1,3 @@ -# DocVeced +# DocVec -::: docarray.array.array.array.DocVeced +::: docarray.array.doc_vec.doc_vec.DocVec From 836995735a4a09d26e03ffe429e49157b858241d Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 3 Apr 2023 16:20:20 +0200 Subject: [PATCH 08/27] fix: fix ci Signed-off-by: samsja --- docs/api_references/array/da.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api_references/array/da.md b/docs/api_references/array/da.md index 79c898cafb3..21a206a9537 100644 --- a/docs/api_references/array/da.md +++ b/docs/api_references/array/da.md @@ -1,3 +1,3 @@ # DocList -::: docarray.array.doc_list.doc_list.DocArray +::: docarray.array.doc_list.doc_list.DocList From 446a8f21f124fad943ab558a72f1fdde65a5c5b8 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 3 Apr 2023 16:23:36 +0200 Subject: [PATCH 09/27] refactor: rename namesapce Signed-off-by: samsja --- docarray/array/{abstract_array.py => any_array.py} | 0 docarray/array/doc_list/doc_list.py | 2 +- docarray/array/doc_vec/doc_vec.py | 2 +- docarray/display/document_array_summary.py | 2 +- docarray/index/abstract.py | 2 +- docarray/utils/filter.py | 2 +- docarray/utils/find.py | 2 +- docarray/utils/map.py | 2 +- tests/units/array/test_traverse.py | 2 +- 9 files changed, 8 insertions(+), 8 deletions(-) rename docarray/array/{abstract_array.py => any_array.py} (100%) diff --git a/docarray/array/abstract_array.py b/docarray/array/any_array.py similarity index 100% rename from docarray/array/abstract_array.py rename to docarray/array/any_array.py diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py index e5a2478f028..86d61ac8fb1 100644 --- a/docarray/array/doc_list/doc_list.py +++ b/docarray/array/doc_list/doc_list.py @@ -17,7 +17,7 @@ from typing_inspect import is_union_type -from docarray.array.abstract_array import AnyDocArray +from docarray.array.any_array import AnyDocArray from docarray.array.doc_list.io import IOMixinArray from docarray.array.doc_list.pushpull import PushPullMixin from docarray.array.doc_list.sequence_indexing_mixin import ( diff --git a/docarray/array/doc_vec/doc_vec.py b/docarray/array/doc_vec/doc_vec.py index 7e34f9be97d..12cb03c4f72 100644 --- a/docarray/array/doc_vec/doc_vec.py +++ b/docarray/array/doc_vec/doc_vec.py @@ -18,7 +18,7 @@ from pydantic import BaseConfig, parse_obj_as -from docarray.array.abstract_array import AnyDocArray +from docarray.array.any_array import AnyDocArray from docarray.array.doc_list.doc_list import DocList from docarray.array.doc_vec.column_storage import ColumnStorage, ColumnStorageView from docarray.array.doc_vec.list_advance_indexing import ListAdvancedIndexing diff --git a/docarray/display/document_array_summary.py b/docarray/display/document_array_summary.py index e587bf9d9e4..eacedcb6dc3 100644 --- a/docarray/display/document_array_summary.py +++ b/docarray/display/document_array_summary.py @@ -4,7 +4,7 @@ if TYPE_CHECKING: from docarray.array import DocVec - from docarray.array.abstract_array import AnyDocArray + from docarray.array.any_array import AnyDocArray class DocArraySummary: diff --git a/docarray/index/abstract.py b/docarray/index/abstract.py index 1a4e9571ce5..4a046183e29 100644 --- a/docarray/index/abstract.py +++ b/docarray/index/abstract.py @@ -25,7 +25,7 @@ from typing_inspect import get_args, is_optional_type, is_union_type from docarray import BaseDoc, DocList -from docarray.array.abstract_array import AnyDocArray +from docarray.array.any_array import AnyDocArray from docarray.typing import AnyTensor from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.utils._internal._typing import is_tensor_union diff --git a/docarray/utils/filter.py b/docarray/utils/filter.py index bafe2e1bfb7..50108536e93 100644 --- a/docarray/utils/filter.py +++ b/docarray/utils/filter.py @@ -3,7 +3,7 @@ import json from typing import Dict, List, Union -from docarray.array.abstract_array import AnyDocArray +from docarray.array.any_array import AnyDocArray from docarray.array.doc_list.doc_list import DocList diff --git a/docarray/utils/find.py b/docarray/utils/find.py index dab2415c194..65987b0dc64 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -4,7 +4,7 @@ from typing_inspect import is_union_type -from docarray.array.abstract_array import AnyDocArray +from docarray.array.any_array import AnyDocArray from docarray.array.doc_list.doc_list import DocList from docarray.array.doc_vec.doc_vec import DocVec from docarray.base_doc import BaseDoc diff --git a/docarray/utils/map.py b/docarray/utils/map.py index 31e93bc2175..be4a738d3ec 100644 --- a/docarray/utils/map.py +++ b/docarray/utils/map.py @@ -7,7 +7,7 @@ from rich.progress import track from docarray import BaseDoc -from docarray.array.abstract_array import AnyDocArray +from docarray.array.any_array import AnyDocArray from docarray.helper import _is_lambda_or_partial_or_local_function T = TypeVar('T', bound=AnyDocArray) diff --git a/tests/units/array/test_traverse.py b/tests/units/array/test_traverse.py index 8f648526faa..281abe0ce0e 100644 --- a/tests/units/array/test_traverse.py +++ b/tests/units/array/test_traverse.py @@ -4,7 +4,7 @@ import torch from docarray import BaseDoc, DocList -from docarray.array.abstract_array import AnyDocArray +from docarray.array.any_array import AnyDocArray from docarray.documents import TextDoc from docarray.typing import TorchTensor From fd159ab67a9fc9a876974bb6dc51c62b575f3cd4 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 3 Apr 2023 16:38:38 +0200 Subject: [PATCH 10/27] fix: fix docstring Signed-off-by: samsja --- docarray/utils/filter.py | 8 +-- docarray/utils/find.py | 20 +++--- docarray/utils/map.py | 18 ++--- docarray/utils/reduce.py | 46 ++++++------- .../how_to/multimodal_training_and_serving.md | 68 +++++++++---------- 5 files changed, 78 insertions(+), 82 deletions(-) diff --git a/docarray/utils/filter.py b/docarray/utils/filter.py index 50108536e93..d34c50b278b 100644 --- a/docarray/utils/filter.py +++ b/docarray/utils/filter.py @@ -19,7 +19,7 @@ def filter_docs( --- ```python - from docarray import DocArray, BaseDoc + from docarray import DocList, BaseDoc from docarray.documents import TextDoc, ImageDoc from docarray.utils.filter import filter_docs @@ -30,7 +30,7 @@ class MyDocument(BaseDoc): price: int - docs = DocArray[MyDocument]( + docs = DocList[MyDocument]( [ MyDocument( caption='A tiger in the jungle', @@ -65,9 +65,9 @@ class MyDocument(BaseDoc): --- - :param docs: the DocArray where to apply the filter + :param docs: the DocList where to apply the filter :param query: the query to filter by - :return: A DocArray containing the Documents + :return: A DocList containing the Documents in `docs` that fulfill the filter conditions in the `query` """ from docarray.utils._internal.query_language.query_parser import QueryParser diff --git a/docarray/utils/find.py b/docarray/utils/find.py index 65987b0dc64..a2acd3b9e39 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -44,7 +44,7 @@ def find( --- ```python - from docarray import DocArray, BaseDoc + from docarray import DocList, BaseDoc from docarray.typing import TorchTensor from docarray.utils.find import find import torch @@ -54,9 +54,7 @@ class MyDocument(BaseDoc): embedding: TorchTensor - index = DocArray[MyDocument]( - [MyDocument(embedding=torch.rand(128)) for _ in range(100)] - ) + index = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(100)]) # use Document as query query = MyDocument(embedding=torch.rand(128)) @@ -92,7 +90,7 @@ class MyDocument(BaseDoc): can be either `cpu` or a `cuda` device. :param descending: sort the results in descending order. Per default, this is chosen based on the `metric` argument. - :return: A named tuple of the form (DocArray, AnyTensor), + :return: A named tuple of the form (DocList, AnyTensor), where the first element contains the closes matches for the query, and the second element contains the corresponding scores. """ @@ -130,7 +128,7 @@ def find_batched( --- ```python - from docarray import DocArray, BaseDoc + from docarray import DocList, BaseDoc from docarray.typing import TorchTensor from docarray.utils.find import find_batched import torch @@ -140,12 +138,10 @@ class MyDocument(BaseDoc): embedding: TorchTensor - index = DocArray[MyDocument]( - [MyDocument(embedding=torch.rand(128)) for _ in range(100)] - ) + index = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(100)]) - # use DocArray as query - query = DocArray[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)]) + # use DocList as query + query = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)]) results = find_batched( index=index, query=query, @@ -180,7 +176,7 @@ class MyDocument(BaseDoc): can be either `cpu` or a `cuda` device. :param descending: sort the results in descending order. Per default, this is chosen based on the `metric` argument. - :return: a list of named tuples of the form (DocArray, AnyTensor), + :return: a list of named tuples of the form (DocList, AnyTensor), where the first element contains the closes matches for each query, and the second element contains the corresponding scores. """ diff --git a/docarray/utils/map.py b/docarray/utils/map.py index be4a738d3ec..3d7f86bc853 100644 --- a/docarray/utils/map.py +++ b/docarray/utils/map.py @@ -29,7 +29,7 @@ def map_docs( --- ```python - from docarray import DocArray + from docarray import DocList from docarray.documents import ImageDoc from docarray.utils.map import map_docs @@ -44,8 +44,8 @@ def load_url_to_tensor(img: ImageDoc) -> ImageDoc: 'Dag_Sebastian_Ahlander_at_G%C3%B6teborg_Book_Fair_2012b.jpg' ) - da = DocArray[ImageDoc]([ImageDoc(url=url) for _ in range(100)]) - da = DocArray[ImageDoc]( + da = DocList[ImageDoc]([ImageDoc(url=url) for _ in range(100)]) + da = DocList[ImageDoc]( list(map_docs(da, load_url_to_tensor, backend='thread')) ) # threading is usually a good option for IO-bound tasks such as loading an # ImageDoc from url @@ -56,7 +56,7 @@ def load_url_to_tensor(img: ImageDoc) -> ImageDoc: --- - :param da: DocArray to apply function to + :param da: DocList to apply function to :param func: a function that takes a :class:`BaseDoc` as input and outputs a :class:`BaseDoc`. :param backend: `thread` for multithreading and `process` for multiprocessing. @@ -121,7 +121,7 @@ def map_docs_batched( --- ```python - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList from docarray.utils.map import map_docs_batched @@ -129,13 +129,13 @@ class MyDoc(BaseDoc): name: str - def upper_case_name(da: DocArray[MyDoc]) -> DocArray[MyDoc]: + def upper_case_name(da: DocList[MyDoc]) -> DocList[MyDoc]: da.name = [n.upper() for n in da.name] return da batch_size = 16 - da = DocArray[MyDoc]([MyDoc(name='my orange cat') for _ in range(100)]) + da = DocList[MyDoc]([MyDoc(name='my orange cat') for _ in range(100)]) it = map_docs_batched(da, upper_case_name, batch_size=batch_size) for i, d in enumerate(it): da[i * batch_size : (i + 1) * batch_size] = d @@ -152,7 +152,7 @@ def upper_case_name(da: DocArray[MyDoc]) -> DocArray[MyDoc]: --- - :param da: DocArray to apply function to + :param da: DocList to apply function to :param batch_size: Size of each generated batch (except the last one, which might be smaller). :param shuffle: If set, shuffle the Documents before dividing into minibatches. @@ -180,7 +180,7 @@ def upper_case_name(da: DocArray[MyDoc]) -> DocArray[MyDoc]: :param pool: use an existing/external pool. If given, `backend` is ignored and you will be responsible for closing the pool. - :return: yield DocArrays returned from `func` + :return: yield DocLists returned from `func` """ if backend == 'process' and _is_lambda_or_partial_or_local_function(func): raise ValueError( diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py index 41761a241f1..71374f088d9 100644 --- a/docarray/utils/reduce.py +++ b/docarray/utils/reduce.py @@ -9,21 +9,21 @@ def reduce( left: DocList, right: DocList, left_id_map: Optional[Dict] = None ) -> 'DocList': """ - Reduces left and right DocArray into one DocArray in-place. - Changes are applied to the left DocArray. - Reducing 2 DocArrays consists in adding Documents in the second DocArray - to the first DocArray if they do not exist. - If a Document exists in both DocArrays (identified by ID), + Reduces left and right DocList into one DocList in-place. + Changes are applied to the left DocList. + Reducing 2 DocLists consists in adding Documents in the second DocList + to the first DocList if they do not exist. + If a Document exists in both DocLists (identified by ID), the data properties are merged with priority to the left Document. - Nested DocArrays are also reduced in the same way. - :param left: First DocArray to be reduced. Changes will be applied to it + Nested DocLists are also reduced in the same way. + :param left: First DocList to be reduced. Changes will be applied to it in-place - :param right: Second DocArray to be reduced + :param right: Second DocList to be reduced :param left_id_map: Optional parameter to be passed in repeated calls for optimizations, keeping a map of the Document ID to its offset - in the DocArray - :return: Reduced DocArray + in the DocList + :return: Reduced DocList """ left_id_map = left_id_map or {doc.id: i for i, doc in enumerate(left)} @@ -38,29 +38,29 @@ def reduce( def reduce_all(docarrays: List[DocList]) -> DocList: """ - Reduces a list of DocArrays into one DocArray. - Changes are applied to the first DocArray in-place. + Reduces a list of DocLists into one DocList. + Changes are applied to the first DocList in-place. - The resulting DocArray contains Documents of all DocArrays. - If a Document exists (identified by their ID) in many DocArrays, + The resulting DocList contains Documents of all DocLists. + If a Document exists (identified by their ID) in many DocLists, data properties are merged with priority to the left-most - DocArrays (that is, if a data attribute is set in a Document - belonging to many DocArrays, the attribute value of the left-most - DocArray is kept). - Nested DocArrays belonging to many DocArrays + DocLists (that is, if a data attribute is set in a Document + belonging to many DocLists, the attribute value of the left-most + DocList is kept). + Nested DocLists belonging to many DocLists are also reduced in the same way. .. note:: - - Nested DocArrays order does not follow any specific rule. + - Nested DocLists order does not follow any specific rule. You might want to re-sort them in a later step. - - The final result depends on the order of DocArrays + - The final result depends on the order of DocLists when applying reduction. - :param docarrays: List of DocArrays to be reduced - :return: the resulting DocArray + :param docarrays: List of DocLists to be reduced + :return: the resulting DocList """ if len(docarrays) <= 1: raise Exception( - 'In order to reduce DocArrays' ' we should have more than one DocArray' + 'In order to reduce DocLists' ' we should have more than one DocList' ) left = docarrays[0] others = docarrays[1:] diff --git a/docs/how_to/multimodal_training_and_serving.md b/docs/how_to/multimodal_training_and_serving.md index 81cbe2917d6..9c30cbeffba 100644 --- a/docs/how_to/multimodal_training_and_serving.md +++ b/docs/how_to/multimodal_training_and_serving.md @@ -12,9 +12,9 @@ jupyter: name: python3 --- -# Multi-Modal Deep learning with DocArray +# Multi-Modal Deep learning with DocList -DocArray is a library for representing, sending, and storing multi-modal data that can be used for a variety of different +DocList is a library for representing, sending, and storing multi-modal data that can be used for a variety of different use cases. Here we will focus on a workflow familiar to many ML Engineers: Building and training a model, and then serving it to @@ -22,10 +22,10 @@ users. This notebook contains two parts: -1. **Representing**: We will use DocArray to represent multi-modal data while **building and training a PyTorch model**. -We will see how DocArray can help to organize and group your modalities and tensors and make clear what methods expect as inputs and return as outputs. +1. **Representing**: We will use DocList to represent multi-modal data while **building and training a PyTorch model**. +We will see how DocList can help to organize and group your modalities and tensors and make clear what methods expect as inputs and return as outputs. 2. **Sending**: We will take the model that we built and trained in part 1, and **serve it using FastAPI**. -We will see how DocArray narrows the gap between model development and model deployment, and how the same data models can be +We will see how DocList narrows the gap between model development and model deployment, and how the same data models can be reused in both contexts. That part will be very short, but that's the point! So without further ado, let's dive into it! @@ -39,11 +39,11 @@ We train the CLIP-like model on the [flickr8k](https://www.kaggle.com/datasets/a To run this notebook you need to download and unzip the data into the same folder as the notebook. Note that in this notebook by no means we aim at reproduce any CLIP results (our dataset is way too small anyways), -but we rather want to show how DocArray datastructures help researchers and practitioners to write beautiful and +but we rather want to show how DocList datastructures help researchers and practitioners to write beautiful and pythonic multi-modal PyTorch code. ```python tags=[] -#!pip install "git+https://github.com/docarray/docarray@feat-rewrite-v2#egg=docarray[torch,image]" +#!pip install "git+https://github.com/DocList/DocList@feat-rewrite-v2#egg=DocList[torch,image]" #!pip install torchvision #!pip install transformers #!pip install fastapi @@ -56,7 +56,7 @@ from typing import Callable, Dict, List, Optional ``` ```python -import docarray +import DocList import torch ``` @@ -74,23 +74,23 @@ DEVICE = "cuda:0" # change to your favourite device ## Create the Documents for handling the Muti-Modal data -The first thing we are trying to achieve when using DocArray is to clearly model our data so that we never get confused +The first thing we are trying to achieve when using DocList is to clearly model our data so that we never get confused about which tensors are supposed to represent what. -To do that we are using a concept that is at the core of DocArray. The `Document`, a collection of multi-modal data. +To do that we are using a concept that is at the core of DocList. The `Document`, a collection of multi-modal data. The `BaseDoc` class allows users to define their own (nested, multi-modal) Document schema to represent any kind of complex data. Let's start by defining a few Documents to handle the different modalities that we will use during our training: ```python -from docarray import BaseDoc, DocList -from docarray.typing import TorchTensor, ImageUrl +from DocList import BaseDoc, DocList +from DocList.typing import TorchTensor, ImageUrl ``` Let's first create a Document for our Text modality. It will contain a number of `Tokens`, which we also define: ```python -from docarray.documents import TextDoc as BaseText +from DocList.documents import TextDoc as BaseText class Tokens(BaseDoc): @@ -106,10 +106,10 @@ Notice the `TorchTensor` type. It is a thin wrapper around `torch.Tensor` that c but also enables additional features. One such feature is shape parametrization (`TorchTensor[48]`), which lets you hint and even enforce the desired shape of any tensor! -To represent our image data, we use the `Image` Document that is included in DocArray: +To represent our image data, we use the `Image` Document that is included in DocList: ```python -from docarray.documents import ImageDoc +from DocList.documents import ImageDoc ``` Under the hood, an `Image` looks something like this (with the only main difference that it can take tensors from any @@ -136,9 +136,9 @@ class PairTextImage(BaseDoc): ## Create the Dataset -In this section we will create a multi-modal pytorch dataset around the Flick8k dataset using DocArray. +In this section we will create a multi-modal pytorch dataset around the Flick8k dataset using DocList. -We will use DocArray data loading functionality to load the data and use Torchvision and Transformers to preprocess the data before feeding it to our deep learning model: +We will use DocList data loading functionality to load the data and use Torchvision and Transformers to preprocess the data before feeding it to our deep learning model: ```python from torch.utils.data import DataLoader, Dataset @@ -191,7 +191,7 @@ def get_flickr8k_da(file: str = "captions.txt", N: Optional[int] = None): return da ``` -In the `get_flickr8k_da` method we process the Flickr8k dataset into a `DocArray`. +In the `get_flickr8k_da` method we process the Flickr8k dataset into a `DocList`. Now let's instantiate this dataset using the `MultiModalDataset` class. The constructor takes in the `da` and a dictionary of preprocessing transformations: @@ -201,7 +201,7 @@ preprocessing = {"image": VisionPreprocess(), "text": TextPreprocess()} ``` ```python -from docarray.data import MultiModalDataset +from DocList.data import MultiModalDataset dataset = MultiModalDataset[PairTextImage](da=da, preprocessing=preprocessing) loader = DataLoader( @@ -214,11 +214,11 @@ loader = DataLoader( ) ``` -## Create the Pytorch model that works on DocArray +## Create the Pytorch model that works on DocList In this section we create two encoders, one per modality (Text and Image). These encoders are normal PyTorch `nn.Module`s. -The only difference is that they operate on DocArray rather that on torch.Tensor: +The only difference is that they operate on DocList rather that on torch.Tensor: ```python class TextEncoder(nn.Module): @@ -226,7 +226,7 @@ class TextEncoder(nn.Module): super().__init__() self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased") - def forward(self, texts: DocArray[Text]) -> TorchTensor: + def forward(self, texts: DocList[Text]) -> TorchTensor: last_hidden_state = self.bert( input_ids=texts.tokens.input_ids, attention_mask=texts.tokens.attention_mask ).last_hidden_state @@ -240,8 +240,8 @@ class TextEncoder(nn.Module): return masked_output.sum(dim=1) / attention_mask.sum(-1, keepdim=True) ``` -The `TextEncoder` takes a `DocArray` of `Text`s as input, and returns an embedding `TorchTensor` as output. -`DocArray` can be seen as a list of `Text` documents, and the encoder will treat it as one batch. +The `TextEncoder` takes a `DocList` of `Text`s as input, and returns an embedding `TorchTensor` as output. +`DocList` can be seen as a list of `Text` documents, and the encoder will treat it as one batch. ```python @@ -251,12 +251,12 @@ class VisionEncoder(nn.Module): self.backbone = torchvision.models.resnet18(pretrained=True) self.linear = nn.LazyLinear(out_features=768) - def forward(self, images: DocArray[Image]) -> TorchTensor: + def forward(self, images: DocList[Image]) -> TorchTensor: x = self.backbone(images.tensor) return self.linear(x) ``` -Similarly, the `VisionEncoder` also takes a `DocArray` of `Image`s as input, and returns an embedding `TorchTensor` as output. +Similarly, the `VisionEncoder` also takes a `DocList` of `Image`s as input, and returns an embedding `TorchTensor` as output. However, it operates on the `image` attribute of each Document. Now we can instantiate our encoders: @@ -266,7 +266,7 @@ vision_encoder = VisionEncoder().to(DEVICE) text_encoder = TextEncoder().to(DEVICE) ``` -As you can see, DocArray helps us to clearly convey what data is expected as input and output for each method, all through Python type hints. +As you can see, DocList helps us to clearly convey what data is expected as input and output for each method, all through Python type hints. ## Train the model in a contrastive way between Text and Image (CLIP) @@ -289,7 +289,7 @@ def cosine_sim(x_mat: TorchTensor, y_mat: TorchTensor) -> TorchTensor: ``` ```python -def clip_loss(image: DocList[Image], text: DocArray[Text]) -> TorchTensor: +def clip_loss(image: DocList[Image], text: DocList[Text]) -> TorchTensor: sims = cosine_sim(image.embedding, text.embedding) return torch.norm(sims - torch.eye(sims.shape[0], device=DEVICE)) ``` @@ -301,7 +301,7 @@ In the type hints of `cosine_sim` and `clip_loss` you can again notice that we c num_epoch = 1 # here you should do more epochs to really learn something ``` -One things to notice here is that our dataloader does not return a `torch.Tensor` but a `DocArray[PairTextImage]`, +One things to notice here is that our dataloader does not return a `torch.Tensor` but a `DocList[PairTextImage]`, which is exactly what our model can operate on. So let's write a training loop and train our encoders: @@ -312,7 +312,7 @@ from tqdm import tqdm with torch.autocast(device_type="cuda", dtype=torch.float16): for epoch in range(num_epoch): for i, batch in tqdm(enumerate(loader), total=len(loader), desc=f"Epoch {epoch}"): - batch.to(DEVICE) # DocArray can be moved to device + batch.to(DEVICE) # DocList can be moved to device optim.zero_grad() # FORWARD PASS: @@ -337,12 +337,12 @@ Let's use our beloved [FastAPI](https://fastapi.tiangolo.com/) for that! FastAPI is powerful because it allows you to define your Rest API data schema in pure Python. -And DocArray is fully compatible with FastAPI and Pydantic, which means that as long as you have a function that takes a Document as input, +And DocList is fully compatible with FastAPI and Pydantic, which means that as long as you have a function that takes a Document as input, FastAPI will be able to automatically translate it into a fully fledged API with documentation, openAPI specification and more: ```python from fastapi import FastAPI -from docarray.base_doc import DocumentResponse +from DocList.base_doc import DocumentResponse ``` ```python @@ -366,7 +366,7 @@ async def embed_text(doc: Text) -> Text: with torch.autocast(device_type="cuda", dtype=torch.float16): with torch.inference_mode(): text_preprocess(doc) - da = DocArray[Text]([doc], tensor_type=TorchTensor).stack() + da = DocList[Text]([doc], tensor_type=TorchTensor).stack() da.to(DEVICE) doc.embedding = text_encoder(da)[0].to('cpu') return doc @@ -400,4 +400,4 @@ doc_resp = Text.parse_raw(response.content.decode()) doc_resp.embedding.shape ``` -And we're done! You have trained and served a mulit-modal ML model, with zero headache and a lot of DocArray! +And we're done! You have trained and served a mulit-modal ML model, with zero headache and a lot of DocList! From d344b302d583ba650fd8e047fc0690d7896e7989 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 3 Apr 2023 16:46:28 +0200 Subject: [PATCH 11/27] refactor: rename proto Signed-off-by: samsja --- docarray/array/any_array.py | 6 +- docarray/array/doc_list/doc_list.py | 4 +- docarray/array/doc_list/io.py | 14 +- docarray/array/doc_vec/doc_vec.py | 16 +- docarray/base_doc/mixins/io.py | 14 +- docarray/index/backends/hnswlib.py | 4 +- docarray/proto/__init__.py | 20 +-- docarray/proto/docarray.proto | 16 +- docarray/proto/pb/docarray_pb2.py | 73 ++++----- docarray/proto/pb2/docarray_pb2.py | 155 +++++++++--------- .../document/proto/test_proto_based_object.py | 4 +- 11 files changed, 160 insertions(+), 166 deletions(-) diff --git a/docarray/array/any_array.py b/docarray/array/any_array.py index 583fc8e29d3..ffccbdc71cc 100644 --- a/docarray/array/any_array.py +++ b/docarray/array/any_array.py @@ -25,7 +25,7 @@ from docarray.utils._internal._typing import change_cls_name if TYPE_CHECKING: - from docarray.proto import DocumentArrayProto, NodeProto + from docarray.proto import DocListProto, NodeProto from docarray.typing.tensor.abstract_tensor import AbstractTensor T = TypeVar('T', bound='AnyDocArray') @@ -130,12 +130,12 @@ def _set_data_column( @classmethod @abstractmethod - def from_protobuf(cls: Type[T], pb_msg: 'DocumentArrayProto') -> T: + def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T: """create a Document from a protobuf message""" ... @abstractmethod - def to_protobuf(self) -> 'DocumentArrayProto': + def to_protobuf(self) -> 'DocListProto': """Convert DocList into a Protobuf message""" ... diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py index 86d61ac8fb1..e486baaba8a 100644 --- a/docarray/array/doc_list/doc_list.py +++ b/docarray/array/doc_list/doc_list.py @@ -32,7 +32,7 @@ from pydantic.fields import ModelField from docarray.array.doc_vec.doc_vec import DocVec - from docarray.proto import DocumentArrayProto + from docarray.proto import DocListProto from docarray.typing import TorchTensor from docarray.typing.tensor.abstract_tensor import AbstractTensor @@ -293,7 +293,7 @@ def traverse_flat( return flattened @classmethod - def from_protobuf(cls: Type[T], pb_msg: 'DocumentArrayProto') -> T: + def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T: """create a Document from a protobuf message :param pb_msg: The protobuf message from where to construct the DocArray """ diff --git a/docarray/array/doc_list/io.py b/docarray/array/doc_list/io.py index 273a4f38d61..aad803a78e8 100644 --- a/docarray/array/doc_list/io.py +++ b/docarray/array/doc_list/io.py @@ -40,7 +40,7 @@ import pandas as pd from docarray import DocList - from docarray.proto import DocumentArrayProto + from docarray.proto import DocListProto T = TypeVar('T', bound='IOMixinArray') T_doc = TypeVar('T_doc', bound=BaseDoc) @@ -112,7 +112,7 @@ def __init__( ... @classmethod - def from_protobuf(cls: Type[T], pb_msg: 'DocumentArrayProto') -> T: + def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T: """create a Document from a protobuf message :param pb_msg: The protobuf message from where to construct the DocArray """ @@ -120,11 +120,11 @@ def from_protobuf(cls: Type[T], pb_msg: 'DocumentArrayProto') -> T: cls.document_type.from_protobuf(doc_proto) for doc_proto in pb_msg.docs ) - def to_protobuf(self) -> 'DocumentArrayProto': + def to_protobuf(self) -> 'DocListProto': """Convert DocArray into a Protobuf message""" - from docarray.proto import DocumentArrayProto + from docarray.proto import DocListProto - da_proto = DocumentArrayProto() + da_proto = DocListProto() for doc in self: da_proto.docs.append(doc.to_protobuf()) @@ -565,9 +565,9 @@ def _load_binary_all( compress = None if protocol is not None and protocol == 'protobuf-array': - from docarray.proto import DocumentArrayProto + from docarray.proto import DocListProto - dap = DocumentArrayProto() + dap = DocListProto() dap.ParseFromString(d) return cls.from_protobuf(dap) diff --git a/docarray/array/doc_vec/doc_vec.py b/docarray/array/doc_vec/doc_vec.py index 12cb03c4f72..8031cd8afeb 100644 --- a/docarray/array/doc_vec/doc_vec.py +++ b/docarray/array/doc_vec/doc_vec.py @@ -32,7 +32,7 @@ if TYPE_CHECKING: from pydantic.fields import ModelField - from docarray.proto import DocArrayStackedProto + from docarray.proto import DocVecProto torch_available = is_torch_available() if torch_available: @@ -422,7 +422,7 @@ def __len__(self): #################### @classmethod - def from_protobuf(cls: Type[T], pb_msg: 'DocArrayStackedProto') -> T: + def from_protobuf(cls: Type[T], pb_msg: 'DocVecProto') -> T: """create a Document from a protobuf message""" storage = ColumnStorage( pb_msg.tensor_columns, @@ -433,21 +433,21 @@ def from_protobuf(cls: Type[T], pb_msg: 'DocArrayStackedProto') -> T: return cls.from_columns_storage(storage) - def to_protobuf(self) -> 'DocArrayStackedProto': + def to_protobuf(self) -> 'DocVecProto': """Convert DocArray into a Protobuf message""" from docarray.proto import ( - DocArrayStackedProto, - DocumentArrayProto, + DocListProto, + DocVecProto, ListOfAnyProto, ListOfDocArrayProto, NdArrayProto, ) - da_proto = DocumentArrayProto() + da_proto = DocListProto() for doc in self: da_proto.docs.append(doc.to_protobuf()) - doc_columns_proto: Dict[str, DocArrayStackedProto] = dict() + doc_columns_proto: Dict[str, DocVecProto] = dict() tensor_columns_proto: Dict[str, NdArrayProto] = dict() da_columns_proto: Dict[str, ListOfDocArrayProto] = dict() any_columns_proto: Dict[str, ListOfAnyProto] = dict() @@ -467,7 +467,7 @@ def to_protobuf(self) -> 'DocArrayStackedProto': list_proto.data.append(_type_to_protobuf(data)) any_columns_proto[field] = list_proto - return DocArrayStackedProto( + return DocVecProto( doc_columns=doc_columns_proto, tensor_columns=tensor_columns_proto, da_columns=da_columns_proto, diff --git a/docarray/base_doc/mixins/io.py b/docarray/base_doc/mixins/io.py index 9654ae03d41..a80ac6fc3cf 100644 --- a/docarray/base_doc/mixins/io.py +++ b/docarray/base_doc/mixins/io.py @@ -28,7 +28,7 @@ import torch from pydantic.fields import ModelField - from docarray.proto import DocumentProto, NodeProto + from docarray.proto import DocProto, NodeProto from docarray.typing import TensorFlowTensor, TorchTensor else: tf = import_library('tensorflow', raise_error=False) @@ -171,9 +171,9 @@ def from_bytes( if protocol == 'pickle': return pickle.loads(bstr) elif protocol == 'protobuf': - from docarray.proto import DocumentProto + from docarray.proto import DocProto - pb_msg = DocumentProto() + pb_msg = DocProto() pb_msg.ParseFromString(bstr) return cls.from_protobuf(pb_msg) else: @@ -209,7 +209,7 @@ def from_base64( return cls.from_bytes(base64.b64decode(data), protocol, compress) @classmethod - def from_protobuf(cls: Type[T], pb_msg: 'DocumentProto') -> T: + def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T: """create a Document from a protobuf message :param pb_msg: the proto message of the Document @@ -299,12 +299,12 @@ def _get_content_from_node_proto( return return_field - def to_protobuf(self: T) -> 'DocumentProto': + def to_protobuf(self: T) -> 'DocProto': """Convert Document into a Protobuf message. :return: the protobuf message """ - from docarray.proto import DocumentProto + from docarray.proto import DocProto data = {} for field, value in self: @@ -324,7 +324,7 @@ def to_protobuf(self: T) -> 'DocumentProto': ex.args = (f'Field `{field}` is problematic',) + ex.args raise ex - return DocumentProto(data=data) + return DocProto(data=data) def _to_node_protobuf(self) -> 'NodeProto': from docarray.proto import NodeProto diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index 6399f76b8e9..5d3d6bd2e5c 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -28,7 +28,7 @@ _raise_not_composable, _raise_not_supported, ) -from docarray.proto import DocumentProto +from docarray.proto import DocProto from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.utils._internal.misc import import_library, is_np_int from docarray.utils.filter import filter_docs @@ -424,4 +424,4 @@ def _doc_to_bytes(self, doc: BaseDoc) -> bytes: def _doc_from_bytes(self, data: bytes) -> BaseDoc: schema_cls = cast(Type[BaseDoc], self._schema) - return schema_cls.from_protobuf(DocumentProto.FromString(data)) + return schema_cls.from_protobuf(DocProto.FromString(data)) diff --git a/docarray/proto/__init__.py b/docarray/proto/__init__.py index 1b04df23fe6..b1a201b6e2f 100644 --- a/docarray/proto/__init__.py +++ b/docarray/proto/__init__.py @@ -12,9 +12,9 @@ if __pb__version__.startswith('4'): from docarray.proto.pb.docarray_pb2 import ( DictOfAnyProto, - DocArrayStackedProto, - DocumentArrayProto, - DocumentProto, + DocListProto, + DocProto, + DocVecProto, ListOfAnyProto, ListOfDocArrayProto, NdArrayProto, @@ -23,9 +23,9 @@ else: from docarray.proto.pb2.docarray_pb2 import ( DictOfAnyProto, - DocArrayStackedProto, - DocumentArrayProto, - DocumentProto, + DocListProto, + DocProto, + DocVecProto, ListOfAnyProto, ListOfDocArrayProto, NdArrayProto, @@ -33,12 +33,12 @@ ) __all__ = [ - 'DocumentArrayProto', - 'DocumentProto', + 'DocListProto', + 'DocProto', 'NdArrayProto', 'NodeProto', - 'DocArrayStackedProto', - 'DocumentArrayProto', + 'DocVecProto', + 'DocListProto', 'ListOfDocArrayProto', 'ListOfAnyProto', 'DictOfAnyProto', diff --git a/docarray/proto/docarray.proto b/docarray/proto/docarray.proto index 2b1d557da52..4bbe631cbf7 100644 --- a/docarray/proto/docarray.proto +++ b/docarray/proto/docarray.proto @@ -53,9 +53,9 @@ message NodeProto { // the ndarray of the image/audio/video document NdArrayProto ndarray = 6; // a sub Document - DocumentProto document = 7; + DocProto document = 7; // a sub DocArray - DocumentArrayProto document_array = 8; + DocListProto document_array = 8; //any list ListOfAnyProto list = 9; //any set @@ -75,7 +75,7 @@ message NodeProto { /** * Represents a Document */ -message DocumentProto { +message DocProto { map data = 1; @@ -91,18 +91,18 @@ message ListOfAnyProto { repeated NodeProto data = 1; } -message DocumentArrayProto { - repeated DocumentProto docs = 1; // a list of Documents +message DocListProto { + repeated DocProto docs = 1; // a list of Documents } message ListOfDocArrayProto { - repeated DocumentArrayProto data = 1; + repeated DocListProto data = 1; } -message DocArrayStackedProto{ +message DocVecProto{ map tensor_columns = 1; // a dict of document columns - map doc_columns = 2; // a dict of tensor columns + map doc_columns = 2; // a dict of tensor columns map da_columns = 3; // a dict of document array columns map any_columns = 4; // a dict of any columns. Used for the rest of the data } \ No newline at end of file diff --git a/docarray/proto/pb/docarray_pb2.py b/docarray/proto/pb/docarray_pb2.py index a830f17ddc4..c499fab35e2 100644 --- a/docarray/proto/pb/docarray_pb2.py +++ b/docarray/proto/pb/docarray_pb2.py @@ -6,7 +6,6 @@ from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database - # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -15,25 +14,25 @@ from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xc6\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12+\n\x08\x64ocument\x18\x07 \x01(\x0b\x32\x17.docarray.DocumentProtoH\x00\x12\x36\n\x0e\x64ocument_array\x18\x08 \x01(\x0b\x32\x1c.docarray.DocumentArrayProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"\x82\x01\n\rDocumentProto\x12/\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32!.docarray.DocumentProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\";\n\x12\x44ocumentArrayProto\x12%\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x17.docarray.DocumentProto\"A\n\x13ListOfDocArrayProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocumentArrayProto\"\xed\x04\n\x14\x44ocArrayStackedProto\x12I\n\x0etensor_columns\x18\x01 \x03(\x0b\x32\x31.docarray.DocArrayStackedProto.TensorColumnsEntry\x12\x43\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32..docarray.DocArrayStackedProto.DocColumnsEntry\x12\x41\n\nda_columns\x18\x03 \x03(\x0b\x32-.docarray.DocArrayStackedProto.DaColumnsEntry\x12\x43\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32..docarray.DocArrayStackedProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aQ\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12-\n\x05value\x18\x02 \x01(\x0b\x32\x1e.docarray.DocArrayStackedProto:\x02\x38\x01\x1aO\n\x0e\x44\x61\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xbb\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12&\n\x08\x64ocument\x18\x07 \x01(\x0b\x32\x12.docarray.DocProtoH\x00\x12\x30\n\x0e\x64ocument_array\x18\x08 \x01(\x0b\x32\x16.docarray.DocListProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"x\n\x08\x44ocProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\"0\n\x0c\x44ocListProto\x12 \n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x12.docarray.DocProto\";\n\x13ListOfDocArrayProto\x12$\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x16.docarray.DocListProto\"\xb7\x04\n\x0b\x44ocVecProto\x12@\n\x0etensor_columns\x18\x01 \x03(\x0b\x32(.docarray.DocVecProto.TensorColumnsEntry\x12:\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32%.docarray.DocVecProto.DocColumnsEntry\x12\x38\n\nda_columns\x18\x03 \x03(\x0b\x32$.docarray.DocVecProto.DaColumnsEntry\x12:\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32%.docarray.DocVecProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aH\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12$\n\x05value\x18\x02 \x01(\x0b\x32\x15.docarray.DocVecProto:\x02\x38\x01\x1aO\n\x0e\x44\x61\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'docarray_pb2', globals()) if _descriptor._USE_C_DESCRIPTORS == False: DESCRIPTOR._options = None - _DOCUMENTPROTO_DATAENTRY._options = None - _DOCUMENTPROTO_DATAENTRY._serialized_options = b'8\001' + _DOCPROTO_DATAENTRY._options = None + _DOCPROTO_DATAENTRY._serialized_options = b'8\001' _DICTOFANYPROTO_DATAENTRY._options = None _DICTOFANYPROTO_DATAENTRY._serialized_options = b'8\001' - _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._options = None - _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_options = b'8\001' - _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._options = None - _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_options = b'8\001' - _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._options = None - _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_options = b'8\001' - _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._options = None - _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_TENSORCOLUMNSENTRY._options = None + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_DOCCOLUMNSENTRY._options = None + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_DACOLUMNSENTRY._options = None + _DOCVECPROTO_DACOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_ANYCOLUMNSENTRY._options = None + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_options = b'8\001' _DENSENDARRAYPROTO._serialized_start=58 _DENSENDARRAYPROTO._serialized_end=123 _NDARRAYPROTO._serialized_start=125 @@ -43,29 +42,29 @@ _GENERICDICTVALUE._serialized_start=322 _GENERICDICTVALUE._serialized_end=381 _NODEPROTO._serialized_start=384 - _NODEPROTO._serialized_end=838 - _DOCUMENTPROTO._serialized_start=841 - _DOCUMENTPROTO._serialized_end=971 - _DOCUMENTPROTO_DATAENTRY._serialized_start=907 - _DOCUMENTPROTO_DATAENTRY._serialized_end=971 - _DICTOFANYPROTO._serialized_start=974 - _DICTOFANYPROTO._serialized_end=1106 - _DICTOFANYPROTO_DATAENTRY._serialized_start=907 - _DICTOFANYPROTO_DATAENTRY._serialized_end=971 - _LISTOFANYPROTO._serialized_start=1108 - _LISTOFANYPROTO._serialized_end=1159 - _DOCUMENTARRAYPROTO._serialized_start=1161 - _DOCUMENTARRAYPROTO._serialized_end=1220 - _LISTOFDOCARRAYPROTO._serialized_start=1222 - _LISTOFDOCARRAYPROTO._serialized_end=1287 - _DOCARRAYSTACKEDPROTO._serialized_start=1290 - _DOCARRAYSTACKEDPROTO._serialized_end=1911 - _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_start=1594 - _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_end=1670 - _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_start=1672 - _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_end=1753 - _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_start=1755 - _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_end=1834 - _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_start=1836 - _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_end=1911 + _NODEPROTO._serialized_end=827 + _DOCPROTO._serialized_start=829 + _DOCPROTO._serialized_end=949 + _DOCPROTO_DATAENTRY._serialized_start=885 + _DOCPROTO_DATAENTRY._serialized_end=949 + _DICTOFANYPROTO._serialized_start=952 + _DICTOFANYPROTO._serialized_end=1084 + _DICTOFANYPROTO_DATAENTRY._serialized_start=885 + _DICTOFANYPROTO_DATAENTRY._serialized_end=949 + _LISTOFANYPROTO._serialized_start=1086 + _LISTOFANYPROTO._serialized_end=1137 + _DOCLISTPROTO._serialized_start=1139 + _DOCLISTPROTO._serialized_end=1187 + _LISTOFDOCARRAYPROTO._serialized_start=1189 + _LISTOFDOCARRAYPROTO._serialized_end=1248 + _DOCVECPROTO._serialized_start=1251 + _DOCVECPROTO._serialized_end=1818 + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_start=1510 + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_end=1586 + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_start=1588 + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_end=1660 + _DOCVECPROTO_DACOLUMNSENTRY._serialized_start=1662 + _DOCVECPROTO_DACOLUMNSENTRY._serialized_end=1741 + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_start=1743 + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_end=1818 # @@protoc_insertion_point(module_scope) diff --git a/docarray/proto/pb2/docarray_pb2.py b/docarray/proto/pb2/docarray_pb2.py index 0ea41987658..cbdfb576dbb 100644 --- a/docarray/proto/pb2/docarray_pb2.py +++ b/docarray/proto/pb2/docarray_pb2.py @@ -12,10 +12,11 @@ _sym_db = _symbol_database.Default() + from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xc6\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12+\n\x08\x64ocument\x18\x07 \x01(\x0b\x32\x17.docarray.DocumentProtoH\x00\x12\x36\n\x0e\x64ocument_array\x18\x08 \x01(\x0b\x32\x1c.docarray.DocumentArrayProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"\x82\x01\n\rDocumentProto\x12/\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32!.docarray.DocumentProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\";\n\x12\x44ocumentArrayProto\x12%\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x17.docarray.DocumentProto\"A\n\x13ListOfDocArrayProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocumentArrayProto\"\xed\x04\n\x14\x44ocArrayStackedProto\x12I\n\x0etensor_columns\x18\x01 \x03(\x0b\x32\x31.docarray.DocArrayStackedProto.TensorColumnsEntry\x12\x43\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32..docarray.DocArrayStackedProto.DocColumnsEntry\x12\x41\n\nda_columns\x18\x03 \x03(\x0b\x32-.docarray.DocArrayStackedProto.DaColumnsEntry\x12\x43\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32..docarray.DocArrayStackedProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aQ\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12-\n\x05value\x18\x02 \x01(\x0b\x32\x1e.docarray.DocArrayStackedProto:\x02\x38\x01\x1aO\n\x0e\x44\x61\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3' + b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xbb\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12&\n\x08\x64ocument\x18\x07 \x01(\x0b\x32\x12.docarray.DocProtoH\x00\x12\x30\n\x0e\x64ocument_array\x18\x08 \x01(\x0b\x32\x16.docarray.DocListProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"x\n\x08\x44ocProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\"0\n\x0c\x44ocListProto\x12 \n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x12.docarray.DocProto\";\n\x13ListOfDocArrayProto\x12$\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x16.docarray.DocListProto\"\xb7\x04\n\x0b\x44ocVecProto\x12@\n\x0etensor_columns\x18\x01 \x03(\x0b\x32(.docarray.DocVecProto.TensorColumnsEntry\x12:\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32%.docarray.DocVecProto.DocColumnsEntry\x12\x38\n\nda_columns\x18\x03 \x03(\x0b\x32$.docarray.DocVecProto.DaColumnsEntry\x12:\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32%.docarray.DocVecProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aH\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12$\n\x05value\x18\x02 \x01(\x0b\x32\x15.docarray.DocVecProto:\x02\x38\x01\x1aO\n\x0e\x44\x61\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3' ) @@ -24,26 +25,20 @@ _KEYVALUEPAIR = DESCRIPTOR.message_types_by_name['KeyValuePair'] _GENERICDICTVALUE = DESCRIPTOR.message_types_by_name['GenericDictValue'] _NODEPROTO = DESCRIPTOR.message_types_by_name['NodeProto'] -_DOCUMENTPROTO = DESCRIPTOR.message_types_by_name['DocumentProto'] -_DOCUMENTPROTO_DATAENTRY = _DOCUMENTPROTO.nested_types_by_name['DataEntry'] +_DOCPROTO = DESCRIPTOR.message_types_by_name['DocProto'] +_DOCPROTO_DATAENTRY = _DOCPROTO.nested_types_by_name['DataEntry'] _DICTOFANYPROTO = DESCRIPTOR.message_types_by_name['DictOfAnyProto'] _DICTOFANYPROTO_DATAENTRY = _DICTOFANYPROTO.nested_types_by_name['DataEntry'] _LISTOFANYPROTO = DESCRIPTOR.message_types_by_name['ListOfAnyProto'] -_DOCUMENTARRAYPROTO = DESCRIPTOR.message_types_by_name['DocumentArrayProto'] +_DOCLISTPROTO = DESCRIPTOR.message_types_by_name['DocListProto'] _LISTOFDOCARRAYPROTO = DESCRIPTOR.message_types_by_name['ListOfDocArrayProto'] -_DOCARRAYSTACKEDPROTO = DESCRIPTOR.message_types_by_name['DocArrayStackedProto'] -_DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY = _DOCARRAYSTACKEDPROTO.nested_types_by_name[ +_DOCVECPROTO = DESCRIPTOR.message_types_by_name['DocVecProto'] +_DOCVECPROTO_TENSORCOLUMNSENTRY = _DOCVECPROTO.nested_types_by_name[ 'TensorColumnsEntry' ] -_DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY = _DOCARRAYSTACKEDPROTO.nested_types_by_name[ - 'DocColumnsEntry' -] -_DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY = _DOCARRAYSTACKEDPROTO.nested_types_by_name[ - 'DaColumnsEntry' -] -_DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY = _DOCARRAYSTACKEDPROTO.nested_types_by_name[ - 'AnyColumnsEntry' -] +_DOCVECPROTO_DOCCOLUMNSENTRY = _DOCVECPROTO.nested_types_by_name['DocColumnsEntry'] +_DOCVECPROTO_DACOLUMNSENTRY = _DOCVECPROTO.nested_types_by_name['DaColumnsEntry'] +_DOCVECPROTO_ANYCOLUMNSENTRY = _DOCVECPROTO.nested_types_by_name['AnyColumnsEntry'] DenseNdArrayProto = _reflection.GeneratedProtocolMessageType( 'DenseNdArrayProto', (_message.Message,), @@ -99,26 +94,26 @@ ) _sym_db.RegisterMessage(NodeProto) -DocumentProto = _reflection.GeneratedProtocolMessageType( - 'DocumentProto', +DocProto = _reflection.GeneratedProtocolMessageType( + 'DocProto', (_message.Message,), { 'DataEntry': _reflection.GeneratedProtocolMessageType( 'DataEntry', (_message.Message,), { - 'DESCRIPTOR': _DOCUMENTPROTO_DATAENTRY, + 'DESCRIPTOR': _DOCPROTO_DATAENTRY, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentProto.DataEntry) + # @@protoc_insertion_point(class_scope:docarray.DocProto.DataEntry) }, ), - 'DESCRIPTOR': _DOCUMENTPROTO, + 'DESCRIPTOR': _DOCPROTO, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentProto) + # @@protoc_insertion_point(class_scope:docarray.DocProto) }, ) -_sym_db.RegisterMessage(DocumentProto) -_sym_db.RegisterMessage(DocumentProto.DataEntry) +_sym_db.RegisterMessage(DocProto) +_sym_db.RegisterMessage(DocProto.DataEntry) DictOfAnyProto = _reflection.GeneratedProtocolMessageType( 'DictOfAnyProto', @@ -152,16 +147,16 @@ ) _sym_db.RegisterMessage(ListOfAnyProto) -DocumentArrayProto = _reflection.GeneratedProtocolMessageType( - 'DocumentArrayProto', +DocListProto = _reflection.GeneratedProtocolMessageType( + 'DocListProto', (_message.Message,), { - 'DESCRIPTOR': _DOCUMENTARRAYPROTO, + 'DESCRIPTOR': _DOCLISTPROTO, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentArrayProto) + # @@protoc_insertion_point(class_scope:docarray.DocListProto) }, ) -_sym_db.RegisterMessage(DocumentArrayProto) +_sym_db.RegisterMessage(DocListProto) ListOfDocArrayProto = _reflection.GeneratedProtocolMessageType( 'ListOfDocArrayProto', @@ -174,72 +169,72 @@ ) _sym_db.RegisterMessage(ListOfDocArrayProto) -DocArrayStackedProto = _reflection.GeneratedProtocolMessageType( - 'DocArrayStackedProto', +DocVecProto = _reflection.GeneratedProtocolMessageType( + 'DocVecProto', (_message.Message,), { 'TensorColumnsEntry': _reflection.GeneratedProtocolMessageType( 'TensorColumnsEntry', (_message.Message,), { - 'DESCRIPTOR': _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY, + 'DESCRIPTOR': _DOCVECPROTO_TENSORCOLUMNSENTRY, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocArrayStackedProto.TensorColumnsEntry) + # @@protoc_insertion_point(class_scope:docarray.DocVecProto.TensorColumnsEntry) }, ), 'DocColumnsEntry': _reflection.GeneratedProtocolMessageType( 'DocColumnsEntry', (_message.Message,), { - 'DESCRIPTOR': _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY, + 'DESCRIPTOR': _DOCVECPROTO_DOCCOLUMNSENTRY, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocArrayStackedProto.DocColumnsEntry) + # @@protoc_insertion_point(class_scope:docarray.DocVecProto.DocColumnsEntry) }, ), 'DaColumnsEntry': _reflection.GeneratedProtocolMessageType( 'DaColumnsEntry', (_message.Message,), { - 'DESCRIPTOR': _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY, + 'DESCRIPTOR': _DOCVECPROTO_DACOLUMNSENTRY, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocArrayStackedProto.DaColumnsEntry) + # @@protoc_insertion_point(class_scope:docarray.DocVecProto.DaColumnsEntry) }, ), 'AnyColumnsEntry': _reflection.GeneratedProtocolMessageType( 'AnyColumnsEntry', (_message.Message,), { - 'DESCRIPTOR': _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY, + 'DESCRIPTOR': _DOCVECPROTO_ANYCOLUMNSENTRY, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocArrayStackedProto.AnyColumnsEntry) + # @@protoc_insertion_point(class_scope:docarray.DocVecProto.AnyColumnsEntry) }, ), - 'DESCRIPTOR': _DOCARRAYSTACKEDPROTO, + 'DESCRIPTOR': _DOCVECPROTO, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocArrayStackedProto) + # @@protoc_insertion_point(class_scope:docarray.DocVecProto) }, ) -_sym_db.RegisterMessage(DocArrayStackedProto) -_sym_db.RegisterMessage(DocArrayStackedProto.TensorColumnsEntry) -_sym_db.RegisterMessage(DocArrayStackedProto.DocColumnsEntry) -_sym_db.RegisterMessage(DocArrayStackedProto.DaColumnsEntry) -_sym_db.RegisterMessage(DocArrayStackedProto.AnyColumnsEntry) +_sym_db.RegisterMessage(DocVecProto) +_sym_db.RegisterMessage(DocVecProto.TensorColumnsEntry) +_sym_db.RegisterMessage(DocVecProto.DocColumnsEntry) +_sym_db.RegisterMessage(DocVecProto.DaColumnsEntry) +_sym_db.RegisterMessage(DocVecProto.AnyColumnsEntry) if _descriptor._USE_C_DESCRIPTORS == False: DESCRIPTOR._options = None - _DOCUMENTPROTO_DATAENTRY._options = None - _DOCUMENTPROTO_DATAENTRY._serialized_options = b'8\001' + _DOCPROTO_DATAENTRY._options = None + _DOCPROTO_DATAENTRY._serialized_options = b'8\001' _DICTOFANYPROTO_DATAENTRY._options = None _DICTOFANYPROTO_DATAENTRY._serialized_options = b'8\001' - _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._options = None - _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_options = b'8\001' - _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._options = None - _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_options = b'8\001' - _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._options = None - _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_options = b'8\001' - _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._options = None - _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_TENSORCOLUMNSENTRY._options = None + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_DOCCOLUMNSENTRY._options = None + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_DACOLUMNSENTRY._options = None + _DOCVECPROTO_DACOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_ANYCOLUMNSENTRY._options = None + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_options = b'8\001' _DENSENDARRAYPROTO._serialized_start = 58 _DENSENDARRAYPROTO._serialized_end = 123 _NDARRAYPROTO._serialized_start = 125 @@ -249,29 +244,29 @@ _GENERICDICTVALUE._serialized_start = 322 _GENERICDICTVALUE._serialized_end = 381 _NODEPROTO._serialized_start = 384 - _NODEPROTO._serialized_end = 838 - _DOCUMENTPROTO._serialized_start = 841 - _DOCUMENTPROTO._serialized_end = 971 - _DOCUMENTPROTO_DATAENTRY._serialized_start = 907 - _DOCUMENTPROTO_DATAENTRY._serialized_end = 971 - _DICTOFANYPROTO._serialized_start = 974 - _DICTOFANYPROTO._serialized_end = 1106 - _DICTOFANYPROTO_DATAENTRY._serialized_start = 907 - _DICTOFANYPROTO_DATAENTRY._serialized_end = 971 - _LISTOFANYPROTO._serialized_start = 1108 - _LISTOFANYPROTO._serialized_end = 1159 - _DOCUMENTARRAYPROTO._serialized_start = 1161 - _DOCUMENTARRAYPROTO._serialized_end = 1220 - _LISTOFDOCARRAYPROTO._serialized_start = 1222 - _LISTOFDOCARRAYPROTO._serialized_end = 1287 - _DOCARRAYSTACKEDPROTO._serialized_start = 1290 - _DOCARRAYSTACKEDPROTO._serialized_end = 1911 - _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_start = 1594 - _DOCARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_end = 1670 - _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_start = 1672 - _DOCARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_end = 1753 - _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_start = 1755 - _DOCARRAYSTACKEDPROTO_DACOLUMNSENTRY._serialized_end = 1834 - _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_start = 1836 - _DOCARRAYSTACKEDPROTO_ANYCOLUMNSENTRY._serialized_end = 1911 + _NODEPROTO._serialized_end = 827 + _DOCPROTO._serialized_start = 829 + _DOCPROTO._serialized_end = 949 + _DOCPROTO_DATAENTRY._serialized_start = 885 + _DOCPROTO_DATAENTRY._serialized_end = 949 + _DICTOFANYPROTO._serialized_start = 952 + _DICTOFANYPROTO._serialized_end = 1084 + _DICTOFANYPROTO_DATAENTRY._serialized_start = 885 + _DICTOFANYPROTO_DATAENTRY._serialized_end = 949 + _LISTOFANYPROTO._serialized_start = 1086 + _LISTOFANYPROTO._serialized_end = 1137 + _DOCLISTPROTO._serialized_start = 1139 + _DOCLISTPROTO._serialized_end = 1187 + _LISTOFDOCARRAYPROTO._serialized_start = 1189 + _LISTOFDOCARRAYPROTO._serialized_end = 1248 + _DOCVECPROTO._serialized_start = 1251 + _DOCVECPROTO._serialized_end = 1818 + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_start = 1510 + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_end = 1586 + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_start = 1588 + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_end = 1660 + _DOCVECPROTO_DACOLUMNSENTRY._serialized_start = 1662 + _DOCVECPROTO_DACOLUMNSENTRY._serialized_end = 1741 + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_start = 1743 + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_end = 1818 # @@protoc_insertion_point(module_scope) diff --git a/tests/units/document/proto/test_proto_based_object.py b/tests/units/document/proto/test_proto_based_object.py index ecec88fb6e6..96708dea32b 100644 --- a/tests/units/document/proto/test_proto_based_object.py +++ b/tests/units/document/proto/test_proto_based_object.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from docarray.proto import DocumentProto, NodeProto +from docarray.proto import DocProto, NodeProto from docarray.typing import NdArray @@ -32,4 +32,4 @@ def test_document_proto_set(): data['a'] = nested_item1 data['b'] = nested_item2 - DocumentProto(data=data) + DocProto(data=data) From 4a4fd1ca487e9c5fd0d12d389a1afec776ba7867 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 3 Apr 2023 16:47:23 +0200 Subject: [PATCH 12/27] refactor: document_type to document Signed-off-by: samsja --- docarray/array/any_array.py | 6 ++-- docarray/array/doc_list/doc_list.py | 22 +++++------- docarray/array/doc_list/io.py | 32 ++++++++--------- docarray/array/doc_list/pushpull.py | 6 ++-- docarray/array/doc_vec/doc_vec.py | 40 ++++++++++------------ docarray/data/torch_dataset.py | 6 ++-- docarray/display/document_array_summary.py | 2 +- docarray/display/document_summary.py | 8 ++--- docarray/helper.py | 2 +- docarray/index/abstract.py | 2 +- docarray/store/file.py | 2 +- docarray/store/jac.py | 2 +- docarray/store/s3.py | 2 +- docarray/utils/filter.py | 2 +- docarray/utils/find.py | 2 +- tests/units/array/test_generic_array.py | 4 +-- 16 files changed, 64 insertions(+), 76 deletions(-) diff --git a/docarray/array/any_array.py b/docarray/array/any_array.py index ffccbdc71cc..c0c0449556b 100644 --- a/docarray/array/any_array.py +++ b/docarray/array/any_array.py @@ -34,7 +34,7 @@ class AnyDocArray(Sequence[T_doc], Generic[T_doc], AbstractType): - document_type: Type[BaseDoc] + doc_type: Type[BaseDoc] __typed_da__: Dict[Type['AnyDocArray'], Dict[Type[BaseDoc], Type]] = {} def __repr__(self): @@ -58,9 +58,9 @@ def __class_getitem__(cls, item: Union[Type[BaseDoc], TypeVar, str]): global _DocArrayTyped class _DocArrayTyped(cls): # type: ignore - document_type: Type[BaseDoc] = cast(Type[BaseDoc], item) + doc_type: Type[BaseDoc] = cast(Type[BaseDoc], item) - for field in _DocArrayTyped.document_type.__fields__.keys(): + for field in _DocArrayTyped.doc_type.__fields__.keys(): def _property_generator(val: str): def _getter(self): diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py index e486baaba8a..9ebea25007e 100644 --- a/docarray/array/doc_list/doc_list.py +++ b/docarray/array/doc_list/doc_list.py @@ -121,7 +121,7 @@ class Image(BaseDoc): """ - document_type: Type[BaseDoc] = AnyDoc + doc_type: Type[BaseDoc] = AnyDoc def __init__( self, @@ -161,10 +161,8 @@ def _validate_docs(self, docs: Iterable[T_doc]) -> Iterable[T_doc]: def _validate_one_doc(self, doc: T_doc) -> T_doc: """Validate if a Document is compatible with this DocArray""" - if not issubclass(self.document_type, AnyDoc) and not isinstance( - doc, self.document_type - ): - raise ValueError(f'{doc} is not a {self.document_type}') + if not issubclass(self.doc_type, AnyDoc) and not isinstance(doc, self.doc_type): + raise ValueError(f'{doc} is not a {self.doc_type}') return doc def __len__(self): @@ -181,7 +179,7 @@ def __bytes__(self) -> bytes: def append(self, doc: T_doc): """ Append a Document to the DocArray. The Document must be from the same class - as the document_type of this DocArray otherwise it will fail. + as the doc_type of this DocArray otherwise it will fail. :param doc: A Document """ self._data.append(self._validate_one_doc(doc)) @@ -189,7 +187,7 @@ def append(self, doc: T_doc): def extend(self, docs: Iterable[T_doc]): """ Extend a DocArray with an Iterable of Document. The Documents must be from - the same class as the document_type of this DocArray otherwise it will + the same class as the doc_type of this DocArray otherwise it will fail. :param docs: Iterable of Documents """ @@ -198,7 +196,7 @@ def extend(self, docs: Iterable[T_doc]): def insert(self, i: int, doc: T_doc): """ Insert a Document to the DocArray. The Document must be from the same - class as the document_type of this DocArray otherwise it will fail. + class as the doc_type of this DocArray otherwise it will fail. :param i: index to insert :param doc: A Document """ @@ -219,7 +217,7 @@ def _get_data_column( :return: Returns a list of the field value for each document in the doc_list like container """ - field_type = self.__class__.document_type._get_field_type(field) + field_type = self.__class__.doc_type._get_field_type(field) if ( not is_union_type(field_type) @@ -263,9 +261,7 @@ def stack( """ from docarray.array.doc_vec.doc_vec import DocVec - return DocVec.__class_getitem__(self.document_type)( - self, tensor_type=tensor_type - ) + return DocVec.__class_getitem__(self.doc_type)(self, tensor_type=tensor_type) @classmethod def validate( @@ -281,7 +277,7 @@ def validate( elif isinstance(value, Iterable): return cls(value) else: - raise TypeError(f'Expecting an Iterable of {cls.document_type}') + raise TypeError(f'Expecting an Iterable of {cls.doc_type}') def traverse_flat( self: 'DocList', diff --git a/docarray/array/doc_list/io.py b/docarray/array/doc_list/io.py index aad803a78e8..72d3181984b 100644 --- a/docarray/array/doc_list/io.py +++ b/docarray/array/doc_list/io.py @@ -97,7 +97,7 @@ def __getitem__(self, item: slice): class IOMixinArray(Iterable[T_doc]): - document_type: Type[T_doc] + doc_type: Type[T_doc] _data: List[T_doc] @abstractmethod @@ -116,9 +116,7 @@ def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T: """create a Document from a protobuf message :param pb_msg: The protobuf message from where to construct the DocArray """ - return cls( - cls.document_type.from_protobuf(doc_proto) for doc_proto in pb_msg.docs - ) + return cls(cls.doc_type.from_protobuf(doc_proto) for doc_proto in pb_msg.docs) def to_protobuf(self) -> 'DocListProto': """Convert DocArray into a Protobuf message""" @@ -322,7 +320,7 @@ def from_json( :return: the deserialized DocArray """ json_docs = orjson.loads(file) - return cls([cls.document_type(**v) for v in json_docs]) + return cls([cls.doc_type(**v) for v in json_docs]) def to_json(self) -> bytes: """Convert the object into JSON bytes. Can be loaded via :meth:`.from_json`. @@ -346,7 +344,7 @@ def from_csv( ) -> 'DocList': """ Load a DocArray from a csv file following the schema defined in the - :attr:`~docarray.DocArray.document_type` attribute. + :attr:`~docarray.DocArray.doc_type` attribute. Every row of the csv file will be mapped to one document in the doc_list. The column names (defined in the first row) have to match the field names of the Document type. @@ -365,13 +363,13 @@ def from_csv( """ from docarray import DocList - if cls.document_type == AnyDoc: + if cls.doc_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' 'Please specify the DocArray\'s Document type using `DocArray[MyDoc]`.' ) - doc_type = cls.document_type + doc_type = cls.doc_type da = DocList.__class_getitem__(doc_type)() with open(file_path, 'r', encoding=encoding) as fp: @@ -388,7 +386,7 @@ def from_csv( if not all(valid_paths): raise ValueError( f'Column names do not match the schema of the DocArray\'s ' - f'document type ({cls.document_type.__name__}): ' + f'document type ({cls.doc_type.__name__}): ' f'{list(compress(field_names, [not v for v in valid_paths]))}' ) @@ -417,7 +415,7 @@ def to_csv( 'excel-tab' (for tab separated values), 'unix' (for csv file generated on UNIX systems). """ - fields = self.document_type._get_access_paths() + fields = self.doc_type._get_access_paths() with open(file_path, 'w') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=fields, dialect=dialect) @@ -431,7 +429,7 @@ def to_csv( def from_pandas(cls, df: 'pd.DataFrame') -> 'DocList': """ Load a DocArray from a `pandas.DataFrame` following the schema - defined in the :attr:`~docarray.DocArray.document_type` attribute. + defined in the :attr:`~docarray.DocArray.doc_type` attribute. Every row of the dataframe will be mapped to one Document in the doc_list. The column names of the dataframe have to match the field names of the Document type. @@ -470,13 +468,13 @@ class Person(BaseDoc): """ from docarray import DocList - if cls.document_type == AnyDoc: + if cls.doc_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' 'Please specify the DocArray\'s Document type using `DocArray[MyDoc]`.' ) - doc_type = cls.document_type + doc_type = cls.doc_type da = DocList.__class_getitem__(doc_type)() field_names = df.columns.tolist() @@ -489,7 +487,7 @@ class Person(BaseDoc): if not all(valid_paths): raise ValueError( f'Column names do not match the schema of the DocArray\'s ' - f'document type ({cls.document_type.__name__}): ' + f'document type ({cls.doc_type.__name__}): ' f'{list(compress(field_names, [not v for v in valid_paths]))}' ) @@ -516,7 +514,7 @@ def to_pandas(self) -> 'pd.DataFrame': else: pd = import_library('pandas', raise_error=True) - fields = self.document_type._get_access_paths() + fields = self.doc_type._get_access_paths() df = pd.DataFrame(columns=fields) for doc in self: @@ -606,7 +604,7 @@ def _load_binary_all( # variable length bytes doc load_protocol: str = protocol or 'protobuf' - doc = cls.document_type.from_bytes( + doc = cls.doc_type.from_bytes( d[start_doc_pos:end_doc_pos], protocol=load_protocol, compress=compress, @@ -663,7 +661,7 @@ def _load_binary_stream( f.read(4), 'big', signed=False ) load_protocol: str = protocol - yield cls.document_type.from_bytes( + yield cls.doc_type.from_bytes( f.read(len_current_doc_in_bytes), protocol=load_protocol, compress=compress, diff --git a/docarray/array/doc_list/pushpull.py b/docarray/array/doc_list/pushpull.py index 2d85af8e711..cd666c38cba 100644 --- a/docarray/array/doc_list/pushpull.py +++ b/docarray/array/doc_list/pushpull.py @@ -30,7 +30,7 @@ class PushPullMixin(Iterable['BaseDoc']): """Mixin class for push/pull functionality.""" __backends__: Dict[str, Type['AbstractDocStore']] = {} - document_type: Type['BaseDoc'] + doc_type: Type['BaseDoc'] @abstractmethod def __len__(self) -> int: @@ -139,7 +139,7 @@ def pull( """ from docarray.base_doc import AnyDoc - if cls.document_type == AnyDoc: + if cls.doc_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' 'Please specify the DocArray\'s Document type using `DocArray[MyDoc]`.' @@ -167,7 +167,7 @@ def pull_stream( """ from docarray.base_doc import AnyDoc - if cls.document_type == AnyDoc: + if cls.doc_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' 'Please specify the DocArray\'s Document type using `DocArray[MyDoc]`.' diff --git a/docarray/array/doc_vec/doc_vec.py b/docarray/array/doc_vec/doc_vec.py index 8031cd8afeb..c562dfea0e6 100644 --- a/docarray/array/doc_vec/doc_vec.py +++ b/docarray/array/doc_vec/doc_vec.py @@ -63,7 +63,7 @@ class DocVec(AnyDocArray[T_doc]): {class}`~docarray.array.DocArray` but with an underlying implementation that is column based instead of row based. Each field of the schema of the DocArrayStack - (the :attr:`~docarray.array.doc_vec.DocVec.document_type` which is a + (the :attr:`~docarray.array.doc_vec.DocVec.doc_type` which is a `BaseDoc`) will be stored in a column. If the field is a tensor, the data from all Documents will be stored as a single, doc_vec (torch/np/tf) tensor. If the tensor field is `AnyTensor` or a Union of tensor types, the @@ -87,7 +87,7 @@ class DocVec(AnyDocArray[T_doc]): AnyTensor or Union of NdArray and TorchTensor """ - document_type: Type[T_doc] + doc_type: Type[T_doc] def __init__( self: T, @@ -106,13 +106,13 @@ def __init__( docs = ( docs if isinstance(docs, DocList) - else DocList.__class_getitem__(self.document_type)(docs) + else DocList.__class_getitem__(self.doc_type)(docs) ) - for field_name, field in self.document_type.__fields__.items(): + for field_name, field in self.doc_type.__fields__.items(): # here we iterate over the field of the da schema, and we collect the data # from each document and put them in the corresponding column - field_type = self.document_type._get_field_type(field_name) + field_type = self.doc_type._get_field_type(field_name) if is_tensor_union(field_type): field_type = tensor_type @@ -209,14 +209,14 @@ def validate( ) -> T: if isinstance(value, cls): return value - elif isinstance(value, DocList.__class_getitem__(cls.document_type)): + elif isinstance(value, DocList.__class_getitem__(cls.doc_type)): return cast(T, value.stack()) elif isinstance(value, Sequence): return cls(value) elif isinstance(value, Iterable): return cls(list(value)) else: - raise TypeError(f'Expecting an Iterable of {cls.document_type}') + raise TypeError(f'Expecting an Iterable of {cls.doc_type}') def to(self: T, device: str) -> T: """Move all tensors of this DocArrayStacked to the given device @@ -255,7 +255,7 @@ def __getitem__(self: T, item: Union[int, IndexIterType]) -> Union[T_doc, T]: if isinstance(item, (slice, Iterable)): return self.__class__.from_columns_storage(self._storage[item]) # single doc case - return self.document_type.from_view(ColumnStorageView(item, self._storage)) + return self.doc_type.from_view(ColumnStorageView(item, self._storage)) def _get_data_column( self: T, @@ -292,8 +292,8 @@ def __setitem__(self: T, key: IndexIterType, value: T): def __setitem__(self: T, key, value): # single doc case if not isinstance(key, (slice, Iterable)): - if not isinstance(value, self.document_type): - raise ValueError(f'{value} is not a {self.document_type}') + if not isinstance(value, self.doc_type): + raise ValueError(f'{value} is not a {self.doc_type}') for field, value in value.dict().items(): self._storage.columns[field][key] = value # todo we might want to @@ -319,20 +319,20 @@ def _set_data_and_columns( # set data and prepare columns processed_value: T if isinstance(value, DocList): - if not issubclass(value.document_type, self.document_type): + if not issubclass(value.doc_type, self.doc_type): raise TypeError( - f'{value} schema : {value.document_type} is not compatible with ' - f'this DocArrayStacked schema : {self.document_type}' + f'{value} schema : {value.doc_type} is not compatible with ' + f'this DocArrayStacked schema : {self.doc_type}' ) processed_value = cast( T, value.stack(tensor_type=self.tensor_type) ) # we need to copy data here elif isinstance(value, DocVec): - if not issubclass(value.document_type, self.document_type): + if not issubclass(value.doc_type, self.doc_type): raise TypeError( - f'{value} schema : {value.document_type} is not compatible with ' - f'this DocArrayStacked schema : {self.document_type}' + f'{value} schema : {value.doc_type} is not compatible with ' + f'this DocArrayStacked schema : {self.doc_type}' ) processed_value = value else: @@ -376,9 +376,7 @@ def _set_data_column( elif field in self._storage.doc_columns.keys(): values_ = parse_obj_as( - DocVec.__class_getitem__( - self._storage.doc_columns[field].document_type - ), + DocVec.__class_getitem__(self._storage.doc_columns[field].doc_type), values, ) self._storage.doc_columns[field] = values_ @@ -511,11 +509,11 @@ def unstack(self: T) -> DocList[T_doc]: for i in range(len(self)): data = {field: col[i] for field, col in unstacked_column.items()} - docs.append(self.document_type.construct(**data)) + docs.append(self.doc_type.construct(**data)) del self._storage - return DocList.__class_getitem__(self.document_type).construct(docs) + return DocList.__class_getitem__(self.doc_type).construct(docs) def traverse_flat( self, diff --git a/docarray/data/torch_dataset.py b/docarray/data/torch_dataset.py index 09a4f2326dd..414e3cbe146 100644 --- a/docarray/data/torch_dataset.py +++ b/docarray/data/torch_dataset.py @@ -92,7 +92,7 @@ def add_nonsense(student: Student): print(batch.thesis.title.embedding) """ - document_type: Optional[Type[BaseDoc]] = None + doc_type: Optional[Type[BaseDoc]] = None __typed_ds__: Dict[Type[BaseDoc], Type['MultiModalDataset']] = {} def __init__( @@ -121,7 +121,7 @@ def __getitem__(self, item: int): @classmethod def collate_fn(cls, batch: List[T_doc]): - doc_type = cls.document_type + doc_type = cls.doc_type if doc_type: batch_da = DocVec[doc_type]( # type: ignore batch, @@ -142,7 +142,7 @@ def __class_getitem__(cls, item: Type[BaseDoc]) -> Type['MultiModalDataset']: global _TypedDataset class _TypedDataset(cls): # type: ignore - document_type = item + doc_type = item change_cls_name( _TypedDataset, f'{cls.__name__}[{item.__name__}]', globals() diff --git a/docarray/display/document_array_summary.py b/docarray/display/document_array_summary.py index eacedcb6dc3..62fcc7b385b 100644 --- a/docarray/display/document_array_summary.py +++ b/docarray/display/document_array_summary.py @@ -51,7 +51,7 @@ def summary(self) -> None: table.add_row(f' • {field_name}:', col_2) Console().print(Panel(table, title='DocArray Summary', expand=False)) - self.da.document_type.schema_summary() + self.da.doc_type.schema_summary() @staticmethod def _get_stacked_fields(da: 'DocVec') -> List[str]: # TODO this might diff --git a/docarray/display/document_summary.py b/docarray/display/document_summary.py index 23482903763..349829b6e0e 100644 --- a/docarray/display/document_summary.py +++ b/docarray/display/document_summary.py @@ -77,9 +77,7 @@ def _get_schema(cls: Type['BaseDoc'], doc_name: Optional[str] = None) -> Tree: if issubclass(arg, BaseDoc): sub_tree.add(DocumentSummary._get_schema(cls=arg)) elif issubclass(arg, DocList): - sub_tree.add( - DocumentSummary._get_schema(cls=arg.document_type) - ) + sub_tree.add(DocumentSummary._get_schema(cls=arg.doc_type)) tree.add(sub_tree) elif issubclass(field_type, BaseDoc): @@ -89,9 +87,7 @@ def _get_schema(cls: Type['BaseDoc'], doc_name: Optional[str] = None) -> Tree: elif issubclass(field_type, DocList): sub_tree = Tree(node_name, highlight=True) - sub_tree.add( - DocumentSummary._get_schema(cls=field_type.document_type) - ) + sub_tree.add(DocumentSummary._get_schema(cls=field_type.doc_type)) tree.add(sub_tree) else: diff --git a/docarray/helper.py b/docarray/helper.py index d9c1d779809..7cedb443d56 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -146,7 +146,7 @@ def _get_field_type_by_access_path( else: d = doc_type._get_field_type(field) if issubclass(d, DocList): - return _get_field_type_by_access_path(d.document_type, remaining) + return _get_field_type_by_access_path(d.doc_type, remaining) elif issubclass(d, BaseDoc): return _get_field_type_by_access_path(d, remaining) else: diff --git a/docarray/index/abstract.py b/docarray/index/abstract.py index 4a046183e29..4672f4348a9 100644 --- a/docarray/index/abstract.py +++ b/docarray/index/abstract.py @@ -786,7 +786,7 @@ def _validate_docs( reference_names = [name for (name, _, _) in reference_schema_flat] reference_types = [t_ for (_, t_, _) in reference_schema_flat] try: - input_schema_flat = self._flatten_schema(docs.document_type) + input_schema_flat = self._flatten_schema(docs.doc_type) except ValueError: pass else: diff --git a/docarray/store/file.py b/docarray/store/file.py index f3164704c00..b4ac938739c 100644 --- a/docarray/store/file.py +++ b/docarray/store/file.py @@ -186,7 +186,7 @@ def pull_stream( path = cls._abs_filepath(name).with_suffix('.da') source = open(path, 'rb') return _from_binary_stream( - da_cls.document_type, + da_cls.doc_type, source, protocol='protobuf', compress='gzip', diff --git a/docarray/store/jac.py b/docarray/store/jac.py index f590fc7aa52..ba5d8f275c9 100644 --- a/docarray/store/jac.py +++ b/docarray/store/jac.py @@ -287,7 +287,7 @@ def pull( """ from docarray import DocList - return DocList[cls.document_type]( # type: ignore + return DocList[cls.doc_type]( # type: ignore JACDocStore.pull_stream(cls, name, show_progress, local_cache) ) diff --git a/docarray/store/s3.py b/docarray/store/s3.py index f940a77a626..07aa418137c 100644 --- a/docarray/store/s3.py +++ b/docarray/store/s3.py @@ -235,7 +235,7 @@ def pull_stream( source = open(cache_path, 'rb') return _from_binary_stream( - da_cls.document_type, + da_cls.doc_type, source, protocol='pickle', compress=None, diff --git a/docarray/utils/filter.py b/docarray/utils/filter.py index d34c50b278b..5b7daa1e6f2 100644 --- a/docarray/utils/filter.py +++ b/docarray/utils/filter.py @@ -75,7 +75,7 @@ class MyDocument(BaseDoc): if query: query = query if not isinstance(query, str) else json.loads(query) parser = QueryParser(query) - return DocList.__class_getitem__(docs.document_type)( + return DocList.__class_getitem__(docs.doc_type)( d for d in docs if parser.evaluate(d) ) else: diff --git a/docarray/utils/find.py b/docarray/utils/find.py index a2acd3b9e39..b3418126589 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -265,7 +265,7 @@ def _da_attr_type(da: AnyDocArray, access_path: str) -> Type[AnyTensor]: :return: the type of the attribute """ field_type: Optional[Type] = _get_field_type_by_access_path( - da.document_type, access_path + da.doc_type, access_path ) if field_type is None: raise ValueError(f"Access path is not valid: {access_path}") diff --git a/tests/units/array/test_generic_array.py b/tests/units/array/test_generic_array.py index a693e810e95..a51789ed81e 100644 --- a/tests/units/array/test_generic_array.py +++ b/tests/units/array/test_generic_array.py @@ -7,13 +7,13 @@ class Text(BaseDoc): text: str da = DocList[Text]([]) - da.document_type == Text + da.doc_type == Text assert isinstance(da, DocList) def test_normal_access_init(): da = DocList([]) - da.document_type == AnyDoc + da.doc_type == AnyDoc assert isinstance(da, DocList) From cd1172290154a4e7d25c8edf790c5f0ac2b939c7 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 3 Apr 2023 16:50:21 +0200 Subject: [PATCH 13/27] refactor: rename document and document_array key from proto to doc and doc_arra Signed-off-by: samsja --- docarray/array/any_array.py | 2 +- docarray/base_doc/mixins/io.py | 4 +-- docarray/proto/docarray.proto | 4 +-- docarray/proto/pb/docarray_pb2.py | 52 +++++++++++++++--------------- docarray/proto/pb2/docarray_pb2.py | 52 +++++++++++++++--------------- 5 files changed, 57 insertions(+), 57 deletions(-) diff --git a/docarray/array/any_array.py b/docarray/array/any_array.py index c0c0449556b..33b2c17ef84 100644 --- a/docarray/array/any_array.py +++ b/docarray/array/any_array.py @@ -148,7 +148,7 @@ def _to_node_protobuf(self) -> 'NodeProto': """ from docarray.proto import NodeProto - return NodeProto(document_array=self.to_protobuf()) + return NodeProto(doc_array=self.to_protobuf()) @abstractmethod def traverse_flat( diff --git a/docarray/base_doc/mixins/io.py b/docarray/base_doc/mixins/io.py index a80ac6fc3cf..de379cc23fe 100644 --- a/docarray/base_doc/mixins/io.py +++ b/docarray/base_doc/mixins/io.py @@ -254,7 +254,7 @@ def _get_content_from_node_proto( return_field = content_type_dict[docarray_type].from_protobuf( getattr(value, content_key) ) - elif content_key in ['document', 'document_array']: + elif content_key in ['doc', 'doc_array']: if field_name is None: raise ValueError( 'field_name cannot be None when trying to deseriliaze a Document or a DocArray' @@ -335,7 +335,7 @@ def _to_node_protobuf(self) -> 'NodeProto': :return: the nested item protobuf message """ - return NodeProto(document=self.to_protobuf()) + return NodeProto(doc=self.to_protobuf()) @classmethod def _get_access_paths(cls) -> List[str]: diff --git a/docarray/proto/docarray.proto b/docarray/proto/docarray.proto index 4bbe631cbf7..f7302ad5867 100644 --- a/docarray/proto/docarray.proto +++ b/docarray/proto/docarray.proto @@ -53,9 +53,9 @@ message NodeProto { // the ndarray of the image/audio/video document NdArrayProto ndarray = 6; // a sub Document - DocProto document = 7; + DocProto doc = 7; // a sub DocArray - DocListProto document_array = 8; + DocListProto doc_array = 8; //any list ListOfAnyProto list = 9; //any set diff --git a/docarray/proto/pb/docarray_pb2.py b/docarray/proto/pb/docarray_pb2.py index c499fab35e2..0ad5c4dcfed 100644 --- a/docarray/proto/pb/docarray_pb2.py +++ b/docarray/proto/pb/docarray_pb2.py @@ -14,7 +14,7 @@ from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xbb\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12&\n\x08\x64ocument\x18\x07 \x01(\x0b\x32\x12.docarray.DocProtoH\x00\x12\x30\n\x0e\x64ocument_array\x18\x08 \x01(\x0b\x32\x16.docarray.DocListProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"x\n\x08\x44ocProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\"0\n\x0c\x44ocListProto\x12 \n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x12.docarray.DocProto\";\n\x13ListOfDocArrayProto\x12$\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x16.docarray.DocListProto\"\xb7\x04\n\x0b\x44ocVecProto\x12@\n\x0etensor_columns\x18\x01 \x03(\x0b\x32(.docarray.DocVecProto.TensorColumnsEntry\x12:\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32%.docarray.DocVecProto.DocColumnsEntry\x12\x38\n\nda_columns\x18\x03 \x03(\x0b\x32$.docarray.DocVecProto.DaColumnsEntry\x12:\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32%.docarray.DocVecProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aH\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12$\n\x05value\x18\x02 \x01(\x0b\x32\x15.docarray.DocVecProto:\x02\x38\x01\x1aO\n\x0e\x44\x61\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xb1\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12!\n\x03\x64oc\x18\x07 \x01(\x0b\x32\x12.docarray.DocProtoH\x00\x12+\n\tdoc_array\x18\x08 \x01(\x0b\x32\x16.docarray.DocListProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"x\n\x08\x44ocProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\"0\n\x0c\x44ocListProto\x12 \n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x12.docarray.DocProto\";\n\x13ListOfDocArrayProto\x12$\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x16.docarray.DocListProto\"\xb7\x04\n\x0b\x44ocVecProto\x12@\n\x0etensor_columns\x18\x01 \x03(\x0b\x32(.docarray.DocVecProto.TensorColumnsEntry\x12:\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32%.docarray.DocVecProto.DocColumnsEntry\x12\x38\n\nda_columns\x18\x03 \x03(\x0b\x32$.docarray.DocVecProto.DaColumnsEntry\x12:\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32%.docarray.DocVecProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aH\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12$\n\x05value\x18\x02 \x01(\x0b\x32\x15.docarray.DocVecProto:\x02\x38\x01\x1aO\n\x0e\x44\x61\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'docarray_pb2', globals()) @@ -42,29 +42,29 @@ _GENERICDICTVALUE._serialized_start=322 _GENERICDICTVALUE._serialized_end=381 _NODEPROTO._serialized_start=384 - _NODEPROTO._serialized_end=827 - _DOCPROTO._serialized_start=829 - _DOCPROTO._serialized_end=949 - _DOCPROTO_DATAENTRY._serialized_start=885 - _DOCPROTO_DATAENTRY._serialized_end=949 - _DICTOFANYPROTO._serialized_start=952 - _DICTOFANYPROTO._serialized_end=1084 - _DICTOFANYPROTO_DATAENTRY._serialized_start=885 - _DICTOFANYPROTO_DATAENTRY._serialized_end=949 - _LISTOFANYPROTO._serialized_start=1086 - _LISTOFANYPROTO._serialized_end=1137 - _DOCLISTPROTO._serialized_start=1139 - _DOCLISTPROTO._serialized_end=1187 - _LISTOFDOCARRAYPROTO._serialized_start=1189 - _LISTOFDOCARRAYPROTO._serialized_end=1248 - _DOCVECPROTO._serialized_start=1251 - _DOCVECPROTO._serialized_end=1818 - _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_start=1510 - _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_end=1586 - _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_start=1588 - _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_end=1660 - _DOCVECPROTO_DACOLUMNSENTRY._serialized_start=1662 - _DOCVECPROTO_DACOLUMNSENTRY._serialized_end=1741 - _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_start=1743 - _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_end=1818 + _NODEPROTO._serialized_end=817 + _DOCPROTO._serialized_start=819 + _DOCPROTO._serialized_end=939 + _DOCPROTO_DATAENTRY._serialized_start=875 + _DOCPROTO_DATAENTRY._serialized_end=939 + _DICTOFANYPROTO._serialized_start=942 + _DICTOFANYPROTO._serialized_end=1074 + _DICTOFANYPROTO_DATAENTRY._serialized_start=875 + _DICTOFANYPROTO_DATAENTRY._serialized_end=939 + _LISTOFANYPROTO._serialized_start=1076 + _LISTOFANYPROTO._serialized_end=1127 + _DOCLISTPROTO._serialized_start=1129 + _DOCLISTPROTO._serialized_end=1177 + _LISTOFDOCARRAYPROTO._serialized_start=1179 + _LISTOFDOCARRAYPROTO._serialized_end=1238 + _DOCVECPROTO._serialized_start=1241 + _DOCVECPROTO._serialized_end=1808 + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_start=1500 + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_end=1576 + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_start=1578 + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_end=1650 + _DOCVECPROTO_DACOLUMNSENTRY._serialized_start=1652 + _DOCVECPROTO_DACOLUMNSENTRY._serialized_end=1731 + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_start=1733 + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_end=1808 # @@protoc_insertion_point(module_scope) diff --git a/docarray/proto/pb2/docarray_pb2.py b/docarray/proto/pb2/docarray_pb2.py index cbdfb576dbb..795f618bb22 100644 --- a/docarray/proto/pb2/docarray_pb2.py +++ b/docarray/proto/pb2/docarray_pb2.py @@ -16,7 +16,7 @@ from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xbb\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12&\n\x08\x64ocument\x18\x07 \x01(\x0b\x32\x12.docarray.DocProtoH\x00\x12\x30\n\x0e\x64ocument_array\x18\x08 \x01(\x0b\x32\x16.docarray.DocListProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"x\n\x08\x44ocProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\"0\n\x0c\x44ocListProto\x12 \n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x12.docarray.DocProto\";\n\x13ListOfDocArrayProto\x12$\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x16.docarray.DocListProto\"\xb7\x04\n\x0b\x44ocVecProto\x12@\n\x0etensor_columns\x18\x01 \x03(\x0b\x32(.docarray.DocVecProto.TensorColumnsEntry\x12:\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32%.docarray.DocVecProto.DocColumnsEntry\x12\x38\n\nda_columns\x18\x03 \x03(\x0b\x32$.docarray.DocVecProto.DaColumnsEntry\x12:\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32%.docarray.DocVecProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aH\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12$\n\x05value\x18\x02 \x01(\x0b\x32\x15.docarray.DocVecProto:\x02\x38\x01\x1aO\n\x0e\x44\x61\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3' + b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xb1\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12!\n\x03\x64oc\x18\x07 \x01(\x0b\x32\x12.docarray.DocProtoH\x00\x12+\n\tdoc_array\x18\x08 \x01(\x0b\x32\x16.docarray.DocListProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"x\n\x08\x44ocProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\"0\n\x0c\x44ocListProto\x12 \n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x12.docarray.DocProto\";\n\x13ListOfDocArrayProto\x12$\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x16.docarray.DocListProto\"\xb7\x04\n\x0b\x44ocVecProto\x12@\n\x0etensor_columns\x18\x01 \x03(\x0b\x32(.docarray.DocVecProto.TensorColumnsEntry\x12:\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32%.docarray.DocVecProto.DocColumnsEntry\x12\x38\n\nda_columns\x18\x03 \x03(\x0b\x32$.docarray.DocVecProto.DaColumnsEntry\x12:\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32%.docarray.DocVecProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aH\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12$\n\x05value\x18\x02 \x01(\x0b\x32\x15.docarray.DocVecProto:\x02\x38\x01\x1aO\n\x0e\x44\x61\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3' ) @@ -244,29 +244,29 @@ _GENERICDICTVALUE._serialized_start = 322 _GENERICDICTVALUE._serialized_end = 381 _NODEPROTO._serialized_start = 384 - _NODEPROTO._serialized_end = 827 - _DOCPROTO._serialized_start = 829 - _DOCPROTO._serialized_end = 949 - _DOCPROTO_DATAENTRY._serialized_start = 885 - _DOCPROTO_DATAENTRY._serialized_end = 949 - _DICTOFANYPROTO._serialized_start = 952 - _DICTOFANYPROTO._serialized_end = 1084 - _DICTOFANYPROTO_DATAENTRY._serialized_start = 885 - _DICTOFANYPROTO_DATAENTRY._serialized_end = 949 - _LISTOFANYPROTO._serialized_start = 1086 - _LISTOFANYPROTO._serialized_end = 1137 - _DOCLISTPROTO._serialized_start = 1139 - _DOCLISTPROTO._serialized_end = 1187 - _LISTOFDOCARRAYPROTO._serialized_start = 1189 - _LISTOFDOCARRAYPROTO._serialized_end = 1248 - _DOCVECPROTO._serialized_start = 1251 - _DOCVECPROTO._serialized_end = 1818 - _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_start = 1510 - _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_end = 1586 - _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_start = 1588 - _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_end = 1660 - _DOCVECPROTO_DACOLUMNSENTRY._serialized_start = 1662 - _DOCVECPROTO_DACOLUMNSENTRY._serialized_end = 1741 - _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_start = 1743 - _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_end = 1818 + _NODEPROTO._serialized_end = 817 + _DOCPROTO._serialized_start = 819 + _DOCPROTO._serialized_end = 939 + _DOCPROTO_DATAENTRY._serialized_start = 875 + _DOCPROTO_DATAENTRY._serialized_end = 939 + _DICTOFANYPROTO._serialized_start = 942 + _DICTOFANYPROTO._serialized_end = 1074 + _DICTOFANYPROTO_DATAENTRY._serialized_start = 875 + _DICTOFANYPROTO_DATAENTRY._serialized_end = 939 + _LISTOFANYPROTO._serialized_start = 1076 + _LISTOFANYPROTO._serialized_end = 1127 + _DOCLISTPROTO._serialized_start = 1129 + _DOCLISTPROTO._serialized_end = 1177 + _LISTOFDOCARRAYPROTO._serialized_start = 1179 + _LISTOFDOCARRAYPROTO._serialized_end = 1238 + _DOCVECPROTO._serialized_start = 1241 + _DOCVECPROTO._serialized_end = 1808 + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_start = 1500 + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_end = 1576 + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_start = 1578 + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_end = 1650 + _DOCVECPROTO_DACOLUMNSENTRY._serialized_start = 1652 + _DOCVECPROTO_DACOLUMNSENTRY._serialized_end = 1731 + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_start = 1733 + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_end = 1808 # @@protoc_insertion_point(module_scope) From 2af12d746a58b4096bc129a1908fe53399ea6f44 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 3 Apr 2023 17:30:14 +0200 Subject: [PATCH 14/27] fix: add docv vec to init Signed-off-by: samsja --- docarray/array/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docarray/array/__init__.py b/docarray/array/__init__.py index 3792c3c6755..16e1274c1e3 100644 --- a/docarray/array/__init__.py +++ b/docarray/array/__init__.py @@ -1,4 +1,5 @@ +from docarray.array.any_array import AnyDocArray from docarray.array.doc_list.doc_list import DocList from docarray.array.doc_vec.doc_vec import DocVec -__all__ = ['DocList', 'DocVec'] +__all__ = ['DocList', 'DocVec', 'AnyDocArray'] From 61a793fc7ec4fb90b8d121b08a668ee1b044f552 Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 4 Apr 2023 10:29:46 +0200 Subject: [PATCH 15/27] refactor: rename da to docs Signed-off-by: samsja --- docarray/utils/map.py | 8 ++++---- tests/benchmark_tests/test_map.py | 9 +++++++-- tests/units/util/test_map.py | 8 ++++---- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/docarray/utils/map.py b/docarray/utils/map.py index 3d7f86bc853..fba7db54e39 100644 --- a/docarray/utils/map.py +++ b/docarray/utils/map.py @@ -15,7 +15,7 @@ def map_docs( - da: T, + docs: T, func: Callable[[T_doc], T_doc], backend: str = 'thread', num_worker: Optional[int] = None, @@ -56,7 +56,7 @@ def load_url_to_tensor(img: ImageDoc) -> ImageDoc: --- - :param da: DocList to apply function to + :param docs: DocList to apply function to :param func: a function that takes a :class:`BaseDoc` as input and outputs a :class:`BaseDoc`. :param backend: `thread` for multithreading and `process` for multiprocessing. @@ -98,8 +98,8 @@ def load_url_to_tensor(img: ImageDoc) -> ImageDoc: context_pool = p with context_pool: - imap = p.imap(func, da) - for x in track(imap, total=len(da), disable=not show_progress): + imap = p.imap(func, docs) + for x in track(imap, total=len(docs), disable=not show_progress): yield x diff --git a/tests/benchmark_tests/test_map.py b/tests/benchmark_tests/test_map.py index ad1067c5824..d5e146d7dba 100644 --- a/tests/benchmark_tests/test_map.py +++ b/tests/benchmark_tests/test_map.py @@ -36,7 +36,10 @@ def time_multiprocessing(num_workers: int) -> float: start_time = time() list( map_docs( - da=da, func=cpu_intensive, backend='process', num_worker=num_workers + docs=da, + func=cpu_intensive, + backend='process', + num_worker=num_workers, ) ) return time() - start_time @@ -96,7 +99,9 @@ def time_multithreading(num_workers: int) -> float: ) start_time = time() list( - map_docs(da=da, func=io_intensive, backend='thread', num_worker=num_workers) + map_docs( + docs=da, func=io_intensive, backend='thread', num_worker=num_workers + ) ) return time() - start_time diff --git a/tests/units/util/test_map.py b/tests/units/util/test_map.py index f4864c239f5..0c6fb460b76 100644 --- a/tests/units/util/test_map.py +++ b/tests/units/util/test_map.py @@ -28,7 +28,7 @@ def test_map(da, backend): for tensor in da.tensor: assert tensor is None - docs = list(map_docs(da=da, func=load_from_doc, backend=backend)) + docs = list(map_docs(docs=da, func=load_from_doc, backend=backend)) assert len(docs) == N_DOCS for doc in docs: @@ -37,7 +37,7 @@ def test_map(da, backend): def test_map_multiprocessing_lambda_func_raise_exception(da): with pytest.raises(ValueError, match='Multiprocessing does not allow'): - list(map_docs(da=da, func=lambda x: x, backend='process')) + list(map_docs(docs=da, func=lambda x: x, backend='process')) def test_map_multiprocessing_local_func_raise_exception(da): @@ -45,14 +45,14 @@ def local_func(x): return x with pytest.raises(ValueError, match='Multiprocessing does not allow'): - list(map_docs(da=da, func=local_func, backend='process')) + list(map_docs(docs=da, func=local_func, backend='process')) @pytest.mark.parametrize('backend', ['thread', 'process']) def test_check_order(backend): da = DocList[ImageDoc]([ImageDoc(id=i) for i in range(N_DOCS)]) - docs = list(map_docs(da=da, func=load_from_doc, backend=backend)) + docs = list(map_docs(docs=da, func=load_from_doc, backend=backend)) assert len(docs) == N_DOCS for i, doc in enumerate(docs): From 99d9f5b1b2fffc49ee7b3d005e164bcab4f014d6 Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 4 Apr 2023 10:45:09 +0200 Subject: [PATCH 16/27] fix: fix jac Signed-off-by: samsja --- docarray/store/jac.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docarray/store/jac.py b/docarray/store/jac.py index ba5d8f275c9..251d1dfb576 100644 --- a/docarray/store/jac.py +++ b/docarray/store/jac.py @@ -102,7 +102,7 @@ def list(namespace: str = '', show_table: bool = False) -> List[str]: from rich.table import Table resp = HubbleClient(jsonify=True).list_artifacts( - filter={'type': 'DocArray'}, sort={'createdAt': 1} + filter={'type': 'DocumentArray'}, sort={'createdAt': 1} ) table = Table( @@ -181,11 +181,11 @@ def push( data, ctype = urllib3.filepost.encode_multipart_formdata( { 'file': ( - 'DocArray', + 'DocumentArray', delimiter, ), 'name': name, - 'type': 'DocArray', + 'type': 'DocumentArray', 'public': public, 'metaData': json.dumps( { From 3f983ef23e755be6feac3284bbaa11c73da93b83 Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 4 Apr 2023 13:13:39 +0200 Subject: [PATCH 17/27] refactor: rename docstring Signed-off-by: samsja --- docarray/array/doc_list/doc_list.py | 4 ++-- docarray/array/doc_vec/column_storage.py | 6 +++--- docarray/array/doc_vec/doc_vec.py | 20 ++++++++++---------- docarray/display/document_array_summary.py | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py index 9ebea25007e..b8e9377977d 100644 --- a/docarray/array/doc_list/doc_list.py +++ b/docarray/array/doc_list/doc_list.py @@ -253,11 +253,11 @@ def stack( tensor_type: Type['AbstractTensor'] = NdArray, ) -> 'DocVec': """ - Convert the DocArray into a DocArrayStacked. `Self` cannot be used + Convert the DocArray into a DocVec. `Self` cannot be used afterwards :param tensor_type: Tensor Class used to wrap the doc_vec tensors. This is useful if the BaseDoc has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor - :return: A DocArrayStacked of the same document type as self + :return: A DocVec of the same document type as self """ from docarray.array.doc_vec.doc_vec import DocVec diff --git a/docarray/array/doc_vec/column_storage.py b/docarray/array/doc_vec/column_storage.py index fa1aca74a8a..e44ab1158f9 100644 --- a/docarray/array/doc_vec/column_storage.py +++ b/docarray/array/doc_vec/column_storage.py @@ -26,11 +26,11 @@ class ColumnStorage: """ ColumnStorage is a container to store the columns of the - :class:`~docarray.array.doc_vec.DocArrayStacked`. + :class:`~docarray.array.doc_vec.DocVec`. :param tensor_columns: a Dict of AbstractTensor - :param doc_columns: a Dict of :class:`~docarray.array.doc_vec.DocArrayStacked` - :param da_columns: a Dict of List of :class:`~docarray.array.doc_vec.DocArrayStacked` + :param doc_columns: a Dict of :class:`~docarray.array.doc_vec.DocVec` + :param da_columns: a Dict of List of :class:`~docarray.array.doc_vec.DocVec` :param any_columns: a Dict of List :param tensor_type: Class used to wrap the doc_vec tensors """ diff --git a/docarray/array/doc_vec/doc_vec.py b/docarray/array/doc_vec/doc_vec.py index c562dfea0e6..5a50a3353f3 100644 --- a/docarray/array/doc_vec/doc_vec.py +++ b/docarray/array/doc_vec/doc_vec.py @@ -67,23 +67,23 @@ class DocVec(AnyDocArray[T_doc]): `BaseDoc`) will be stored in a column. If the field is a tensor, the data from all Documents will be stored as a single, doc_vec (torch/np/tf) tensor. If the tensor field is `AnyTensor` or a Union of tensor types, the - :attr:`~docarray.array.doc_vec.DocArrayStacked.tensor_type` will be used to determine + :attr:`~docarray.array.doc_vec.DocVec.tensor_type` will be used to determine the type of the doc_vec column. - If the field is another `BasedDoc` the column will be another DocArrayStacked that follows the + If the field is another `BasedDoc` the column will be another DocVec that follows the schema of the nested Document. If the field is a `DocArray` or `DocVec` then the column will be a list of `DocVec`. For any other type the column is a Python list. - Every `Document` inside a `DocArrayStacked` is a view into the data columns stored at the `DocVec` level. The `BaseDoc` does + Every `Document` inside a `DocVec` is a view into the data columns stored at the `DocVec` level. The `BaseDoc` does not hold any data itself. The behavior of this Document "view" is similar to the behavior of `view = tensor[i]` in numpy/PyTorch. :param docs: a homogeneous sequence of BaseDoc :param tensor_type: Tensor Class used to wrap the doc_vec tensors. This is useful - if the BaseDoc of this DocArrayStacked has some undefined tensor type like + if the BaseDoc of this DocVec has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor """ @@ -191,7 +191,7 @@ def __init__( @classmethod def from_columns_storage(cls: Type[T], storage: ColumnStorage) -> T: """ - Create a DocArrayStacked directly from a storage object + Create a DocVec directly from a storage object :param storage: the underlying storage. :return: a DocArrayStack """ @@ -219,7 +219,7 @@ def validate( raise TypeError(f'Expecting an Iterable of {cls.doc_type}') def to(self: T, device: str) -> T: - """Move all tensors of this DocArrayStacked to the given device + """Move all tensors of this DocVec to the given device :param device: the device to move the data to """ @@ -322,7 +322,7 @@ def _set_data_and_columns( if not issubclass(value.doc_type, self.doc_type): raise TypeError( f'{value} schema : {value.doc_type} is not compatible with ' - f'this DocArrayStacked schema : {self.doc_type}' + f'this DocVec schema : {self.doc_type}' ) processed_value = cast( T, value.stack(tensor_type=self.tensor_type) @@ -332,11 +332,11 @@ def _set_data_and_columns( if not issubclass(value.doc_type, self.doc_type): raise TypeError( f'{value} schema : {value.doc_type} is not compatible with ' - f'this DocArrayStacked schema : {self.doc_type}' + f'this DocVec schema : {self.doc_type}' ) processed_value = value else: - raise TypeError(f'Can not set a DocArrayStacked with {type(value)}') + raise TypeError(f'Can not set a DocVec with {type(value)}') for field, col in self._storage.columns.items(): col[index_item] = processed_value._storage.columns[field] @@ -473,7 +473,7 @@ def to_protobuf(self) -> 'DocVecProto': ) def unstack(self: T) -> DocList[T_doc]: - """Convert DocArrayStacked into a DocArray. + """Convert DocVec into a DocArray. Note this destroys the arguments and returns a new DocArray """ diff --git a/docarray/display/document_array_summary.py b/docarray/display/document_array_summary.py index 62fcc7b385b..4708d167220 100644 --- a/docarray/display/document_array_summary.py +++ b/docarray/display/document_array_summary.py @@ -57,7 +57,7 @@ def summary(self) -> None: def _get_stacked_fields(da: 'DocVec') -> List[str]: # TODO this might # broken """ - Return a list of the field names of a DocArrayStacked instance that are + Return a list of the field names of a DocVec instance that are doc_vec, i.e. all the fields that are of type AbstractTensor. Nested field paths are separated by dot, such as: 'attr.nested_attr'. """ From 3a156f161668341d7e96a6b85156c8df43939b9a Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 4 Apr 2023 13:15:41 +0200 Subject: [PATCH 18/27] refactor: rename docstring Signed-off-by: samsja --- docarray/array/any_array.py | 24 +++---- docarray/array/doc_list/doc_list.py | 38 +++++----- docarray/array/doc_list/io.py | 70 +++++++++---------- docarray/array/doc_list/pushpull.py | 26 +++---- docarray/array/doc_vec/doc_vec.py | 18 ++--- docarray/base_doc/mixins/io.py | 2 +- docarray/computation/abstract_comp_backend.py | 2 +- docarray/data/torch_dataset.py | 2 +- docarray/display/document_array_summary.py | 4 +- docarray/documents/legacy/legacy_document.py | 8 +-- docarray/index/abstract.py | 27 ++++--- docarray/store/abstract_doc_store.py | 34 ++++----- docarray/store/file.py | 12 ++-- docarray/store/jac.py | 40 +++++------ docarray/store/s3.py | 18 ++--- docarray/typing/tensor/abstract_tensor.py | 6 +- docarray/utils/find.py | 4 +- 17 files changed, 166 insertions(+), 169 deletions(-) diff --git a/docarray/array/any_array.py b/docarray/array/any_array.py index 33b2c17ef84..1b2487d0a01 100644 --- a/docarray/array/any_array.py +++ b/docarray/array/any_array.py @@ -167,7 +167,7 @@ def traverse_flat( EXAMPLE USAGE .. code-block:: python - from docarray import BaseDoc, DocArray, Text + from docarray import BaseDoc, DocList, Text class Author(BaseDoc): @@ -179,7 +179,7 @@ class Book(BaseDoc): content: Text - da = DocArray[Book]( + da = DocList[Book]( Book(author=Author(name='Jenny'), content=Text(text=f'book_{i}')) for i in range(10) # noqa: E501 ) @@ -192,7 +192,7 @@ class Book(BaseDoc): EXAMPLE USAGE .. code-block:: python - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList class Chapter(BaseDoc): @@ -200,19 +200,17 @@ class Chapter(BaseDoc): class Book(BaseDoc): - chapters: DocArray[Chapter] + chapters: DocList[Chapter] - da = DocArray[Book]( - Book( - chapters=DocArray[Chapter]([Chapter(content='some_content') for _ in range(3)]) - ) + da = DocList[Book]( + Book(chapters=DocList[Chapter]([Chapter(content='some_content') for _ in range(3)])) for _ in range(10) ) chapters = da.traverse_flat(access_path='chapters') # list of 30 strings - If your DocArray is in doc_vec mode, and you want to access a field of + If your DocList is in doc_vec mode, and you want to access a field of type AnyTensor, the doc_vec tensor will be returned instead of a list: EXAMPLE USAGE @@ -221,7 +219,7 @@ class Image(BaseDoc): tensor: TorchTensor[3, 224, 224] - batch = DocArray[Image]( + batch = DocList[Image]( [ Image( tensor=torch.zeros(3, 224, 224), @@ -266,7 +264,7 @@ def _flatten_one_level(sequence: List[Any]) -> List[Any]: def summary(self): """ - Print a summary of this DocArray object and a summary of the schema of its + Print a summary of this DocList object and a summary of the schema of its Document type. """ DocArraySummary(self).summary() @@ -278,13 +276,13 @@ def _batch( show_progress: bool = False, ) -> Generator[T, None, None]: """ - Creates a `Generator` that yields `DocArray` of size `batch_size`. + Creates a `Generator` that yields `DocList` of size `batch_size`. Note, that the last batch might be smaller than `batch_size`. :param batch_size: Size of each generated batch. :param shuffle: If set, shuffle the Documents before dividing into minibatches. :param show_progress: if set, show a progress bar when batching documents. - :yield: a Generator of `DocArray`, each in the length of `batch_size` + :yield: a Generator of `DocList`, each in the length of `batch_size` """ from rich.progress import track diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py index b8e9377977d..97276c4bd49 100644 --- a/docarray/array/doc_list/doc_list.py +++ b/docarray/array/doc_list/doc_list.py @@ -43,7 +43,7 @@ def _delegate_meth_to_data(meth_name: str) -> Callable: """ create a function that mimic a function call to the data attribute of the - DocArray + DocList :param meth_name: name of the method :return: a method that mimic the meth_name @@ -64,9 +64,9 @@ class DocList( DocList is a container of Documents. A DocList is a list of Documents of any schema. However, many - DocArray features are only available if these Documents are + DocList features are only available if these Documents are homogeneous and follow the same schema. To precise this schema you can use - the `DocArray[MyDocument]` syntax where MyDocument is a Document class + the `DocList[MyDocument]` syntax where MyDocument is a Document class (i.e. schema). This creates a DocList that can only contains Documents of the type 'MyDocument'. @@ -92,7 +92,7 @@ class Image(BaseDoc): If your DocList is homogeneous (i.e. follows the same schema), you can access - fields at the DocArray level (for example `da.tensor` or `da.url`). + fields at the DocList level (for example `da.tensor` or `da.url`). You can also set fields, with `da.tensor = np.random.random([10, 100])`: print(da.url) @@ -114,8 +114,8 @@ class Image(BaseDoc): You can delete items from a DocList like a Python List - del da[0] # remove first element from DocArray - del da[0:5] # remove elements for 0 to 5 from DocArray + del da[0] # remove first element from DocList + del da[0:5] # remove elements for 0 to 5 from DocList :param docs: iterable of Document @@ -135,7 +135,7 @@ def construct( docs: Sequence[T_doc], ) -> T: """ - Create a DocArray without validation any data. The data must come from a + Create a DocList without validation any data. The data must come from a trusted source :param docs: a Sequence (list) of Document with the same schema :return: @@ -154,13 +154,13 @@ def __eq__(self, other: Any) -> bool: def _validate_docs(self, docs: Iterable[T_doc]) -> Iterable[T_doc]: """ - Validate if an Iterable of Document are compatible with this DocArray + Validate if an Iterable of Document are compatible with this DocList """ for doc in docs: yield self._validate_one_doc(doc) def _validate_one_doc(self, doc: T_doc) -> T_doc: - """Validate if a Document is compatible with this DocArray""" + """Validate if a Document is compatible with this DocList""" if not issubclass(self.doc_type, AnyDoc) and not isinstance(doc, self.doc_type): raise ValueError(f'{doc} is not a {self.doc_type}') return doc @@ -178,16 +178,16 @@ def __bytes__(self) -> bytes: def append(self, doc: T_doc): """ - Append a Document to the DocArray. The Document must be from the same class - as the doc_type of this DocArray otherwise it will fail. + Append a Document to the DocList. The Document must be from the same class + as the doc_type of this DocList otherwise it will fail. :param doc: A Document """ self._data.append(self._validate_one_doc(doc)) def extend(self, docs: Iterable[T_doc]): """ - Extend a DocArray with an Iterable of Document. The Documents must be from - the same class as the doc_type of this DocArray otherwise it will + Extend a DocList with an Iterable of Document. The Documents must be from + the same class as the doc_type of this DocList otherwise it will fail. :param docs: Iterable of Documents """ @@ -195,8 +195,8 @@ def extend(self, docs: Iterable[T_doc]): def insert(self, i: int, doc: T_doc): """ - Insert a Document to the DocArray. The Document must be from the same - class as the doc_type of this DocArray otherwise it will fail. + Insert a Document to the DocList. The Document must be from the same + class as the doc_type of this DocList otherwise it will fail. :param i: index to insert :param doc: A Document """ @@ -238,10 +238,10 @@ def _set_data_column( field: str, values: Union[List, T, 'AbstractTensor'], ): - """Set all Documents in this DocArray using the passed values + """Set all Documents in this DocList using the passed values :param field: name of the fields to set - :values: the values to set at the DocArray level + :values: the values to set at the DocList level """ ... @@ -253,7 +253,7 @@ def stack( tensor_type: Type['AbstractTensor'] = NdArray, ) -> 'DocVec': """ - Convert the DocArray into a DocVec. `Self` cannot be used + Convert the DocList into a DocVec. `Self` cannot be used afterwards :param tensor_type: Tensor Class used to wrap the doc_vec tensors. This is useful if the BaseDoc has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor @@ -291,7 +291,7 @@ def traverse_flat( @classmethod def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T: """create a Document from a protobuf message - :param pb_msg: The protobuf message from where to construct the DocArray + :param pb_msg: The protobuf message from where to construct the DocList """ return super().from_protobuf(pb_msg) diff --git a/docarray/array/doc_list/io.py b/docarray/array/doc_list/io.py index 72d3181984b..e7a13c76543 100644 --- a/docarray/array/doc_list/io.py +++ b/docarray/array/doc_list/io.py @@ -114,12 +114,12 @@ def __init__( @classmethod def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T: """create a Document from a protobuf message - :param pb_msg: The protobuf message from where to construct the DocArray + :param pb_msg: The protobuf message from where to construct the DocList """ return cls(cls.doc_type.from_protobuf(doc_proto) for doc_proto in pb_msg.docs) def to_protobuf(self) -> 'DocListProto': - """Convert DocArray into a Protobuf message""" + """Convert DocList into a Protobuf message""" from docarray.proto import DocListProto da_proto = DocListProto() @@ -136,13 +136,13 @@ def from_bytes( compress: Optional[str] = None, show_progress: bool = False, ) -> T: - """Deserialize bytes into a DocArray. + """Deserialize bytes into a DocList. :param data: Bytes from which to deserialize :param protocol: protocol that was used to serialize :param compress: compress algorithm that was used to serialize :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :return: the deserialized DocArray + :return: the deserialized DocList """ return cls._load_binary_all( file_ctx=nullcontext(data), @@ -272,13 +272,13 @@ def from_base64( compress: Optional[str] = None, show_progress: bool = False, ) -> T: - """Deserialize base64 strings into a DocArray. + """Deserialize base64 strings into a DocList. :param data: Base64 string to deserialize :param protocol: protocol that was used to serialize :param compress: compress algorithm that was used to serialize :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :return: the deserialized DocArray + :return: the deserialized DocList """ return cls._load_binary_all( file_ctx=nullcontext(base64.b64decode(data)), @@ -314,17 +314,17 @@ def from_json( cls: Type[T], file: Union[str, bytes, bytearray], ) -> T: - """Deserialize JSON strings or bytes into a DocArray. + """Deserialize JSON strings or bytes into a DocList. - :param file: JSON object from where to deserialize a DocArray - :return: the deserialized DocArray + :param file: JSON object from where to deserialize a DocList + :return: the deserialized DocList """ json_docs = orjson.loads(file) return cls([cls.doc_type(**v) for v in json_docs]) def to_json(self) -> bytes: """Convert the object into JSON bytes. Can be loaded via :meth:`.from_json`. - :return: JSON serialization of DocArray + :return: JSON serialization of DocList """ return orjson_dumps(self._data) @@ -343,30 +343,30 @@ def from_csv( dialect: Union[str, csv.Dialect] = 'excel', ) -> 'DocList': """ - Load a DocArray from a csv file following the schema defined in the - :attr:`~docarray.DocArray.doc_type` attribute. + Load a DocList from a csv file following the schema defined in the + :attr:`~docarray.DocList.doc_type` attribute. Every row of the csv file will be mapped to one document in the doc_list. The column names (defined in the first row) have to match the field names of the Document type. For nested fields use "__"-separated access paths, such as 'image__url'. - List-like fields (including field of type DocArray) are not supported. + List-like fields (including field of type DocList) are not supported. - :param file_path: path to csv file to load DocArray from. + :param file_path: path to csv file to load DocList from. :param encoding: encoding used to read the csv file. Defaults to 'utf-8'. :param dialect: defines separator and how to handle whitespaces etc. Can be a csv.Dialect instance or one string of: 'excel' (for comma seperated values), 'excel-tab' (for tab separated values), 'unix' (for csv file generated on UNIX systems). - :return: DocArray + :return: DocList """ from docarray import DocList if cls.doc_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' - 'Please specify the DocArray\'s Document type using `DocArray[MyDoc]`.' + 'Please specify the DocList\'s Document type using `DocList[MyDoc]`.' ) doc_type = cls.doc_type @@ -385,7 +385,7 @@ def from_csv( ) if not all(valid_paths): raise ValueError( - f'Column names do not match the schema of the DocArray\'s ' + f'Column names do not match the schema of the DocList\'s ' f'document type ({cls.doc_type.__name__}): ' f'{list(compress(field_names, [not v for v in valid_paths]))}' ) @@ -402,7 +402,7 @@ def to_csv( self, file_path: str, dialect: Union[str, csv.Dialect] = 'excel' ) -> None: """ - Save a DocArray to a csv file. + Save a DocList to a csv file. The field names will be stored in the first row. Each row corresponds to the information of one Document. Columns for nested fields will be named after the "__"-seperated access paths, @@ -428,15 +428,15 @@ def to_csv( @classmethod def from_pandas(cls, df: 'pd.DataFrame') -> 'DocList': """ - Load a DocArray from a `pandas.DataFrame` following the schema - defined in the :attr:`~docarray.DocArray.doc_type` attribute. + Load a DocList from a `pandas.DataFrame` following the schema + defined in the :attr:`~docarray.DocList.doc_type` attribute. Every row of the dataframe will be mapped to one Document in the doc_list. The column names of the dataframe have to match the field names of the Document type. For nested fields use "__"-separated access paths as column names, such as 'image__url'. - List-like fields (including field of type DocArray) are not supported. + List-like fields (including field of type DocList) are not supported. EXAMPLE USAGE: @@ -444,7 +444,7 @@ def from_pandas(cls, df: 'pd.DataFrame') -> 'DocList': import pandas as pd - from docarray import BaseDoc, DocArray + from docarray import BaseDoc, DocList class Person(BaseDoc): @@ -456,14 +456,14 @@ class Person(BaseDoc): data=[['Maria', 12345], ['Jake', 54321]], columns=['name', 'follower'] ) - da = DocArray[Person].from_pandas(df) + da = DocList[Person].from_pandas(df) assert da.name == ['Maria', 'Jake'] assert da.follower == [12345, 54321] :param df: pandas.DataFrame to extract Document's information from - :return: DocArray where each Document contains the information of one + :return: DocList where each Document contains the information of one corresponding row of the `pandas.DataFrame`. """ from docarray import DocList @@ -471,7 +471,7 @@ class Person(BaseDoc): if cls.doc_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' - 'Please specify the DocArray\'s Document type using `DocArray[MyDoc]`.' + 'Please specify the DocList\'s Document type using `DocList[MyDoc]`.' ) doc_type = cls.doc_type @@ -486,7 +486,7 @@ class Person(BaseDoc): ) if not all(valid_paths): raise ValueError( - f'Column names do not match the schema of the DocArray\'s ' + f'Column names do not match the schema of the DocList\'s ' f'document type ({cls.doc_type.__name__}): ' f'{list(compress(field_names, [not v for v in valid_paths]))}' ) @@ -501,7 +501,7 @@ class Person(BaseDoc): def to_pandas(self) -> 'pd.DataFrame': """ - Save a DocArray to a `pandas.DataFrame`. + Save a DocList to a `pandas.DataFrame`. The field names will be stored as column names. Each row of the dataframe corresponds to the information of one Document. Columns for nested fields will be named after the "__"-seperated access paths, @@ -528,7 +528,7 @@ def to_pandas(self) -> 'pd.DataFrame': def _stream_header(self) -> bytes: # Binary format for streaming case - # V1 DocArray streaming serialization format + # V1 DocList streaming serialization format # | 1 byte | 8 bytes | 4 bytes | variable | 4 bytes | variable ... # 1 byte (uint8) @@ -545,11 +545,11 @@ def _load_binary_all( compress: Optional[str], show_progress: bool, ): - """Read a `DocArray` object from a binary file + """Read a `DocList` object from a binary file :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' :param compress: compress algorithm to use :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :return: a `DocArray` + :return: a `DocList` """ with file_ctx as fp: if isinstance(fp, bytes): @@ -689,7 +689,7 @@ def load_binary( :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :param streaming: if `True` returns a generator over `Document` objects. In case protocol is pickle the `Documents` are streamed from disk to save memory usage - :return: a DocArray object + :return: a DocList object .. note:: If `file` is `str` it can specify `protocol` and `compress` as file extensions. @@ -736,12 +736,12 @@ def save_binary( compress: Optional[str] = None, show_progress: bool = False, ) -> None: - """Save DocArray into a binary file. + """Save DocList into a binary file. - It will use the protocol to pick how to save the DocArray. - If used 'picke-doc_list` and `protobuf-array` the DocArray will be stored + It will use the protocol to pick how to save the DocList. + If used 'picke-doc_list` and `protobuf-array` the DocList will be stored and compressed at complete level using `pickle` or `protobuf`. - When using `protobuf` or `pickle` as protocol each Document in DocArray + When using `protobuf` or `pickle` as protocol each Document in DocList will be stored individually and this would make it available for streaming. :param file: File or filename to which the data is saved. diff --git a/docarray/array/doc_list/pushpull.py b/docarray/array/doc_list/pushpull.py index cd666c38cba..baa9c0439da 100644 --- a/docarray/array/doc_list/pushpull.py +++ b/docarray/array/doc_list/pushpull.py @@ -86,10 +86,10 @@ def push( show_progress: bool = False, branding: Optional[Dict] = None, ) -> Dict: - """Push this DocArray object to the specified url. + """Push this DocList object to the specified url. - :param url: url specifying the protocol and save name of the DocArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` - :param public: Only used by ``jac`` protocol. If true, anyone can pull a DocArray if they know its name. + :param url: url specifying the protocol and save name of the DocList. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param public: Only used by ``jac`` protocol. If true, anyone can pull a DocList if they know its name. Setting this to false will restrict access to only the creator. :param show_progress: If true, a progress bar will be displayed. :param branding: Only used by ``jac`` protocol. A dictionary of branding information to be sent to Jina AI Cloud. {"icon": "emoji", "background": "#fff"} @@ -112,8 +112,8 @@ def push_stream( """Push a stream of documents to the specified url. :param docs: a stream of documents - :param url: url specifying the protocol and save name of the DocArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` - :param public: Only used by ``jac`` protocol. If true, anyone can pull a DocArray if they know its name. + :param url: url specifying the protocol and save name of the DocList. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param public: Only used by ``jac`` protocol. If true, anyone can pull a DocList if they know its name. :param show_progress: If true, a progress bar will be displayed. :param branding: Only used by ``jac`` protocol. A dictionary of branding information to be sent to Jina AI Cloud. {"icon": "emoji", "background": "#fff"} """ @@ -130,19 +130,19 @@ def pull( show_progress: bool = False, local_cache: bool = True, ) -> 'DocList': - """Pull a :class:`DocArray` from the specified url. + """Pull a :class:`DocList` from the specified url. - :param url: url specifying the protocol and save name of the DocArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param url: url specifying the protocol and save name of the DocList. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocArray to local folder - :return: a :class:`DocArray` object + :param local_cache: store the downloaded DocList to local folder + :return: a :class:`DocList` object """ from docarray.base_doc import AnyDoc if cls.doc_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' - 'Please specify the DocArray\'s Document type using `DocArray[MyDoc]`.' + 'Please specify the DocList\'s Document type using `DocList[MyDoc]`.' ) logging.info(f'Pulling {url}') @@ -160,9 +160,9 @@ def pull_stream( ) -> Iterator['BaseDoc']: """Pull a stream of Documents from the specified url. - :param url: url specifying the protocol and save name of the DocArray. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param url: url specifying the protocol and save name of the DocList. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocArray to local folder + :param local_cache: store the downloaded DocList to local folder :return: Iterator of Documents """ from docarray.base_doc import AnyDoc @@ -170,7 +170,7 @@ def pull_stream( if cls.doc_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' - 'Please specify the DocArray\'s Document type using `DocArray[MyDoc]`.' + 'Please specify the DocList\'s Document type using `DocList[MyDoc]`.' ) logging.info(f'Pulling Document stream from {url}') diff --git a/docarray/array/doc_vec/doc_vec.py b/docarray/array/doc_vec/doc_vec.py index 5a50a3353f3..a77c436f408 100644 --- a/docarray/array/doc_vec/doc_vec.py +++ b/docarray/array/doc_vec/doc_vec.py @@ -60,7 +60,7 @@ class DocVec(AnyDocArray[T_doc]): calculation, deep learning forward pass) A DocVec has a similar interface as - {class}`~docarray.array.DocArray` but with an underlying implementation that is + {class}`~docarray.array.DocList` but with an underlying implementation that is column based instead of row based. Each field of the schema of the DocArrayStack (the :attr:`~docarray.array.doc_vec.DocVec.doc_type` which is a @@ -72,7 +72,7 @@ class DocVec(AnyDocArray[T_doc]): If the field is another `BasedDoc` the column will be another DocVec that follows the schema of the nested Document. - If the field is a `DocArray` or + If the field is a `DocList` or `DocVec` then the column will be a list of `DocVec`. For any other type the column is a Python list. @@ -310,7 +310,7 @@ def _set_data_and_columns( """Delegates the setting to the data and the columns. :param index_item: the key used as index. Needs to be a valid index for both - DocArray (data) and column types (torch/tensorflow/numpy tensors) + DocList (data) and column types (torch/tensorflow/numpy tensors) :value: the value to set at the `key` location """ if isinstance(index_item, tuple): @@ -352,10 +352,10 @@ def _set_data_column( AbstractTensor, ], ) -> None: - """Set all Documents in this DocArray using the passed values + """Set all Documents in this DocList using the passed values :param field: name of the fields to set - :values: the values to set at the DocArray level + :values: the values to set at the DocList level """ if len(values) != len(self._storage): @@ -390,7 +390,7 @@ def _set_data_column( values_ = cast(Sequence, values) self._storage.any_columns[field] = values_ else: - raise KeyError(f'{field} is not a valid field for this DocArray') + raise KeyError(f'{field} is not a valid field for this DocList') #################### # Deleting data # @@ -432,7 +432,7 @@ def from_protobuf(cls: Type[T], pb_msg: 'DocVecProto') -> T: return cls.from_columns_storage(storage) def to_protobuf(self) -> 'DocVecProto': - """Convert DocArray into a Protobuf message""" + """Convert DocList into a Protobuf message""" from docarray.proto import ( DocListProto, DocVecProto, @@ -473,9 +473,9 @@ def to_protobuf(self) -> 'DocVecProto': ) def unstack(self: T) -> DocList[T_doc]: - """Convert DocVec into a DocArray. + """Convert DocVec into a DocList. - Note this destroys the arguments and returns a new DocArray + Note this destroys the arguments and returns a new DocList """ unstacked_doc_column: Dict[str, DocList] = dict() diff --git a/docarray/base_doc/mixins/io.py b/docarray/base_doc/mixins/io.py index de379cc23fe..b2a64e8082b 100644 --- a/docarray/base_doc/mixins/io.py +++ b/docarray/base_doc/mixins/io.py @@ -257,7 +257,7 @@ def _get_content_from_node_proto( elif content_key in ['doc', 'doc_array']: if field_name is None: raise ValueError( - 'field_name cannot be None when trying to deseriliaze a Document or a DocArray' + 'field_name cannot be None when trying to deseriliaze a Document or a DocList' ) return_field = cls._get_field_type(field_name).from_protobuf( getattr(value, content_key) diff --git a/docarray/computation/abstract_comp_backend.py b/docarray/computation/abstract_comp_backend.py index cfe525cc932..da80ad9f841 100644 --- a/docarray/computation/abstract_comp_backend.py +++ b/docarray/computation/abstract_comp_backend.py @@ -16,7 +16,7 @@ class AbstractComputationalBackend(ABC, typing.Generic[TTensor]): Abstract base class for computational backends. Every supported tensor/ML framework (numpy, torch etc.) should define its own computational backend exposing common functionality expressed in that framework. - That way, DocArray can leverage native implementations from all frameworks. + That way, DocList can leverage native implementations from all frameworks. """ @classmethod diff --git a/docarray/data/torch_dataset.py b/docarray/data/torch_dataset.py index 414e3cbe146..a9541711af0 100644 --- a/docarray/data/torch_dataset.py +++ b/docarray/data/torch_dataset.py @@ -78,7 +78,7 @@ def add_nonsense(student: Student): ) - da = DocArray[Student](Student(thesis=Thesis(title=str(i))) for i in range(16)) + da = DocList[Student](Student(thesis=Thesis(title=str(i))) for i in range(16)) ds = MultiModalDataset[Student]( da, preprocessing={ diff --git a/docarray/display/document_array_summary.py b/docarray/display/document_array_summary.py index 4708d167220..2654e3d07d1 100644 --- a/docarray/display/document_array_summary.py +++ b/docarray/display/document_array_summary.py @@ -13,7 +13,7 @@ def __init__(self, da: 'AnyDocArray'): def summary(self) -> None: """ - Print a summary of this DocArray object and a summary of the schema of its + Print a summary of this DocList object and a summary of the schema of its Document type. """ from rich import box @@ -50,7 +50,7 @@ def summary(self) -> None: table.add_row(f' • {field_name}:', col_2) - Console().print(Panel(table, title='DocArray Summary', expand=False)) + Console().print(Panel(table, title='DocList Summary', expand=False)) self.da.doc_type.schema_summary() @staticmethod diff --git a/docarray/documents/legacy/legacy_document.py b/docarray/documents/legacy/legacy_document.py index 0c16d512d7a..96e2ee1e758 100644 --- a/docarray/documents/legacy/legacy_document.py +++ b/docarray/documents/legacy/legacy_document.py @@ -8,7 +8,7 @@ class LegacyDocument(BaseDoc): """ - This Document is the LegacyDocument. It follows the same schema as in DocArray v1. + This Document is the LegacyDocument. It follows the same schema as in DocList v1. It can be useful to start migrating a codebase from v1 to v2. Nevertheless, the API is not totally compatible with DocAray v1 `Document`. @@ -16,7 +16,7 @@ class LegacyDocument(BaseDoc): of the data is similar. .. code-block:: python - from docarray import DocArray + from docarray import DocList from docarray.documents.legacy import LegacyDocument import numpy as np @@ -27,9 +27,9 @@ class LegacyDocument(BaseDoc): doc.tags['price'] = 10 - doc.chunks = DocArray[Document]([Document() for _ in range(10)]) + doc.chunks = DocList[Document]([Document() for _ in range(10)]) - doc.chunks = DocArray[Document]([Document() for _ in range(10)]) + doc.chunks = DocList[Document]([Document() for _ in range(10)]) """ diff --git a/docarray/index/abstract.py b/docarray/index/abstract.py index 4672f4348a9..ece1a1b9764 100644 --- a/docarray/index/abstract.py +++ b/docarray/index/abstract.py @@ -259,7 +259,7 @@ def _filter( :param filter_query: the DB specific filter query to execute :param limit: maximum number of documents to return - :return: a DocArray containing the documents that match the filter query + :return: a DocList containing the documents that match the filter query """ ... @@ -387,7 +387,7 @@ def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs): """ if not isinstance(docs, (BaseDoc, DocList)): self._logger.warning( - 'Passing a sequence of Documents that is not a DocArray comes at ' + 'Passing a sequence of Documents that is not a DocList comes at ' 'a performance penalty, since compatibility with the schema of Index ' 'needs to be checked for every Document individually.' ) @@ -440,7 +440,7 @@ def find_batched( :param queries: query vector for KNN/ANN search. Can be either a tensor-like (np.array, torch.Tensor, etc.) with a, - or a DocArray. + or a DocList. If a tensor-like is passed, it should have shape (batch_size, vector_dim) :param search_field: name of the field to search on. Documents in the index are retrieved based on this similarity @@ -476,7 +476,7 @@ def filter( :param filter_query: the DB specific filter query to execute :param limit: maximum number of documents to return - :return: a DocArray containing the documents that match the filter query + :return: a DocList containing the documents that match the filter query """ self._logger.debug(f'Executing `filter` for the query {filter_query}') docs = self._filter(filter_query, limit=limit, **kwargs) @@ -496,7 +496,7 @@ def filter_batched( :param filter_queries: the DB specific filter query to execute :param limit: maximum number of documents to return - :return: a DocArray containing the documents that match the filter query + :return: a DocList containing the documents that match the filter query """ self._logger.debug( f'Executing `filter_batched` for the queries {filter_queries}' @@ -577,7 +577,7 @@ def text_search_batched( def _get_values_by_column(docs: Sequence[BaseDoc], col_name: str) -> List[Any]: """Get the value of a column of a document. - :param docs: The DocArray to get the values from + :param docs: The DocList to get the values from :param col_name: The name of the column, e.g. 'text' or 'image__tensor' :return: The value of the column of `doc` """ @@ -600,7 +600,7 @@ def _transpose_col_value_dict( """'Transpose' the output of `_get_col_value_dict()`: Yield rows of columns, where each row represent one Document. Since a generator is returned, this process comes at negligible cost. - :param docs: The DocArray to get the values from + :param docs: The DocList to get the values from :return: The `docs` flattened out as rows. Each row is a dictionary mapping from column name to value """ return (dict(zip(col_value_dict, row)) for row in zip(*col_value_dict.values())) @@ -726,8 +726,7 @@ def _create_column_infos(self, schema: Type[BaseDoc]) -> Dict[str, _ColumnInfo]: # Union types are handle in _flatten_schema if issubclass(type_, AnyDocArray): raise ValueError( - 'Indexing field of DocArray type (=subindex)' - 'is not yet supported.' + 'Indexing field of DocList type (=subindex)' 'is not yet supported.' ) else: column_infos[field_name] = self._create_single_column(field_, type_) @@ -770,16 +769,16 @@ def _validate_docs( Index need to evaluate to the same flattened columns. If Validation fails, a ValueError is raised. - :param docs: Document to evaluate. If this is a DocArray, validation is + :param docs: Document to evaluate. If this is a DocList, validation is performed using its `doc_type` (parametrization), without having to check ever Document in `docs`. If this check fails, or if `docs` is not a - DocArray, evaluation is performed for every Document in `docs`. - :return: A DocArray containing the Documents in `docs` + DocList, evaluation is performed for every Document in `docs`. + :return: A DocList containing the Documents in `docs` """ if isinstance(docs, BaseDoc): docs = [docs] if isinstance(docs, DocList): - # validation shortcut for DocArray; only look at the schema + # validation shortcut for DocList; only look at the schema reference_schema_flat = self._flatten_schema( cast(Type[BaseDoc], self._schema) ) @@ -872,7 +871,7 @@ def _convert_dict_to_doc( return schema_cls(**doc_dict) def _dict_list_to_docarray(self, dict_list: Sequence[Dict[str, Any]]) -> DocList: - """Convert a list of docs in dict type to a DocArray of the schema type.""" + """Convert a list of docs in dict type to a DocList of the schema type.""" doc_list = [self._convert_dict_to_doc(doc_dict, self._schema) for doc_dict in dict_list] # type: ignore da_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) diff --git a/docarray/store/abstract_doc_store.py b/docarray/store/abstract_doc_store.py index 1b926e47703..42ce9083440 100644 --- a/docarray/store/abstract_doc_store.py +++ b/docarray/store/abstract_doc_store.py @@ -15,18 +15,18 @@ def list(namespace: str, show_table: bool) -> List[str]: :param namespace: The namespace to list :param show_table: If true, a table is printed to the console - :return: A list of DocArray names + :return: A list of DocList names """ ... @staticmethod @abstractmethod def delete(name: str, missing_ok: bool) -> bool: - """Delete the DocArray object at the specified name + """Delete the DocList object at the specified name - :param name: The name of the DocArray to delete - :param missing_ok: If true, no error will be raised if the DocArray does not exist. - :return: True if the DocArray was deleted, False if it did not exist. + :param name: The name of the DocList to delete + :param missing_ok: If true, no error will be raised if the DocList does not exist. + :return: True if the DocList was deleted, False if it did not exist. """ ... @@ -39,13 +39,13 @@ def push( show_progress: bool, branding: Optional[Dict], ) -> Dict: - """Push this DocArray to the specified name. + """Push this DocList to the specified name. - :param da: The DocArray to push + :param da: The DocList to push :param name: The name to push to - :param public: Whether the DocArray should be publicly accessible + :param public: Whether the DocList should be publicly accessible :param show_progress: If true, a progress bar will be displayed. - :param branding: Branding information to be stored with the DocArray + :param branding: Branding information to be stored with the DocList """ ... @@ -62,9 +62,9 @@ def push_stream( :param docs: a stream of documents :param url: The name to push to - :param public: Whether the DocArray should be publicly accessible + :param public: Whether the DocList should be publicly accessible :param show_progress: If true, a progress bar will be displayed. - :param branding: Branding information to be stored with the DocArray + :param branding: Branding information to be stored with the DocList """ ... @@ -76,13 +76,13 @@ def pull( show_progress: bool, local_cache: bool, ) -> 'DocList': - """Pull a DocArray from the specified name. + """Pull a DocList from the specified name. - :param da_cls: The DocArray class to instantiate + :param da_cls: The DocList class to instantiate :param name: The name to pull from :param show_progress: If true, a progress bar will be displayed. - :param local_cache: If true, the DocArray will be cached locally - :return: A DocArray + :param local_cache: If true, the DocList will be cached locally + :return: A DocList """ ... @@ -96,9 +96,9 @@ def pull_stream( ) -> Iterator['BaseDoc']: """Pull a stream of documents from the specified name. - :param da_cls: The DocArray class to instantiate + :param da_cls: The DocList class to instantiate :param name: The name to pull from :param show_progress: If true, a progress bar will be displayed. - :param local_cache: If true, the DocArray will be cached locally + :param local_cache: If true, the DocList will be cached locally :return: An iterator of documents""" ... diff --git a/docarray/store/file.py b/docarray/store/file.py index b4ac938739c..88b1c1b3de8 100644 --- a/docarray/store/file.py +++ b/docarray/store/file.py @@ -74,9 +74,9 @@ def list( def delete( cls: Type[SelfFileDocStore], name: str, missing_ok: bool = False ) -> bool: - """Delete a DocArray from the local filesystem. + """Delete a DocList from the local filesystem. - :param name: The name of the DocArray to delete. + :param name: The name of the DocList to delete. :param missing_ok: If True, do not raise an exception if the file does not exist. Defaults to False. :return: True if the file was deleted, False if it did not exist. """ @@ -98,7 +98,7 @@ def push( show_progress: bool, branding: Optional[Dict], ) -> Dict: - """Push this DocArray object to the specified file path. + """Push this DocList object to the specified file path. :param name: The file path to push to. :param public: Not used by the ``file`` protocol. @@ -150,12 +150,12 @@ def pull( show_progress: bool, local_cache: bool, ) -> 'DocList': - """Pull a :class:`DocArray` from the specified url. + """Pull a :class:`DocList` from the specified url. :param name: The file path to pull from. :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocArray to local folder - :return: a :class:`DocArray` object + :param local_cache: store the downloaded DocList to local folder + :return: a :class:`DocList` object """ return da_cls( diff --git a/docarray/store/jac.py b/docarray/store/jac.py index 251d1dfb576..c50a3136cc6 100644 --- a/docarray/store/jac.py +++ b/docarray/store/jac.py @@ -51,12 +51,12 @@ def _get_raw_summary(self: 'DocList') -> List[Dict[str, Any]]: dict( name='Type', value=self.__class__.__name__, - description='The type of the DocArray', + description='The type of the DocList', ), dict( name='Length', value=len(self), - description='The length of the DocArray', + description='The length of the DocList', ), dict( name='Homogenous Documents', @@ -82,7 +82,7 @@ def _get_raw_summary(self: 'DocList') -> List[Dict[str, Any]]: class JACDocStore(AbstractDocStore): - """Class to push and pull DocArray to and from Jina AI Cloud.""" + """Class to push and pull DocList to and from Jina AI Cloud.""" @staticmethod @hubble.login_required @@ -91,7 +91,7 @@ def list(namespace: str = '', show_table: bool = False) -> List[str]: :param namespace: Not supported for Jina AI Cloud. :param show_table: if true, show the table of the arrays. - :returns: List of available DocArray's names. + :returns: List of available DocList's names. """ if len(namespace) > 0: logging.warning('Namespace is not supported for Jina AI Cloud.') @@ -106,7 +106,7 @@ def list(namespace: str = '', show_table: bool = False) -> List[str]: ) table = Table( - title=f'You have {resp["meta"]["total"]} DocArray on the cloud', + title=f'You have {resp["meta"]["total"]} DocList on the cloud', box=box.SIMPLE, highlight=True, ) @@ -135,10 +135,10 @@ def list(namespace: str = '', show_table: bool = False) -> List[str]: @hubble.login_required def delete(name: str, missing_ok: bool = True) -> bool: """ - Delete a DocArray from the cloud. - :param name: the name of the DocArray to delete. - :param missing_ok: if true, do not raise an error if the DocArray does not exist. - :return: True if the DocArray was deleted, False if it did not exist. + Delete a DocList from the cloud. + :param name: the name of the DocList to delete. + :param missing_ok: if true, do not raise an error if the DocList does not exist. + :return: True if the DocList was deleted, False if it did not exist. """ try: HubbleClient(jsonify=True).delete_artifact(name=name) @@ -158,7 +158,7 @@ def push( show_progress: bool = False, branding: Optional[Dict] = None, ) -> Dict: - """Push this DocArray object to Jina AI Cloud + """Push this DocList object to Jina AI Cloud .. note:: - Push with the same ``name`` will override the existing content. @@ -167,8 +167,8 @@ def push( - The lifetime of the content is not promised atm, could be a day, could be a week. Do not use it for persistence. Only use this full temporary transmission/storage/clipboard. - :param name: A name that can later be used to retrieve this :class:`DocArray`. - :param public: By default, anyone can pull a DocArray if they know its name. + :param name: A name that can later be used to retrieve this :class:`DocList`. + :param public: By default, anyone can pull a DocList if they know its name. Setting this to false will restrict access to only the creator. :param show_progress: If true, a progress bar will be displayed. :param branding: A dictionary of branding information to be sent to Jina Cloud. e.g. {"icon": "emoji", "background": "#fff"} @@ -252,8 +252,8 @@ def push_stream( - The lifetime of the content is not promised atm, could be a day, could be a week. Do not use it for persistence. Only use this full temporary transmission/storage/clipboard. - :param name: A name that can later be used to retrieve this :class:`DocArray`. - :param public: By default, anyone can pull a DocArray if they know its name. + :param name: A name that can later be used to retrieve this :class:`DocList`. + :param public: By default, anyone can pull a DocList if they know its name. Setting this to false will restrict access to only the creator. :param show_progress: If true, a progress bar will be displayed. :param branding: A dictionary of branding information to be sent to Jina Cloud. e.g. {"icon": "emoji", "background": "#fff"} @@ -262,7 +262,7 @@ def push_stream( # This is a temporary solution to push a stream of documents # The memory footprint is not ideal - # But it must be done this way for now because Hubble expects to know the length of the DocArray + # But it must be done this way for now because Hubble expects to know the length of the DocList # before it starts receiving the documents first_doc = next(docs) da = DocList[first_doc.__class__]([first_doc]) # type: ignore @@ -278,12 +278,12 @@ def pull( show_progress: bool = False, local_cache: bool = True, ) -> 'DocList': - """Pull a :class:`DocArray` from Jina AI Cloud to local. + """Pull a :class:`DocList` from Jina AI Cloud to local. :param name: the upload name set during :meth:`.push` :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocArray to local folder - :return: a :class:`DocArray` object + :param local_cache: store the downloaded DocList to local folder + :return: a :class:`DocList` object """ from docarray import DocList @@ -299,11 +299,11 @@ def pull_stream( show_progress: bool = False, local_cache: bool = False, ) -> Iterator['BaseDoc']: - """Pull a :class:`DocArray` from Jina AI Cloud to local. + """Pull a :class:`DocList` from Jina AI Cloud to local. :param name: the upload name set during :meth:`.push` :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocArray to local folder + :param local_cache: store the downloaded DocList to local folder :return: An iterator of Documents """ import requests diff --git a/docarray/store/s3.py b/docarray/store/s3.py index 07aa418137c..85defe7323a 100644 --- a/docarray/store/s3.py +++ b/docarray/store/s3.py @@ -48,7 +48,7 @@ def close(self): class S3DocStore(AbstractDocStore): - """Class to push and pull DocArray to and from S3.""" + """Class to push and pull DocList to and from S3.""" @staticmethod def list(namespace: str, show_table: bool = False) -> List[str]: @@ -56,7 +56,7 @@ def list(namespace: str, show_table: bool = False) -> List[str]: :param namespace: The bucket and namespace to list. e.g. my_bucket/my_namespace :param show_table: If true, a rich table will be printed to the console. - :return: A list of DocArray names. + :return: A list of DocList names. """ bucket, namespace = namespace.split('/', 1) s3 = boto3.resource('s3') @@ -94,7 +94,7 @@ def list(namespace: str, show_table: bool = False) -> List[str]: @staticmethod def delete(name: str, missing_ok: bool = True) -> bool: - """Delete the DocArray object at the specified bucket and key. + """Delete the DocList object at the specified bucket and key. :param name: The bucket and key to delete. e.g. my_bucket/my_key :param missing_ok: If true, no error will be raised if the object does not exist. @@ -125,9 +125,9 @@ def push( show_progress: bool = False, branding: Optional[Dict] = None, ) -> Dict: - """Push this DocArray object to the specified bucket and key. + """Push this DocList object to the specified bucket and key. - :param da: The DocArray to push. + :param da: The DocList to push. :param name: The bucket and key to push to. e.g. my_bucket/my_key :param public: Not used by the ``s3`` protocol. :param show_progress: If true, a progress bar will be displayed. @@ -182,12 +182,12 @@ def pull( show_progress: bool = False, local_cache: bool = False, ) -> 'DocList': - """Pull a :class:`DocArray` from the specified bucket and key. + """Pull a :class:`DocList` from the specified bucket and key. :param name: The bucket and key to pull from. e.g. my_bucket/my_key :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocArray to local cache - :return: a :class:`DocArray` object + :param local_cache: store the downloaded DocList to local cache + :return: a :class:`DocList` object """ da = da_cls( # type: ignore cls.pull_stream( @@ -209,7 +209,7 @@ def pull_stream( :param name: The bucket and key to pull from. e.g. my_bucket/my_key :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocArray to local cache + :param local_cache: store the downloaded DocList to local cache :return: An iterator of Documents """ diff --git a/docarray/typing/tensor/abstract_tensor.py b/docarray/typing/tensor/abstract_tensor.py index f9814b429e4..b74cc06697f 100644 --- a/docarray/typing/tensor/abstract_tensor.py +++ b/docarray/typing/tensor/abstract_tensor.py @@ -265,7 +265,7 @@ def _docarray_stack(cls: Type[T], seq: Union[List[T], Tuple[T]]) -> T: @abc.abstractmethod def _docarray_from_native(cls: Type[T], value: Any) -> T: """ - Create a DocArray tensor from a tensor that is native to the given framework, + Create a DocList tensor from a tensor that is native to the given framework, e.g. from numpy.ndarray or torch.Tensor. """ ... @@ -293,11 +293,11 @@ def __iter__(self): @abc.abstractmethod def to_protobuf(self) -> 'NdArrayProto': - """Convert DocArray into a Protobuf message""" + """Convert DocList into a Protobuf message""" ... def unwrap(self): - """Return the native tensor object that this DocArray tensor wraps.""" + """Return the native tensor object that this DocList tensor wraps.""" @abc.abstractmethod def _docarray_to_json_compatible(self): diff --git a/docarray/utils/find.py b/docarray/utils/find.py index b3418126589..99bf506b077 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -258,9 +258,9 @@ def _extract_embeddings( def _da_attr_type(da: AnyDocArray, access_path: str) -> Type[AnyTensor]: """Get the type of the attribute according to the Document type - (schema) of the DocArray. + (schema) of the DocList. - :param da: the DocArray + :param da: the DocList :param access_path: the "__"-separated access path :return: the type of the attribute """ From 9659e3857a833d5b7adb9ad3c9e0382088e82335 Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 4 Apr 2023 13:31:48 +0200 Subject: [PATCH 19/27] refactor: rename da to docs Signed-off-by: samsja --- docarray/array/any_array.py | 10 +++--- docarray/array/doc_list/doc_list.py | 30 ++++++++-------- docarray/array/doc_list/io.py | 18 +++++----- .../array/doc_list/sequence_indexing_mixin.py | 8 ++--- docarray/array/doc_vec/doc_vec.py | 28 +++++++-------- .../array/doc_vec/list_advance_indexing.py | 8 ++--- docarray/data/torch_dataset.py | 18 +++++----- docarray/display/document_array_summary.py | 24 ++++++------- docarray/helper.py | 6 ++-- docarray/store/abstract_doc_store.py | 4 +-- docarray/store/file.py | 12 +++---- docarray/store/jac.py | 30 ++++++++-------- docarray/store/s3.py | 24 ++++++------- docarray/utils/find.py | 8 ++--- docarray/utils/map.py | 34 +++++++++---------- docarray/utils/reduce.py | 4 +-- 16 files changed, 133 insertions(+), 133 deletions(-) diff --git a/docarray/array/any_array.py b/docarray/array/any_array.py index 1b2487d0a01..d156da9ea8c 100644 --- a/docarray/array/any_array.py +++ b/docarray/array/any_array.py @@ -179,14 +179,14 @@ class Book(BaseDoc): content: Text - da = DocList[Book]( + docs = DocList[Book]( Book(author=Author(name='Jenny'), content=Text(text=f'book_{i}')) for i in range(10) # noqa: E501 ) - books = da.traverse_flat(access_path='content') # list of 10 Text objs + books = docs.traverse_flat(access_path='content') # list of 10 Text objs - authors = da.traverse_flat(access_path='author__name') # list of 10 strings + authors = docs.traverse_flat(access_path='author__name') # list of 10 strings If the resulting list is a nested list, it will be flattened: @@ -203,12 +203,12 @@ class Book(BaseDoc): chapters: DocList[Chapter] - da = DocList[Book]( + docs = DocList[Book]( Book(chapters=DocList[Chapter]([Chapter(content='some_content') for _ in range(3)])) for _ in range(10) ) - chapters = da.traverse_flat(access_path='chapters') # list of 30 strings + chapters = docs.traverse_flat(access_path='chapters') # list of 30 strings If your DocList is in doc_vec mode, and you want to access a field of type AnyTensor, the doc_vec tensor will be returned instead of a list: diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py index 97276c4bd49..89364ff4842 100644 --- a/docarray/array/doc_list/doc_list.py +++ b/docarray/array/doc_list/doc_list.py @@ -83,7 +83,7 @@ class Image(BaseDoc): url: ImageUrl - da = DocList[Image]( + docs = DocList[Image]( Image(url='http://url.com/foo.png') for _ in range(10) ) # noqa: E510 ``` @@ -92,30 +92,30 @@ class Image(BaseDoc): If your DocList is homogeneous (i.e. follows the same schema), you can access - fields at the DocList level (for example `da.tensor` or `da.url`). - You can also set fields, with `da.tensor = np.random.random([10, 100])`: + fields at the DocList level (for example `docs.tensor` or `docs.url`). + You can also set fields, with `docs.tensor = np.random.random([10, 100])`: - print(da.url) + print(docs.url) # [ImageUrl('http://url.com/foo.png', host_type='domain'), ...] import numpy as np - da.tensor = np.random.random([10, 100]) - print(da.tensor) + docs.tensor = np.random.random([10, 100]) + print(docs.tensor) # [NdArray([0.11299577, 0.47206767, 0.481723 , 0.34754724, 0.15016037, # 0.88861321, 0.88317666, 0.93845579, 0.60486676, ... ]), ...] You can index into a DocList like a numpy doc_list or torch tensor: - da[0] # index by position - da[0:5:2] # index by slice - da[[0, 2, 3]] # index by list of indices - da[True, False, True, True, ...] # index by boolean mask + docs[0] # index by position + docs[0:5:2] # index by slice + docs[[0, 2, 3]] # index by list of indices + docs[True, False, True, True, ...] # index by boolean mask You can delete items from a DocList like a Python List - del da[0] # remove first element from DocList - del da[0:5] # remove elements for 0 to 5 from DocList + del docs[0] # remove first element from DocList + del docs[0:5] # remove elements for 0 to 5 from DocList :param docs: iterable of Document @@ -140,9 +140,9 @@ def construct( :param docs: a Sequence (list) of Document with the same schema :return: """ - da = cls.__new__(cls) - da._data = docs if isinstance(docs, list) else list(docs) - return da + new_docs = cls.__new__(cls) + new_docs._data = docs if isinstance(docs, list) else list(docs) + return new_docs def __eq__(self, other: Any) -> bool: if self.__len__() != other.__len__(): diff --git a/docarray/array/doc_list/io.py b/docarray/array/doc_list/io.py index e7a13c76543..40d3486699f 100644 --- a/docarray/array/doc_list/io.py +++ b/docarray/array/doc_list/io.py @@ -370,7 +370,7 @@ def from_csv( ) doc_type = cls.doc_type - da = DocList.__class_getitem__(doc_type)() + docs = DocList.__class_getitem__(doc_type)() with open(file_path, 'r', encoding=encoding) as fp: rows = csv.DictReader(fp, dialect=dialect) @@ -394,9 +394,9 @@ def from_csv( doc_dict: Dict[Any, Any] = _access_path_dict_to_nested_dict( access_path2val ) - da.append(doc_type.parse_obj(doc_dict)) + docs.append(doc_type.parse_obj(doc_dict)) - return da + return docs def to_csv( self, file_path: str, dialect: Union[str, csv.Dialect] = 'excel' @@ -456,10 +456,10 @@ class Person(BaseDoc): data=[['Maria', 12345], ['Jake', 54321]], columns=['name', 'follower'] ) - da = DocList[Person].from_pandas(df) + docs = DocList[Person].from_pandas(df) - assert da.name == ['Maria', 'Jake'] - assert da.follower == [12345, 54321] + assert docs.name == ['Maria', 'Jake'] + assert docs.follower == [12345, 54321] :param df: pandas.DataFrame to extract Document's information from @@ -475,7 +475,7 @@ class Person(BaseDoc): ) doc_type = cls.doc_type - da = DocList.__class_getitem__(doc_type)() + docs = DocList.__class_getitem__(doc_type)() field_names = df.columns.tolist() if field_names is None or len(field_names) == 0: @@ -495,9 +495,9 @@ class Person(BaseDoc): access_path2val = row._asdict() access_path2val.pop('index', None) doc_dict = _access_path_dict_to_nested_dict(access_path2val) - da.append(doc_type.parse_obj(doc_dict)) + docs.append(doc_type.parse_obj(doc_dict)) - return da + return docs def to_pandas(self) -> 'pd.DataFrame': """ diff --git a/docarray/array/doc_list/sequence_indexing_mixin.py b/docarray/array/doc_list/sequence_indexing_mixin.py index 601545c3a9d..85bad64429f 100644 --- a/docarray/array/doc_list/sequence_indexing_mixin.py +++ b/docarray/array/doc_list/sequence_indexing_mixin.py @@ -42,10 +42,10 @@ class IndexingSequenceMixin(Iterable[T_item]): You can index into, delete from, and set items in a IndexingSequenceMixin like a numpy doc_list or torch tensor: .. code-block:: python - da[0] # index by position - da[0:5:2] # index by slice - da[[0, 2, 3]] # index by list of indices - da[True, False, True, True, ...] # index by boolean mask + docs[0] # index by position + docs[0:5:2] # index by slice + docs[[0, 2, 3]] # index by list of indices + docs[True, False, True, True, ...] # index by boolean mask """ diff --git a/docarray/array/doc_vec/doc_vec.py b/docarray/array/doc_vec/doc_vec.py index a77c436f408..6b36f521439 100644 --- a/docarray/array/doc_vec/doc_vec.py +++ b/docarray/array/doc_vec/doc_vec.py @@ -110,7 +110,7 @@ def __init__( ) for field_name, field in self.doc_type.__fields__.items(): - # here we iterate over the field of the da schema, and we collect the data + # here we iterate over the field of the docs schema, and we collect the data # from each document and put them in the corresponding column field_type = self.doc_type._get_field_type(field_name) @@ -166,10 +166,10 @@ def __init__( elif issubclass(field_type, AnyDocArray): docs_list = list() for doc in docs: - da = getattr(doc, field_name) - if isinstance(da, DocList): - da = da.stack(tensor_type=self.tensor_type) - docs_list.append(da) + docs = getattr(doc, field_name) + if isinstance(docs, DocList): + docs = docs.stack(tensor_type=self.tensor_type) + docs_list.append(docs) da_columns[field_name] = ListAdvancedIndexing(docs_list) else: any_columns[field_name] = ListAdvancedIndexing( @@ -195,10 +195,10 @@ def from_columns_storage(cls: Type[T], storage: ColumnStorage) -> T: :param storage: the underlying storage. :return: a DocArrayStack """ - da = cls.__new__(cls) - da.tensor_type = storage.tensor_type - da._storage = storage - return da + docs = cls.__new__(cls) + docs.tensor_type = storage.tensor_type + docs._storage = storage + return docs @classmethod def validate( @@ -231,8 +231,8 @@ def to(self: T, device: str) -> T: for field, col_doc in self._storage.doc_columns.items(): self._storage.doc_columns[field] = col_doc.to(device) for _, col_da in self._storage.da_columns.items(): - for da in col_da: - da.to(device) + for docs in col_da: + docs.to(device) return self @@ -456,8 +456,8 @@ def to_protobuf(self) -> 'DocVecProto': tensor_columns_proto[field] = col_tens.to_protobuf() for field, col_da in self._storage.da_columns.items(): list_proto = ListOfDocArrayProto() - for da in col_da: - list_proto.data.append(da.to_protobuf()) + for docs in col_da: + list_proto.data.append(docs.to_protobuf()) da_columns_proto[field] = list_proto for field, col_any in self._storage.any_columns.items(): list_proto = ListOfAnyProto() @@ -487,7 +487,7 @@ def unstack(self: T) -> DocList[T_doc]: unstacked_doc_column[field] = doc_col.unstack() for field, da_col in self._storage.da_columns.items(): - unstacked_da_column[field] = [da.unstack() for da in da_col] + unstacked_da_column[field] = [docs.unstack() for docs in da_col] for field, tensor_col in list(self._storage.tensor_columns.items()): # list is needed here otherwise we cannot delete the column diff --git a/docarray/array/doc_vec/list_advance_indexing.py b/docarray/array/doc_vec/list_advance_indexing.py index 1de13dd9f27..e0eaf2e970c 100644 --- a/docarray/array/doc_vec/list_advance_indexing.py +++ b/docarray/array/doc_vec/list_advance_indexing.py @@ -12,10 +12,10 @@ class ListAdvancedIndexing(IndexingSequenceMixin[T_item]): You can index into a ListAdvanceIndex like a numpy array or torch tensor: .. code-block:: python - da[0] # index by position - da[0:5:2] # index by slice - da[[0, 2, 3]] # index by list of indices - da[True, False, True, True, ...] # index by boolean mask + docs[0] # index by position + docs[0:5:2] # index by slice + docs[[0, 2, 3]] # index by list of indices + docs[True, False, True, True, ...] # index by boolean mask """ diff --git a/docarray/data/torch_dataset.py b/docarray/data/torch_dataset.py index a9541711af0..25fbb9a9a6a 100644 --- a/docarray/data/torch_dataset.py +++ b/docarray/data/torch_dataset.py @@ -14,7 +14,7 @@ class MultiModalDataset(Dataset, Generic[T_doc]): A dataset that can be used inside a PyTorch DataLoader. In other words, it implements the PyTorch Dataset interface. - :param da: the DocList to be used as the dataset + :param docs: the DocList to be used as the dataset :param preprocessing: a dictionary of field names and preprocessing functions The preprocessing dictionary passed to the constructor consists of keys that are @@ -33,8 +33,8 @@ def prepend_number(text: str): return f"Number {text}" - da = DocList[Text](Text(text=str(i)) for i in range(16)) - ds = MultiModalDataset[Text](da, preprocessing={'text': prepend_number}) + docs = DocList[Text](Text(text=str(i)) for i in range(16)) + ds = MultiModalDataset[Text](docs, preprocessing={'text': prepend_number}) loader = DataLoader(ds, batch_size=4, collate_fn=MultiModalDataset[Text].collate_fn) for batch in loader: print(batch.text) @@ -78,9 +78,9 @@ def add_nonsense(student: Student): ) - da = DocList[Student](Student(thesis=Thesis(title=str(i))) for i in range(16)) + docs = DocList[Student](Student(thesis=Thesis(title=str(i))) for i in range(16)) ds = MultiModalDataset[Student]( - da, + docs, preprocessing={ "thesis.title": embed_title, "thesis": normalize_embedding, @@ -96,16 +96,16 @@ def add_nonsense(student: Student): __typed_ds__: Dict[Type[BaseDoc], Type['MultiModalDataset']] = {} def __init__( - self, da: 'DocList[T_doc]', preprocessing: Dict[str, Callable] + self, docs: 'DocList[T_doc]', preprocessing: Dict[str, Callable] ) -> None: - self.da = da + self.docs = docs self._preprocessing = preprocessing def __len__(self): - return len(self.da) + return len(self.docs) def __getitem__(self, item: int): - doc = self.da[item].copy(deep=True) + doc = self.docs[item].copy(deep=True) for field, preprocess in self._preprocessing.items(): if len(field) == 0: doc = preprocess(doc) or doc diff --git a/docarray/display/document_array_summary.py b/docarray/display/document_array_summary.py index 2654e3d07d1..60d2bc7445d 100644 --- a/docarray/display/document_array_summary.py +++ b/docarray/display/document_array_summary.py @@ -8,8 +8,8 @@ class DocArraySummary: - def __init__(self, da: 'AnyDocArray'): - self.da = da + def __init__(self, docs: 'AnyDocArray'): + self.docs = docs def summary(self) -> None: """ @@ -25,14 +25,14 @@ def summary(self) -> None: table = Table(box=box.SIMPLE, highlight=True) table.show_header = False - table.add_row('Type', self.da.__class__.__name__) - table.add_row('Length', str(len(self.da)), end_section=True) + table.add_row('Type', self.docs.__class__.__name__) + table.add_row('Length', str(len(self.docs)), end_section=True) - if isinstance(self.da, DocVec): + if isinstance(self.docs, DocVec): table.add_row('Stacked columns:') - stacked_fields = self._get_stacked_fields(da=self.da) + stacked_fields = self._get_stacked_fields(docs=self.docs) for field_name in stacked_fields: - val = self.da + val = self.docs for attr in field_name.split('.'): val = getattr(val, attr) @@ -51,10 +51,10 @@ def summary(self) -> None: table.add_row(f' • {field_name}:', col_2) Console().print(Panel(table, title='DocList Summary', expand=False)) - self.da.doc_type.schema_summary() + self.docs.doc_type.schema_summary() @staticmethod - def _get_stacked_fields(da: 'DocVec') -> List[str]: # TODO this might + def _get_stacked_fields(docs: 'DocVec') -> List[str]: # TODO this might # broken """ Return a list of the field names of a DocVec instance that are @@ -62,13 +62,13 @@ def _get_stacked_fields(da: 'DocVec') -> List[str]: # TODO this might paths are separated by dot, such as: 'attr.nested_attr'. """ fields = [] - for field_name, value_tens in da._storage.tensor_columns.items(): + for field_name, value_tens in docs._storage.tensor_columns.items(): fields.append(field_name) - for field_name, value_doc in da._storage.doc_columns.items(): + for field_name, value_doc in docs._storage.doc_columns.items(): fields.extend( [ f'{field_name}.{x}' - for x in DocArraySummary._get_stacked_fields(da=value_doc) + for x in DocArraySummary._get_stacked_fields(docs=value_doc) ] ) diff --git a/docarray/helper.py b/docarray/helper.py index 7cedb443d56..7c8972b4735 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -191,12 +191,12 @@ class Banner(BaseDoc): # you can call it in the constructor - da = DocList[Banner]([Banner(text_url=url) for url in get_paths(patterns='*.txt')]) + docs = DocList[Banner]([Banner(text_url=url) for url in get_paths(patterns='*.txt')]) # and call it after construction to set the urls - da.image_url = list(get_paths(patterns='*.jpg', exclude_regex='test')) + docs.image_url = list(get_paths(patterns='*.jpg', exclude_regex='test')) - for doc in da: + for doc in docs: assert doc.image_url.endswith('.txt') assert doc.text_url.endswith('.jpg') diff --git a/docarray/store/abstract_doc_store.py b/docarray/store/abstract_doc_store.py index 42ce9083440..e9c961faa7f 100644 --- a/docarray/store/abstract_doc_store.py +++ b/docarray/store/abstract_doc_store.py @@ -33,7 +33,7 @@ def delete(name: str, missing_ok: bool) -> bool: @staticmethod @abstractmethod def push( - da: 'DocList', + docs: 'DocList', name: str, public: bool, show_progress: bool, @@ -41,7 +41,7 @@ def push( ) -> Dict: """Push this DocList to the specified name. - :param da: The DocList to push + :param docs: The DocList to push :param name: The name to push to :param public: Whether the DocList should be publicly accessible :param show_progress: If true, a progress bar will be displayed. diff --git a/docarray/store/file.py b/docarray/store/file.py index 88b1c1b3de8..69a52ec4ab9 100644 --- a/docarray/store/file.py +++ b/docarray/store/file.py @@ -41,7 +41,7 @@ def list( namespace_dir = cls._abs_filepath(namespace) if not namespace_dir.exists(): raise FileNotFoundError(f'Directory {namespace} does not exist') - da_files = [dafile for dafile in namespace_dir.glob('*.da')] + da_files = [dafile for dafile in namespace_dir.glob('*.docs')] if show_table: from datetime import datetime @@ -82,7 +82,7 @@ def delete( """ path = cls._abs_filepath(name) try: - path.with_suffix('.da').unlink() + path.with_suffix('.docs').unlink() return True except FileNotFoundError: if not missing_ok: @@ -92,7 +92,7 @@ def delete( @classmethod def push( cls: Type[SelfFileDocStore], - da: 'DocList', + docs: 'DocList', name: str, public: bool, show_progress: bool, @@ -105,7 +105,7 @@ def push( :param show_progress: If true, a progress bar will be displayed. :param branding: Not used by the ``file`` protocol. """ - return cls.push_stream(iter(da), name, public, show_progress, branding) + return cls.push_stream(iter(docs), name, public, show_progress, branding) @classmethod def push_stream( @@ -130,7 +130,7 @@ def push_stream( source = _to_binary_stream( docs, protocol='protobuf', compress='gzip', show_progress=show_progress ) - path = cls._abs_filepath(name).with_suffix('.da.tmp') + path = cls._abs_filepath(name).with_suffix('.docs.tmp') if path.exists(): raise ConcurrentPushException(f'File {path} already exists.') with open(path, 'wb') as f: @@ -183,7 +183,7 @@ def pull_stream( if local_cache: logging.warning('local_cache is not supported for "file" protocol') - path = cls._abs_filepath(name).with_suffix('.da') + path = cls._abs_filepath(name).with_suffix('.docs') source = open(path, 'rb') return _from_binary_stream( da_cls.doc_type, diff --git a/docarray/store/jac.py b/docarray/store/jac.py index c50a3136cc6..7838e3c26c8 100644 --- a/docarray/store/jac.py +++ b/docarray/store/jac.py @@ -116,15 +116,15 @@ def list(namespace: str = '', show_table: bool = False) -> List[str]: table.add_column('Created at', justify='center') table.add_column('Updated at', justify='center') - for da in resp['data']: - result.append(da['name']) + for docs in resp['data']: + result.append(docs['name']) table.add_row( - da['name'], - str(_get_length_from_summary(da['metaData'].get('summary', []))), - da['visibility'], - da['createdAt'], - da['updatedAt'], + docs['name'], + str(_get_length_from_summary(docs['metaData'].get('summary', []))), + docs['visibility'], + docs['createdAt'], + docs['updatedAt'], ) if show_table: @@ -152,7 +152,7 @@ def delete(name: str, missing_ok: bool = True) -> bool: @staticmethod @hubble.login_required def push( - da: 'DocList', + docs: 'DocList', name: str, public: bool = True, show_progress: bool = False, @@ -189,7 +189,7 @@ def push( 'public': public, 'metaData': json.dumps( { - 'summary': _get_raw_summary(da), + 'summary': _get_raw_summary(docs), 'branding': branding, 'version': get_version_info(), }, @@ -210,7 +210,7 @@ def push( def gen(): yield _head - binary_stream = da.to_binary_stream( + binary_stream = docs.to_binary_stream( protocol='protobuf', compress='gzip', show_progress=show_progress ) while True: @@ -265,10 +265,10 @@ def push_stream( # But it must be done this way for now because Hubble expects to know the length of the DocList # before it starts receiving the documents first_doc = next(docs) - da = DocList[first_doc.__class__]([first_doc]) # type: ignore + docs = DocList[first_doc.__class__]([first_doc]) # type: ignore for doc in docs: - da.append(doc) - return cls.push(da, name, public, show_progress, branding) + docs.append(doc) + return cls.push(docs, name, public, show_progress, branding) @staticmethod @hubble.login_required @@ -332,12 +332,12 @@ def pull_stream( r.raise_for_status() save_name = name.replace('/', '_') - tmp_cache_file = Path(f'/tmp/{save_name}.da') + tmp_cache_file = Path(f'/tmp/{save_name}.docs') _source: Union[ _BufferedCachingRequestReader, io.BufferedReader ] = _BufferedCachingRequestReader(r, tmp_cache_file) - cache_file = _get_cache_path() / f'{save_name}.da' + cache_file = _get_cache_path() / f'{save_name}.docs' if local_cache and cache_file.exists(): _cache_len = cache_file.stat().st_size if _cache_len == int(r.headers['Content-length']): diff --git a/docarray/store/s3.py b/docarray/store/s3.py index 85defe7323a..02b87d79b51 100644 --- a/docarray/store/s3.py +++ b/docarray/store/s3.py @@ -43,7 +43,7 @@ def read(self, size: Optional[int] = -1) -> bytes: def close(self): if not self.closed and self._cache: - self._cache_path.rename(self._cache_path.with_suffix('.da')) + self._cache_path.rename(self._cache_path.with_suffix('.docs')) self._cache.close() @@ -64,7 +64,7 @@ def list(namespace: str, show_table: bool = False) -> List[str]: da_files = [ obj for obj in s3_bucket.objects.all() - if obj.key.startswith(namespace) and obj.key.endswith('.da') + if obj.key.startswith(namespace) and obj.key.endswith('.docs') ] da_names = [f.key.split('/')[-1].split('.')[0] for f in da_files] @@ -102,7 +102,7 @@ def delete(name: str, missing_ok: bool = True) -> bool: """ bucket, name = name.split('/', 1) s3 = boto3.resource('s3') - object = s3.Object(bucket, name + '.da') + object = s3.Object(bucket, name + '.docs') try: object.load() except botocore.exceptions.ClientError as e: @@ -119,7 +119,7 @@ def delete(name: str, missing_ok: bool = True) -> bool: @classmethod def push( cls: Type[SelfS3DocStore], - da: 'DocList', + docs: 'DocList', name: str, public: bool = False, show_progress: bool = False, @@ -127,13 +127,13 @@ def push( ) -> Dict: """Push this DocList object to the specified bucket and key. - :param da: The DocList to push. + :param docs: The DocList to push. :param name: The bucket and key to push to. e.g. my_bucket/my_key :param public: Not used by the ``s3`` protocol. :param show_progress: If true, a progress bar will be displayed. :param branding: Not used by the ``s3`` protocol. """ - return cls.push_stream(iter(da), name, public, show_progress, branding) + return cls.push_stream(iter(docs), name, public, show_progress, branding) @staticmethod def push_stream( @@ -161,7 +161,7 @@ def push_stream( # Upload to S3 with open( - f"s3://{bucket}/{name}.da", + f"s3://{bucket}/{name}.docs", 'wb', compression='.gz', transport_params={'multipart_upload': False}, @@ -189,12 +189,12 @@ def pull( :param local_cache: store the downloaded DocList to local cache :return: a :class:`DocList` object """ - da = da_cls( # type: ignore + docs = da_cls( # type: ignore cls.pull_stream( da_cls, name, show_progress=show_progress, local_cache=local_cache ) ) - return da + return docs @classmethod def pull_stream( @@ -216,17 +216,17 @@ def pull_stream( bucket, name = name.split('/', 1) save_name = name.replace('/', '_') - cache_path = _get_cache_path() / f'{save_name}.da' + cache_path = _get_cache_path() / f'{save_name}.docs' source = _BufferedCachingReader( - open(f"s3://{bucket}/{name}.da", 'rb', compression='.gz'), + open(f"s3://{bucket}/{name}.docs", 'rb', compression='.gz'), cache_path=cache_path if local_cache else None, ) if local_cache: if cache_path.exists(): object_header = boto3.client('s3').head_object( - Bucket=bucket, Key=name + '.da' + Bucket=bucket, Key=name + '.docs' ) if cache_path.stat().st_size == object_header['ContentLength']: logging.info( diff --git a/docarray/utils/find.py b/docarray/utils/find.py index 99bf506b077..405f3e75f15 100644 --- a/docarray/utils/find.py +++ b/docarray/utils/find.py @@ -256,23 +256,23 @@ def _extract_embeddings( return emb -def _da_attr_type(da: AnyDocArray, access_path: str) -> Type[AnyTensor]: +def _da_attr_type(docs: AnyDocArray, access_path: str) -> Type[AnyTensor]: """Get the type of the attribute according to the Document type (schema) of the DocList. - :param da: the DocList + :param docs: the DocList :param access_path: the "__"-separated access path :return: the type of the attribute """ field_type: Optional[Type] = _get_field_type_by_access_path( - da.doc_type, access_path + docs.doc_type, access_path ) if field_type is None: raise ValueError(f"Access path is not valid: {access_path}") if is_union_type(field_type): # determine type based on the fist element - field_type = type(next(AnyDocArray._traverse(da[0], access_path))) + field_type = type(next(AnyDocArray._traverse(docs[0], access_path))) if not issubclass(field_type, AbstractTensor): raise ValueError( diff --git a/docarray/utils/map.py b/docarray/utils/map.py index fba7db54e39..09c7b1ae2ef 100644 --- a/docarray/utils/map.py +++ b/docarray/utils/map.py @@ -23,7 +23,7 @@ def map_docs( show_progress: bool = False, ) -> Generator[T_doc, None, None]: """ - Return an iterator that applies `func` to every Document in `da` in parallel, + Return an iterator that applies `func` to every Document in `docs` in parallel, yielding the results. --- @@ -44,13 +44,13 @@ def load_url_to_tensor(img: ImageDoc) -> ImageDoc: 'Dag_Sebastian_Ahlander_at_G%C3%B6teborg_Book_Fair_2012b.jpg' ) - da = DocList[ImageDoc]([ImageDoc(url=url) for _ in range(100)]) - da = DocList[ImageDoc]( - list(map_docs(da, load_url_to_tensor, backend='thread')) + docs = DocList[ImageDoc]([ImageDoc(url=url) for _ in range(100)]) + docs = DocList[ImageDoc]( + list(map_docs(docs, load_url_to_tensor, backend='thread')) ) # threading is usually a good option for IO-bound tasks such as loading an # ImageDoc from url - for doc in da: + for doc in docs: assert doc.tensor is not None ``` @@ -104,7 +104,7 @@ def load_url_to_tensor(img: ImageDoc) -> ImageDoc: def map_docs_batched( - da: T, + docs: T, func: Callable[[T], Union[T, T_doc]], batch_size: int, backend: str = 'thread', @@ -129,19 +129,19 @@ class MyDoc(BaseDoc): name: str - def upper_case_name(da: DocList[MyDoc]) -> DocList[MyDoc]: - da.name = [n.upper() for n in da.name] - return da + def upper_case_name(docs: DocList[MyDoc]) -> DocList[MyDoc]: + docs.name = [n.upper() for n in docs.name] + return docs batch_size = 16 - da = DocList[MyDoc]([MyDoc(name='my orange cat') for _ in range(100)]) - it = map_docs_batched(da, upper_case_name, batch_size=batch_size) + docs = DocList[MyDoc]([MyDoc(name='my orange cat') for _ in range(100)]) + it = map_docs_batched(docs, upper_case_name, batch_size=batch_size) for i, d in enumerate(it): - da[i * batch_size : (i + 1) * batch_size] = d + docs[i * batch_size : (i + 1) * batch_size] = d - assert len(da) == 100 - print(da.name[:3]) + assert len(docs) == 100 + print(docs.name[:3]) ``` --- @@ -152,7 +152,7 @@ def upper_case_name(da: DocList[MyDoc]) -> DocList[MyDoc]: --- - :param da: DocList to apply function to + :param docs: DocList to apply function to :param batch_size: Size of each generated batch (except the last one, which might be smaller). :param shuffle: If set, shuffle the Documents before dividing into minibatches. @@ -196,9 +196,9 @@ def upper_case_name(da: DocList[MyDoc]) -> DocList[MyDoc]: context_pool = p with context_pool: - imap = p.imap(func, da._batch(batch_size=batch_size, shuffle=shuffle)) + imap = p.imap(func, docs._batch(batch_size=batch_size, shuffle=shuffle)) for x in track( - imap, total=ceil(len(da) / batch_size), disable=not show_progress + imap, total=ceil(len(docs) / batch_size), disable=not show_progress ): yield x diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py index 71374f088d9..f60ad0a1671 100644 --- a/docarray/utils/reduce.py +++ b/docarray/utils/reduce.py @@ -65,6 +65,6 @@ def reduce_all(docarrays: List[DocList]) -> DocList: left = docarrays[0] others = docarrays[1:] left_id_map = {doc.id: i for i, doc in enumerate(left)} - for da in others: - reduce(left, da, left_id_map) + for docs in others: + reduce(left, docs, left_id_map) return left From 4e3a31b8e51614f748343ee615da2e464a8f5e9c Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 4 Apr 2023 13:39:47 +0200 Subject: [PATCH 20/27] refactor: rename da columns to docs_vec_colunn Signed-off-by: samsja --- docarray/array/doc_vec/column_storage.py | 14 +++++---- docarray/array/doc_vec/doc_vec.py | 24 +++++++-------- docarray/proto/docarray.proto | 2 +- docarray/proto/pb/docarray_pb2.py | 24 +++++++-------- docarray/proto/pb2/docarray_pb2.py | 38 +++++++++++++----------- 5 files changed, 53 insertions(+), 49 deletions(-) diff --git a/docarray/array/doc_vec/column_storage.py b/docarray/array/doc_vec/column_storage.py index e44ab1158f9..42c67c96b3b 100644 --- a/docarray/array/doc_vec/column_storage.py +++ b/docarray/array/doc_vec/column_storage.py @@ -30,7 +30,7 @@ class ColumnStorage: :param tensor_columns: a Dict of AbstractTensor :param doc_columns: a Dict of :class:`~docarray.array.doc_vec.DocVec` - :param da_columns: a Dict of List of :class:`~docarray.array.doc_vec.DocVec` + :param docs_vec_columns: a Dict of List of :class:`~docarray.array.doc_vec.DocVec` :param any_columns: a Dict of List :param tensor_type: Class used to wrap the doc_vec tensors """ @@ -39,13 +39,13 @@ def __init__( self, tensor_columns: Dict[str, AbstractTensor], doc_columns: Dict[str, 'DocVec'], - da_columns: Dict[str, ListAdvancedIndexing['DocVec']], + docs_vec_columns: Dict[str, ListAdvancedIndexing['DocVec']], any_columns: Dict[str, ListAdvancedIndexing], tensor_type: Type[AbstractTensor] = NdArray, ): self.tensor_columns = tensor_columns self.doc_columns = doc_columns - self.da_columns = da_columns + self.docs_vec_columns = docs_vec_columns self.any_columns = any_columns self.tensor_type = tensor_type @@ -53,7 +53,7 @@ def __init__( self.columns = ChainMap( # type: ignore self.tensor_columns, # type: ignore self.doc_columns, # type: ignore - self.da_columns, # type: ignore + self.docs_vec_columns, # type: ignore self.any_columns, # type: ignore ) # type: ignore @@ -65,13 +65,15 @@ def __getitem__(self: T, item: IndexIterType) -> T: item = list(item) tensor_columns = {key: col[item] for key, col in self.tensor_columns.items()} doc_columns = {key: col[item] for key, col in self.doc_columns.items()} - da_columns = {key: col[item] for key, col in self.da_columns.items()} + docs_vec_columns = { + key: col[item] for key, col in self.docs_vec_columns.items() + } any_columns = {key: col[item] for key, col in self.any_columns.items()} return self.__class__( tensor_columns, doc_columns, - da_columns, + docs_vec_columns, any_columns, self.tensor_type, ) diff --git a/docarray/array/doc_vec/doc_vec.py b/docarray/array/doc_vec/doc_vec.py index 6b36f521439..f1d82eabc40 100644 --- a/docarray/array/doc_vec/doc_vec.py +++ b/docarray/array/doc_vec/doc_vec.py @@ -98,7 +98,7 @@ def __init__( tensor_columns: Dict[str, AbstractTensor] = dict() doc_columns: Dict[str, 'DocVec'] = dict() - da_columns: Dict[str, ListAdvancedIndexing['DocVec']] = dict() + docs_vec_columns: Dict[str, ListAdvancedIndexing['DocVec']] = dict() any_columns: Dict[str, ListAdvancedIndexing] = dict() if len(docs) == 0: @@ -170,7 +170,7 @@ def __init__( if isinstance(docs, DocList): docs = docs.stack(tensor_type=self.tensor_type) docs_list.append(docs) - da_columns[field_name] = ListAdvancedIndexing(docs_list) + docs_vec_columns[field_name] = ListAdvancedIndexing(docs_list) else: any_columns[field_name] = ListAdvancedIndexing( getattr(docs, field_name) @@ -183,7 +183,7 @@ def __init__( self._storage = ColumnStorage( tensor_columns, doc_columns, - da_columns, + docs_vec_columns, any_columns, tensor_type, ) @@ -230,7 +230,7 @@ def to(self: T, device: str) -> T: for field, col_doc in self._storage.doc_columns.items(): self._storage.doc_columns[field] = col_doc.to(device) - for _, col_da in self._storage.da_columns.items(): + for _, col_da in self._storage.docs_vec_columns.items(): for docs in col_da: docs.to(device) @@ -269,8 +269,8 @@ def _get_data_column( """ if field in self._storage.any_columns.keys(): return self._storage.any_columns[field].data - elif field in self._storage.da_columns.keys(): - return self._storage.da_columns[field].data + elif field in self._storage.docs_vec_columns.keys(): + return self._storage.docs_vec_columns[field].data elif field in self._storage.columns.keys(): return self._storage.columns[field] else: @@ -381,10 +381,10 @@ def _set_data_column( ) self._storage.doc_columns[field] = values_ - elif field in self._storage.da_columns.keys(): + elif field in self._storage.docs_vec_columns.keys(): values_ = cast(Sequence[DocList[T_doc]], values) # TODO here we should actually check if this is correct - self._storage.da_columns[field] = values_ + self._storage.docs_vec_columns[field] = values_ elif field in self._storage.any_columns.keys(): # TODO here we should actually check if this is correct values_ = cast(Sequence, values) @@ -425,7 +425,7 @@ def from_protobuf(cls: Type[T], pb_msg: 'DocVecProto') -> T: storage = ColumnStorage( pb_msg.tensor_columns, pb_msg.doc_columns, - pb_msg.da_columns, + pb_msg.docs_vec_columns, pb_msg.any_columns, ) @@ -454,7 +454,7 @@ def to_protobuf(self) -> 'DocVecProto': doc_columns_proto[field] = col_doc.to_protobuf() for field, col_tens in self._storage.tensor_columns.items(): tensor_columns_proto[field] = col_tens.to_protobuf() - for field, col_da in self._storage.da_columns.items(): + for field, col_da in self._storage.docs_vec_columns.items(): list_proto = ListOfDocArrayProto() for docs in col_da: list_proto.data.append(docs.to_protobuf()) @@ -468,7 +468,7 @@ def to_protobuf(self) -> 'DocVecProto': return DocVecProto( doc_columns=doc_columns_proto, tensor_columns=tensor_columns_proto, - da_columns=da_columns_proto, + docs_vec_columns=da_columns_proto, any_columns=any_columns_proto, ) @@ -486,7 +486,7 @@ def unstack(self: T) -> DocList[T_doc]: for field, doc_col in self._storage.doc_columns.items(): unstacked_doc_column[field] = doc_col.unstack() - for field, da_col in self._storage.da_columns.items(): + for field, da_col in self._storage.docs_vec_columns.items(): unstacked_da_column[field] = [docs.unstack() for docs in da_col] for field, tensor_col in list(self._storage.tensor_columns.items()): diff --git a/docarray/proto/docarray.proto b/docarray/proto/docarray.proto index f7302ad5867..19a33ccbc22 100644 --- a/docarray/proto/docarray.proto +++ b/docarray/proto/docarray.proto @@ -103,6 +103,6 @@ message ListOfDocArrayProto { message DocVecProto{ map tensor_columns = 1; // a dict of document columns map doc_columns = 2; // a dict of tensor columns - map da_columns = 3; // a dict of document array columns + map docs_vec_columns = 3; // a dict of document array columns map any_columns = 4; // a dict of any columns. Used for the rest of the data } \ No newline at end of file diff --git a/docarray/proto/pb/docarray_pb2.py b/docarray/proto/pb/docarray_pb2.py index 0ad5c4dcfed..8ff91a9f5e8 100644 --- a/docarray/proto/pb/docarray_pb2.py +++ b/docarray/proto/pb/docarray_pb2.py @@ -14,7 +14,7 @@ from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xb1\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12!\n\x03\x64oc\x18\x07 \x01(\x0b\x32\x12.docarray.DocProtoH\x00\x12+\n\tdoc_array\x18\x08 \x01(\x0b\x32\x16.docarray.DocListProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"x\n\x08\x44ocProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\"0\n\x0c\x44ocListProto\x12 \n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x12.docarray.DocProto\";\n\x13ListOfDocArrayProto\x12$\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x16.docarray.DocListProto\"\xb7\x04\n\x0b\x44ocVecProto\x12@\n\x0etensor_columns\x18\x01 \x03(\x0b\x32(.docarray.DocVecProto.TensorColumnsEntry\x12:\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32%.docarray.DocVecProto.DocColumnsEntry\x12\x38\n\nda_columns\x18\x03 \x03(\x0b\x32$.docarray.DocVecProto.DaColumnsEntry\x12:\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32%.docarray.DocVecProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aH\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12$\n\x05value\x18\x02 \x01(\x0b\x32\x15.docarray.DocVecProto:\x02\x38\x01\x1aO\n\x0e\x44\x61\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xb1\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12!\n\x03\x64oc\x18\x07 \x01(\x0b\x32\x12.docarray.DocProtoH\x00\x12+\n\tdoc_array\x18\x08 \x01(\x0b\x32\x16.docarray.DocListProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"x\n\x08\x44ocProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\"0\n\x0c\x44ocListProto\x12 \n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x12.docarray.DocProto\";\n\x13ListOfDocArrayProto\x12$\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x16.docarray.DocListProto\"\xc7\x04\n\x0b\x44ocVecProto\x12@\n\x0etensor_columns\x18\x01 \x03(\x0b\x32(.docarray.DocVecProto.TensorColumnsEntry\x12:\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32%.docarray.DocVecProto.DocColumnsEntry\x12\x43\n\x10\x64ocs_vec_columns\x18\x03 \x03(\x0b\x32).docarray.DocVecProto.DocsVecColumnsEntry\x12:\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32%.docarray.DocVecProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aH\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12$\n\x05value\x18\x02 \x01(\x0b\x32\x15.docarray.DocVecProto:\x02\x38\x01\x1aT\n\x13\x44ocsVecColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3') _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'docarray_pb2', globals()) @@ -29,8 +29,8 @@ _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_options = b'8\001' _DOCVECPROTO_DOCCOLUMNSENTRY._options = None _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_options = b'8\001' - _DOCVECPROTO_DACOLUMNSENTRY._options = None - _DOCVECPROTO_DACOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_DOCSVECCOLUMNSENTRY._options = None + _DOCVECPROTO_DOCSVECCOLUMNSENTRY._serialized_options = b'8\001' _DOCVECPROTO_ANYCOLUMNSENTRY._options = None _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_options = b'8\001' _DENSENDARRAYPROTO._serialized_start=58 @@ -58,13 +58,13 @@ _LISTOFDOCARRAYPROTO._serialized_start=1179 _LISTOFDOCARRAYPROTO._serialized_end=1238 _DOCVECPROTO._serialized_start=1241 - _DOCVECPROTO._serialized_end=1808 - _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_start=1500 - _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_end=1576 - _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_start=1578 - _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_end=1650 - _DOCVECPROTO_DACOLUMNSENTRY._serialized_start=1652 - _DOCVECPROTO_DACOLUMNSENTRY._serialized_end=1731 - _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_start=1733 - _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_end=1808 + _DOCVECPROTO._serialized_end=1824 + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_start=1511 + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_end=1587 + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_start=1589 + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_end=1661 + _DOCVECPROTO_DOCSVECCOLUMNSENTRY._serialized_start=1663 + _DOCVECPROTO_DOCSVECCOLUMNSENTRY._serialized_end=1747 + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_start=1749 + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_end=1824 # @@protoc_insertion_point(module_scope) diff --git a/docarray/proto/pb2/docarray_pb2.py b/docarray/proto/pb2/docarray_pb2.py index 795f618bb22..9fbbbadf342 100644 --- a/docarray/proto/pb2/docarray_pb2.py +++ b/docarray/proto/pb2/docarray_pb2.py @@ -16,7 +16,7 @@ from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xb1\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12!\n\x03\x64oc\x18\x07 \x01(\x0b\x32\x12.docarray.DocProtoH\x00\x12+\n\tdoc_array\x18\x08 \x01(\x0b\x32\x16.docarray.DocListProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"x\n\x08\x44ocProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\"0\n\x0c\x44ocListProto\x12 \n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x12.docarray.DocProto\";\n\x13ListOfDocArrayProto\x12$\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x16.docarray.DocListProto\"\xb7\x04\n\x0b\x44ocVecProto\x12@\n\x0etensor_columns\x18\x01 \x03(\x0b\x32(.docarray.DocVecProto.TensorColumnsEntry\x12:\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32%.docarray.DocVecProto.DocColumnsEntry\x12\x38\n\nda_columns\x18\x03 \x03(\x0b\x32$.docarray.DocVecProto.DaColumnsEntry\x12:\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32%.docarray.DocVecProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aH\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12$\n\x05value\x18\x02 \x01(\x0b\x32\x15.docarray.DocVecProto:\x02\x38\x01\x1aO\n\x0e\x44\x61\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3' + b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xb1\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12!\n\x03\x64oc\x18\x07 \x01(\x0b\x32\x12.docarray.DocProtoH\x00\x12+\n\tdoc_array\x18\x08 \x01(\x0b\x32\x16.docarray.DocListProtoH\x00\x12(\n\x04list\x18\t \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12\'\n\x03set\x18\n \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12)\n\x05tuple\x18\x0b \x01(\x0b\x32\x18.docarray.ListOfAnyProtoH\x00\x12(\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x18.docarray.DictOfAnyProtoH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"x\n\x08\x44ocProto\x12*\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x1c.docarray.DocProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"\x84\x01\n\x0e\x44ictOfAnyProto\x12\x30\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\".docarray.DictOfAnyProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\"3\n\x0eListOfAnyProto\x12!\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x13.docarray.NodeProto\"0\n\x0c\x44ocListProto\x12 \n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x12.docarray.DocProto\";\n\x13ListOfDocArrayProto\x12$\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32\x16.docarray.DocListProto\"\xc7\x04\n\x0b\x44ocVecProto\x12@\n\x0etensor_columns\x18\x01 \x03(\x0b\x32(.docarray.DocVecProto.TensorColumnsEntry\x12:\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32%.docarray.DocVecProto.DocColumnsEntry\x12\x43\n\x10\x64ocs_vec_columns\x18\x03 \x03(\x0b\x32).docarray.DocVecProto.DocsVecColumnsEntry\x12:\n\x0b\x61ny_columns\x18\x04 \x03(\x0b\x32%.docarray.DocVecProto.AnyColumnsEntry\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x1aH\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12$\n\x05value\x18\x02 \x01(\x0b\x32\x15.docarray.DocVecProto:\x02\x38\x01\x1aT\n\x13\x44ocsVecColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12,\n\x05value\x18\x02 \x01(\x0b\x32\x1d.docarray.ListOfDocArrayProto:\x02\x38\x01\x1aK\n\x0f\x41nyColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.docarray.ListOfAnyProto:\x02\x38\x01\x62\x06proto3' ) @@ -37,7 +37,9 @@ 'TensorColumnsEntry' ] _DOCVECPROTO_DOCCOLUMNSENTRY = _DOCVECPROTO.nested_types_by_name['DocColumnsEntry'] -_DOCVECPROTO_DACOLUMNSENTRY = _DOCVECPROTO.nested_types_by_name['DaColumnsEntry'] +_DOCVECPROTO_DOCSVECCOLUMNSENTRY = _DOCVECPROTO.nested_types_by_name[ + 'DocsVecColumnsEntry' +] _DOCVECPROTO_ANYCOLUMNSENTRY = _DOCVECPROTO.nested_types_by_name['AnyColumnsEntry'] DenseNdArrayProto = _reflection.GeneratedProtocolMessageType( 'DenseNdArrayProto', @@ -191,13 +193,13 @@ # @@protoc_insertion_point(class_scope:docarray.DocVecProto.DocColumnsEntry) }, ), - 'DaColumnsEntry': _reflection.GeneratedProtocolMessageType( - 'DaColumnsEntry', + 'DocsVecColumnsEntry': _reflection.GeneratedProtocolMessageType( + 'DocsVecColumnsEntry', (_message.Message,), { - 'DESCRIPTOR': _DOCVECPROTO_DACOLUMNSENTRY, + 'DESCRIPTOR': _DOCVECPROTO_DOCSVECCOLUMNSENTRY, '__module__': 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocVecProto.DaColumnsEntry) + # @@protoc_insertion_point(class_scope:docarray.DocVecProto.DocsVecColumnsEntry) }, ), 'AnyColumnsEntry': _reflection.GeneratedProtocolMessageType( @@ -217,7 +219,7 @@ _sym_db.RegisterMessage(DocVecProto) _sym_db.RegisterMessage(DocVecProto.TensorColumnsEntry) _sym_db.RegisterMessage(DocVecProto.DocColumnsEntry) -_sym_db.RegisterMessage(DocVecProto.DaColumnsEntry) +_sym_db.RegisterMessage(DocVecProto.DocsVecColumnsEntry) _sym_db.RegisterMessage(DocVecProto.AnyColumnsEntry) if _descriptor._USE_C_DESCRIPTORS == False: @@ -231,8 +233,8 @@ _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_options = b'8\001' _DOCVECPROTO_DOCCOLUMNSENTRY._options = None _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_options = b'8\001' - _DOCVECPROTO_DACOLUMNSENTRY._options = None - _DOCVECPROTO_DACOLUMNSENTRY._serialized_options = b'8\001' + _DOCVECPROTO_DOCSVECCOLUMNSENTRY._options = None + _DOCVECPROTO_DOCSVECCOLUMNSENTRY._serialized_options = b'8\001' _DOCVECPROTO_ANYCOLUMNSENTRY._options = None _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_options = b'8\001' _DENSENDARRAYPROTO._serialized_start = 58 @@ -260,13 +262,13 @@ _LISTOFDOCARRAYPROTO._serialized_start = 1179 _LISTOFDOCARRAYPROTO._serialized_end = 1238 _DOCVECPROTO._serialized_start = 1241 - _DOCVECPROTO._serialized_end = 1808 - _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_start = 1500 - _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_end = 1576 - _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_start = 1578 - _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_end = 1650 - _DOCVECPROTO_DACOLUMNSENTRY._serialized_start = 1652 - _DOCVECPROTO_DACOLUMNSENTRY._serialized_end = 1731 - _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_start = 1733 - _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_end = 1808 + _DOCVECPROTO._serialized_end = 1824 + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_start = 1511 + _DOCVECPROTO_TENSORCOLUMNSENTRY._serialized_end = 1587 + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_start = 1589 + _DOCVECPROTO_DOCCOLUMNSENTRY._serialized_end = 1661 + _DOCVECPROTO_DOCSVECCOLUMNSENTRY._serialized_start = 1663 + _DOCVECPROTO_DOCSVECCOLUMNSENTRY._serialized_end = 1747 + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_start = 1749 + _DOCVECPROTO_ANYCOLUMNSENTRY._serialized_end = 1824 # @@protoc_insertion_point(module_scope) From 9b1a9a7b0bfe95e898478c5a2936749b87261244 Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 4 Apr 2023 13:45:04 +0200 Subject: [PATCH 21/27] fix: fix readme Signed-off-by: samsja --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 90b5db0997a..af497cb5b3e 100644 --- a/README.md +++ b/README.md @@ -259,18 +259,18 @@ class MyPodcastModel(nn.Module): self.image_encoder = ImageEncoder() self.text_encoder = TextEncoder() - def forward_podcast(self, da: DocList[Podcast]) -> DocList[Podcast]: - da.audio.embedding = self.audio_encoder(da.audio.tensor) - da.text.embedding = self.text_encoder(da.text.tensor) - da.image.embedding = self.image_encoder(da.image.tensor) + def forward_podcast(self, docs: DocList[Podcast]) -> DocList[Podcast]: + docs.audio.embedding = self.audio_encoder(docs.audio.tensor) + docs.text.embedding = self.text_encoder(docs.text.tensor) + docs.image.embedding = self.image_encoder(docs.image.tensor) - return da + return docs - def forward(self, da: DocList[PairPodcast]) -> DocList[PairPodcast]: - da.left = self.forward_podcast(da.left) - da.right = self.forward_podcast(da.right) + def forward(self, docs: DocList[PairPodcast]) -> DocList[PairPodcast]: + docs.left = self.forward_podcast(docs.left) + docs.right = self.forward_podcast(docs.right) - return da + return docs ``` Looks much better, doesn't it? From a4cda60f5a9482d00ce8b5cea03246476d0cf6f9 Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 4 Apr 2023 13:46:12 +0200 Subject: [PATCH 22/27] fix: rename last da Signed-off-by: samsja --- docarray/index/abstract.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docarray/index/abstract.py b/docarray/index/abstract.py index ece1a1b9764..450b4755181 100644 --- a/docarray/index/abstract.py +++ b/docarray/index/abstract.py @@ -342,14 +342,14 @@ def __getitem__( # cast output if isinstance(doc_sequence, DocList): - out_da: DocList[TSchema] = doc_sequence + out_docs: DocList[TSchema] = doc_sequence elif isinstance(doc_sequence[0], Dict): - out_da = self._dict_list_to_docarray(doc_sequence) # type: ignore + out_docs = self._dict_list_to_docarray(doc_sequence) # type: ignore else: da_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) - out_da = da_cls(doc_sequence) + out_docs = da_cls(doc_sequence) - return out_da[0] if return_singleton else out_da + return out_docs[0] if return_singleton else out_docs def __delitem__(self, key: Union[str, Sequence[str]]): """Delete one or multiple Documents from the index, by `id`. From 6b8f30dab9d6c07d7615d3d705f22420bb7a7d71 Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 4 Apr 2023 13:46:44 +0200 Subject: [PATCH 23/27] fix: rename last da Signed-off-by: samsja --- docarray/index/abstract.py | 8 ++++---- docarray/index/backends/hnswlib.py | 16 ++++++++-------- docarray/store/abstract_doc_store.py | 8 ++++---- docarray/store/file.py | 10 +++++----- docarray/store/s3.py | 10 +++++----- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/docarray/index/abstract.py b/docarray/index/abstract.py index 450b4755181..03ab7361f62 100644 --- a/docarray/index/abstract.py +++ b/docarray/index/abstract.py @@ -346,8 +346,8 @@ def __getitem__( elif isinstance(doc_sequence[0], Dict): out_docs = self._dict_list_to_docarray(doc_sequence) # type: ignore else: - da_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) - out_docs = da_cls(doc_sequence) + docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) + out_docs = docs_cls(doc_sequence) return out_docs[0] if return_singleton else out_docs @@ -874,5 +874,5 @@ def _dict_list_to_docarray(self, dict_list: Sequence[Dict[str, Any]]) -> DocList """Convert a list of docs in dict type to a DocList of the schema type.""" doc_list = [self._convert_dict_to_doc(doc_dict, self._schema) for doc_dict in dict_list] # type: ignore - da_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) - return da_cls(doc_list) + docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) + return docs_cls(doc_list) diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index 5d3d6bd2e5c..d0e11e7e959 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -228,8 +228,8 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: self._logger.debug(f'Executing query {query}') docs_filtered = ann_docs for cond in filter_conditions: - da_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) - docs_filtered = da_cls(filter_docs(docs_filtered, cond)) + docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) + docs_filtered = docs_cls(filter_docs(docs_filtered, cond)) self._logger.debug(f'{len(docs_filtered)} results found') docs_and_scores = zip( @@ -387,14 +387,14 @@ def _get_docs_sqlite_unsorted(self, univ_ids: Sequence[int]): 'SELECT data FROM docs WHERE doc_id IN %s' % sql_id_list, ) rows = self._sqlite_cursor.fetchall() - da_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) - return da_cls([self._doc_from_bytes(row[0]) for row in rows]) + docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) + return docs_cls([self._doc_from_bytes(row[0]) for row in rows]) def _get_docs_sqlite_doc_id(self, doc_ids: Sequence[str]) -> DocList[TSchema]: hashed_ids = tuple(self._to_hashed_id(id_) for id_ in doc_ids) docs_unsorted = self._get_docs_sqlite_unsorted(hashed_ids) - da_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) - return da_cls(sorted(docs_unsorted, key=lambda doc: doc_ids.index(doc.id))) + docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) + return docs_cls(sorted(docs_unsorted, key=lambda doc: doc_ids.index(doc.id))) def _get_docs_sqlite_hashed_id(self, hashed_ids: Sequence[int]) -> DocList: docs_unsorted = self._get_docs_sqlite_unsorted(hashed_ids) @@ -402,8 +402,8 @@ def _get_docs_sqlite_hashed_id(self, hashed_ids: Sequence[int]) -> DocList: def _in_position(doc): return hashed_ids.index(self._to_hashed_id(doc.id)) - da_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) - return da_cls(sorted(docs_unsorted, key=_in_position)) + docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) + return docs_cls(sorted(docs_unsorted, key=_in_position)) def _delete_docs_from_sqlite(self, doc_ids: Sequence[Union[str, int]]): ids = tuple( diff --git a/docarray/store/abstract_doc_store.py b/docarray/store/abstract_doc_store.py index e9c961faa7f..16c17227a64 100644 --- a/docarray/store/abstract_doc_store.py +++ b/docarray/store/abstract_doc_store.py @@ -71,14 +71,14 @@ def push_stream( @staticmethod @abstractmethod def pull( - da_cls: Type['DocList'], + docs_cls: Type['DocList'], name: str, show_progress: bool, local_cache: bool, ) -> 'DocList': """Pull a DocList from the specified name. - :param da_cls: The DocList class to instantiate + :param docs_cls: The DocList class to instantiate :param name: The name to pull from :param show_progress: If true, a progress bar will be displayed. :param local_cache: If true, the DocList will be cached locally @@ -89,14 +89,14 @@ def pull( @staticmethod @abstractmethod def pull_stream( - da_cls: Type['DocList'], + docs_cls: Type['DocList'], name: str, show_progress: bool, local_cache: bool, ) -> Iterator['BaseDoc']: """Pull a stream of documents from the specified name. - :param da_cls: The DocList class to instantiate + :param docs_cls: The DocList class to instantiate :param name: The name to pull from :param show_progress: If true, a progress bar will be displayed. :param local_cache: If true, the DocList will be cached locally diff --git a/docarray/store/file.py b/docarray/store/file.py index 69a52ec4ab9..b649864478a 100644 --- a/docarray/store/file.py +++ b/docarray/store/file.py @@ -145,7 +145,7 @@ def push_stream( @classmethod def pull( cls: Type[SelfFileDocStore], - da_cls: Type['DocList'], + docs_cls: Type['DocList'], name: str, show_progress: bool, local_cache: bool, @@ -158,16 +158,16 @@ def pull( :return: a :class:`DocList` object """ - return da_cls( + return docs_cls( cls.pull_stream( - da_cls, name, show_progress=show_progress, local_cache=local_cache + docs_cls, name, show_progress=show_progress, local_cache=local_cache ) ) @classmethod def pull_stream( cls: Type[SelfFileDocStore], - da_cls: Type['DocList'], + docs_cls: Type['DocList'], name: str, show_progress: bool, local_cache: bool, @@ -186,7 +186,7 @@ def pull_stream( path = cls._abs_filepath(name).with_suffix('.docs') source = open(path, 'rb') return _from_binary_stream( - da_cls.doc_type, + docs_cls.doc_type, source, protocol='protobuf', compress='gzip', diff --git a/docarray/store/s3.py b/docarray/store/s3.py index 02b87d79b51..936a261396f 100644 --- a/docarray/store/s3.py +++ b/docarray/store/s3.py @@ -177,7 +177,7 @@ def push_stream( @classmethod def pull( cls: Type[SelfS3DocStore], - da_cls: Type['DocList'], + docs_cls: Type['DocList'], name: str, show_progress: bool = False, local_cache: bool = False, @@ -189,9 +189,9 @@ def pull( :param local_cache: store the downloaded DocList to local cache :return: a :class:`DocList` object """ - docs = da_cls( # type: ignore + docs = docs_cls( # type: ignore cls.pull_stream( - da_cls, name, show_progress=show_progress, local_cache=local_cache + docs_cls, name, show_progress=show_progress, local_cache=local_cache ) ) return docs @@ -199,7 +199,7 @@ def pull( @classmethod def pull_stream( cls: Type[SelfS3DocStore], - da_cls: Type['DocList'], + docs_cls: Type['DocList'], name: str, show_progress: bool, local_cache: bool, @@ -235,7 +235,7 @@ def pull_stream( source = open(cache_path, 'rb') return _from_binary_stream( - da_cls.doc_type, + docs_cls.doc_type, source, protocol='pickle', compress=None, From 4c2d286bdf83a625c975cf6be292fa952121cf74 Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 4 Apr 2023 13:54:00 +0200 Subject: [PATCH 24/27] fix: fic docs nested da stack Signed-off-by: samsja --- docarray/array/doc_vec/doc_vec.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docarray/array/doc_vec/doc_vec.py b/docarray/array/doc_vec/doc_vec.py index f1d82eabc40..899ccef9da5 100644 --- a/docarray/array/doc_vec/doc_vec.py +++ b/docarray/array/doc_vec/doc_vec.py @@ -166,10 +166,12 @@ def __init__( elif issubclass(field_type, AnyDocArray): docs_list = list() for doc in docs: - docs = getattr(doc, field_name) - if isinstance(docs, DocList): - docs = docs.stack(tensor_type=self.tensor_type) - docs_list.append(docs) + docs_nested = getattr(doc, field_name) + if isinstance(docs_nested, DocList): + docs_nested = docs_nested.stack( + tensor_type=self.tensor_type + ) + docs_list.append(docs_nested) docs_vec_columns[field_name] = ListAdvancedIndexing(docs_list) else: any_columns[field_name] = ListAdvancedIndexing( From 1852b70a1bd8d38f0eb7362d7053fb771df3e933 Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 4 Apr 2023 14:17:57 +0200 Subject: [PATCH 25/27] fix: fix becmark Signed-off-by: samsja --- tests/benchmark_tests/test_map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/benchmark_tests/test_map.py b/tests/benchmark_tests/test_map.py index d5e146d7dba..c5a1516d03a 100644 --- a/tests/benchmark_tests/test_map.py +++ b/tests/benchmark_tests/test_map.py @@ -70,7 +70,7 @@ def time_multiprocessing(num_workers: int) -> float: start_time = time() list( map_docs_batched( - da=da, + docs=da, func=cpu_intensive_batch, batch_size=8, backend='process', From 408736c83719bd0fa047db675a52b29ebef027bc Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 4 Apr 2023 14:29:05 +0200 Subject: [PATCH 26/27] fix: fix docsm ap Signed-off-by: samsja --- tests/benchmark_tests/test_map.py | 2 +- tests/integrations/doc_index/elastic/fixture.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/benchmark_tests/test_map.py b/tests/benchmark_tests/test_map.py index c5a1516d03a..e5c664a408b 100644 --- a/tests/benchmark_tests/test_map.py +++ b/tests/benchmark_tests/test_map.py @@ -127,7 +127,7 @@ def time_multithreading_batch(num_workers: int) -> float: start_time = time() list( map_docs_batched( - da=da, + docs=da, func=io_intensive_batch, backend='thread', num_worker=num_workers, diff --git a/tests/integrations/doc_index/elastic/fixture.py b/tests/integrations/doc_index/elastic/fixture.py index 1caa31da2a6..82462ea96e9 100644 --- a/tests/integrations/doc_index/elastic/fixture.py +++ b/tests/integrations/doc_index/elastic/fixture.py @@ -7,7 +7,7 @@ from docarray import BaseDoc from docarray.typing import NdArray -pytestmark = [pytest.mark.slow, pytest.mark.doc_index] +pytestmark = [pytest.mark.slow, pytest.mark.index] class SimpleDoc(BaseDoc): From cbc159824e3b775df908e531c9d267cd62cbc207 Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 4 Apr 2023 14:36:38 +0200 Subject: [PATCH 27/27] fix: fix docsm ap Signed-off-by: samsja --- tests/integrations/doc_index/elastic/fixture.py | 2 +- tests/units/util/test_map.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integrations/doc_index/elastic/fixture.py b/tests/integrations/doc_index/elastic/fixture.py index 82462ea96e9..1caa31da2a6 100644 --- a/tests/integrations/doc_index/elastic/fixture.py +++ b/tests/integrations/doc_index/elastic/fixture.py @@ -7,7 +7,7 @@ from docarray import BaseDoc from docarray.typing import NdArray -pytestmark = [pytest.mark.slow, pytest.mark.index] +pytestmark = [pytest.mark.slow, pytest.mark.doc_index] class SimpleDoc(BaseDoc): diff --git a/tests/units/util/test_map.py b/tests/units/util/test_map.py index 0c6fb460b76..68efdfbbd7e 100644 --- a/tests/units/util/test_map.py +++ b/tests/units/util/test_map.py @@ -77,7 +77,7 @@ def test_map_docs_batched(n_docs, batch_size, backend): da = DocList[MyImage]([MyImage(url=IMAGE_PATHS['png']) for _ in range(n_docs)]) it = map_docs_batched( - da=da, func=load_from_da, batch_size=batch_size, backend=backend + docs=da, func=load_from_da, batch_size=batch_size, backend=backend ) assert isinstance(it, Generator)