From 9226633c412835b1e8b445287fd2a9ef99343676 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Fri, 26 Aug 2022 13:54:53 +0200 Subject: [PATCH 01/88] fix(plot): be robust against non-existing subindices --- docarray/array/mixins/plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/mixins/plot.py b/docarray/array/mixins/plot.py index b9fe8909d15..86c62d50498 100644 --- a/docarray/array/mixins/plot.py +++ b/docarray/array/mixins/plot.py @@ -79,7 +79,7 @@ def summary(self): is_multimodal = all(d.is_multimodal for d in self) table.add_row('Multimodal dataclass', str(is_multimodal)) - if getattr(self, '_subindices'): + if getattr(self, '_subindices', None): table.add_row( 'Subindices', rich.markup.escape(str(tuple(self._subindices.keys()))) ) From 61c4a95e52fd6990785af2f50659ef83320ba60c Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 4 Oct 2022 11:26:55 +0200 Subject: [PATCH 02/88] chore: add file structure --- docarray/array/storage/milvus/__init__.py | 0 docarray/array/storage/milvus/backend.py | 0 docarray/array/storage/milvus/find.py | 0 docarray/array/storage/milvus/getsetdel.py | 0 docarray/array/storage/milvus/seqlike.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 docarray/array/storage/milvus/__init__.py create mode 100644 docarray/array/storage/milvus/backend.py create mode 100644 docarray/array/storage/milvus/find.py create mode 100644 docarray/array/storage/milvus/getsetdel.py create mode 100644 docarray/array/storage/milvus/seqlike.py diff --git a/docarray/array/storage/milvus/__init__.py b/docarray/array/storage/milvus/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docarray/array/storage/milvus/find.py b/docarray/array/storage/milvus/find.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docarray/array/storage/milvus/seqlike.py b/docarray/array/storage/milvus/seqlike.py new file mode 100644 index 00000000000..e69de29bb2d From ef8dc0ab3873f39768d90ff4ed0ec33147782118 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 4 Oct 2022 17:05:17 +0200 Subject: [PATCH 03/88] feat: first draft of backend implementation --- docarray/array/storage/milvus/backend.py | 89 ++++++++++++++++++++++ docarray/array/storage/milvus/find.py | 38 +++++++++ docarray/array/storage/milvus/getsetdel.py | 24 ++++++ 3 files changed, 151 insertions(+) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index e69de29bb2d..53d6773db61 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -0,0 +1,89 @@ +import copy +import uuid +from typing import Optional, TYPE_CHECKING, Union, Dict +from dataclasses import dataclass, field +from pymilvus import connections, Collection, FieldSchema, DataType, CollectionSchema + +from docarray.array.storage.base.backend import BaseBackendMixin +from docarray.helper import dataclass_from_dict + +if TYPE_CHECKING: + from docarray.typing import ( + DocumentArraySourceType, + ) + + +@dataclass +class MilvusConfig: + n_dim: int + collection_name: str = None + host: str = 'localhost' + port: Optional[Union[str, int]] = None # 19530 for gRPC, 9091 for HTTP + distance: str = 'IP' # metric_type in milvus + index_type: str = 'HNSW' + index_config: Dict = None # passed to milvus at index creation time + collection_config: Dict = field( + default_factory=dict + ) # passed to milvus at collection creation time + + +class BackendMixin(BaseBackendMixin): + def _init_storage( + self, + _docs: Optional['DocumentArraySourceType'] = None, + config: Optional[Union[MilvusConfig, Dict]] = None, + **kwargs, + ): + config = copy.deepcopy(config) + if not config: + raise ValueError('Empty config is not allowed for Elastic storage') + elif isinstance(config, dict): + config = dataclass_from_dict(MilvusConfig, config) + + if config.collection_name is None: + id = uuid.uuid4().hex + config.index_name = 'docarray__' + id + self._config = config + + self._connection_alias = 'docarray_default' + connections.connect( + alias=self._connection_alias, host=config.host, port=config.port + ) + + self._collection = self._create_collection() + + super()._init_storage(_docs, config, **kwargs) + + def _create_collection(self): + document_id = FieldSchema(name='document_id', dtype=DataType.STRING) + order = FieldSchema(name='order', dtype=DataType.STRING, is_primary=True) + embedding = FieldSchema( + name='embedding', dtype=DataType.FLOAT_VECTOR, dim=self._config.n_dim + ) + serialized = FieldSchema( + name='serialized', dtype=DataType.VARCHAR, max_length=65_535 + ) # this is the maximus allowed length in milvus, could be optimized + + schema = CollectionSchema( + fields=[document_id, order, embedding, serialized], + description='docarray collection', + ) + return Collection( + name=self._config.collection_name, + schema=schema, + using=self._connection_alias, + **self._config.collection_config, + ) + + def _ensure_unique_config( + self, + config_root: dict, + config_subindex: dict, + config_joined: dict, + subindex_name: str, + ) -> dict: + if 'collection_name' not in config_subindex: + config_joined['collection_name'] = ( + config_joined['collection_name'] + '_subindex_' + subindex_name + ) + return config_joined diff --git a/docarray/array/storage/milvus/find.py b/docarray/array/storage/milvus/find.py index e69de29bb2d..e095fb8bc3b 100644 --- a/docarray/array/storage/milvus/find.py +++ b/docarray/array/storage/milvus/find.py @@ -0,0 +1,38 @@ +from typing import Optional, TYPE_CHECKING, Union, Dict +from dataclasses import dataclass + +from docarray.array.storage.base.backend import BaseBackendMixin + +if TYPE_CHECKING: + from docarray.typing import ( + DocumentArraySourceType, + ) + + +@dataclass +class MilvusConfig: + config1: str + config2: str + config3: Dict + ... + + +class BackendMixin(BaseBackendMixin): + def _init_storage( + self, + _docs: Optional['DocumentArraySourceType'] = None, + config: Optional[Union[MilvusConfig, Dict]] = None, + **kwargs + ): + super()._init_storage(_docs, config, **kwargs) + ... + + def _ensure_unique_config( + self, + config_root: dict, + config_subindex: dict, + config_joined: dict, + subindex_name: str, + ) -> dict: + ... # ensure unique identifiers here + return config_joined diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index e69de29bb2d..9bbc9ee06cc 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -0,0 +1,24 @@ +from docarray.array.storage.base.getsetdel import BaseGetSetDelMixin +from docarray import Document + + +class GetSetDelMixin(BaseGetSetDelMixin): + def _get_doc_by_id(self, _id: str) -> 'Document': + # to be implemented + ... + + def _del_doc_by_id(self, _id: str): + # to be implemented + ... + + def _set_doc_by_id(self, _id: str, value: 'Document'): + # to be implemented + ... + + def _load_offset2ids(self): + # to be implemented + ... + + def _save_offset2ids(self): + # to be implemented + ... From 4b1049efc92a8d99f469499ec4ea169562d1aecd Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 6 Oct 2022 15:08:38 +0200 Subject: [PATCH 04/88] feat: docarray can now connect to running milvus database --- docarray/array/document.py | 4 ++ docarray/array/milvus.py | 10 +++ docarray/array/storage/milvus/__init__.py | 11 ++++ docarray/array/storage/milvus/backend.py | 74 +++++++++++++++++++--- docarray/array/storage/milvus/getsetdel.py | 60 ++++++++++++++++-- docarray/array/storage/milvus/seqlike.py | 40 ++++++++++++ 6 files changed, 183 insertions(+), 16 deletions(-) create mode 100644 docarray/array/milvus.py diff --git a/docarray/array/document.py b/docarray/array/document.py index d119d2f2983..e734fa175f5 100644 --- a/docarray/array/document.py +++ b/docarray/array/document.py @@ -179,6 +179,10 @@ def __new__(cls, *args, storage: str = 'memory', **kwargs): from .redis import DocumentArrayRedis instance = super().__new__(DocumentArrayRedis) + elif storage == 'milvus': + from .milvus import DocumentArrayMilvus + + instance = super().__new__(DocumentArrayMilvus) else: raise ValueError(f'storage=`{storage}` is not supported.') diff --git a/docarray/array/milvus.py b/docarray/array/milvus.py new file mode 100644 index 00000000000..4a6f54c9fed --- /dev/null +++ b/docarray/array/milvus.py @@ -0,0 +1,10 @@ +from .document import DocumentArray + +from .storage.milvus import StorageMixins, MilvusConfig + +__all__ = ['MilvusConfig', 'DocumentArrayMilvus'] + + +class DocumentArrayMilvus(StorageMixins, DocumentArray): + def __new__(cls, *args, **kwargs): + return super().__new__(cls) diff --git a/docarray/array/storage/milvus/__init__.py b/docarray/array/storage/milvus/__init__.py index e69de29bb2d..aa79e529e62 100644 --- a/docarray/array/storage/milvus/__init__.py +++ b/docarray/array/storage/milvus/__init__.py @@ -0,0 +1,11 @@ +from abc import ABC + +from .backend import BackendMixin, MilvusConfig +from .getsetdel import GetSetDelMixin +from .seqlike import SequenceLikeMixin + +__all__ = ['StorageMixins', 'MilvusConfig'] + + +class StorageMixins(BackendMixin, GetSetDelMixin, SequenceLikeMixin, ABC): + ... diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 53d6773db61..7d3ea5f5c40 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -13,18 +13,30 @@ ) +def always_true_expr(primary_key: str) -> str: + """ + Returns a Milvus expression that is always true, thus allowing for the retrieval of all entries in a Collection + Assumes that the primary key is of type DataType.VARCHAR + + :param primary_key: the name of the primary key + :return: a Milvus expression that is always true for that primary key + """ + return f'({primary_key} in ["1"]) or ({primary_key} not in ["1"])' + + @dataclass class MilvusConfig: n_dim: int collection_name: str = None host: str = 'localhost' - port: Optional[Union[str, int]] = None # 19530 for gRPC, 9091 for HTTP + port: Optional[Union[str, int]] = 19530 # 19530 for gRPC, 9091 for HTTP distance: str = 'IP' # metric_type in milvus index_type: str = 'HNSW' index_config: Dict = None # passed to milvus at index creation time collection_config: Dict = field( default_factory=dict ) # passed to milvus at collection creation time + serialize_config: Dict = field(default_factory=dict) class BackendMixin(BaseBackendMixin): @@ -42,21 +54,24 @@ def _init_storage( if config.collection_name is None: id = uuid.uuid4().hex - config.index_name = 'docarray__' + id + config.collection_name = 'docarray__' + id self._config = config - self._connection_alias = 'docarray_default' + self._connection_alias = 'docarray_default_connection' connections.connect( alias=self._connection_alias, host=config.host, port=config.port ) - self._collection = self._create_collection() + self._collection = self._create_or_reuse_collection() + self._offset2id_collection = self._create_or_reuse_offset2id_collection() super()._init_storage(_docs, config, **kwargs) - def _create_collection(self): - document_id = FieldSchema(name='document_id', dtype=DataType.STRING) - order = FieldSchema(name='order', dtype=DataType.STRING, is_primary=True) + def _create_or_reuse_collection(self): + # TODO(johannes) add logic to re-use collection if already exists + document_id = FieldSchema( + name='document_id', dtype=DataType.VARCHAR, max_length=1024, is_primary=True + ) # TODO(johannes) this max_length is completely arbitrary embedding = FieldSchema( name='embedding', dtype=DataType.FLOAT_VECTOR, dim=self._config.n_dim ) @@ -65,8 +80,8 @@ def _create_collection(self): ) # this is the maximus allowed length in milvus, could be optimized schema = CollectionSchema( - fields=[document_id, order, embedding, serialized], - description='docarray collection', + fields=[document_id, embedding, serialized], + description='DocumentArray collection', ) return Collection( name=self._config.collection_name, @@ -75,6 +90,33 @@ def _create_collection(self): **self._config.collection_config, ) + def _create_or_reuse_offset2id_collection(self): + # TODO(johannes) add logic to re-use collection if already exists + document_id = FieldSchema( + name='document_id', dtype=DataType.VARCHAR, max_length=1024 + ) # TODO(johannes) this max_length is completely arbitrary + offset = FieldSchema( + name='offset', dtype=DataType.VARCHAR, max_length=1024, is_primary=True + ) # TODO(johannes) this max_length is completely arbitrary + # TODO(johannes) + # This is really stupid and hacky, but milvus needs at least one vector field to create a Collection + # We probably need a better way to store offset2id, but this should unblock the implementation in the meantime + dummy_vector = FieldSchema( + name='dummy_vector', dtype=DataType.FLOAT_VECTOR, dim=1 + ) + + schema = CollectionSchema( + fields=[offset, document_id, dummy_vector], + description='offset2id for DocumentArray', + ) + + return Collection( + name=self._config.collection_name + '_offset2id', + schema=schema, + using=self._connection_alias, + # **self._config.collection_config, # we probably don't want to apply the same config here + ) + def _ensure_unique_config( self, config_root: dict, @@ -87,3 +129,17 @@ def _ensure_unique_config( config_joined['collection_name'] + '_subindex_' + subindex_name ) return config_joined + + def _doc_to_milvus_payload(self, doc): + return [ + [doc.id], + [doc.embedding], + [doc.to_base64(**self._config.serialize_config)], + ] + + def _docs_to_milvus_payload(self, docs): + return [ + docs[:, 'id'], + list(docs[:, 'embedding']), + [doc.to_base64(**self._config.serialize_config) for doc in docs], + ] diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index 9bbc9ee06cc..89c4eadba58 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -1,24 +1,70 @@ from docarray.array.storage.base.getsetdel import BaseGetSetDelMixin from docarray import Document +from docarray.array.storage.base.helper import Offset2ID +from docarray.array.storage.milvus.backend import always_true_expr class GetSetDelMixin(BaseGetSetDelMixin): def _get_doc_by_id(self, _id: str) -> 'Document': # to be implemented - ... + self._get_docs_by_ids([_id]) def _del_doc_by_id(self, _id: str): # to be implemented - ... + self._del_docs_by_ids([_id]) def _set_doc_by_id(self, _id: str, value: 'Document'): # to be implemented - ... + self._set_doc_by_id([_id], [value], None) def _load_offset2ids(self): - # to be implemented - ... + collection = self._offset2id_collection # Get an existing collection. + collection.load() + res = collection.query( + expr='(document_id in ["1"]) or (document_id not in ["1"])', # is this the correct expr to say "all"? + # output_fields=["book_id", "book_intro"], + consistency_level="Strong", + ) + collection.release() + sorted_res = sorted(res, key=lambda k: int(k['offset'])) + self._offset2id = Offset2ID(r['document_id'] for r in sorted_res) def _save_offset2ids(self): - # to be implemented - ... + collection = self._offset2id_collection # Get an existing collection. + # delete old entries + collection.delete( + expr=always_true_expr( + 'document_id' + ), # is this the correct expr to say "all"? + consistency_level="Strong", + ) + # insert current entries + ids = self._offset2id.ids + offsets = [str(i) for i in range(len(ids))] + collection.insert([offsets, ids]) + + def _get_docs_by_ids(self, ids: 'Iterable[str]') -> 'DocumentArray': + id_list_str = '[' + ','.join(ids) + ']' + res = self._collection.query( + expr=f'document_id in {id_list_str}', + # output_fields=["book_id", "book_intro"], + consistency_level='Strong', + ) + # TODO(johannes) handle output an convert to da + + def _del_docs_by_ids(self, ids: 'Iterable[str]') -> 'DocumentArray': + id_list_str = '[' + ','.join(ids) + ']' + self._collection.delete( + expr=f'document_id in {id_list_str}', consistency_level='Strong' + ) + + def _set_docs_by_ids(self, ids, docs: 'Iterable[Document]', mismatch_ids: 'Dict'): + # TODO(johannes) check if deletion is necesarry if ids already match + # delete old entries + id_list_str = '[' + ','.join(ids) + ']' + self._collection.delete( + expr=f'document_id in {id_list_str}', consistency_level='Strong' + ) + # insert new entries + payload = self._docs_to_milvus_payload(docs) + self._collection.insert(payload) diff --git a/docarray/array/storage/milvus/seqlike.py b/docarray/array/storage/milvus/seqlike.py index e69de29bb2d..8a6ffe76120 100644 --- a/docarray/array/storage/milvus/seqlike.py +++ b/docarray/array/storage/milvus/seqlike.py @@ -0,0 +1,40 @@ +from typing import Iterable, Iterator, Union, TYPE_CHECKING +from docarray.array.storage.base.seqlike import BaseSequenceLikeMixin + +if TYPE_CHECKING: + from docarray import Document + + +class SequenceLikeMixin(BaseSequenceLikeMixin): + def __eq__(self, other): + ... + + def __contains__(self, x: Union[str, 'Document']): + ... + + def __repr__(self): + ... + + def __add__(self, other: Union['Document', Iterable['Document']]): + ... + + def insert(self, index: int, value: 'Document'): + # Optional. By default, this will add a new item and update offset2id + # if you want to customize this, make sure to handle offset2id + ... + + def _append(self, value: 'Document'): + # Optional. Override this if you have a better implementation than inserting at the last position + ... + + def _extend(self, values: Iterable['Document']) -> None: + # Optional. Override this if you have better implementation than appending one by one + ... + + def __len__(self): + # Optional. By default, this will rely on offset2id to get the length + ... + + def __iter__(self) -> Iterator['Document']: + # Optional. By default, this will rely on offset2id to iterate + ... From 14d23d1074ba77417350b59a0ff9340794731a1b Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 6 Oct 2022 16:46:07 +0200 Subject: [PATCH 05/88] feat: implement basics of getsetdel and seqlike --- docarray/array/storage/milvus/backend.py | 61 +++++++++++++++--- docarray/array/storage/milvus/getsetdel.py | 35 +++++------ docarray/array/storage/milvus/seqlike.py | 72 ++++++++++++++-------- 3 files changed, 115 insertions(+), 53 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 7d3ea5f5c40..191665af011 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -1,9 +1,19 @@ import copy import uuid -from typing import Optional, TYPE_CHECKING, Union, Dict +from typing import Optional, TYPE_CHECKING, Union, Dict, Iterable from dataclasses import dataclass, field -from pymilvus import connections, Collection, FieldSchema, DataType, CollectionSchema +import numpy as np +from pymilvus import ( + connections, + Collection, + FieldSchema, + DataType, + CollectionSchema, + has_collection, +) + +from docarray import Document, DocumentArray from docarray.array.storage.base.backend import BaseBackendMixin from docarray.helper import dataclass_from_dict @@ -24,6 +34,11 @@ def always_true_expr(primary_key: str) -> str: return f'({primary_key} in ["1"]) or ({primary_key} not in ["1"])' +def ids_to_milvus_expr(ids): + ids = ['"' + _id + '"' for _id in ids] + return '[' + ','.join(ids) + ']' + + @dataclass class MilvusConfig: n_dim: int @@ -48,7 +63,7 @@ def _init_storage( ): config = copy.deepcopy(config) if not config: - raise ValueError('Empty config is not allowed for Elastic storage') + raise ValueError('Empty config is not allowed for Milvus storage') elif isinstance(config, dict): config = dataclass_from_dict(MilvusConfig, config) @@ -57,7 +72,7 @@ def _init_storage( config.collection_name = 'docarray__' + id self._config = config - self._connection_alias = 'docarray_default_connection' + self._connection_alias = f'docarray_{config.host}_{config.port}' connections.connect( alias=self._connection_alias, host=config.host, port=config.port ) @@ -67,8 +82,25 @@ def _init_storage( super()._init_storage(_docs, config, **kwargs) + # To align with Sqlite behavior; if `docs` is not `None` and table name + # is provided, :class:`DocumentArraySqlite` will clear the existing + # table and load the given `docs` + if _docs is None: + return + elif isinstance(_docs, Iterable): + self.clear() + self.extend(_docs) + else: + self.clear() + if isinstance(_docs, Document): + self.append(_docs) + def _create_or_reuse_collection(self): - # TODO(johannes) add logic to re-use collection if already exists + if has_collection(self._config.collection_name, using=self._connection_alias): + return Collection( + self._config.collection_name, using=self._connection_alias + ) + document_id = FieldSchema( name='document_id', dtype=DataType.VARCHAR, max_length=1024, is_primary=True ) # TODO(johannes) this max_length is completely arbitrary @@ -91,7 +123,14 @@ def _create_or_reuse_collection(self): ) def _create_or_reuse_offset2id_collection(self): - # TODO(johannes) add logic to re-use collection if already exists + if has_collection( + self._config.collection_name + '_offset2id', using=self._connection_alias + ): + return Collection( + self._config.collection_name + '_offset2id', + using=self._connection_alias, + ) + document_id = FieldSchema( name='document_id', dtype=DataType.VARCHAR, max_length=1024 ) # TODO(johannes) this max_length is completely arbitrary @@ -137,9 +176,13 @@ def _doc_to_milvus_payload(self, doc): [doc.to_base64(**self._config.serialize_config)], ] - def _docs_to_milvus_payload(self, docs): + def _docs_to_milvus_payload(self, docs: 'Iterable[Document]'): return [ - docs[:, 'id'], - list(docs[:, 'embedding']), + [doc.id for doc in docs], + [doc.embedding or np.zeros(self._config.n_dim) for doc in docs], [doc.to_base64(**self._config.serialize_config) for doc in docs], ] + + def _docs_from_milvus_respone(self, response): + # [{'serialized': 'blablalba', 'document_id': '4299acbf3c800fa4f6eed919a3e9fe0c'}] + return DocumentArray([Document.from_base64(d['serialized']) for d in response]) diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index 89c4eadba58..6c0076fd519 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -1,13 +1,13 @@ from docarray.array.storage.base.getsetdel import BaseGetSetDelMixin from docarray import Document from docarray.array.storage.base.helper import Offset2ID -from docarray.array.storage.milvus.backend import always_true_expr +from docarray.array.storage.milvus.backend import always_true_expr, ids_to_milvus_expr class GetSetDelMixin(BaseGetSetDelMixin): def _get_doc_by_id(self, _id: str) -> 'Document': # to be implemented - self._get_docs_by_ids([_id]) + return self._get_docs_by_ids([_id]) def _del_doc_by_id(self, _id: str): # to be implemented @@ -15,55 +15,52 @@ def _del_doc_by_id(self, _id: str): def _set_doc_by_id(self, _id: str, value: 'Document'): # to be implemented - self._set_doc_by_id([_id], [value], None) + self._set_docs_by_ids([_id], [value], None) def _load_offset2ids(self): - collection = self._offset2id_collection # Get an existing collection. + collection = self._offset2id_collection collection.load() res = collection.query( - expr='(document_id in ["1"]) or (document_id not in ["1"])', # is this the correct expr to say "all"? + expr='(document_id in ["1"]) or (document_id not in ["1"])', # output_fields=["book_id", "book_intro"], consistency_level="Strong", ) collection.release() sorted_res = sorted(res, key=lambda k: int(k['offset'])) - self._offset2id = Offset2ID(r['document_id'] for r in sorted_res) + self._offset2ids = Offset2ID([r['document_id'] for r in sorted_res]) def _save_offset2ids(self): collection = self._offset2id_collection # Get an existing collection. # delete old entries collection.delete( - expr=always_true_expr( - 'document_id' - ), # is this the correct expr to say "all"? - consistency_level="Strong", + expr=always_true_expr('document_id'), + consistency_level='Strong', ) # insert current entries - ids = self._offset2id.ids + ids = self._offset2ids.ids offsets = [str(i) for i in range(len(ids))] collection.insert([offsets, ids]) def _get_docs_by_ids(self, ids: 'Iterable[str]') -> 'DocumentArray': - id_list_str = '[' + ','.join(ids) + ']' + self._collection.load() res = self._collection.query( - expr=f'document_id in {id_list_str}', - # output_fields=["book_id", "book_intro"], + expr=f'document_id in {ids_to_milvus_expr(ids)}', + output_fields=['serialized'], consistency_level='Strong', ) - # TODO(johannes) handle output an convert to da + self._collection.release() + return self._docs_from_milvus_respone(res) def _del_docs_by_ids(self, ids: 'Iterable[str]') -> 'DocumentArray': - id_list_str = '[' + ','.join(ids) + ']' self._collection.delete( - expr=f'document_id in {id_list_str}', consistency_level='Strong' + expr=f'document_id in {ids_to_milvus_expr(ids)}', consistency_level='Strong' ) def _set_docs_by_ids(self, ids, docs: 'Iterable[Document]', mismatch_ids: 'Dict'): # TODO(johannes) check if deletion is necesarry if ids already match # delete old entries - id_list_str = '[' + ','.join(ids) + ']' self._collection.delete( - expr=f'document_id in {id_list_str}', consistency_level='Strong' + expr=f'document_id in {ids_to_milvus_expr(ids)}', consistency_level='Strong' ) # insert new entries payload = self._docs_to_milvus_payload(docs) diff --git a/docarray/array/storage/milvus/seqlike.py b/docarray/array/storage/milvus/seqlike.py index 8a6ffe76120..e4087b7b151 100644 --- a/docarray/array/storage/milvus/seqlike.py +++ b/docarray/array/storage/milvus/seqlike.py @@ -7,34 +7,56 @@ class SequenceLikeMixin(BaseSequenceLikeMixin): def __eq__(self, other): - ... + """Compare this object to the other, returns True if and only if other + as the same type as self and other have the same Milvus Collections for data and offset2id + + :param other: the other object to check for equality + :return: `True` if other is equal to self + """ + # two DAW are considered as the same if they have the same client meta data + return ( + type(self) is type(other) + and self._collection.name == other._collection.name + and self._offset2id_collection.name == other._offset2id_collection.name + and self._config == other._config + ) def __contains__(self, x: Union[str, 'Document']): - ... + if isinstance(x, Document): + x = x.id + try: + self._get_doc_by_id(x) + return True + except: # TODO(johannes) make exception more specific + return False def __repr__(self): - ... + return f'' def __add__(self, other: Union['Document', Iterable['Document']]): - ... - - def insert(self, index: int, value: 'Document'): - # Optional. By default, this will add a new item and update offset2id - # if you want to customize this, make sure to handle offset2id - ... - - def _append(self, value: 'Document'): - # Optional. Override this if you have a better implementation than inserting at the last position - ... - - def _extend(self, values: Iterable['Document']) -> None: - # Optional. Override this if you have better implementation than appending one by one - ... - - def __len__(self): - # Optional. By default, this will rely on offset2id to get the length - ... - - def __iter__(self) -> Iterator['Document']: - # Optional. By default, this will rely on offset2id to iterate - ... + if isinstance(other, Document): + self.append(other) + else: + self.extend(other) + return self + + # def insert(self, index: int, value: 'Document'): + # # Optional. By default, this will add a new item and update offset2id + # # if you want to customize this, make sure to handle offset2id + # ... + # + # def _append(self, value: 'Document'): + # # Optional. Override this if you have a better implementation than inserting at the last position + # ... + # + # def _extend(self, values: Iterable['Document']) -> None: + # # Optional. Override this if you have better implementation than appending one by one + # ... + # + # def __len__(self): + # # Optional. By default, this will rely on offset2id to get the length + # ... + # + # def __iter__(self) -> Iterator['Document']: + # # Optional. By default, this will rely on offset2id to iterate + # ... From a7e4555c61c0b4eabe823b080be92fcccd16dc07 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 6 Oct 2022 16:51:30 +0200 Subject: [PATCH 06/88] fix: type hint --- docarray/array/storage/milvus/getsetdel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index 6c0076fd519..5f64b99c54a 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -1,3 +1,5 @@ +from typing import Iterable + from docarray.array.storage.base.getsetdel import BaseGetSetDelMixin from docarray import Document from docarray.array.storage.base.helper import Offset2ID From 5ce0bfd3a28666ae9066cc6b5b458943cc2a0f24 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Fri, 7 Oct 2022 15:00:01 +0200 Subject: [PATCH 07/88] fix: saving and loading offset2ids --- docarray/array/storage/milvus/backend.py | 8 +++++-- docarray/array/storage/milvus/getsetdel.py | 25 ++++++++++++++-------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 191665af011..6bd3fec870b 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -179,10 +179,14 @@ def _doc_to_milvus_payload(self, doc): def _docs_to_milvus_payload(self, docs: 'Iterable[Document]'): return [ [doc.id for doc in docs], - [doc.embedding or np.zeros(self._config.n_dim) for doc in docs], + [ + doc.embedding + if doc.embedding is not None + else np.zeros(self._config.n_dim) + for doc in docs + ], [doc.to_base64(**self._config.serialize_config) for doc in docs], ] def _docs_from_milvus_respone(self, response): - # [{'serialized': 'blablalba', 'document_id': '4299acbf3c800fa4f6eed919a3e9fe0c'}] return DocumentArray([Document.from_base64(d['serialized']) for d in response]) diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index 5f64b99c54a..5dda0af82ca 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -1,5 +1,7 @@ from typing import Iterable +import numpy as np + from docarray.array.storage.base.getsetdel import BaseGetSetDelMixin from docarray import Document from docarray.array.storage.base.helper import Offset2ID @@ -9,7 +11,7 @@ class GetSetDelMixin(BaseGetSetDelMixin): def _get_doc_by_id(self, _id: str) -> 'Document': # to be implemented - return self._get_docs_by_ids([_id]) + return self._get_docs_by_ids([_id])[0] def _del_doc_by_id(self, _id: str): # to be implemented @@ -23,25 +25,30 @@ def _load_offset2ids(self): collection = self._offset2id_collection collection.load() res = collection.query( - expr='(document_id in ["1"]) or (document_id not in ["1"])', - # output_fields=["book_id", "book_intro"], - consistency_level="Strong", + expr=always_true_expr('document_id'), + output_fields=['offset', 'document_id'], + consistency_level='Strong', ) collection.release() sorted_res = sorted(res, key=lambda k: int(k['offset'])) self._offset2ids = Offset2ID([r['document_id'] for r in sorted_res]) - def _save_offset2ids(self): - collection = self._offset2id_collection # Get an existing collection. - # delete old entries + def _empty_offset2ids_milvus(self): + collection = self._offset2id_collection collection.delete( - expr=always_true_expr('document_id'), + expr=f'document_id in {ids_to_milvus_expr(self._offset2ids.ids)}', consistency_level='Strong', ) + + def _save_offset2ids(self): + # delete old entries + self._empty_offset2ids_milvus() # insert current entries + collection = self._offset2id_collection ids = self._offset2ids.ids offsets = [str(i) for i in range(len(ids))] - collection.insert([offsets, ids]) + dummy_vectors = [np.zeros(1) for _ in range(len(ids))] + collection.insert([offsets, ids, dummy_vectors]) def _get_docs_by_ids(self, ids: 'Iterable[str]') -> 'DocumentArray': self._collection.load() From 74db70c1f3b47d104e41597f38d008867cef97df Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 10 Oct 2022 17:33:23 +0200 Subject: [PATCH 08/88] feat: first implementation of vector search --- docarray/array/storage/milvus/__init__.py | 3 +- docarray/array/storage/milvus/backend.py | 28 +++++++++- docarray/array/storage/milvus/find.py | 62 +++++++++++----------- docarray/array/storage/milvus/getsetdel.py | 13 ++++- 4 files changed, 71 insertions(+), 35 deletions(-) diff --git a/docarray/array/storage/milvus/__init__.py b/docarray/array/storage/milvus/__init__.py index aa79e529e62..b3e2894e607 100644 --- a/docarray/array/storage/milvus/__init__.py +++ b/docarray/array/storage/milvus/__init__.py @@ -1,11 +1,12 @@ from abc import ABC from .backend import BackendMixin, MilvusConfig +from .find import FindMixin from .getsetdel import GetSetDelMixin from .seqlike import SequenceLikeMixin __all__ = ['StorageMixins', 'MilvusConfig'] -class StorageMixins(BackendMixin, GetSetDelMixin, SequenceLikeMixin, ABC): +class StorageMixins(FindMixin, BackendMixin, GetSetDelMixin, SequenceLikeMixin, ABC): ... diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 6bd3fec870b..803211e70f0 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -47,7 +47,12 @@ class MilvusConfig: port: Optional[Union[str, int]] = 19530 # 19530 for gRPC, 9091 for HTTP distance: str = 'IP' # metric_type in milvus index_type: str = 'HNSW' - index_config: Dict = None # passed to milvus at index creation time + index_params: Dict = field( + default_factory=lambda: { + 'M': 4, + 'efConstruction': 200, + } # TODO(johannes) check if these defaults are reasonable + ) # passed to milvus at index creation time. The default assumes 'HNSW' index type collection_config: Dict = field( default_factory=dict ) # passed to milvus at collection creation time @@ -79,6 +84,7 @@ def _init_storage( self._collection = self._create_or_reuse_collection() self._offset2id_collection = self._create_or_reuse_offset2id_collection() + self._build_index() super()._init_storage(_docs, config, **kwargs) @@ -122,6 +128,14 @@ def _create_or_reuse_collection(self): **self._config.collection_config, ) + def _build_index(self): + index_params = { + 'metric_type': self._config.distance, + 'index_type': self._config.index_type, + 'params': self._config.index_params, + } + self._collection.create_index(field_name='embedding', index_params=index_params) + def _create_or_reuse_offset2id_collection(self): if has_collection( self._config.collection_name + '_offset2id', using=self._connection_alias @@ -190,3 +204,15 @@ def _docs_to_milvus_payload(self, docs: 'Iterable[Document]'): def _docs_from_milvus_respone(self, response): return DocumentArray([Document.from_base64(d['serialized']) for d in response]) + + def _docs_from_search_response( + self, responses + ) -> 'Union[List[DocumentArray], DocumentArray]': + das = [] + for r in responses: + das.append( + DocumentArray( + [Document.from_base64(hit.entity.get('serialized')) for hit in r] + ) + ) + return das if len(das) > 0 else das[0] diff --git a/docarray/array/storage/milvus/find.py b/docarray/array/storage/milvus/find.py index e095fb8bc3b..caecc73f7d5 100644 --- a/docarray/array/storage/milvus/find.py +++ b/docarray/array/storage/milvus/find.py @@ -1,38 +1,38 @@ -from typing import Optional, TYPE_CHECKING, Union, Dict -from dataclasses import dataclass - -from docarray.array.storage.base.backend import BaseBackendMixin +from typing import TYPE_CHECKING, TypeVar, List, Union, Optional, Dict if TYPE_CHECKING: - from docarray.typing import ( - DocumentArraySourceType, - ) - + import numpy as np -@dataclass -class MilvusConfig: - config1: str - config2: str - config3: Dict - ... + # Define the expected input type that your ANN search supports + MilvusArrayType = TypeVar( + 'MilvusArrayType', np.ndarray, list + ) # TODO(johannes) test torch, tf, etc. -class BackendMixin(BaseBackendMixin): - def _init_storage( +class FindMixin: + def _find( self, - _docs: Optional['DocumentArraySourceType'] = None, - config: Optional[Union[MilvusConfig, Dict]] = None, + query: 'MilvusArrayType', + limit: int = 10, + filter: Optional[Dict] = None, + param=None, **kwargs - ): - super()._init_storage(_docs, config, **kwargs) - ... - - def _ensure_unique_config( - self, - config_root: dict, - config_subindex: dict, - config_joined: dict, - subindex_name: str, - ) -> dict: - ... # ensure unique identifiers here - return config_joined + ) -> Union['DocumentArray', List['DocumentArray']]: + """Returns `limit` approximate nearest neighbors given a batch of input queries. + If the query is a single query, should return a DocumentArray, otherwise a list of DocumentArrays containing + the closest Documents for each query. + """ + if param is None: + param = dict() + self._collection.load() + results = self._collection.search( + data=query, + anns_field='embedding', + limit=limit, + expr=None, + param=param, + output_fields=['serialized'], + **kwargs + ) + self._collection.release() + return self._docs_from_search_response(results) diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index 5dda0af82ca..8f8df004daf 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -33,7 +33,7 @@ def _load_offset2ids(self): sorted_res = sorted(res, key=lambda k: int(k['offset'])) self._offset2ids = Offset2ID([r['document_id'] for r in sorted_res]) - def _empty_offset2ids_milvus(self): + def _clear_offset2ids_milvus(self): collection = self._offset2id_collection collection.delete( expr=f'document_id in {ids_to_milvus_expr(self._offset2ids.ids)}', @@ -42,7 +42,7 @@ def _empty_offset2ids_milvus(self): def _save_offset2ids(self): # delete old entries - self._empty_offset2ids_milvus() + self._clear_offset2ids_milvus() # insert current entries collection = self._offset2id_collection ids = self._offset2ids.ids @@ -74,3 +74,12 @@ def _set_docs_by_ids(self, ids, docs: 'Iterable[Document]', mismatch_ids: 'Dict' # insert new entries payload = self._docs_to_milvus_payload(docs) self._collection.insert(payload) + + def _clear_storage(self): + collection = self._collection + collection.delete( + expr=f'document_id in {ids_to_milvus_expr(self._offset2ids.ids)}', + consistency_level='Strong', + ) + self._clear_offset2ids_milvus() + self._offset2ids = Offset2ID() From b1e3bce5585c1aa5f41ceba255fe956bae922039 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 11 Oct 2022 12:08:47 +0200 Subject: [PATCH 09/88] refactor: declare static methods --- docarray/array/storage/milvus/backend.py | 6 ++++-- docarray/array/storage/milvus/getsetdel.py | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 803211e70f0..6cdc2516cc2 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -202,11 +202,13 @@ def _docs_to_milvus_payload(self, docs: 'Iterable[Document]'): [doc.to_base64(**self._config.serialize_config) for doc in docs], ] - def _docs_from_milvus_respone(self, response): + @staticmethod + def _docs_from_milvus_respone(response): return DocumentArray([Document.from_base64(d['serialized']) for d in response]) + @staticmethod def _docs_from_search_response( - self, responses + responses, ) -> 'Union[List[DocumentArray], DocumentArray]': das = [] for r in responses: diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index 8f8df004daf..e8a4d1ad307 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -82,4 +82,3 @@ def _clear_storage(self): consistency_level='Strong', ) self._clear_offset2ids_milvus() - self._offset2ids = Offset2ID() From 1a481af271f83cc9450eea1849f1f691942f76e8 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 12 Oct 2022 11:12:08 +0200 Subject: [PATCH 10/88] feat: add consistency level as a configuration parameter --- docarray/array/storage/milvus/backend.py | 1 + docarray/array/storage/milvus/getsetdel.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 6cdc2516cc2..b7d2b42f0f2 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -57,6 +57,7 @@ class MilvusConfig: default_factory=dict ) # passed to milvus at collection creation time serialize_config: Dict = field(default_factory=dict) + consistency_level: str = 'Strong' class BackendMixin(BaseBackendMixin): diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index e8a4d1ad307..b16f221d328 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -27,7 +27,7 @@ def _load_offset2ids(self): res = collection.query( expr=always_true_expr('document_id'), output_fields=['offset', 'document_id'], - consistency_level='Strong', + consistency_level=self._config.consistency_level, ) collection.release() sorted_res = sorted(res, key=lambda k: int(k['offset'])) @@ -37,7 +37,7 @@ def _clear_offset2ids_milvus(self): collection = self._offset2id_collection collection.delete( expr=f'document_id in {ids_to_milvus_expr(self._offset2ids.ids)}', - consistency_level='Strong', + consistency_level=self._config.consistency_level, ) def _save_offset2ids(self): @@ -55,21 +55,23 @@ def _get_docs_by_ids(self, ids: 'Iterable[str]') -> 'DocumentArray': res = self._collection.query( expr=f'document_id in {ids_to_milvus_expr(ids)}', output_fields=['serialized'], - consistency_level='Strong', + consistency_level=self._config.consistency_level, ) self._collection.release() return self._docs_from_milvus_respone(res) def _del_docs_by_ids(self, ids: 'Iterable[str]') -> 'DocumentArray': self._collection.delete( - expr=f'document_id in {ids_to_milvus_expr(ids)}', consistency_level='Strong' + expr=f'document_id in {ids_to_milvus_expr(ids)}', + consistency_level=self._config.consistency_level, ) def _set_docs_by_ids(self, ids, docs: 'Iterable[Document]', mismatch_ids: 'Dict'): # TODO(johannes) check if deletion is necesarry if ids already match # delete old entries self._collection.delete( - expr=f'document_id in {ids_to_milvus_expr(ids)}', consistency_level='Strong' + expr=f'document_id in {ids_to_milvus_expr(ids)}', + consistency_level=self._config.consistency_level, ) # insert new entries payload = self._docs_to_milvus_payload(docs) @@ -79,6 +81,6 @@ def _clear_storage(self): collection = self._collection collection.delete( expr=f'document_id in {ids_to_milvus_expr(self._offset2ids.ids)}', - consistency_level='Strong', + consistency_level=self._config.consistency_level, ) self._clear_offset2ids_milvus() From ee50e51479138c0129cf7f8b928a71efe909b888 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 12 Oct 2022 11:54:22 +0200 Subject: [PATCH 11/88] feat: change default consistency to session --- docarray/array/storage/milvus/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index b7d2b42f0f2..f2a9b848029 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -57,7 +57,7 @@ class MilvusConfig: default_factory=dict ) # passed to milvus at collection creation time serialize_config: Dict = field(default_factory=dict) - consistency_level: str = 'Strong' + consistency_level: str = 'Session' class BackendMixin(BaseBackendMixin): From 0ca783b3cbb9a8022e952f85150acb2df04856a6 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 12 Oct 2022 11:59:54 +0200 Subject: [PATCH 12/88] refactor: in clear_storage, drop and re-create collection --- docarray/array/storage/milvus/getsetdel.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index b16f221d328..04e533836bb 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -33,13 +33,6 @@ def _load_offset2ids(self): sorted_res = sorted(res, key=lambda k: int(k['offset'])) self._offset2ids = Offset2ID([r['document_id'] for r in sorted_res]) - def _clear_offset2ids_milvus(self): - collection = self._offset2id_collection - collection.delete( - expr=f'document_id in {ids_to_milvus_expr(self._offset2ids.ids)}', - consistency_level=self._config.consistency_level, - ) - def _save_offset2ids(self): # delete old entries self._clear_offset2ids_milvus() @@ -78,9 +71,10 @@ def _set_docs_by_ids(self, ids, docs: 'Iterable[Document]', mismatch_ids: 'Dict' self._collection.insert(payload) def _clear_storage(self): - collection = self._collection - collection.delete( - expr=f'document_id in {ids_to_milvus_expr(self._offset2ids.ids)}', - consistency_level=self._config.consistency_level, - ) + self._collection.drop() + self._create_or_reuse_collection() self._clear_offset2ids_milvus() + + def _clear_offset2ids_milvus(self): + self._offset2id_collection.drop() + self._create_or_reuse_offset2id_collection() From f4a2eb0c78e638467fcae841b57c73048830b2d4 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 12 Oct 2022 15:00:13 +0200 Subject: [PATCH 13/88] feat: implement filter and hybrid search --- docarray/array/mixins/find.py | 8 +++++--- docarray/array/storage/milvus/backend.py | 2 +- docarray/array/storage/milvus/find.py | 21 ++++++++++++++++++++- docarray/array/storage/milvus/getsetdel.py | 2 +- docarray/array/storage/milvus/seqlike.py | 20 +------------------- 5 files changed, 28 insertions(+), 25 deletions(-) diff --git a/docarray/array/mixins/find.py b/docarray/array/mixins/find.py index b6f509edf5e..0679cb8a6b3 100644 --- a/docarray/array/mixins/find.py +++ b/docarray/array/mixins/find.py @@ -96,7 +96,7 @@ def find( limit: Optional[Union[int, float]] = 20, metric_name: Optional[str] = None, exclude_self: bool = False, - filter: Optional[Dict] = None, + filter: Optional[Union[Dict, str]] = None, only_id: bool = False, index: str = 'text', on: Optional[str] = None, @@ -146,7 +146,9 @@ def find( ) from docarray import Document, DocumentArray - if isinstance(query, dict): + if isinstance( + query, dict + ): # TODO(johannes) since filters in milvus are strings, the can't be passes as `query`, otherwise it will be confused for text matching query if filter is None: return self._filter(query, limit=limit) else: @@ -154,7 +156,7 @@ def find( 'filter and query cannot be both dict type, set only one for filtering' ) elif query is None: - if isinstance(filter, dict): + if isinstance(filter, dict) or isinstance(filter, str): return self._filter(filter, limit=limit) else: raise ValueError('filter must be dict when query is None') diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index f2a9b848029..03fbed9f8f9 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -204,7 +204,7 @@ def _docs_to_milvus_payload(self, docs: 'Iterable[Document]'): ] @staticmethod - def _docs_from_milvus_respone(response): + def _docs_from_query_respone(response): return DocumentArray([Document.from_base64(d['serialized']) for d in response]) @staticmethod diff --git a/docarray/array/storage/milvus/find.py b/docarray/array/storage/milvus/find.py index caecc73f7d5..4db09b3d809 100644 --- a/docarray/array/storage/milvus/find.py +++ b/docarray/array/storage/milvus/find.py @@ -29,10 +29,29 @@ def _find( data=query, anns_field='embedding', limit=limit, - expr=None, + expr=filter, param=param, output_fields=['serialized'], **kwargs ) self._collection.release() return self._docs_from_search_response(results) + + def _filter(self, filter, limit=10, **kwargs): + # TODO(johannes) apply this consistency level handling everywhere; spin it out into a helper function + kwargs_consistency_level = kwargs.get('consistency_level', None) + consistency_level = ( + kwargs_consistency_level + if kwargs_consistency_level + else self._config.consistency_level + ) + self._collection.load() + results = self._collection.query( + expr=filter, + limit=limit, + output_fields=['serialized'], + consistency_level=consistency_level, + **kwargs + ) + self._collection.release() + return self._docs_from_query_respone(results)[:limit] diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index 04e533836bb..93a21619722 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -51,7 +51,7 @@ def _get_docs_by_ids(self, ids: 'Iterable[str]') -> 'DocumentArray': consistency_level=self._config.consistency_level, ) self._collection.release() - return self._docs_from_milvus_respone(res) + return self._docs_from_query_respone(res) def _del_docs_by_ids(self, ids: 'Iterable[str]') -> 'DocumentArray': self._collection.delete( diff --git a/docarray/array/storage/milvus/seqlike.py b/docarray/array/storage/milvus/seqlike.py index e4087b7b151..9b21bfe7ae2 100644 --- a/docarray/array/storage/milvus/seqlike.py +++ b/docarray/array/storage/milvus/seqlike.py @@ -13,7 +13,6 @@ def __eq__(self, other): :param other: the other object to check for equality :return: `True` if other is equal to self """ - # two DAW are considered as the same if they have the same client meta data return ( type(self) is type(other) and self._collection.name == other._collection.name @@ -40,23 +39,6 @@ def __add__(self, other: Union['Document', Iterable['Document']]): self.extend(other) return self - # def insert(self, index: int, value: 'Document'): - # # Optional. By default, this will add a new item and update offset2id - # # if you want to customize this, make sure to handle offset2id - # ... - # - # def _append(self, value: 'Document'): - # # Optional. Override this if you have a better implementation than inserting at the last position - # ... - # - # def _extend(self, values: Iterable['Document']) -> None: - # # Optional. Override this if you have better implementation than appending one by one - # ... # # def __len__(self): - # # Optional. By default, this will rely on offset2id to get the length - # ... - # - # def __iter__(self) -> Iterator['Document']: - # # Optional. By default, this will rely on offset2id to iterate - # ... + # return self._collection.num_entities # This doesn't work for some reason. Currently in contact with Milvus team to resolve. From 7d7d9fb9e240180568c05bfc8cc28a615ead7fc7 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 12 Oct 2022 16:15:24 +0200 Subject: [PATCH 14/88] feat: implement columns feature --- docarray/array/storage/milvus/backend.py | 40 ++++++++++++++++-------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 03fbed9f8f9..b6933f427f6 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -1,6 +1,6 @@ import copy import uuid -from typing import Optional, TYPE_CHECKING, Union, Dict, Iterable +from typing import Optional, TYPE_CHECKING, Union, Dict, Iterable, List, Tuple from dataclasses import dataclass, field import numpy as np @@ -14,8 +14,8 @@ ) from docarray import Document, DocumentArray -from docarray.array.storage.base.backend import BaseBackendMixin -from docarray.helper import dataclass_from_dict +from docarray.array.storage.base.backend import BaseBackendMixin, TypeMap +from docarray.helper import dataclass_from_dict, _safe_cast_int if TYPE_CHECKING: from docarray.typing import ( @@ -58,9 +58,19 @@ class MilvusConfig: ) # passed to milvus at collection creation time serialize_config: Dict = field(default_factory=dict) consistency_level: str = 'Session' + columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None class BackendMixin(BaseBackendMixin): + + TYPE_MAP = { + 'str': TypeMap(type=DataType.STRING, converter=str), + 'float': TypeMap(type=DataType.FLOAT, converter=float), + 'double': TypeMap(type=DataType.DOUBLE, converter=float), + 'int': TypeMap(type=DataType.INT64, converter=_safe_cast_int), + 'bool': TypeMap(type=DataType.BOOL, converter=bool), + } + def _init_storage( self, _docs: Optional['DocumentArraySourceType'] = None, @@ -77,6 +87,7 @@ def _init_storage( id = uuid.uuid4().hex config.collection_name = 'docarray__' + id self._config = config + self._config.columns = self._normalize_columns(self._config.columns) self._connection_alias = f'docarray_{config.host}_{config.port}' connections.connect( @@ -116,10 +127,15 @@ def _create_or_reuse_collection(self): ) serialized = FieldSchema( name='serialized', dtype=DataType.VARCHAR, max_length=65_535 - ) # this is the maximus allowed length in milvus, could be optimized + ) # TODO(johannes) this is the maximus allowed length in milvus, could be optimized + + additional_columns = [ + FieldSchema(name=col, dtype=self._map_type(coltype)) + for col, coltype in self._config.columns.items() + ] schema = CollectionSchema( - fields=[document_id, embedding, serialized], + fields=[document_id, embedding, serialized, *additional_columns], description='DocumentArray collection', ) return Collection( @@ -152,9 +168,6 @@ def _create_or_reuse_offset2id_collection(self): offset = FieldSchema( name='offset', dtype=DataType.VARCHAR, max_length=1024, is_primary=True ) # TODO(johannes) this max_length is completely arbitrary - # TODO(johannes) - # This is really stupid and hacky, but milvus needs at least one vector field to create a Collection - # We probably need a better way to store offset2id, but this should unblock the implementation in the meantime dummy_vector = FieldSchema( name='dummy_vector', dtype=DataType.FLOAT_VECTOR, dim=1 ) @@ -185,13 +198,13 @@ def _ensure_unique_config( return config_joined def _doc_to_milvus_payload(self, doc): - return [ - [doc.id], - [doc.embedding], - [doc.to_base64(**self._config.serialize_config)], - ] + return self._docs_to_milvus_payload([doc]) def _docs_to_milvus_payload(self, docs: 'Iterable[Document]'): + extra_columns = [ + [self._map_column(doc.tags.get(col), col_type) for doc in docs] + for col, col_type in self._config.columns.items() + ] return [ [doc.id for doc in docs], [ @@ -201,6 +214,7 @@ def _docs_to_milvus_payload(self, docs: 'Iterable[Document]'): for doc in docs ], [doc.to_base64(**self._config.serialize_config) for doc in docs], + *extra_columns, ] @staticmethod From 69f589bb4b595bf36fd2ef5fc8ee5ee734d5a5ea Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 12 Oct 2022 17:33:37 +0200 Subject: [PATCH 15/88] feat: allow consistency level to be passed to extend, append, insert --- docarray/array/storage/milvus/backend.py | 9 ++++++++ docarray/array/storage/milvus/find.py | 15 +++----------- docarray/array/storage/milvus/getsetdel.py | 24 +++++++++++++--------- docarray/array/storage/milvus/seqlike.py | 8 ++++++++ 4 files changed, 34 insertions(+), 22 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index b6933f427f6..90869dcb22c 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -233,3 +233,12 @@ def _docs_from_search_response( ) ) return das if len(das) > 0 else das[0] + + def _update_consistency_level(self, **kwargs): + kwargs_consistency_level = kwargs.get('consistency_level', None) + kwargs['consistency_level'] = ( + kwargs_consistency_level + if kwargs_consistency_level + else self._config.consistency_level + ) + return kwargs diff --git a/docarray/array/storage/milvus/find.py b/docarray/array/storage/milvus/find.py index 4db09b3d809..17962eb938a 100644 --- a/docarray/array/storage/milvus/find.py +++ b/docarray/array/storage/milvus/find.py @@ -25,6 +25,7 @@ def _find( if param is None: param = dict() self._collection.load() + kwargs = self._update_consistency_level(**kwargs) results = self._collection.search( data=query, anns_field='embedding', @@ -38,20 +39,10 @@ def _find( return self._docs_from_search_response(results) def _filter(self, filter, limit=10, **kwargs): - # TODO(johannes) apply this consistency level handling everywhere; spin it out into a helper function - kwargs_consistency_level = kwargs.get('consistency_level', None) - consistency_level = ( - kwargs_consistency_level - if kwargs_consistency_level - else self._config.consistency_level - ) + kwargs = self._update_consistency_level(**kwargs) self._collection.load() results = self._collection.query( - expr=filter, - limit=limit, - output_fields=['serialized'], - consistency_level=consistency_level, - **kwargs + expr=filter, limit=limit, output_fields=['serialized'], **kwargs ) self._collection.release() return self._docs_from_query_respone(results)[:limit] diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index 93a21619722..1f3306f433a 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -17,9 +17,9 @@ def _del_doc_by_id(self, _id: str): # to be implemented self._del_docs_by_ids([_id]) - def _set_doc_by_id(self, _id: str, value: 'Document'): + def _set_doc_by_id(self, _id: str, value: 'Document', **kwargs): # to be implemented - self._set_docs_by_ids([_id], [value], None) + self._set_docs_by_ids([_id], [value], None, **kwargs) def _load_offset2ids(self): collection = self._offset2id_collection @@ -43,32 +43,36 @@ def _save_offset2ids(self): dummy_vectors = [np.zeros(1) for _ in range(len(ids))] collection.insert([offsets, ids, dummy_vectors]) - def _get_docs_by_ids(self, ids: 'Iterable[str]') -> 'DocumentArray': + def _get_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': + kwargs = self._update_consistency_level(**kwargs) self._collection.load() res = self._collection.query( expr=f'document_id in {ids_to_milvus_expr(ids)}', output_fields=['serialized'], - consistency_level=self._config.consistency_level, + **kwargs, ) self._collection.release() return self._docs_from_query_respone(res) - def _del_docs_by_ids(self, ids: 'Iterable[str]') -> 'DocumentArray': + def _del_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': + kwargs = self._update_consistency_level(**kwargs) self._collection.delete( - expr=f'document_id in {ids_to_milvus_expr(ids)}', - consistency_level=self._config.consistency_level, + expr=f'document_id in {ids_to_milvus_expr(ids)}', **kwargs ) - def _set_docs_by_ids(self, ids, docs: 'Iterable[Document]', mismatch_ids: 'Dict'): + def _set_docs_by_ids( + self, ids, docs: 'Iterable[Document]', mismatch_ids: 'Dict', **kwargs + ): # TODO(johannes) check if deletion is necesarry if ids already match # delete old entries + kwargs = self._update_consistency_level(**kwargs) self._collection.delete( expr=f'document_id in {ids_to_milvus_expr(ids)}', - consistency_level=self._config.consistency_level, + **kwargs, ) # insert new entries payload = self._docs_to_milvus_payload(docs) - self._collection.insert(payload) + self._collection.insert(payload, **kwargs) def _clear_storage(self): self._collection.drop() diff --git a/docarray/array/storage/milvus/seqlike.py b/docarray/array/storage/milvus/seqlike.py index 9b21bfe7ae2..bae7731128f 100644 --- a/docarray/array/storage/milvus/seqlike.py +++ b/docarray/array/storage/milvus/seqlike.py @@ -42,3 +42,11 @@ def __add__(self, other: Union['Document', Iterable['Document']]): # # def __len__(self): # return self._collection.num_entities # This doesn't work for some reason. Currently in contact with Milvus team to resolve. + + def insert(self, index: int, value: 'Document', **kwargs): + self._set_doc_by_id(value.id, value, **kwargs) + self._offset2ids.insert(index, value.id) + + def _append(self, value: 'Document', **kwargs): + self._set_doc_by_id(value.id, value, **kwargs) + self._offset2ids.append(value.id) From 8a75dcea10ec57466971a934bd3828f974f498fe Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 17 Oct 2022 18:17:00 +0200 Subject: [PATCH 16/88] fix: fix columns feature in milvus --- docarray/array/storage/milvus/backend.py | 18 ++++++++++++------ docarray/array/storage/milvus/seqlike.py | 4 ---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 90869dcb22c..ed3745bc038 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -64,8 +64,10 @@ class MilvusConfig: class BackendMixin(BaseBackendMixin): TYPE_MAP = { - 'str': TypeMap(type=DataType.STRING, converter=str), - 'float': TypeMap(type=DataType.FLOAT, converter=float), + 'str': TypeMap(type=DataType.VARCHAR, converter=str), + 'float': TypeMap( + type=DataType.DOUBLE, converter=float + ), # it doesn't like DataType.FLOAT type, perhaps because python floats are double precision? 'double': TypeMap(type=DataType.DOUBLE, converter=float), 'int': TypeMap(type=DataType.INT64, converter=_safe_cast_int), 'bool': TypeMap(type=DataType.BOOL, converter=bool), @@ -129,10 +131,14 @@ def _create_or_reuse_collection(self): name='serialized', dtype=DataType.VARCHAR, max_length=65_535 ) # TODO(johannes) this is the maximus allowed length in milvus, could be optimized - additional_columns = [ - FieldSchema(name=col, dtype=self._map_type(coltype)) - for col, coltype in self._config.columns.items() - ] + additional_columns = [] + for col, coltype in self._config.columns.items(): + mapped_type = self._map_type(coltype) + if mapped_type == DataType.VARCHAR: + field_ = FieldSchema(name=col, dtype=mapped_type, max_length=1024) + else: + field_ = FieldSchema(name=col, dtype=mapped_type) + additional_columns.append(field_) schema = CollectionSchema( fields=[document_id, embedding, serialized, *additional_columns], diff --git a/docarray/array/storage/milvus/seqlike.py b/docarray/array/storage/milvus/seqlike.py index bae7731128f..db4aaa715f7 100644 --- a/docarray/array/storage/milvus/seqlike.py +++ b/docarray/array/storage/milvus/seqlike.py @@ -39,10 +39,6 @@ def __add__(self, other: Union['Document', Iterable['Document']]): self.extend(other) return self - # - # def __len__(self): - # return self._collection.num_entities # This doesn't work for some reason. Currently in contact with Milvus team to resolve. - def insert(self, index: int, value: 'Document', **kwargs): self._set_doc_by_id(value.id, value, **kwargs) self._offset2ids.insert(index, value.id) From 76cc4b143c8b5add23961d97d10612e0639ddbd1 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 18 Oct 2022 09:45:58 +0200 Subject: [PATCH 17/88] test: add test for milvus columns feature --- tests/conftest.py | 10 ++++ tests/unit/array/docker-compose.yml | 47 ++++++++++++++++++- .../unit/array/test_backend_configuration.py | 26 ++++++++++ 3 files changed, 82 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 77686a21570..7571adbc5eb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -43,3 +43,13 @@ def set_env_vars(request): yield os.environ.clear() os.environ.update(_old_environ) + + +@pytest.fixture +def milvus_cleanup(): + yield + from pymilvus import list_collections, drop_collection + + alias = f'docarray_localhost_19530' # assumes default host and port are used + for c in list_collections(using=alias): + drop_collection(c, using=alias) diff --git a/tests/unit/array/docker-compose.yml b/tests/unit/array/docker-compose.yml index 07de4842154..c20e3ad6134 100644 --- a/tests/unit/array/docker-compose.yml +++ b/tests/unit/array/docker-compose.yml @@ -30,7 +30,52 @@ services: image: redislabs/redisearch:2.6.0 ports: - "6379:6379" + + milvus-etcd: + container_name: milvus-etcd + image: quay.io/coreos/etcd:v3.5.0 + environment: + - ETCD_AUTO_COMPACTION_MODE=revision + - ETCD_AUTO_COMPACTION_RETENTION=1000 + - ETCD_QUOTA_BACKEND_BYTES=4294967296 + - ETCD_SNAPSHOT_COUNT=50000 + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd + command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd + + milvus-minio: + container_name: milvus-minio + image: minio/minio:RELEASE.2022-03-17T06-34-49Z + environment: + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data + command: minio server /minio_data + healthcheck: + test: [ "CMD", "curl", "-f", "http://localhost:9000/minio/health/live" ] + interval: 30s + timeout: 20s + retries: 3 + + milvus-standalone: + container_name: milvus-standalone + image: milvusdb/milvus:v2.1.4 + command: [ "milvus", "run", "standalone" ] + environment: + ETCD_ENDPOINTS: etcd:2379 + MINIO_ADDRESS: minio:9000 + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus + ports: + - "19530:19530" + - "9091:9091" + depends_on: + - "milvus-etcd" + - "milvus-minio" networks: elastic: - name: elastic \ No newline at end of file + name: elastic + milvus: + name: milvus \ No newline at end of file diff --git a/tests/unit/array/test_backend_configuration.py b/tests/unit/array/test_backend_configuration.py index 8255375e6d2..11351777188 100644 --- a/tests/unit/array/test_backend_configuration.py +++ b/tests/unit/array/test_backend_configuration.py @@ -153,6 +153,32 @@ def test_cast_columns_qdrant(start_storage, type_da, type_column, request): assert len(index) == N +@pytest.mark.parametrize('type_da', [int, float, str, bool]) +@pytest.mark.parametrize('type_column', ['int', 'str', 'float', 'double', 'bool']) +def test_cast_columns_qdrant( + start_storage, type_da, type_column, request, milvus_cleanup +): + test_id = request.node.callspec.id.replace( + '-', '' + ) # remove '-' from the test id for the milvus name + N = 10 + + index = DocumentArray( + storage='milvus', + config={ + 'collection_name': f'test{test_id}', + 'n_dim': 3, + 'columns': {'price': type_column}, + }, + ) + + docs = DocumentArray([Document(tags={'price': type_da(i)}) for i in range(N)]) + + index.extend(docs) + + assert len(index) == N + + def test_random_subindices_config(): database_index = random.randint(0, 100) database_name = "jina" + str(database_index) + ".db" From 6e35dcf382e3bb7e229663ae4aab836631c85aa4 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 18 Oct 2022 10:09:50 +0200 Subject: [PATCH 18/88] fix: import types for type hints --- docarray/array/storage/milvus/find.py | 1 + docarray/array/storage/milvus/getsetdel.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docarray/array/storage/milvus/find.py b/docarray/array/storage/milvus/find.py index 17962eb938a..5abf078646a 100644 --- a/docarray/array/storage/milvus/find.py +++ b/docarray/array/storage/milvus/find.py @@ -7,6 +7,7 @@ MilvusArrayType = TypeVar( 'MilvusArrayType', np.ndarray, list ) # TODO(johannes) test torch, tf, etc. + from docarray import Document, DocumentArray class FindMixin: diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index 1f3306f433a..a163bd67a1a 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -1,12 +1,14 @@ -from typing import Iterable +from typing import Iterable, Dict, TYPE_CHECKING import numpy as np from docarray.array.storage.base.getsetdel import BaseGetSetDelMixin -from docarray import Document from docarray.array.storage.base.helper import Offset2ID from docarray.array.storage.milvus.backend import always_true_expr, ids_to_milvus_expr +if TYPE_CHECKING: + from docarray import Document, DocumentArray + class GetSetDelMixin(BaseGetSetDelMixin): def _get_doc_by_id(self, _id: str) -> 'Document': From 9abf9f5a3d4010177de6dac8bc02090ec2bc5974 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 18 Oct 2022 10:42:48 +0200 Subject: [PATCH 19/88] fix: add pymilvus requirement --- setup.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/setup.py b/setup.py index 3124418193d..c519584e1b4 100644 --- a/setup.py +++ b/setup.py @@ -79,6 +79,9 @@ 'redis': [ 'redis>=4.3.0', ], + 'milvus': [ + 'pymilvus>=2.1.0', + ], 'benchmark': [ 'pandas', 'seaborn', @@ -106,6 +109,7 @@ 'annlite>=0.3.12', 'elasticsearch>=8.2.0', 'redis>=4.3.0', + 'pymilvus>=2.1.0', 'jina', ], }, From 95529c39d99ded9221246159830685d520c221f3 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 19 Oct 2022 09:54:16 +0200 Subject: [PATCH 20/88] test: fix docker compose for milvus --- tests/conftest.py | 31 ++++++++++++++++--- tests/unit/array/docker-compose.yml | 15 +++++---- .../unit/array/test_backend_configuration.py | 2 +- 3 files changed, 34 insertions(+), 14 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 7571adbc5eb..761053acc16 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ from typing import Dict import pytest +from pymilvus import MilvusUnavailableException cur_dir = os.path.dirname(os.path.abspath(__file__)) compose_yml = os.path.abspath( @@ -23,11 +24,8 @@ def start_storage(): f"docker-compose -f {compose_yml} --project-directory . up --build -d " f"--remove-orphans" ) - from elasticsearch import Elasticsearch - - es = Elasticsearch(hosts='http://localhost:9200/') - while not es.ping(): - time.sleep(0.5) + _wait_for_es() + _wait_for_milvus() yield os.system( @@ -36,6 +34,29 @@ def start_storage(): ) +def _wait_for_es(): + from elasticsearch import Elasticsearch + + es = Elasticsearch(hosts='http://localhost:9200/') + while not es.ping(): + time.sleep(0.5) + + +def _wait_for_milvus(): + from pymilvus import connections, has_collection + from pymilvus.exceptions import MilvusUnavailableException + + milvus_conn_alias = f'pytest_localhost_19530' + connections.connect(alias=milvus_conn_alias, host='localhost', port=19530) + milvus_ready = False + while not milvus_ready: + try: + has_collection('ping', using=milvus_conn_alias) + milvus_ready = True + except MilvusUnavailableException as e: + time.sleep(0.5) + + @pytest.fixture(scope='session') def set_env_vars(request): _old_environ = dict(os.environ) diff --git a/tests/unit/array/docker-compose.yml b/tests/unit/array/docker-compose.yml index c20e3ad6134..eef055169ba 100644 --- a/tests/unit/array/docker-compose.yml +++ b/tests/unit/array/docker-compose.yml @@ -31,7 +31,7 @@ services: ports: - "6379:6379" - milvus-etcd: + etcd: container_name: milvus-etcd image: quay.io/coreos/etcd:v3.5.0 environment: @@ -43,7 +43,7 @@ services: - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd - milvus-minio: + minio: container_name: milvus-minio image: minio/minio:RELEASE.2022-03-17T06-34-49Z environment: @@ -58,7 +58,7 @@ services: timeout: 20s retries: 3 - milvus-standalone: + standalone: container_name: milvus-standalone image: milvusdb/milvus:v2.1.4 command: [ "milvus", "run", "standalone" ] @@ -71,11 +71,10 @@ services: - "19530:19530" - "9091:9091" depends_on: - - "milvus-etcd" - - "milvus-minio" + - "etcd" + - "minio" + networks: elastic: - name: elastic - milvus: - name: milvus \ No newline at end of file + name: elastic \ No newline at end of file diff --git a/tests/unit/array/test_backend_configuration.py b/tests/unit/array/test_backend_configuration.py index 11351777188..e1871f8e4a8 100644 --- a/tests/unit/array/test_backend_configuration.py +++ b/tests/unit/array/test_backend_configuration.py @@ -155,7 +155,7 @@ def test_cast_columns_qdrant(start_storage, type_da, type_column, request): @pytest.mark.parametrize('type_da', [int, float, str, bool]) @pytest.mark.parametrize('type_column', ['int', 'str', 'float', 'double', 'bool']) -def test_cast_columns_qdrant( +def test_cast_columns_milvus( start_storage, type_da, type_column, request, milvus_cleanup ): test_id = request.node.callspec.id.replace( From fdab6e75570fc10a30d1be2a0db979d26f8bc777 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 19 Oct 2022 11:37:49 +0200 Subject: [PATCH 21/88] test: add milvus to test construct --- docarray/array/storage/milvus/backend.py | 3 +-- tests/unit/array/test_construct.py | 5 +++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index ed3745bc038..06b80290318 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -99,8 +99,7 @@ def _init_storage( self._collection = self._create_or_reuse_collection() self._offset2id_collection = self._create_or_reuse_offset2id_collection() self._build_index() - - super()._init_storage(_docs, config, **kwargs) + super()._init_storage() # To align with Sqlite behavior; if `docs` is not `None` and table name # is provided, :class:`DocumentArraySqlite` will clear the existing diff --git a/tests/unit/array/test_construct.py b/tests/unit/array/test_construct.py index 251e8459b16..e0e68d4b834 100644 --- a/tests/unit/array/test_construct.py +++ b/tests/unit/array/test_construct.py @@ -10,6 +10,7 @@ from docarray.array.weaviate import DocumentArrayWeaviate, WeaviateConfig from docarray.array.elastic import DocumentArrayElastic, ElasticConfig from docarray.array.redis import DocumentArrayRedis, RedisConfig +from docarray.array.milvus import DocumentArrayMilvus, MilvusConfig @pytest.mark.parametrize( @@ -22,6 +23,7 @@ (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_construct_docarray(da_cls, config, start_storage): @@ -71,6 +73,7 @@ def test_construct_docarray(da_cls, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) @pytest.mark.parametrize('is_copy', [True, False]) @@ -101,6 +104,7 @@ def test_docarray_copy_singleton(da_cls, config, is_copy, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) @pytest.mark.parametrize('is_copy', [True, False]) @@ -130,6 +134,7 @@ def test_docarray_copy_da(da_cls, config, is_copy, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=1)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) @pytest.mark.parametrize('is_copy', [True, False]) From c000a90670ae392cdfd24456cd4c839f8daaed48 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 19 Oct 2022 12:02:59 +0200 Subject: [PATCH 22/88] fix: backend naming --- docarray/array/storage/milvus/backend.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 06b80290318..61d646bc6a6 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -2,6 +2,7 @@ import uuid from typing import Optional, TYPE_CHECKING, Union, Dict, Iterable, List, Tuple from dataclasses import dataclass, field +import re import numpy as np from pymilvus import ( @@ -39,6 +40,12 @@ def ids_to_milvus_expr(ids): return '[' + ','.join(ids) + ']' +def _sanitize_collection_name(name): + return ''.join( + re.findall('[a-zA-Z0-9_]', name) + ) # remove everything that is not a letter, number or underscore + + @dataclass class MilvusConfig: n_dim: int @@ -197,7 +204,7 @@ def _ensure_unique_config( subindex_name: str, ) -> dict: if 'collection_name' not in config_subindex: - config_joined['collection_name'] = ( + config_joined['collection_name'] = _sanitize_collection_name( config_joined['collection_name'] + '_subindex_' + subindex_name ) return config_joined From 0c1be5b4da1c5711f8ece43bbfeead2b5a171eb7 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 19 Oct 2022 14:41:01 +0200 Subject: [PATCH 23/88] fix: sort returned docs when accessing by id --- docarray/array/storage/milvus/backend.py | 2 +- docarray/array/storage/milvus/find.py | 2 +- docarray/array/storage/milvus/getsetdel.py | 6 +++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 61d646bc6a6..2989151d255 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -230,7 +230,7 @@ def _docs_to_milvus_payload(self, docs: 'Iterable[Document]'): ] @staticmethod - def _docs_from_query_respone(response): + def _docs_from_query_response(response): return DocumentArray([Document.from_base64(d['serialized']) for d in response]) @staticmethod diff --git a/docarray/array/storage/milvus/find.py b/docarray/array/storage/milvus/find.py index 5abf078646a..6c4f04ebec2 100644 --- a/docarray/array/storage/milvus/find.py +++ b/docarray/array/storage/milvus/find.py @@ -46,4 +46,4 @@ def _filter(self, filter, limit=10, **kwargs): expr=filter, limit=limit, output_fields=['serialized'], **kwargs ) self._collection.release() - return self._docs_from_query_respone(results)[:limit] + return self._docs_from_query_response(results)[:limit] diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index a163bd67a1a..619d1ec6e50 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -2,6 +2,7 @@ import numpy as np +from docarray import DocumentArray from docarray.array.storage.base.getsetdel import BaseGetSetDelMixin from docarray.array.storage.base.helper import Offset2ID from docarray.array.storage.milvus.backend import always_true_expr, ids_to_milvus_expr @@ -54,7 +55,10 @@ def _get_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': **kwargs, ) self._collection.release() - return self._docs_from_query_respone(res) + docs = self._docs_from_query_response(res) + # sort output docs according to input id sorting + ids_list = list(ids) + return DocumentArray(sorted(docs, key=lambda d: ids_list.index(d.id))) def _del_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': kwargs = self._update_consistency_level(**kwargs) From 435e19d0a96ca4536e60a429901efe6eb867c6b9 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 19 Oct 2022 14:50:06 +0200 Subject: [PATCH 24/88] test: add milvus to sequence tests --- tests/unit/array/test_sequence.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/unit/array/test_sequence.py b/tests/unit/array/test_sequence.py index 458e563f7c5..600289d8c04 100644 --- a/tests/unit/array/test_sequence.py +++ b/tests/unit/array/test_sequence.py @@ -15,6 +15,7 @@ from docarray.array.storage.sqlite import SqliteConfig from docarray.array.storage.weaviate import WeaviateConfig from docarray.array.weaviate import DocumentArrayWeaviate +from docarray.array.milvus import DocumentArrayMilvus, MilvusConfig from tests.conftest import tmpfile @@ -27,6 +28,7 @@ (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=1)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=1)), (DocumentArrayRedis, lambda: RedisConfig(n_dim=1)), + (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=128)), ], ) def test_insert(da_cls, config, start_storage): @@ -50,6 +52,7 @@ def test_insert(da_cls, config, start_storage): (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=1)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=1)), (DocumentArrayRedis, lambda: RedisConfig(n_dim=1)), + (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=128)), ], ) def test_append_extend(da_cls, config, start_storage): @@ -84,6 +87,7 @@ def update_config_inplace(config, tmpdir, tmpfile): ('qdrant', {'n_dim': 3, 'collection_name': 'qdrant'}), ('elasticsearch', {'n_dim': 3, 'index_name': 'elasticsearch'}), ('redis', {'n_dim': 3, 'index_name': 'redis'}), + ('milvus', {'n_dim': 3, 'collection_name': 'redis'}), ], ) def test_context_manager_from_disk(storage, config, start_storage, tmpdir, tmpfile): @@ -118,9 +122,10 @@ def test_context_manager_from_disk(storage, config, start_storage, tmpdir, tmpfi ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), ('redis', {'n_dim': 3, 'distance': 'L2'}), + ('milvus', {'n_dim': 3, 'distance': 'L2'}), ], ) -def test_extend_subindex(storage, config): +def test_extend_subindex(storage, config, start_storage): n_dim = 3 subindex_configs = ( @@ -164,9 +169,10 @@ def test_extend_subindex(storage, config): ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), ('redis', {'n_dim': 3, 'distance': 'L2'}), + ('milvus', {'n_dim': 3, 'distance': 'L2'}), ], ) -def test_append_subindex(storage, config): +def test_append_subindex(storage, config, start_storage): n_dim = 3 subindex_configs = ( @@ -214,12 +220,13 @@ def embeddings_eq(emb1, emb2): ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), ('redis', {'n_dim': 3, 'distance': 'L2'}), + ('milvus', {'n_dim': 3, 'distance': 'L2'}), ], ) @pytest.mark.parametrize( 'index', [1, '1', slice(1, 2), [1], [False, True, False, False, False]] ) -def test_del_and_append(index, storage, config): +def test_del_and_append(index, storage, config, start_storage): da = DocumentArray(storage=storage, config=config) with da: @@ -241,12 +248,13 @@ def test_del_and_append(index, storage, config): ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), ('redis', {'n_dim': 3, 'distance': 'L2'}), + ('milvus', {'n_dim': 3, 'distance': 'L2'}), ], ) @pytest.mark.parametrize( 'index', [1, '1', slice(1, 2), [1], [False, True, False, False, False]] ) -def test_set_and_append(index, storage, config): +def test_set_and_append(index, storage, config, start_storage): da = DocumentArray(storage=storage, config=config) with da: From 94e4a65ec1215f98e594a5b24c223d840b8d4bdf Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 19 Oct 2022 16:17:38 +0200 Subject: [PATCH 25/88] fix: raise keyerror on non existing id --- docarray/array/storage/milvus/getsetdel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index 619d1ec6e50..dfbfe4ce367 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -55,6 +55,8 @@ def _get_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': **kwargs, ) self._collection.release() + if not res: + raise KeyError(f'No documents found for ids {ids}') docs = self._docs_from_query_response(res) # sort output docs according to input id sorting ids_list = list(ids) From 061e01cca9146f1cb388cea2aeddb58e0c0134ed Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 19 Oct 2022 16:43:03 +0200 Subject: [PATCH 26/88] refactor: use context managar to laod and release collections --- docarray/array/storage/milvus/backend.py | 14 ++++++++++ docarray/array/storage/milvus/find.py | 32 ++++++++++------------ docarray/array/storage/milvus/getsetdel.py | 26 ++++++++---------- 3 files changed, 41 insertions(+), 31 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 2989151d255..82b816eebfe 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -254,3 +254,17 @@ def _update_consistency_level(self, **kwargs): else self._config.consistency_level ) return kwargs + + def loaded_collection(self, collection=None): + class LoadedCollectionMngr: + def __init__(self, coll): + self._collection = coll + + def __enter__(self): + self._collection.load() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self._collection.release() + + return LoadedCollectionMngr(collection if collection else self._collection) diff --git a/docarray/array/storage/milvus/find.py b/docarray/array/storage/milvus/find.py index 6c4f04ebec2..2057b08b5f5 100644 --- a/docarray/array/storage/milvus/find.py +++ b/docarray/array/storage/milvus/find.py @@ -25,25 +25,23 @@ def _find( """ if param is None: param = dict() - self._collection.load() - kwargs = self._update_consistency_level(**kwargs) - results = self._collection.search( - data=query, - anns_field='embedding', - limit=limit, - expr=filter, - param=param, - output_fields=['serialized'], - **kwargs - ) - self._collection.release() + with self.loaded_collection(): + kwargs = self._update_consistency_level(**kwargs) + results = self._collection.search( + data=query, + anns_field='embedding', + limit=limit, + expr=filter, + param=param, + output_fields=['serialized'], + **kwargs + ) return self._docs_from_search_response(results) def _filter(self, filter, limit=10, **kwargs): kwargs = self._update_consistency_level(**kwargs) - self._collection.load() - results = self._collection.query( - expr=filter, limit=limit, output_fields=['serialized'], **kwargs - ) - self._collection.release() + with self.loaded_collection(): + results = self._collection.query( + expr=filter, limit=limit, output_fields=['serialized'], **kwargs + ) return self._docs_from_query_response(results)[:limit] diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index dfbfe4ce367..ace63262144 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -26,13 +26,12 @@ def _set_doc_by_id(self, _id: str, value: 'Document', **kwargs): def _load_offset2ids(self): collection = self._offset2id_collection - collection.load() - res = collection.query( - expr=always_true_expr('document_id'), - output_fields=['offset', 'document_id'], - consistency_level=self._config.consistency_level, - ) - collection.release() + with self.loaded_collection(collection): + res = collection.query( + expr=always_true_expr('document_id'), + output_fields=['offset', 'document_id'], + consistency_level=self._config.consistency_level, + ) sorted_res = sorted(res, key=lambda k: int(k['offset'])) self._offset2ids = Offset2ID([r['document_id'] for r in sorted_res]) @@ -48,13 +47,12 @@ def _save_offset2ids(self): def _get_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': kwargs = self._update_consistency_level(**kwargs) - self._collection.load() - res = self._collection.query( - expr=f'document_id in {ids_to_milvus_expr(ids)}', - output_fields=['serialized'], - **kwargs, - ) - self._collection.release() + with self.loaded_collection(): + res = self._collection.query( + expr=f'document_id in {ids_to_milvus_expr(ids)}', + output_fields=['serialized'], + **kwargs, + ) if not res: raise KeyError(f'No documents found for ids {ids}') docs = self._docs_from_query_response(res) From c5e6b7e7f258b8115b1e96be03312da20d4c3b3d Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 19 Oct 2022 17:47:59 +0200 Subject: [PATCH 27/88] fix: map embeddings to np before inserting --- docarray/array/storage/milvus/backend.py | 19 +++++++++++++------ docarray/array/storage/milvus/seqlike.py | 4 +--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 82b816eebfe..d9ffc5cb33d 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -219,12 +219,7 @@ def _docs_to_milvus_payload(self, docs: 'Iterable[Document]'): ] return [ [doc.id for doc in docs], - [ - doc.embedding - if doc.embedding is not None - else np.zeros(self._config.n_dim) - for doc in docs - ], + [self._map_embedding(doc.embedding) for doc in docs], [doc.to_base64(**self._config.serialize_config) for doc in docs], *extra_columns, ] @@ -268,3 +263,15 @@ def __exit__(self, exc_type, exc_val, exc_tb): self._collection.release() return LoadedCollectionMngr(collection if collection else self._collection) + + def _map_embedding(self, embedding): + if embedding is not None: + from docarray.math.ndarray import to_numpy_array + + embedding = to_numpy_array(embedding) + + if embedding.ndim > 1: + embedding = np.asarray(embedding).squeeze() + else: + embedding = np.zeros(self._config.n_dim) + return embedding diff --git a/docarray/array/storage/milvus/seqlike.py b/docarray/array/storage/milvus/seqlike.py index db4aaa715f7..b3a52a3305b 100644 --- a/docarray/array/storage/milvus/seqlike.py +++ b/docarray/array/storage/milvus/seqlike.py @@ -1,8 +1,6 @@ from typing import Iterable, Iterator, Union, TYPE_CHECKING from docarray.array.storage.base.seqlike import BaseSequenceLikeMixin - -if TYPE_CHECKING: - from docarray import Document +from docarray import Document class SequenceLikeMixin(BaseSequenceLikeMixin): From f61dfda537ddabea37f7b009954bc2007133a01a Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 19 Oct 2022 18:00:43 +0200 Subject: [PATCH 28/88] test: add milvus to advanced indexing tests --- tests/unit/array/test_advance_indexing.py | 41 ++++++++++++++++++++--- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/tests/unit/array/test_advance_indexing.py b/tests/unit/array/test_advance_indexing.py index 666ae8596b9..c6bb0aa22f9 100644 --- a/tests/unit/array/test_advance_indexing.py +++ b/tests/unit/array/test_advance_indexing.py @@ -7,6 +7,7 @@ from docarray.array.qdrant import QdrantConfig from docarray.array.elastic import ElasticConfig from docarray.array.redis import RedisConfig +from docarray.array.milvus import MilvusConfig @pytest.fixture @@ -29,6 +30,7 @@ def indices(): ('qdrant', QdrantConfig(n_dim=123)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), + ('milvus', MilvusConfig(n_dim=123)), ], ) def test_getter_int_str(docs, storage, config, start_storage): @@ -62,6 +64,7 @@ def test_getter_int_str(docs, storage, config, start_storage): ('annlite', AnnliteConfig(n_dim=123)), ('qdrant', QdrantConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), + ('milvus', MilvusConfig(n_dim=123)), ], ) def test_setter_int_str(docs, storage, config, start_storage): @@ -92,6 +95,7 @@ def test_setter_int_str(docs, storage, config, start_storage): ('qdrant', QdrantConfig(n_dim=123)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), + ('milvus', MilvusConfig(n_dim=123)), ], ) def test_del_int_str(docs, storage, config, start_storage, indices): @@ -127,6 +131,7 @@ def test_del_int_str(docs, storage, config, start_storage, indices): ('qdrant', QdrantConfig(n_dim=123)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), + ('milvus', MilvusConfig(n_dim=123)), ], ) def test_slice(docs, storage, config, start_storage): @@ -166,6 +171,7 @@ def test_slice(docs, storage, config, start_storage): ('qdrant', QdrantConfig(n_dim=123)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), + ('milvus', MilvusConfig(n_dim=123)), ], ) def test_sequence_bool_index(docs, storage, config, start_storage): @@ -213,6 +219,7 @@ def test_sequence_bool_index(docs, storage, config, start_storage): ('qdrant', QdrantConfig(n_dim=123)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), + ('milvus', MilvusConfig(n_dim=123)), ], ) def test_sequence_int(docs, nparray, storage, config, start_storage): @@ -250,6 +257,7 @@ def test_sequence_int(docs, nparray, storage, config, start_storage): ('qdrant', QdrantConfig(n_dim=123)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), + ('milvus', MilvusConfig(n_dim=123)), ], ) def test_sequence_str(docs, storage, config, start_storage): @@ -285,6 +293,7 @@ def test_sequence_str(docs, storage, config, start_storage): ('qdrant', QdrantConfig(n_dim=123)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), + ('milvus', MilvusConfig(n_dim=123)), ], ) def test_docarray_list_tuple(docs, storage, config, start_storage): @@ -306,6 +315,7 @@ def test_docarray_list_tuple(docs, storage, config, start_storage): ('qdrant', QdrantConfig(n_dim=123)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), + ('milvus', MilvusConfig(n_dim=123)), ], ) def test_path_syntax_indexing(storage, config, start_storage): @@ -346,6 +356,7 @@ def test_path_syntax_indexing(storage, config, start_storage): ('qdrant', QdrantConfig(n_dim=123)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), + ('milvus', MilvusConfig(n_dim=123)), ], ) @pytest.mark.parametrize('use_subindex', [False, True]) @@ -443,6 +454,7 @@ def test_path_syntax_indexing_set(storage, config, use_subindex, start_storage): ('qdrant', QdrantConfig(n_dim=123)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), + ('milvus', MilvusConfig(n_dim=123)), ], ) def test_getset_subindex(storage, config, start_storage): @@ -489,6 +501,7 @@ def test_getset_subindex(storage, config, start_storage): ('qdrant', lambda: QdrantConfig(n_dim=123)), ('elasticsearch', lambda: ElasticConfig(n_dim=123)), ('redis', lambda: RedisConfig(n_dim=123)), + ('milvus', lambda: MilvusConfig(n_dim=123)), ], ) def test_attribute_indexing(storage, config_gen, start_storage, size): @@ -520,7 +533,16 @@ def test_attribute_indexing(storage, config_gen, start_storage, size): @pytest.mark.parametrize( 'storage', - ['memory', 'sqlite', 'weaviate', 'annlite', 'qdrant', 'elasticsearch', 'redis'], + [ + 'memory', + 'sqlite', + 'weaviate', + 'annlite', + 'qdrant', + 'elasticsearch', + 'redis', + 'milvus', + ], ) def test_tensor_attribute_selector(storage, start_storage): import scipy.sparse @@ -529,7 +551,7 @@ def test_tensor_attribute_selector(storage, start_storage): sp_embed[sp_embed > 0.1] = 0 sp_embed = scipy.sparse.coo_matrix(sp_embed) - if storage in ('annlite', 'weaviate', 'qdrant', 'elasticsearch', 'redis'): + if storage in ('annlite', 'weaviate', 'qdrant', 'elasticsearch', 'redis', 'milvus'): da = DocumentArray(storage=storage, config={'n_dim': 10}) else: da = DocumentArray(storage=storage) @@ -573,10 +595,19 @@ def test_advance_selector_mixed(storage): @pytest.mark.parametrize( 'storage', - ['memory', 'sqlite', 'weaviate', 'annlite', 'qdrant', 'elasticsearch', 'redis'], + [ + 'memory', + 'sqlite', + 'weaviate', + 'annlite', + 'qdrant', + 'elasticsearch', + 'redis', + 'milvus', + ], ) def test_single_boolean_and_padding(storage, start_storage): - if storage in ('annlite', 'weaviate', 'qdrant', 'elasticsearch', 'redis'): + if storage in ('annlite', 'weaviate', 'qdrant', 'elasticsearch', 'redis', 'milvus'): da = DocumentArray(storage=storage, config={'n_dim': 10}) else: da = DocumentArray(storage=storage) @@ -606,6 +637,7 @@ def test_single_boolean_and_padding(storage, start_storage): ('qdrant', lambda: QdrantConfig(n_dim=123)), ('elasticsearch', lambda: ElasticConfig(n_dim=123)), ('redis', lambda: RedisConfig(n_dim=123)), + ('milvus', lambda: MilvusConfig(n_dim=123)), ], ) def test_edge_case_two_strings(storage, config_gen, start_storage): @@ -684,6 +716,7 @@ def test_edge_case_two_strings(storage, config_gen, start_storage): ('qdrant', QdrantConfig(n_dim=123)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), + ('milvus', MilvusConfig(n_dim=123)), ], ) def test_offset2ids_persistence(storage, config, start_storage): From e9671ee853d931faa2d90f0de27f200182036333 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 19 Oct 2022 18:13:01 +0200 Subject: [PATCH 29/88] test: add milvus to test pull out --- tests/unit/array/test_pull_out.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/unit/array/test_pull_out.py b/tests/unit/array/test_pull_out.py index e487c94214e..c36dde294b0 100644 --- a/tests/unit/array/test_pull_out.py +++ b/tests/unit/array/test_pull_out.py @@ -23,6 +23,7 @@ def docs(): ('qdrant', {'n_dim': 2}), ('elasticsearch', {'n_dim': 2}), ('redis', {'n_dim': 2}), + ('milvus', {'n_dim': 2}), ], ) def test_update_embedding(docs, storage, config, start_storage): @@ -58,6 +59,7 @@ def test_update_embedding(docs, storage, config, start_storage): ('qdrant', {'n_dim': 2}), ('elasticsearch', {'n_dim': 2}), ('redis', {'n_dim': 2}), + ('milvus', {'n_dim': 2}), ], ) def test_update_doc_embedding(docs, storage, config, start_storage): @@ -93,6 +95,7 @@ def test_update_doc_embedding(docs, storage, config, start_storage): ('qdrant', {'n_dim': 2}), ('elasticsearch', {'n_dim': 2}), ('redis', {'n_dim': 2}), + ('milvus', {'n_dim': 2}), ], ) def test_batch_update_embedding(docs, storage, config, start_storage): @@ -126,6 +129,7 @@ def test_batch_update_embedding(docs, storage, config, start_storage): ('qdrant', {'n_dim': 2}), ('elasticsearch', {'n_dim': 2}), ('redis', {'n_dim': 2}), + ('milvus', {'n_dim': 2}), ], ) def test_batch_update_doc_embedding(docs, storage, config, start_storage): @@ -161,6 +165,7 @@ def test_batch_update_doc_embedding(docs, storage, config, start_storage): ('qdrant', {'n_dim': 2}), ('elasticsearch', {'n_dim': 2}), ('redis', {'n_dim': 2}), + ('milvus', {'n_dim': 2}), ], ) def test_update_id(docs, storage, config, start_storage): @@ -183,6 +188,7 @@ def test_update_id(docs, storage, config, start_storage): ('qdrant', {'n_dim': 2}), ('elasticsearch', {'n_dim': 2}), ('redis', {'n_dim': 2}), + ('milvus', {'n_dim': 2}), ], ) def test_update_doc_id(docs, storage, config, start_storage): @@ -204,6 +210,7 @@ def test_update_doc_id(docs, storage, config, start_storage): ('qdrant', {'n_dim': 2}), ('elasticsearch', {'n_dim': 2}), ('redis', {'n_dim': 2}), + ('milvus', {'n_dim': 2}), ], ) def test_batch_update_id(docs, storage, config, start_storage): @@ -228,6 +235,7 @@ def test_batch_update_id(docs, storage, config, start_storage): ('qdrant', {'n_dim': 2}), ('elasticsearch', {'n_dim': 2}), ('redis', {'n_dim': 2}), + ('milvus', {'n_dim': 2}), ], ) def test_batch_update_doc_id(docs, storage, config, start_storage): From ee8f40b391b49a9de2e181ac1e12223ad5ef435b Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 20 Oct 2022 10:53:00 +0200 Subject: [PATCH 30/88] fix: deleting to size zero --- docarray/array/storage/milvus/getsetdel.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index ace63262144..9450d0d6506 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -39,10 +39,12 @@ def _save_offset2ids(self): # delete old entries self._clear_offset2ids_milvus() # insert current entries - collection = self._offset2id_collection ids = self._offset2ids.ids + if not ids: + return offsets = [str(i) for i in range(len(ids))] dummy_vectors = [np.zeros(1) for _ in range(len(ids))] + collection = self._offset2id_collection collection.insert([offsets, ids, dummy_vectors]) def _get_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': From b1c6a88c201363ede715549e94a373edf6468f4d Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 20 Oct 2022 11:21:39 +0200 Subject: [PATCH 31/88] fix: when accessing with no ids, return empty da instead of rasing --- docarray/array/storage/milvus/getsetdel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index 9450d0d6506..b85480c57d4 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -48,6 +48,8 @@ def _save_offset2ids(self): collection.insert([offsets, ids, dummy_vectors]) def _get_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': + if not ids: + return DocumentArray() kwargs = self._update_consistency_level(**kwargs) with self.loaded_collection(): res = self._collection.query( From 4a4d1c89fffb37b11f1c8f04fd7bc5e6d6366a43 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 20 Oct 2022 11:22:43 +0200 Subject: [PATCH 32/88] test: add milvus to content tests --- tests/unit/array/mixins/test_content.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/unit/array/mixins/test_content.py b/tests/unit/array/mixins/test_content.py index ea4535c9d00..362d0c488e9 100644 --- a/tests/unit/array/mixins/test_content.py +++ b/tests/unit/array/mixins/test_content.py @@ -10,6 +10,7 @@ from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic, ElasticConfig from docarray.array.redis import DocumentArrayRedis, RedisConfig +from docarray.array.milvus import DocumentArrayMilvus, MilvusConfig @pytest.mark.parametrize( @@ -22,6 +23,7 @@ DocumentArrayQdrant, DocumentArrayElastic, DocumentArrayRedis, + DocumentArrayMilvus, ], ) @pytest.mark.parametrize( @@ -34,6 +36,7 @@ def test_content_empty_getter_return_none(cls, content_attr, start_storage): DocumentArrayQdrant, DocumentArrayElastic, DocumentArrayRedis, + DocumentArrayMilvus, ]: da = cls(config={'n_dim': 3}) else: @@ -51,6 +54,7 @@ def test_content_empty_getter_return_none(cls, content_attr, start_storage): DocumentArrayQdrant, DocumentArrayElastic, DocumentArrayRedis, + DocumentArrayMilvus, ], ) @pytest.mark.parametrize( @@ -70,6 +74,7 @@ def test_content_empty_setter(cls, content_attr, start_storage): DocumentArrayQdrant, DocumentArrayElastic, DocumentArrayRedis, + DocumentArrayMilvus, ]: da = cls(config={'n_dim': 3}) else: @@ -88,6 +93,7 @@ def test_content_empty_setter(cls, content_attr, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) @pytest.mark.parametrize( @@ -123,6 +129,7 @@ def test_content_getter_setter(cls, content_attr, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_content_empty(da_len, da_cls, config, start_storage): @@ -161,6 +168,7 @@ def test_content_empty(da_len, da_cls, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=5)), (DocumentArrayElastic, ElasticConfig(n_dim=5)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=5)), ], ) def test_embeddings_setter(da_len, da_cls, config, start_storage): From 479d0f7f1008e6dde79f288d94f843a6f5cff856 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 20 Oct 2022 11:25:13 +0200 Subject: [PATCH 33/88] test: add milvus to del tests --- tests/unit/array/mixins/test_del.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/array/mixins/test_del.py b/tests/unit/array/mixins/test_del.py index 610ca99140b..aee552af9b4 100644 --- a/tests/unit/array/mixins/test_del.py +++ b/tests/unit/array/mixins/test_del.py @@ -119,9 +119,10 @@ def test_del_da_attribute(): ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), ('redis', {'n_dim': 3, 'distance': 'L2'}), + ('milvus', {'n_dim': 3, 'distance': 'L2'}), ], ) -def test_del_subindex(storage, config): +def test_del_subindex(storage, config, start_storage): n_dim = 3 subindex_configs = ( From a71cab2ad2c47b897726017ecb1db51cceb690c3 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 20 Oct 2022 13:39:14 +0200 Subject: [PATCH 34/88] test: add milvus to embed tests --- tests/unit/array/mixins/test_embed.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/unit/array/mixins/test_embed.py b/tests/unit/array/mixins/test_embed.py index e5c1762e925..17a061b2ea5 100644 --- a/tests/unit/array/mixins/test_embed.py +++ b/tests/unit/array/mixins/test_embed.py @@ -23,6 +23,7 @@ from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic from docarray.array.redis import DocumentArrayRedis +from docarray.array.milvus import DocumentArrayMilvus random_embed_models = { 'keras': lambda: tf.keras.Sequential( @@ -76,6 +77,7 @@ # DocumentArrayWeaviate, TODO: enable this DocumentArrayElastic, DocumentArrayRedis, + DocumentArrayMilvus, ], ) @pytest.mark.parametrize('N', [2, 10]) @@ -97,10 +99,15 @@ def test_embedding_on_random_network( DocumentArrayQdrant, DocumentArrayElastic, DocumentArrayRedis, + DocumentArrayMilvus, ]: da = da_cls.empty(N, config={'n_dim': embedding_shape}) else: da = da_cls.empty(N, config=None) + + if da_cls == DocumentArrayMilvus and len(input_shape) == 3: + input_shape = (3, 12, 12) # Milvus can't handle large tensors + da.tensors = np.random.random([N, *input_shape]).astype(np.float32) embed_model = random_embed_models[framework]() da.embed(embed_model, batch_size=batch_size, to_numpy=to_numpy) From d16468ac281f17331e41385697064d99931a3dff Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 20 Oct 2022 13:50:26 +0200 Subject: [PATCH 35/88] test: add milvus to test empty --- tests/unit/array/mixins/test_empty.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/array/mixins/test_empty.py b/tests/unit/array/mixins/test_empty.py index 0ba3da06e93..c92937cf80d 100644 --- a/tests/unit/array/mixins/test_empty.py +++ b/tests/unit/array/mixins/test_empty.py @@ -9,6 +9,7 @@ from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic, ElasticConfig from docarray.array.redis import DocumentArrayRedis, RedisConfig +from docarray.array.milvus import DocumentArrayMilvus, MilvusConfig @pytest.mark.parametrize( @@ -21,6 +22,7 @@ (DocumentArrayQdrant, QdrantConfig(n_dim=5)), (DocumentArrayElastic, ElasticConfig(n_dim=5)), (DocumentArrayRedis, RedisConfig(n_dim=5)), + (DocumentArrayMilvus, MilvusConfig(n_dim=5)), ], ) def test_empty_non_zero(da_cls, config, start_storage): From ec10d41a6653d4e6b1c114044f73fcab97059b1d Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 20 Oct 2022 14:15:08 +0200 Subject: [PATCH 36/88] test: add milvus to test eval class --- tests/unit/array/mixins/test_eval_class.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/unit/array/mixins/test_eval_class.py b/tests/unit/array/mixins/test_eval_class.py index a1eab692cee..50ee9419fac 100644 --- a/tests/unit/array/mixins/test_eval_class.py +++ b/tests/unit/array/mixins/test_eval_class.py @@ -16,6 +16,7 @@ ('qdrant', {'n_dim': 256}), ('elasticsearch', {'n_dim': 256}), ('redis', {'n_dim': 256}), + ('milvus', {'n_dim': 256}), ], ) @pytest.mark.parametrize( @@ -53,6 +54,7 @@ def test_eval_mixin_perfect_match(metric_fn, kwargs, storage, config, start_stor ('qdrant', {'n_dim': 256}), ('elasticsearch', {'n_dim': 256}), ('redis', {'n_dim': 256}), + ('milvus', {'n_dim': 256}), ], ) @pytest.mark.parametrize( @@ -94,6 +96,7 @@ def test_eval_mixin_perfect_match_labeled( ('qdrant', {'n_dim': 256}), ('elasticsearch', {'n_dim': 256}), ('redis', {'n_dim': 256}), + ('milvus', {'n_dim': 256}), ], ) @pytest.mark.parametrize( @@ -157,6 +160,7 @@ def test_eval_mixin_one_of_n_labeled(metric_fn, metric_score): ('qdrant', {'n_dim': 256}), ('elasticsearch', {'n_dim': 256}), ('redis', {'n_dim': 256}), + ('milvus', {'n_dim': 256}), ], ) @pytest.mark.parametrize( @@ -201,6 +205,7 @@ def test_eval_mixin_zero_match(storage, config, metric_fn, start_storage, kwargs ('qdrant', {'n_dim': 256}), ('elasticsearch', {'n_dim': 256}), ('redis', {'n_dim': 256}), + ('milvus', {'n_dim': 256}), ], ) def test_diff_len_should_raise(storage, config, start_storage): @@ -223,6 +228,7 @@ def test_diff_len_should_raise(storage, config, start_storage): ('qdrant', {'n_dim': 256}), ('elasticsearch', {'n_dim': 256}), ('redis', {'n_dim': 256}), + ('milvus', {'n_dim': 256}), ], ) def test_diff_hash_fun_should_raise(storage, config, start_storage): @@ -245,6 +251,7 @@ def test_diff_hash_fun_should_raise(storage, config, start_storage): ('qdrant', {'n_dim': 3}), ('elasticsearch', {'n_dim': 3}), ('redis', {'n_dim': 3}), + ('milvus', {'n_dim': 3}), ], ) def test_same_hash_same_len_fun_should_work(storage, config, start_storage): @@ -274,6 +281,7 @@ def test_same_hash_same_len_fun_should_work(storage, config, start_storage): ('qdrant', {'n_dim': 3}), ('elasticsearch', {'n_dim': 3}), ('redis', {'n_dim': 3}), + ('milvus', {'n_dim': 3}), ], ) def test_adding_noise(storage, config, start_storage): @@ -305,6 +313,7 @@ def test_adding_noise(storage, config, start_storage): ('qdrant', {'n_dim': 128}), ('elasticsearch', {'n_dim': 128}), ('redis', {'n_dim': 128}), + ('milvus', {'n_dim': 128}), ], ) @pytest.mark.parametrize( @@ -346,6 +355,7 @@ def test_diff_match_len_in_gd(storage, config, metric_fn, start_storage, kwargs) ('qdrant', {'n_dim': 256}), ('elasticsearch', {'n_dim': 256}), ('redis', {'n_dim': 256}), + ('milvus', {'n_dim': 256}), ], ) def test_empty_da_should_raise(storage, config, start_storage): @@ -364,6 +374,7 @@ def test_empty_da_should_raise(storage, config, start_storage): ('qdrant', {'n_dim': 256}), ('elasticsearch', {'n_dim': 256}), ('redis', {'n_dim': 256}), + ('milvus', {'n_dim': 256}), ], ) def test_missing_groundtruth_should_raise(storage, config, start_storage): @@ -382,6 +393,7 @@ def test_missing_groundtruth_should_raise(storage, config, start_storage): ('qdrant', {'n_dim': 256}), ('elasticsearch', {'n_dim': 256}), ('redis', {'n_dim': 256}), + ('milvus', {'n_dim': 256}), ], ) def test_useless_groundtruth_warning_should_raise(storage, config, start_storage): From f7d0a3bea05fc4bbfada7df69d87ae108c2bf59d Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 20 Oct 2022 15:49:05 +0200 Subject: [PATCH 37/88] test: add milvus to test find --- tests/unit/array/mixins/test_find.py | 31 ++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 49ea3325887..36323cb7a52 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -32,6 +32,7 @@ def inv_cosine(*args): ('qdrant', {'n_dim': 32}), ('elasticsearch', {'n_dim': 32}), ('redis', {'n_dim': 32}), + ('milvus', {'n_dim': 32}), ], ) @pytest.mark.parametrize('limit', [1, 5, 10]) @@ -274,6 +275,16 @@ def test_find_by_tag(storage, config, start_storage): } +numeric_operators_milvus = { + '>=': operator.ge, + '>': operator.gt, + '<=': operator.le, + '<': operator.lt, + '==': operator.eq, + '!=': operator.ne, +} + + @pytest.mark.parametrize( 'storage,filter_gen,numeric_operators,operator', [ @@ -391,6 +402,15 @@ def test_find_by_tag(storage, config, start_storage): 'ne', ), ], + *[ + ( + 'milvus', + lambda operator, threshold: f'price {operator} {threshold}', + numeric_operators_milvus, + operator, + ) + for operator in numeric_operators_milvus.keys() + ], ], ) @pytest.mark.parametrize('columns', [[('price', 'int')], {'price': 'int'}]) @@ -518,6 +538,15 @@ def test_search_pre_filtering( 'ne', ), ], + *[ + ( + 'milvus', + lambda operator, threshold: f'price {operator} {threshold}', + numeric_operators_milvus, + operator, + ) + for operator in numeric_operators_milvus.keys() + ], ], ) @pytest.mark.parametrize('columns', [[('price', 'float')], {'price': 'float'}]) @@ -682,6 +711,7 @@ def test_elastic_id_filter(storage, config, limit): ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), ('redis', {'n_dim': 3, 'distance': 'L2'}), + ('milvus', {'n_dim': 3, 'distance': 'L2'}), ], ) def test_find_subindex(storage, config): @@ -737,6 +767,7 @@ def test_find_subindex(storage, config): ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), ('redis', {'n_dim': 3, 'distance': 'L2'}), + ('milvus', {'n_dim': 3, 'distance': 'L2'}), ], ) def test_find_subindex_multimodal(storage, config): From 7aa68950fa21db34e5ed6c779892d816afbf4e3a Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 20 Oct 2022 17:56:05 +0200 Subject: [PATCH 38/88] test: add milvus to getset tests --- tests/unit/array/mixins/test_getset.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/unit/array/mixins/test_getset.py b/tests/unit/array/mixins/test_getset.py index 5cc8ef9cbc5..6a755c5f90a 100644 --- a/tests/unit/array/mixins/test_getset.py +++ b/tests/unit/array/mixins/test_getset.py @@ -14,6 +14,7 @@ from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic, ElasticConfig from docarray.array.redis import DocumentArrayRedis, RedisConfig +from docarray.array.milvus import DocumentArrayMilvus, MilvusConfig from tests import random_docs rand_array = np.random.random([10, 3]) @@ -44,6 +45,7 @@ def nested_docs(): ('qdrant', {'n_dim': 3}), ('elasticsearch', {'n_dim': 3}), ('redis', {'n_dim': 3}), + ('milvus', {'n_dim': 3}), ], ) @pytest.mark.parametrize( @@ -70,6 +72,7 @@ def test_set_embeddings_multi_kind(array, storage, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) def test_da_get_embeddings(docs, config, da_cls, start_storage): @@ -92,6 +95,7 @@ def test_da_get_embeddings(docs, config, da_cls, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) def test_embeddings_setter_da(docs, config, da_cls, start_storage): @@ -123,6 +127,7 @@ def test_embeddings_setter_da(docs, config, da_cls, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) def test_embeddings_wrong_len(docs, config, da_cls, start_storage): @@ -147,6 +152,7 @@ def test_embeddings_wrong_len(docs, config, da_cls, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) def test_tensors_getter_da(docs, config, da_cls, start_storage): @@ -174,6 +180,7 @@ def test_tensors_getter_da(docs, config, da_cls, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) def test_texts_getter_da(docs, config, da_cls, start_storage): @@ -210,6 +217,7 @@ def test_texts_getter_da(docs, config, da_cls, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) def test_setter_by_sequences_in_selected_docs_da(docs, config, da_cls, start_storage): @@ -248,6 +256,7 @@ def test_setter_by_sequences_in_selected_docs_da(docs, config, da_cls, start_sto (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) def test_texts_wrong_len(docs, config, da_cls, start_storage): @@ -272,6 +281,7 @@ def test_texts_wrong_len(docs, config, da_cls, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) def test_tensors_wrong_len(docs, config, da_cls, start_storage): @@ -296,6 +306,7 @@ def test_tensors_wrong_len(docs, config, da_cls, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) def test_blobs_getter_setter(docs, da_cls, config, start_storage): @@ -329,6 +340,7 @@ def test_blobs_getter_setter(docs, da_cls, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) def test_ellipsis_getter(nested_docs, da_cls, config, start_storage): @@ -353,6 +365,7 @@ def test_ellipsis_getter(nested_docs, da_cls, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) def test_ellipsis_attribute_setter(nested_docs, da_cls, config, start_storage): @@ -374,6 +387,7 @@ def test_ellipsis_attribute_setter(nested_docs, da_cls, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=6)), (DocumentArrayElastic, ElasticConfig(n_dim=6)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) def test_zero_embeddings(da_cls, config, start_storage): @@ -426,6 +440,7 @@ def embeddings_eq(emb1, emb2): ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), ('redis', {'n_dim': 3, 'distance': 'L2'}), + ('milvus', {'n_dim': 3, 'distance': 'L2'}), ], ) def test_getset_subindex(storage, config): @@ -509,6 +524,7 @@ def test_getset_subindex(storage, config): ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), ('redis', {'n_dim': 3, 'distance': 'L2'}), + ('milvus', {'n_dim': 3, 'distance': 'L2'}), ], ) def test_init_subindex(storage, config): @@ -549,6 +565,7 @@ def test_init_subindex(storage, config): ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), ('redis', {'n_dim': 3, 'distance': 'L2'}), + ('milvus', {'n_dim': 3, 'distance': 'L2'}), ], ) def test_set_on_subindex(storage, config): From 85513b8fab3155f8188be4397f243e4c203656dc Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 20 Oct 2022 19:16:14 +0200 Subject: [PATCH 39/88] test: add milvus to all remaining tests --- tests/unit/array/mixins/test_io.py | 10 ++++++++ tests/unit/array/mixins/test_magic.py | 5 ++++ tests/unit/array/mixins/test_match.py | 2 ++ tests/unit/array/mixins/test_parallel.py | 6 +++++ tests/unit/array/mixins/test_plot.py | 7 ++++++ tests/unit/array/mixins/test_sample.py | 5 ++++ tests/unit/array/mixins/test_text.py | 6 +++++ tests/unit/array/mixins/test_traverse.py | 29 ++++++++++++++++++++++++ 8 files changed, 70 insertions(+) diff --git a/tests/unit/array/mixins/test_io.py b/tests/unit/array/mixins/test_io.py index e51b16f628c..b8403e5e048 100644 --- a/tests/unit/array/mixins/test_io.py +++ b/tests/unit/array/mixins/test_io.py @@ -14,6 +14,7 @@ from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic, ElasticConfig from docarray.array.redis import DocumentArrayRedis, RedisConfig +from docarray.array.milvus import DocumentArrayMilvus, MilvusConfig from docarray.helper import random_identity from tests import random_docs @@ -36,6 +37,7 @@ def docs(): (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=10)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=10)), (DocumentArrayRedis, lambda: RedisConfig(n_dim=10)), + (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=10)), ], ) def test_document_save_load( @@ -70,6 +72,7 @@ def test_document_save_load( (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=10)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=10)), (DocumentArrayRedis, lambda: RedisConfig(n_dim=10)), + (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=10)), ], ) def test_da_csv_write(docs, flatten_tags, tmp_path, da_cls, config, start_storage): @@ -90,6 +93,7 @@ def test_da_csv_write(docs, flatten_tags, tmp_path, da_cls, config, start_storag (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=256)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=256)), (DocumentArrayRedis, lambda: RedisConfig(n_dim=256)), + (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=256)), ], ) def test_from_ndarray(da_cls, config, start_storage): @@ -108,6 +112,7 @@ def test_from_ndarray(da_cls, config, start_storage): (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=256)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=256)), (DocumentArrayRedis, lambda: RedisConfig(n_dim=256)), + (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=256)), ], ) def test_from_files(da_cls, config, start_storage): @@ -149,6 +154,7 @@ def test_from_files_exclude(): (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=256)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=256)), (DocumentArrayRedis, lambda: RedisConfig(n_dim=256)), + (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=256)), ], ) def test_from_ndjson(da_cls, config, start_storage): @@ -167,6 +173,7 @@ def test_from_ndjson(da_cls, config, start_storage): (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=3)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=3)), (DocumentArrayRedis, lambda: RedisConfig(n_dim=3)), + (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=3)), ], ) def test_from_to_pd_dataframe(da_cls, config, start_storage): @@ -196,6 +203,7 @@ def test_from_to_pd_dataframe(da_cls, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=3)), (DocumentArrayElastic, ElasticConfig(n_dim=3)), (DocumentArrayRedis, RedisConfig(n_dim=3)), + (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=3)), ], ) def test_from_to_bytes(da_cls, config, start_storage): @@ -228,6 +236,7 @@ def test_from_to_bytes(da_cls, config, start_storage): (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=256)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=256)), (DocumentArrayRedis, lambda: RedisConfig(n_dim=256)), + (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=256)), ], ) def test_push_pull_io(da_cls, config, show_progress, start_storage): @@ -271,6 +280,7 @@ def test_push_pull_io(da_cls, config, show_progress, start_storage): # (DocumentArrayQdrant, QdrantConfig(n_dim=3)), # (DocumentArrayElastic, ElasticConfig(n_dim=3)), # Elastic needs config # (DocumentArrayRedis, RedisConfig(n_dim=3)), # Redis needs config + # (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=3)), ], ) def test_from_to_base64(protocol, compress, da_cls, config): diff --git a/tests/unit/array/mixins/test_magic.py b/tests/unit/array/mixins/test_magic.py index 104c3139b27..bd8f813813e 100644 --- a/tests/unit/array/mixins/test_magic.py +++ b/tests/unit/array/mixins/test_magic.py @@ -9,6 +9,7 @@ from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic, ElasticConfig from docarray.array.redis import DocumentArrayRedis, RedisConfig +from docarray.array.milvus import DocumentArrayMilvus, MilvusConfig N = 100 @@ -34,6 +35,7 @@ def docs(): (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=1)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_iter_len_bool(da_cls, config, start_storage): @@ -61,6 +63,7 @@ def test_iter_len_bool(da_cls, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_repr(da_cls, config, start_storage): @@ -81,6 +84,7 @@ def test_repr(da_cls, config, start_storage): ('qdrant', QdrantConfig(n_dim=128)), ('elasticsearch', ElasticConfig(n_dim=128)), ('redis', RedisConfig(n_dim=128)), + ('milvus', MilvusConfig(n_dim=128)), ], ) def test_repr_str(docs, storage, config, start_storage): @@ -105,6 +109,7 @@ def test_repr_str(docs, storage, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) def test_iadd(da_cls, config, start_storage): diff --git a/tests/unit/array/mixins/test_match.py b/tests/unit/array/mixins/test_match.py index fbe863d1106..1d4c7206217 100644 --- a/tests/unit/array/mixins/test_match.py +++ b/tests/unit/array/mixins/test_match.py @@ -76,6 +76,7 @@ def doc_lists_to_doc_arrays(doc_lists, *args, **kwargs): ('qdrant', {'n_dim': 3}), ('weaviate', {'n_dim': 3}), ('redis', {'n_dim': 3}), + ('milvus', {'n_dim': 3}), ], ) @pytest.mark.parametrize('limit', [1, 2, 3]) @@ -749,6 +750,7 @@ def embeddings_eq(emb1, emb2): ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), ('redis', {'n_dim': 3, 'distance': 'L2'}), + ('milvus', {'n_dim': 3, 'distance': 'L2'}), ], ) def test_match_subindex(storage, config): diff --git a/tests/unit/array/mixins/test_parallel.py b/tests/unit/array/mixins/test_parallel.py index 22ce0a78e3a..f919e0af70e 100644 --- a/tests/unit/array/mixins/test_parallel.py +++ b/tests/unit/array/mixins/test_parallel.py @@ -13,6 +13,7 @@ from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic, ElasticConfig from docarray.array.redis import DocumentArrayRedis, RedisConfig +from docarray.array.milvus import DocumentArrayMilvus, MilvusConfig def foo(d: Document): @@ -54,6 +55,7 @@ def test_parallel_map_apply_external_pool(pytestconfig, pool): (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) @pytest.mark.parametrize('backend', ['process', 'thread']) @@ -111,6 +113,7 @@ def test_parallel_map( (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) @pytest.mark.parametrize('backend', ['thread']) @@ -183,6 +186,7 @@ def test_parallel_map_batch( (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) def test_map_lambda(pytestconfig, da_cls, config, start_storage): @@ -212,6 +216,7 @@ def test_map_lambda(pytestconfig, da_cls, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), (DocumentArrayRedis, RedisConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=10)), ], ) def test_apply_partial(pytestconfig, da_cls, config, start_storage): @@ -242,6 +247,7 @@ def test_apply_partial(pytestconfig, da_cls, config, start_storage): ('qdrant', QdrantConfig(n_dim=256)), ('elasticsearch', ElasticConfig(n_dim=256)), ('redis', RedisConfig(n_dim=256)), + ('milvus', MilvusConfig(n_dim=256)), ], ) @pytest.mark.parametrize('backend', ['thread', 'process']) diff --git a/tests/unit/array/mixins/test_plot.py b/tests/unit/array/mixins/test_plot.py index 818e8f7f9d5..087382fc41f 100644 --- a/tests/unit/array/mixins/test_plot.py +++ b/tests/unit/array/mixins/test_plot.py @@ -15,6 +15,7 @@ from docarray.array.storage.annlite import AnnliteConfig from docarray.array.elastic import DocumentArrayElastic, ElasticConfig from docarray.array.redis import DocumentArrayRedis, RedisConfig +from docarray.array.milvus import DocumentArrayMilvus, MilvusConfig @pytest.mark.parametrize('keep_aspect_ratio', [True, False]) @@ -29,6 +30,7 @@ (DocumentArrayQdrant, QdrantConfig(n_dim=128, scroll_batch_size=8)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_sprite_fail_tensor_success_uri( @@ -68,6 +70,7 @@ def test_sprite_fail_tensor_success_uri( (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=128, scroll_batch_size=8)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=128)), (DocumentArrayRedis, lambda: RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) @pytest.mark.parametrize('canvas_size', [50, 512]) @@ -118,6 +121,7 @@ def da_and_dam(start_storage): (DocumentArrayAnnlite, {'config': {'n_dim': 3}}), (DocumentArrayQdrant, {'config': {'n_dim': 3}}), (DocumentArrayRedis, {'config': {'n_dim': 3}}), + (DocumentArrayMilvus, {'config': {'n_dim': 3}}), ] ] @@ -154,6 +158,7 @@ def _test_plot_embeddings(da): (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=5)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=5)), (DocumentArrayRedis, lambda: RedisConfig(n_dim=5)), + (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=5)), ], ) def test_plot_embeddings_same_path(tmpdir, da_cls, config_gen, start_storage): @@ -184,6 +189,7 @@ def test_plot_embeddings_same_path(tmpdir, da_cls, config_gen, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_summary_homo_hetero(da_cls, config, start_storage): @@ -211,6 +217,7 @@ def test_summary_homo_hetero(da_cls, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_empty_get_attributes(da_cls, config, start_storage): diff --git a/tests/unit/array/mixins/test_sample.py b/tests/unit/array/mixins/test_sample.py index b4d1b6b4d17..fa75064fdb7 100644 --- a/tests/unit/array/mixins/test_sample.py +++ b/tests/unit/array/mixins/test_sample.py @@ -9,6 +9,7 @@ from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic, ElasticConfig from docarray.array.redis import DocumentArrayRedis, RedisConfig +from docarray.array.milvus import DocumentArrayMilvus, MilvusConfig @pytest.mark.parametrize( @@ -21,6 +22,7 @@ (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_sample(da_cls, config, start_storage): @@ -47,6 +49,7 @@ def test_sample(da_cls, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_sample_with_seed(da_cls, config, start_storage): @@ -72,6 +75,7 @@ def test_sample_with_seed(da_cls, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_shuffle(da_cls, config, start_storage): @@ -98,6 +102,7 @@ def test_shuffle(da_cls, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_shuffle_with_seed(da_cls, config, start_storage): diff --git a/tests/unit/array/mixins/test_text.py b/tests/unit/array/mixins/test_text.py index 0f7481a7e0d..9d5e42ac2b3 100644 --- a/tests/unit/array/mixins/test_text.py +++ b/tests/unit/array/mixins/test_text.py @@ -10,6 +10,7 @@ from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic, ElasticConfig from docarray.array.redis import DocumentArrayRedis, RedisConfig +from docarray.array.milvus import DocumentArrayMilvus, MilvusConfig @pytest.fixture(scope='function') @@ -32,6 +33,7 @@ def docs(): (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_da_vocabulary(da_cls, config, docs, min_freq, start_storage): @@ -61,6 +63,7 @@ def test_da_vocabulary(da_cls, config, docs, min_freq, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_da_text_to_tensor_non_max_len(docs, da_cls, config, start_storage): @@ -90,6 +93,7 @@ def test_da_text_to_tensor_non_max_len(docs, da_cls, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_da_text_to_tensor_max_len_3(docs, da_cls, config, start_storage): @@ -121,6 +125,7 @@ def test_da_text_to_tensor_max_len_3(docs, da_cls, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_da_text_to_tensor_max_len_1(docs, da_cls, config, start_storage): @@ -152,6 +157,7 @@ def test_da_text_to_tensor_max_len_1(docs, da_cls, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_convert_text_tensor_random_text(da_cls, docs, config, start_storage): diff --git a/tests/unit/array/mixins/test_traverse.py b/tests/unit/array/mixins/test_traverse.py index 9dad5475bcc..df38120d12b 100644 --- a/tests/unit/array/mixins/test_traverse.py +++ b/tests/unit/array/mixins/test_traverse.py @@ -10,6 +10,7 @@ from docarray.array.annlite import DocumentArrayAnnlite from docarray.array.elastic import DocumentArrayElastic from docarray.array.redis import DocumentArrayRedis +from docarray.array.milvus import DocumentArrayMilvus from tests import random_docs # some random prime number for sanity check @@ -44,6 +45,7 @@ def doc_req(): (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_type(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -64,6 +66,7 @@ def test_traverse_type(doc_req, filter_fn, da_cls, kwargs, start_storage): (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_root(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -84,6 +87,7 @@ def test_traverse_root(doc_req, filter_fn, da_cls, kwargs, start_storage): (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -104,6 +108,7 @@ def test_traverse_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_root_plus_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -125,6 +130,7 @@ def test_traverse_root_plus_chunk(doc_req, filter_fn, da_cls, kwargs, start_stor (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_chunk_plus_root(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -146,6 +152,7 @@ def test_traverse_chunk_plus_root(doc_req, filter_fn, da_cls, kwargs, start_stor (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_match(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -166,6 +173,7 @@ def test_traverse_match(doc_req, filter_fn, da_cls, kwargs, start_storage): (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_match_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -186,6 +194,7 @@ def test_traverse_match_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage) (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_root_match_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -205,6 +214,7 @@ def test_traverse_root_match_chunk(doc_req, filter_fn, da_cls, kwargs, start_sto (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_flatten_embedding(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -225,6 +235,7 @@ def test_traverse_flatten_embedding(doc_req, filter_fn, da_cls, kwargs, start_st (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_flatten_root(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -244,6 +255,7 @@ def test_traverse_flatten_root(doc_req, filter_fn, da_cls, kwargs, start_storage (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_flatten_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -263,6 +275,7 @@ def test_traverse_flatten_chunk(doc_req, filter_fn, da_cls, kwargs, start_storag (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_flatten_root_plus_chunk( @@ -284,6 +297,7 @@ def test_traverse_flatten_root_plus_chunk( (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_flatten_match(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -303,6 +317,7 @@ def test_traverse_flatten_match(doc_req, filter_fn, da_cls, kwargs, start_storag (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_flatten_match_chunk( @@ -324,6 +339,7 @@ def test_traverse_flatten_match_chunk( (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_flatten_root_match_chunk( @@ -351,6 +367,7 @@ def test_traverse_flatten_root_match_chunk( (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_flattened_per_path_embedding( @@ -376,6 +393,7 @@ def test_traverse_flattened_per_path_embedding( (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_flattened_per_path_root( @@ -397,6 +415,7 @@ def test_traverse_flattened_per_path_root( (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_flattened_per_path_chunk( @@ -418,6 +437,7 @@ def test_traverse_flattened_per_path_chunk( (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_flattened_per_path_root_plus_chunk( @@ -440,6 +460,7 @@ def test_traverse_flattened_per_path_root_plus_chunk( (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_flattened_per_path_match( @@ -461,6 +482,7 @@ def test_traverse_flattened_per_path_match( (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_flattened_per_path_root_match_chunk( @@ -485,6 +507,7 @@ def test_traverse_flattened_per_path_root_match_chunk( (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_docuset_traverse_over_iterator_HACKY(da_cls, kwargs, filter_fn): @@ -513,6 +536,7 @@ def test_docuset_traverse_over_iterator_HACKY(da_cls, kwargs, filter_fn): (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_docuset_traverse_over_iterator_CAVEAT(da_cls, kwargs, filter_fn): @@ -580,6 +604,7 @@ def test_traverse_chunkarray(filter_fn): (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) @pytest.mark.parametrize( @@ -626,6 +651,7 @@ def test_filter_fn_traverse_flat( (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) @pytest.mark.parametrize( @@ -678,6 +704,7 @@ def test_filter_fn_traverse_flat_per_path( (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traversal_path(da_cls, kwargs): @@ -697,6 +724,7 @@ def test_traversal_path(da_cls, kwargs): (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_traverse_flat_root_itself(da_cls, kwargs): @@ -720,6 +748,7 @@ def da_and_dam(N): (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), (DocumentArrayRedis, {'config': {'n_dim': 10}}), + (DocumentArrayMilvus, {'config': {'n_dim': 10}}), ], ) def test_flatten(da_cls, kwargs): From 20b19a9732a56af00b6c05076285fd84d197f3ef Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 24 Oct 2022 13:52:17 +0200 Subject: [PATCH 40/88] feat: load and release collection in context manager --- docarray/array/storage/milvus/backend.py | 53 +++++++++++++++++----- docarray/array/storage/milvus/find.py | 33 ++++++++------ docarray/array/storage/milvus/getsetdel.py | 25 +++++----- 3 files changed, 72 insertions(+), 39 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index d9ffc5cb33d..c66b793fc48 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -12,6 +12,7 @@ DataType, CollectionSchema, has_collection, + MilvusException, ) from docarray import Document, DocumentArray @@ -250,7 +251,47 @@ def _update_consistency_level(self, **kwargs): ) return kwargs + def _map_embedding(self, embedding): + if embedding is not None: + from docarray.math.ndarray import to_numpy_array + + embedding = to_numpy_array(embedding) + + if embedding.ndim > 1: + embedding = np.asarray(embedding).squeeze() + else: + embedding = np.zeros(self._config.n_dim) + return embedding + + def __enter__(self): + _ = super().__enter__() + self._collection.load() + self._offset2id_collection.load() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self._collection.release() + self._offset2id_collection.release() + super().__exit__(exc_type, exc_val, exc_tb) + + def _call_with_loaded_collection(self, fn, *fn_args, collection=None, **fn_kwargs): + # workaround since loaded_collection cntx manager cannot currently determine if coll was already loaded before + try: + return fn(*fn_args, **fn_kwargs) + except MilvusException: + with self.loaded_collection(collection): + return fn(*fn_args, **fn_kwargs) + def loaded_collection(self, collection=None): + """ + Context manager to load a collection and release it after the context is exited. + ## TODO 'If the collection is already loaded when entering, it will not be released.' This is not true currently, + ## talking to milvus team to enable this. + + :param collection: the collection to load. If None, the collection of this indexer is used. + :return: Context manager for the provided collection. + """ + class LoadedCollectionMngr: def __init__(self, coll): self._collection = coll @@ -263,15 +304,3 @@ def __exit__(self, exc_type, exc_val, exc_tb): self._collection.release() return LoadedCollectionMngr(collection if collection else self._collection) - - def _map_embedding(self, embedding): - if embedding is not None: - from docarray.math.ndarray import to_numpy_array - - embedding = to_numpy_array(embedding) - - if embedding.ndim > 1: - embedding = np.asarray(embedding).squeeze() - else: - embedding = np.zeros(self._config.n_dim) - return embedding diff --git a/docarray/array/storage/milvus/find.py b/docarray/array/storage/milvus/find.py index 2057b08b5f5..d0c23a3642d 100644 --- a/docarray/array/storage/milvus/find.py +++ b/docarray/array/storage/milvus/find.py @@ -25,23 +25,26 @@ def _find( """ if param is None: param = dict() - with self.loaded_collection(): - kwargs = self._update_consistency_level(**kwargs) - results = self._collection.search( - data=query, - anns_field='embedding', - limit=limit, - expr=filter, - param=param, - output_fields=['serialized'], - **kwargs - ) + kwargs = self._update_consistency_level(**kwargs) + results = self._call_with_loaded_collection( + fn=self._collection.search, + data=query, + anns_field='embedding', + limit=limit, + expr=filter, + param=param, + output_fields=['serialized'], + **kwargs + ) return self._docs_from_search_response(results) def _filter(self, filter, limit=10, **kwargs): kwargs = self._update_consistency_level(**kwargs) - with self.loaded_collection(): - results = self._collection.query( - expr=filter, limit=limit, output_fields=['serialized'], **kwargs - ) + results = self._call_with_loaded_collection( + fn=self._collection.query, + expr=filter, + limit=limit, + output_fields=['serialized'], + **kwargs + ) return self._docs_from_query_response(results)[:limit] diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index b85480c57d4..702332e5897 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -26,12 +26,13 @@ def _set_doc_by_id(self, _id: str, value: 'Document', **kwargs): def _load_offset2ids(self): collection = self._offset2id_collection - with self.loaded_collection(collection): - res = collection.query( - expr=always_true_expr('document_id'), - output_fields=['offset', 'document_id'], - consistency_level=self._config.consistency_level, - ) + res = self._call_with_loaded_collection( + fn=collection.query, + collection=collection, + expr=always_true_expr('document_id'), + output_fields=['offset', 'document_id'], + consistency_level=self._config.consistency_level, + ) sorted_res = sorted(res, key=lambda k: int(k['offset'])) self._offset2ids = Offset2ID([r['document_id'] for r in sorted_res]) @@ -51,12 +52,12 @@ def _get_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': if not ids: return DocumentArray() kwargs = self._update_consistency_level(**kwargs) - with self.loaded_collection(): - res = self._collection.query( - expr=f'document_id in {ids_to_milvus_expr(ids)}', - output_fields=['serialized'], - **kwargs, - ) + res = self._call_with_loaded_collection( + fn=self._collection.query, + expr=f'document_id in {ids_to_milvus_expr(ids)}', + output_fields=['serialized'], + **kwargs, + ) if not res: raise KeyError(f'No documents found for ids {ids}') docs = self._docs_from_query_response(res) From c735702d7b3bde8d373c626be2056a2c6b11f7e9 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 24 Oct 2022 13:53:26 +0200 Subject: [PATCH 41/88] test: use context manager to speed up milvus tests --- tests/unit/array/mixins/test_getset.py | 24 +++++++++++++++--------- tests/unit/array/mixins/test_io.py | 2 +- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/tests/unit/array/mixins/test_getset.py b/tests/unit/array/mixins/test_getset.py index 6a755c5f90a..e73f6c13d5c 100644 --- a/tests/unit/array/mixins/test_getset.py +++ b/tests/unit/array/mixins/test_getset.py @@ -81,8 +81,9 @@ def test_da_get_embeddings(docs, config, da_cls, start_storage): else: da = da_cls() da.extend(docs) - np.testing.assert_almost_equal(da._get_attributes('embedding'), da.embeddings) - np.testing.assert_almost_equal(da[:, 'embedding'], da.embeddings) + with da: + np.testing.assert_almost_equal(da._get_attributes('embedding'), da.embeddings) + np.testing.assert_almost_equal(da[:, 'embedding'], da.embeddings) @pytest.mark.parametrize( @@ -106,7 +107,8 @@ def test_embeddings_setter_da(docs, config, da_cls, start_storage): da.extend(docs) emb = np.random.random((100, 10)) da[:, 'embedding'] = emb - np.testing.assert_almost_equal(da.embeddings, emb) + with da: + np.testing.assert_almost_equal(da.embeddings, emb) for x, doc in zip(emb, da): np.testing.assert_almost_equal(x, doc.embedding) @@ -114,7 +116,8 @@ def test_embeddings_setter_da(docs, config, da_cls, start_storage): da[:, 'embedding'] = None if hasattr(da, 'flush'): da.flush() - assert da.embeddings is None or not np.any(da.embeddings) + with da: + assert da.embeddings is None or not np.any(da.embeddings) @pytest.mark.parametrize( @@ -139,7 +142,8 @@ def test_embeddings_wrong_len(docs, config, da_cls, start_storage): embeddings = np.ones((2, 10)) with pytest.raises(ValueError): - da.embeddings = embeddings + with da: + da.embeddings = embeddings @pytest.mark.parametrize( @@ -583,13 +587,15 @@ def test_set_on_subindex(storage, config): embeddings_to_assign = np.random.random((5 * 3, 2)) with da: da['@c'].embeddings = embeddings_to_assign - assert (da['@c'].embeddings == embeddings_to_assign).all() - assert (da._subindices['@c'].embeddings == embeddings_to_assign).all() + with da: + assert (da['@c'].embeddings == embeddings_to_assign).all() + assert (da._subindices['@c'].embeddings == embeddings_to_assign).all() with da: da['@c'].texts = ['hello' for _ in range(5 * 3)] - assert da['@c'].texts == ['hello' for _ in range(5 * 3)] - assert da._subindices['@c'].texts == ['hello' for _ in range(5 * 3)] + with da: + assert da['@c'].texts == ['hello' for _ in range(5 * 3)] + assert da._subindices['@c'].texts == ['hello' for _ in range(5 * 3)] matches = da.find(query=np.random.random(2), on='@c') assert matches diff --git a/tests/unit/array/mixins/test_io.py b/tests/unit/array/mixins/test_io.py index b8403e5e048..3df0af3433a 100644 --- a/tests/unit/array/mixins/test_io.py +++ b/tests/unit/array/mixins/test_io.py @@ -219,7 +219,7 @@ def test_from_to_bytes(da_cls, config, start_storage): assert da2.tensors == [[1, 2], [2, 1]] import numpy as np - np.testing.assert_array_equal(da2.embeddings, [[1, 2, 3], [4, 5, 6]]) + np.testing.assert_array_equal(da2[:, 'embedding'], [[1, 2, 3], [4, 5, 6]]) # assert da2.embeddings == [[1, 2, 3], [4, 5, 6]] assert da2[0].tags == {'hello': 'world'} assert da2[1].tags == {} From d5c0221b9eebee9047b8c8758f1018576e50160f Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 25 Oct 2022 08:57:35 +0200 Subject: [PATCH 42/88] test: add milvus to test plot --- tests/unit/document/test_plot.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/document/test_plot.py b/tests/unit/document/test_plot.py index c14d7bbc51c..172e8c198e1 100644 --- a/tests/unit/document/test_plot.py +++ b/tests/unit/document/test_plot.py @@ -12,6 +12,7 @@ from docarray.array.storage.qdrant import QdrantConfig from docarray.array.storage.weaviate import WeaviateConfig from docarray.array.weaviate import DocumentArrayWeaviate +from docarray.array.milvus import DocumentArrayMilvus, MilvusConfig @pytest.fixture() @@ -58,6 +59,7 @@ def test_empty_doc(embed_docs): (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128, scroll_batch_size=8)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayMilvus, MilvusConfig(n_dim=128)), ], ) def test_matches_sprites( @@ -83,6 +85,7 @@ def test_matches_sprites( (DocumentArrayWeaviate, lambda: WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=128, scroll_batch_size=8)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=128)), + (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=128)), ], ) def test_matches_sprite_image_generator( From eefbe31d0579c24828d211d41765c2669f1dccfd Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 25 Oct 2022 15:36:43 +0200 Subject: [PATCH 43/88] test: fix plot tests for milvus --- tests/unit/document/test_plot.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/unit/document/test_plot.py b/tests/unit/document/test_plot.py index 172e8c198e1..3e4cb18b2ae 100644 --- a/tests/unit/document/test_plot.py +++ b/tests/unit/document/test_plot.py @@ -98,7 +98,9 @@ def test_matches_sprite_image_generator( start_storage, ): da, das = embed_docs - if image_source == 'tensor': + if ( + image_source == 'tensor' and da_cls != DocumentArrayMilvus + ): # Milvus can't handle large tensors da.apply(lambda d: d.load_uri_to_image_tensor()) das.apply(lambda d: d.load_uri_to_image_tensor()) From 5bd9211eb99086e445d0d2175e6b39623bc0a752 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 25 Oct 2022 15:44:27 +0200 Subject: [PATCH 44/88] test: fix multimodal find test --- tests/unit/array/mixins/test_find.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 463c1e73330..2d1a2285373 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -791,12 +791,19 @@ def test_elastic_id_filter(storage, config, limit): ('milvus', {'n_dim': 3, 'distance': 'L2'}), ], ) -def test_find_subindex(storage, config): +def test_find_subindex(storage, config, start_storage): n_dim = 3 subindex_configs = {'@c': None} if storage == 'sqlite': subindex_configs['@c'] = dict() - elif storage in ['weaviate', 'annlite', 'qdrant', 'elasticsearch', 'redis']: + elif storage in [ + 'weaviate', + 'annlite', + 'qdrant', + 'elasticsearch', + 'redis', + 'milvus', + ]: subindex_configs['@c'] = {'n_dim': 2} da = DocumentArray( @@ -847,7 +854,7 @@ def test_find_subindex(storage, config): ('milvus', {'n_dim': 3, 'distance': 'L2'}), ], ) -def test_find_subindex_multimodal(storage, config): +def test_find_subindex_multimodal(storage, config, start_storage): from docarray import dataclass from docarray.typing import Text From 0cab46d16363bfef07fa89506ef51a0ec06c70b3 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 25 Oct 2022 16:22:50 +0200 Subject: [PATCH 45/88] test: fix test embed --- tests/unit/array/mixins/test_embed.py | 49 ++++++++++++++++----------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/tests/unit/array/mixins/test_embed.py b/tests/unit/array/mixins/test_embed.py index 17a061b2ea5..9e42e622908 100644 --- a/tests/unit/array/mixins/test_embed.py +++ b/tests/unit/array/mixins/test_embed.py @@ -105,35 +105,44 @@ def test_embedding_on_random_network( else: da = da_cls.empty(N, config=None) + embed_model = random_embed_models[framework]() if da_cls == DocumentArrayMilvus and len(input_shape) == 3: input_shape = (3, 12, 12) # Milvus can't handle large tensors + if framework.startswith( + 'transformers' + ): # transformer model expects input shape (3, 224, 224), can't test with Milvus + return - da.tensors = np.random.random([N, *input_shape]).astype(np.float32) - embed_model = random_embed_models[framework]() - da.embed(embed_model, batch_size=batch_size, to_numpy=to_numpy) - - r = da.embeddings - if hasattr(r, 'numpy'): - r = r.numpy() - - embed1 = r.copy() + with da: # to speed up milvus by loading the collection + da.tensors = np.random.random([N, *input_shape]).astype(np.float32) + da.embed(embed_model, batch_size=batch_size, to_numpy=to_numpy) - # reset - da.embeddings = np.random.random([N, embedding_shape]).astype(np.float32) + r = da.embeddings + if hasattr(r, 'numpy'): + r = r.numpy() - # docs[a: b].embed is only supported for DocumentArrayInMemory - if isinstance(da, DocumentArrayInMemory): - # try it again, it should yield the same result - da.embed(embed_model, batch_size=batch_size, to_numpy=to_numpy) - np.testing.assert_array_almost_equal(da.embeddings, embed1) + embed1 = r.copy() # reset da.embeddings = np.random.random([N, embedding_shape]).astype(np.float32) - # now do this one by one - da[: int(N / 2)].embed(embed_model, batch_size=batch_size, to_numpy=to_numpy) - da[-int(N / 2) :].embed(embed_model, batch_size=batch_size, to_numpy=to_numpy) - np.testing.assert_array_almost_equal(da.embeddings, embed1) + # docs[a: b].embed is only supported for DocumentArrayInMemory + if isinstance(da, DocumentArrayInMemory): + # try it again, it should yield the same result + da.embed(embed_model, batch_size=batch_size, to_numpy=to_numpy) + np.testing.assert_array_almost_equal(da.embeddings, embed1) + + # reset + da.embeddings = np.random.random([N, embedding_shape]).astype(np.float32) + + # now do this one by one + da[: int(N / 2)].embed( + embed_model, batch_size=batch_size, to_numpy=to_numpy + ) + da[-int(N / 2) :].embed( + embed_model, batch_size=batch_size, to_numpy=to_numpy + ) + np.testing.assert_array_almost_equal(da.embeddings, embed1) @pytest.fixture From d4a5f36f72d6d61a98ceb524fb01dd2e53970e2a Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 25 Oct 2022 16:29:46 +0200 Subject: [PATCH 46/88] test: use context manager to speed up milvus --- tests/unit/array/mixins/test_getset.py | 106 +++++++++++++------------ 1 file changed, 56 insertions(+), 50 deletions(-) diff --git a/tests/unit/array/mixins/test_getset.py b/tests/unit/array/mixins/test_getset.py index e73f6c13d5c..26e35a9d4d1 100644 --- a/tests/unit/array/mixins/test_getset.py +++ b/tests/unit/array/mixins/test_getset.py @@ -166,12 +166,13 @@ def test_tensors_getter_da(docs, config, da_cls, start_storage): da = da_cls() da.extend(docs) tensors = np.random.random((100, 10, 10)) - da.tensors = tensors - assert len(da) == 100 - np.testing.assert_almost_equal(da.tensors, tensors) + with da: # speed up milvus by loading collection + da.tensors = tensors + assert len(da) == 100 + np.testing.assert_almost_equal(da.tensors, tensors) - da.tensors = None - assert da.tensors is None + da.tensors = None + assert da.tensors is None @pytest.mark.parametrize( @@ -193,22 +194,23 @@ def test_texts_getter_da(docs, config, da_cls, start_storage): else: da = da_cls() da.extend(docs) - assert len(da.texts) == 100 - assert da.texts == da[:, 'text'] - texts = ['text' for _ in range(100)] - da.texts = texts - assert da.texts == texts + with da: # speed up milvus by loading collection + assert len(da.texts) == 100 + assert da.texts == da[:, 'text'] + texts = ['text' for _ in range(100)] + da.texts = texts + assert da.texts == texts - for x, doc in zip(texts, da): - assert x == doc.text + for x, doc in zip(texts, da): + assert x == doc.text - da.texts = None - if hasattr(da, 'flush'): - da.flush() + da.texts = None + if hasattr(da, 'flush'): + da.flush() - # unfortunately protobuf does not distinguish None and '' on string - # so non-set str field in Pb is '' - assert set(da.texts) == set(['']) + # unfortunately protobuf does not distinguish None and '' on string + # so non-set str field in Pb is '' + assert set(da.texts) == set(['']) @pytest.mark.parametrize( @@ -272,7 +274,8 @@ def test_texts_wrong_len(docs, config, da_cls, start_storage): texts = ['hello'] with pytest.raises(ValueError): - da.texts = texts + with da: + da.texts = texts @pytest.mark.parametrize( @@ -297,7 +300,8 @@ def test_tensors_wrong_len(docs, config, da_cls, start_storage): tensors = np.ones((2, 10, 10)) with pytest.raises(ValueError): - da.tensors = tensors + with da: # speed up milvus by loading collection + da.tensors = tensors @pytest.mark.parametrize( @@ -319,15 +323,16 @@ def test_blobs_getter_setter(docs, da_cls, config, start_storage): else: da = da_cls() da.extend(docs) - with pytest.raises(ValueError): - da.blobs = [b'cc', b'bb', b'aa', b'dd'] + with da: # speed up milvus by loading collection + with pytest.raises(ValueError): + da.blobs = [b'cc', b'bb', b'aa', b'dd'] - da.blobs = [b'aa'] * len(da) - assert da.blobs == [b'aa'] * len(da) + da.blobs = [b'aa'] * len(da) + assert da.blobs == [b'aa'] * len(da) - da.blobs = None - if hasattr(da, 'flush'): - da.flush() + da.blobs = None + if hasattr(da, 'flush'): + da.flush() # unfortunately protobuf does not distinguish None and '' on string # so non-set str field in Pb is '' @@ -401,29 +406,30 @@ def test_zero_embeddings(da_cls, config, start_storage): else: da = da_cls.empty(10) - # all zero, dense - da[:, 'embedding'] = a - np.testing.assert_almost_equal(da.embeddings, a) - for d in da: - assert d.embedding.shape == (6,) - - # all zero, sparse - sp_a = scipy.sparse.coo_matrix(a) - da[:, 'embedding'] = sp_a - np.testing.assert_almost_equal(da.embeddings.todense(), sp_a.todense()) - for d in da: - # scipy sparse row-vector can only be a (1, m) not squeezible - assert d.embedding.shape == (1, 6) - - # near zero, sparse - a = np.random.random([10, 6]) - a[a > 0.1] = 0 - sp_a = scipy.sparse.coo_matrix(a) - da[:, 'embedding'] = sp_a - np.testing.assert_almost_equal(da.embeddings.todense(), sp_a.todense()) - for d in da: - # scipy sparse row-vector can only be a (1, m) not squeezible - assert d.embedding.shape == (1, 6) + with da: # speed up milvus by loading collection + # all zero, dense + da[:, 'embedding'] = a + np.testing.assert_almost_equal(da.embeddings, a) + for d in da: + assert d.embedding.shape == (6,) + + # all zero, sparse + sp_a = scipy.sparse.coo_matrix(a) + da[:, 'embedding'] = sp_a + np.testing.assert_almost_equal(da.embeddings.todense(), sp_a.todense()) + for d in da: + # scipy sparse row-vector can only be a (1, m) not squeezible + assert d.embedding.shape == (1, 6) + + # near zero, sparse + a = np.random.random([10, 6]) + a[a > 0.1] = 0 + sp_a = scipy.sparse.coo_matrix(a) + da[:, 'embedding'] = sp_a + np.testing.assert_almost_equal(da.embeddings.todense(), sp_a.todense()) + for d in da: + # scipy sparse row-vector can only be a (1, m) not squeezible + assert d.embedding.shape == (1, 6) def embeddings_eq(emb1, emb2): From 363bc08b998da4b0c2439ace5af0e9f0792cdb4b Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 25 Oct 2022 17:18:06 +0200 Subject: [PATCH 47/88] fix: implement state methods to enable pickling --- docarray/array/storage/milvus/backend.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index c66b793fc48..0db5a8d7fe8 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -263,6 +263,17 @@ def _map_embedding(self, embedding): embedding = np.zeros(self._config.n_dim) return embedding + def __getstate__(self): + d = dict(self.__dict__) + del d['_collection'] + del d['_offset2id_collection'] + return d + + def __setstate__(self, state): + self.__dict__ = state + self._collection = self._create_or_reuse_collection() + self._offset2id_collection = self._create_or_reuse_offset2id_collection() + def __enter__(self): _ = super().__enter__() self._collection.load() From 161dc1f806744c26f7e022bf3b19c55e5d2b1780 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 26 Oct 2022 10:11:34 +0200 Subject: [PATCH 48/88] test: fix more tests --- tests/unit/array/mixins/test_getset.py | 2 +- tests/unit/array/mixins/test_traverse.py | 103 +++++++++----- tests/unit/array/test_advance_indexing.py | 164 ++++++++++++---------- 3 files changed, 153 insertions(+), 116 deletions(-) diff --git a/tests/unit/array/mixins/test_getset.py b/tests/unit/array/mixins/test_getset.py index 26e35a9d4d1..c7d9d1455a0 100644 --- a/tests/unit/array/mixins/test_getset.py +++ b/tests/unit/array/mixins/test_getset.py @@ -396,7 +396,7 @@ def test_ellipsis_attribute_setter(nested_docs, da_cls, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=6)), (DocumentArrayElastic, ElasticConfig(n_dim=6)), (DocumentArrayRedis, RedisConfig(n_dim=10)), - (DocumentArrayMilvus, MilvusConfig(n_dim=10)), + (DocumentArrayMilvus, MilvusConfig(n_dim=6)), ], ) def test_zero_embeddings(da_cls, config, start_storage): diff --git a/tests/unit/array/mixins/test_traverse.py b/tests/unit/array/mixins/test_traverse.py index df38120d12b..c68c000e02c 100644 --- a/tests/unit/array/mixins/test_traverse.py +++ b/tests/unit/array/mixins/test_traverse.py @@ -50,7 +50,8 @@ def doc_req(): ) def test_traverse_type(doc_req, filter_fn, da_cls, kwargs, start_storage): doc_req = da_cls(doc_req, **kwargs) - ds = doc_req.traverse('r', filter_fn=filter_fn) + with doc_req: # speed up milvus by loading collection + ds = doc_req.traverse('r', filter_fn=filter_fn) assert isinstance(ds, types.GeneratorType) assert isinstance(list(ds)[0], DocumentArray) @@ -71,7 +72,8 @@ def test_traverse_type(doc_req, filter_fn, da_cls, kwargs, start_storage): ) def test_traverse_root(doc_req, filter_fn, da_cls, kwargs, start_storage): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse('r', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse('r', filter_fn=filter_fn)) assert len(ds) == 1 assert len(ds[0]) == num_docs @@ -92,7 +94,8 @@ def test_traverse_root(doc_req, filter_fn, da_cls, kwargs, start_storage): ) def test_traverse_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse('c', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse('c', filter_fn=filter_fn)) assert len(ds) == num_docs assert len(ds[0]) == num_chunks_per_doc @@ -113,7 +116,8 @@ def test_traverse_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): ) def test_traverse_root_plus_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse('c,r', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse('c,r', filter_fn=filter_fn)) assert len(ds) == num_docs + 1 assert len(ds[0]) == num_chunks_per_doc assert len(ds[-1]) == num_docs @@ -135,7 +139,8 @@ def test_traverse_root_plus_chunk(doc_req, filter_fn, da_cls, kwargs, start_stor ) def test_traverse_chunk_plus_root(doc_req, filter_fn, da_cls, kwargs, start_storage): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse('r,c', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse('r,c', filter_fn=filter_fn)) assert len(ds) == 1 + num_docs assert len(ds[-1]) == num_chunks_per_doc assert len(ds[0]) == num_docs @@ -157,7 +162,8 @@ def test_traverse_chunk_plus_root(doc_req, filter_fn, da_cls, kwargs, start_stor ) def test_traverse_match(doc_req, filter_fn, da_cls, kwargs, start_storage): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse('m', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse('m', filter_fn=filter_fn)) assert len(ds) == num_docs assert len(ds[0]) == num_matches_per_doc @@ -178,7 +184,8 @@ def test_traverse_match(doc_req, filter_fn, da_cls, kwargs, start_storage): ) def test_traverse_match_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse('cm', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse('cm', filter_fn=filter_fn)) assert len(ds) == num_docs * num_chunks_per_doc assert len(ds[0]) == num_matches_per_chunk @@ -199,7 +206,8 @@ def test_traverse_match_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage) ) def test_traverse_root_match_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse('r,c,m,cm', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse('r,c,m,cm', filter_fn=filter_fn)) assert len(ds) == 1 + num_docs + num_docs + num_docs * num_chunks_per_doc @@ -219,7 +227,8 @@ def test_traverse_root_match_chunk(doc_req, filter_fn, da_cls, kwargs, start_sto ) def test_traverse_flatten_embedding(doc_req, filter_fn, da_cls, kwargs, start_storage): doc_req = da_cls(doc_req, **kwargs) - flattened_results = doc_req.traverse_flat('r,c', filter_fn=filter_fn) + with doc_req: # speed up milvus by loading collection + flattened_results = doc_req.traverse_flat('r,c', filter_fn=filter_fn) ds = flattened_results.embeddings assert ds.shape == (num_docs + num_chunks_per_doc * num_docs, 10) @@ -240,7 +249,8 @@ def test_traverse_flatten_embedding(doc_req, filter_fn, da_cls, kwargs, start_st ) def test_traverse_flatten_root(doc_req, filter_fn, da_cls, kwargs, start_storage): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse_flat('r', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse_flat('r', filter_fn=filter_fn)) assert len(ds) == num_docs @@ -260,7 +270,8 @@ def test_traverse_flatten_root(doc_req, filter_fn, da_cls, kwargs, start_storage ) def test_traverse_flatten_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse_flat('c', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse_flat('c', filter_fn=filter_fn)) assert len(ds) == num_docs * num_chunks_per_doc @@ -282,7 +293,8 @@ def test_traverse_flatten_root_plus_chunk( doc_req, filter_fn, da_cls, kwargs, start_storage ): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse_flat('c,r', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse_flat('c,r', filter_fn=filter_fn)) assert len(ds) == num_docs + num_docs * num_chunks_per_doc @@ -302,7 +314,8 @@ def test_traverse_flatten_root_plus_chunk( ) def test_traverse_flatten_match(doc_req, filter_fn, da_cls, kwargs, start_storage): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse_flat('m', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse_flat('m', filter_fn=filter_fn)) assert len(ds) == num_docs * num_matches_per_doc @@ -324,7 +337,8 @@ def test_traverse_flatten_match_chunk( doc_req, filter_fn, da_cls, kwargs, start_storage ): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse_flat('cm', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse_flat('cm', filter_fn=filter_fn)) assert len(ds) == num_docs * num_chunks_per_doc * num_matches_per_chunk @@ -346,7 +360,8 @@ def test_traverse_flatten_root_match_chunk( doc_req, filter_fn, da_cls, kwargs, start_storage ): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse_flat('r,c,m,cm', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse_flat('r,c,m,cm', filter_fn=filter_fn)) assert ( len(ds) == num_docs @@ -374,7 +389,10 @@ def test_traverse_flattened_per_path_embedding( doc_req, filter_fn, da_cls, kwargs, start_storage ): doc_req = da_cls(doc_req, **kwargs) - flattened_results = list(doc_req.traverse_flat_per_path('r,c', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + flattened_results = list( + doc_req.traverse_flat_per_path('r,c', filter_fn=filter_fn) + ) ds = flattened_results[0].embeddings assert ds.shape == (num_docs, 10) @@ -400,7 +418,8 @@ def test_traverse_flattened_per_path_root( doc_req, filter_fn, da_cls, kwargs, start_storage ): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse_flat_per_path('r', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse_flat_per_path('r', filter_fn=filter_fn)) assert len(ds[0]) == num_docs @@ -422,7 +441,8 @@ def test_traverse_flattened_per_path_chunk( doc_req, filter_fn, da_cls, kwargs, start_storage ): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse_flat_per_path('c', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse_flat_per_path('c', filter_fn=filter_fn)) assert len(ds[0]) == num_docs * num_chunks_per_doc @@ -444,7 +464,8 @@ def test_traverse_flattened_per_path_root_plus_chunk( doc_req, filter_fn, da_cls, kwargs, start_storage ): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse_flat_per_path('c,r', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse_flat_per_path('c,r', filter_fn=filter_fn)) assert len(ds[0]) == num_docs * num_chunks_per_doc assert len(ds[1]) == num_docs @@ -467,7 +488,8 @@ def test_traverse_flattened_per_path_match( doc_req, filter_fn, da_cls, kwargs, start_storage ): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse_flat_per_path('m', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse_flat_per_path('m', filter_fn=filter_fn)) assert len(ds[0]) == num_docs * num_matches_per_doc @@ -489,7 +511,8 @@ def test_traverse_flattened_per_path_root_match_chunk( doc_req, filter_fn, da_cls, kwargs, start_storage ): doc_req = da_cls(doc_req, **kwargs) - ds = list(doc_req.traverse_flat_per_path('r,c,m,cm', filter_fn=filter_fn)) + with doc_req: # speed up milvus by loading collection + ds = list(doc_req.traverse_flat_per_path('r,c,m,cm', filter_fn=filter_fn)) assert len(ds[0]) == num_docs assert len(ds[1]) == num_chunks_per_doc * num_docs assert len(ds[2]) == num_matches_per_doc * num_docs @@ -513,13 +536,14 @@ def test_traverse_flattened_per_path_root_match_chunk( def test_docuset_traverse_over_iterator_HACKY(da_cls, kwargs, filter_fn): # HACKY USAGE DO NOT RECOMMEND: can also traverse over "runtime"-documentarray da = da_cls(random_docs(num_docs, num_chunks_per_doc), **kwargs) - - ds = da.traverse('r', filter_fn=filter_fn) + with da: # speed up milvus by loading collection + ds = da.traverse('r', filter_fn=filter_fn) assert len(list(list(ds)[0])) == num_docs - ds = da_cls(random_docs(num_docs, num_chunks_per_doc), **kwargs).traverse( - 'c', filter_fn=filter_fn - ) + ds = da_cls(random_docs(num_docs, num_chunks_per_doc), **kwargs) + + with ds: # speed up milvus by loading collection + ds = ds.traverse('c', filter_fn=filter_fn) ds = list(ds) assert len(ds) == num_docs assert len(ds[0]) == num_chunks_per_doc @@ -541,16 +565,16 @@ def test_docuset_traverse_over_iterator_HACKY(da_cls, kwargs, filter_fn): ) def test_docuset_traverse_over_iterator_CAVEAT(da_cls, kwargs, filter_fn): # HACKY USAGE's CAVEAT: but it can not iterate over an iterator twice - ds = da_cls(random_docs(num_docs, num_chunks_per_doc), **kwargs).traverse( - 'r,c', filter_fn=filter_fn - ) + ds = da_cls(random_docs(num_docs, num_chunks_per_doc), **kwargs) + with ds: + ds = ds.traverse('r,c', filter_fn=filter_fn) # note that random_docs is a generator and can be only used once, # therefore whoever comes first wil get iterated, and then it becomes empty assert len(list(ds)) == 1 + num_docs - ds = da_cls(random_docs(num_docs, num_chunks_per_doc), **kwargs).traverse( - 'c,r', filter_fn=filter_fn - ) + ds = da_cls(random_docs(num_docs, num_chunks_per_doc), **kwargs) + with ds: + ds = ds.traverse('c,r', filter_fn=filter_fn) assert len(list(ds)) == num_docs + 1 @@ -636,7 +660,8 @@ def test_filter_fn_traverse_flat( filter_fn, docs_len, doc_req, da_cls, kwargs, tmp_path ): docs = da_cls(doc_req, **kwargs) - ds = list(docs.traverse_flat('r,c,m,cm', filter_fn=filter_fn)) + with docs: + ds = list(docs.traverse_flat('r,c,m,cm', filter_fn=filter_fn)) assert len(ds) == docs_len assert all(isinstance(d, Document) for d in ds) @@ -687,7 +712,8 @@ def test_filter_fn_traverse_flat_per_path( filter_fn, doc_req, docs_len, da_cls, kwargs, tmp_path ): docs = da_cls(doc_req, **kwargs) - ds = list(docs.traverse_flat_per_path('r,c,m,cm', filter_fn=filter_fn)) + with docs: + ds = list(docs.traverse_flat_per_path('r,c,m,cm', filter_fn=filter_fn)) assert len(ds) == 4 for seq, length in zip(ds, docs_len): assert isinstance(seq, DocumentArray) @@ -711,7 +737,8 @@ def test_traversal_path(da_cls, kwargs): da = da_cls([Document() for _ in range(6)], **kwargs) assert len(da) == 6 - da.traverse_flat('r') + with da: + da.traverse_flat('r') @pytest.mark.parametrize( @@ -729,7 +756,8 @@ def test_traversal_path(da_cls, kwargs): ) def test_traverse_flat_root_itself(da_cls, kwargs): da = da_cls([Document() for _ in range(100)], **kwargs) - res = da.traverse_flat('r') + with da: + res = da.traverse_flat('r') assert id(res) == id(da) @@ -753,7 +781,8 @@ def da_and_dam(N): ) def test_flatten(da_cls, kwargs): da = da_cls(random_docs(100), **kwargs) - daf = da.flatten() + with da: + daf = da.flatten() assert len(daf) == 600 assert isinstance(daf, DocumentArray) assert len(set(d.id for d in daf)) == 600 diff --git a/tests/unit/array/test_advance_indexing.py b/tests/unit/array/test_advance_indexing.py index c6bb0aa22f9..c47dbe790aa 100644 --- a/tests/unit/array/test_advance_indexing.py +++ b/tests/unit/array/test_advance_indexing.py @@ -331,19 +331,19 @@ def test_path_syntax_indexing(storage, config, start_storage): da = DocumentArray(da, storage=storage, config=config) else: da = DocumentArray(da, storage=storage) - - assert len(da['@c']) == 3 * 5 - assert len(da['@c:1']) == 3 - assert len(da['@c-1:']) == 3 - assert len(da['@c1']) == 3 - assert len(da['@c-2:']) == 3 * 2 - assert len(da['@c1:3']) == 3 * 2 - assert len(da['@c1:3c']) == (3 * 2) * 3 - assert len(da['@c1:3,c1:3c']) == (3 * 2) + (3 * 2) * 3 - assert len(da['@c 1:3 , c 1:3 c']) == (3 * 2) + (3 * 2) * 3 - assert len(da['@cc']) == 3 * 5 * 3 - assert len(da['@cc,m']) == 3 * 5 * 3 + 3 * 7 - assert len(da['@r:1cc,m']) == 1 * 5 * 3 + 3 * 7 + with da: + assert len(da['@c']) == 3 * 5 + assert len(da['@c:1']) == 3 + assert len(da['@c-1:']) == 3 + assert len(da['@c1']) == 3 + assert len(da['@c-2:']) == 3 * 2 + assert len(da['@c1:3']) == 3 * 2 + assert len(da['@c1:3c']) == (3 * 2) * 3 + assert len(da['@c1:3,c1:3c']) == (3 * 2) + (3 * 2) * 3 + assert len(da['@c 1:3 , c 1:3 c']) == (3 * 2) + (3 * 2) * 3 + assert len(da['@cc']) == 3 * 5 * 3 + assert len(da['@cc,m']) == 3 * 5 * 3 + 3 * 7 + assert len(da['@r:1cc,m']) == 1 * 5 * 3 + 3 * 7 @pytest.mark.parametrize( @@ -383,44 +383,48 @@ def test_path_syntax_indexing_set(storage, config, use_subindex, start_storage): da, storage=storage, subindex_configs={'@c': None} if use_subindex else None ) - assert da['@c'].texts == repeat('a', 3 * 5) - assert da['@c', 'text'] == repeat('a', 3 * 5) - if use_subindex: - assert da._subindices['@c'].texts == repeat('a', 3 * 5) - assert da['@c:1', 'text'] == repeat('a', 3) - assert da['@c-1:', 'text'] == repeat('a', 3) - assert da['@c1', 'text'] == repeat('a', 3) - assert da['@c-2:', 'text'] == repeat('a', 3 * 2) - assert da['@c1:3', 'text'] == repeat('a', 3 * 2) - assert da['@c1:3c', 'text'] == repeat('a', (3 * 2) * 3) - assert da['@c1:3,c1:3c', 'text'] == repeat('a', (3 * 2) + (3 * 2) * 3) - assert da['@c 1:3 , c 1:3 c', 'text'] == repeat('a', (3 * 2) + (3 * 2) * 3) - assert da['@cc', 'text'] == repeat('a', 3 * 5 * 3) - assert da['@cc,m', 'text'] == repeat('a', 3 * 5 * 3 + 3 * 7) - assert da['@r:1cc,m', 'text'] == repeat('a', 1 * 5 * 3 + 3 * 7) - assert da[0, 'text'] == 'a' - assert da[[True for _ in da], 'text'] == repeat('a', 3) + with da: + assert da['@c'].texts == repeat('a', 3 * 5) + assert da['@c', 'text'] == repeat('a', 3 * 5) + if use_subindex: + assert da._subindices['@c'].texts == repeat('a', 3 * 5) + assert da['@c:1', 'text'] == repeat('a', 3) + assert da['@c-1:', 'text'] == repeat('a', 3) + assert da['@c1', 'text'] == repeat('a', 3) + assert da['@c-2:', 'text'] == repeat('a', 3 * 2) + assert da['@c1:3', 'text'] == repeat('a', 3 * 2) + assert da['@c1:3c', 'text'] == repeat('a', (3 * 2) * 3) + assert da['@c1:3,c1:3c', 'text'] == repeat('a', (3 * 2) + (3 * 2) * 3) + assert da['@c 1:3 , c 1:3 c', 'text'] == repeat('a', (3 * 2) + (3 * 2) * 3) + assert da['@cc', 'text'] == repeat('a', 3 * 5 * 3) + assert da['@cc,m', 'text'] == repeat('a', 3 * 5 * 3 + 3 * 7) + assert da['@r:1cc,m', 'text'] == repeat('a', 1 * 5 * 3 + 3 * 7) + assert da[0, 'text'] == 'a' + assert da[[True for _ in da], 'text'] == repeat('a', 3) da['@m,cc', 'text'] = repeat('b', 3 + 5 * 3 + 7 * 3 + 3 * 5 * 3) - assert da['@c', 'text'] == repeat('a', 3 * 5) - if use_subindex: - assert da._subindices['@c'].texts == repeat('a', 3 * 5) - assert da['@c:1', 'text'] == repeat('a', 3) - assert da['@c-1:', 'text'] == repeat('a', 3) - assert da['@c1', 'text'] == repeat('a', 3) - assert da['@c-2:', 'text'] == repeat('a', 3 * 2) - assert da['@c1:3', 'text'] == repeat('a', 3 * 2) - assert da['@c1:3c', 'text'] == repeat('b', (3 * 2) * 3) - assert da['@c1:3,c1:3c', 'text'] == repeat('a', (3 * 2)) + repeat('b', (3 * 2) * 3) - assert da['@c 1:3 , c 1:3 c', 'text'] == repeat('a', (3 * 2)) + repeat( - 'b', (3 * 2) * 3 - ) - assert da['@cc', 'text'] == repeat('b', 3 * 5 * 3) - assert da['@cc,m', 'text'] == repeat('b', 3 * 5 * 3 + 3 * 7) - assert da['@r:1cc,m', 'text'] == repeat('b', 1 * 5 * 3 + 3 * 7) - assert da[0, 'text'] == 'a' - assert da[[True for _ in da], 'text'] == repeat('a', 3) + with da: + assert da['@c', 'text'] == repeat('a', 3 * 5) + if use_subindex: + assert da._subindices['@c'].texts == repeat('a', 3 * 5) + assert da['@c:1', 'text'] == repeat('a', 3) + assert da['@c-1:', 'text'] == repeat('a', 3) + assert da['@c1', 'text'] == repeat('a', 3) + assert da['@c-2:', 'text'] == repeat('a', 3 * 2) + assert da['@c1:3', 'text'] == repeat('a', 3 * 2) + assert da['@c1:3c', 'text'] == repeat('b', (3 * 2) * 3) + assert da['@c1:3,c1:3c', 'text'] == repeat('a', (3 * 2)) + repeat( + 'b', (3 * 2) * 3 + ) + assert da['@c 1:3 , c 1:3 c', 'text'] == repeat('a', (3 * 2)) + repeat( + 'b', (3 * 2) * 3 + ) + assert da['@cc', 'text'] == repeat('b', 3 * 5 * 3) + assert da['@cc,m', 'text'] == repeat('b', 3 * 5 * 3 + 3 * 7) + assert da['@r:1cc,m', 'text'] == repeat('b', 1 * 5 * 3 + 3 * 7) + assert da[0, 'text'] == 'a' + assert da[[True for _ in da], 'text'] == repeat('a', 3) da[1, 'text'] = 'd' assert da[1, 'text'] == 'd' @@ -432,12 +436,13 @@ def test_path_syntax_indexing_set(storage, config, use_subindex, start_storage): assert da[doc_id].text == 'e' # setting matches is only possible if the IDs are the same - da['@m'] = [Document(id=f'm{i}', text='c') for i in range(3 * 7)] - assert da['@m', 'text'] == repeat('c', 3 * 7) + with da: + da['@m'] = [Document(id=f'm{i}', text='c') for i in range(3 * 7)] + assert da['@m', 'text'] == repeat('c', 3 * 7) - # setting by traversal paths with different IDs is not supported - with pytest.raises(ValueError): - da['@m'] = [Document() for _ in range(3 * 7)] + # setting by traversal paths with different IDs is not supported + with pytest.raises(ValueError): + da['@m'] = [Document() for _ in range(3 * 7)] da[2, ['text', 'id']] = ['new_text', 'new_id'] assert da[2].text == 'new_text' @@ -463,31 +468,34 @@ def test_getset_subindex(storage, config, start_storage): config=config, subindex_configs={'@c': {'n_dim': 123}} if config else {'@c': None}, ) - assert len(da['@c']) == 15 - assert len(da._subindices['@c']) == 15 - # set entire subindex - chunks_ids = [c.id for c in da['@c']] - new_chunks = [Document(id=cid, text=f'{i}') for i, cid in enumerate(chunks_ids)] - da['@c'] = new_chunks - new_chunks = DocumentArray(new_chunks) - assert da['@c'] == new_chunks - assert da._subindices['@c'] == new_chunks - collected_chunks = DocumentArray.empty(0) - for d in da: - collected_chunks.extend(d.chunks) - assert collected_chunks == new_chunks - # set part of a subindex - chunks_ids = [c.id for c in da['@c:3']] - new_chunks = [Document(id=cid, text=f'{2*i}') for i, cid in enumerate(chunks_ids)] - da['@c:3'] = new_chunks - new_chunks = DocumentArray(new_chunks) - assert da['@c:3'] == new_chunks - for d in new_chunks: - assert d in da._subindices['@c'] - collected_chunks = DocumentArray.empty(0) - for d in da: - collected_chunks.extend(d.chunks[:3]) - assert collected_chunks == new_chunks + with da: + assert len(da['@c']) == 15 + assert len(da._subindices['@c']) == 15 + # set entire subindex + chunks_ids = [c.id for c in da['@c']] + new_chunks = [Document(id=cid, text=f'{i}') for i, cid in enumerate(chunks_ids)] + da['@c'] = new_chunks + new_chunks = DocumentArray(new_chunks) + assert da['@c'] == new_chunks + assert da._subindices['@c'] == new_chunks + collected_chunks = DocumentArray.empty(0) + for d in da: + collected_chunks.extend(d.chunks) + assert collected_chunks == new_chunks + # set part of a subindex + chunks_ids = [c.id for c in da['@c:3']] + new_chunks = [ + Document(id=cid, text=f'{2*i}') for i, cid in enumerate(chunks_ids) + ] + da['@c:3'] = new_chunks + new_chunks = DocumentArray(new_chunks) + assert da['@c:3'] == new_chunks + for d in new_chunks: + assert d in da._subindices['@c'] + collected_chunks = DocumentArray.empty(0) + for d in da: + collected_chunks.extend(d.chunks[:3]) + assert collected_chunks == new_chunks @pytest.mark.parametrize('size', [1, 5]) From 7831b7770024e6436cef82d639556d02babb3091 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 26 Oct 2022 10:12:22 +0200 Subject: [PATCH 49/88] feat: add overloaded milvus init --- docarray/array/document.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docarray/array/document.py b/docarray/array/document.py index 4b59b156830..1c1869a1278 100644 --- a/docarray/array/document.py +++ b/docarray/array/document.py @@ -11,11 +11,13 @@ from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic from docarray.array.redis import DocumentArrayRedis + from docarray.array.milvus import DocumentArrayMilvus from docarray.array.storage.sqlite import SqliteConfig from docarray.array.storage.annlite import AnnliteConfig from docarray.array.storage.weaviate import WeaviateConfig from docarray.array.storage.elastic import ElasticConfig from docarray.array.storage.redis import RedisConfig + from docarray.array.storage.milvus import MilvusConfig class DocumentArray(AllMixins, BaseDocumentArray): @@ -139,6 +141,16 @@ def __new__( """Create a Redis-powered DocumentArray object.""" ... + @overload + def __new__( + cls, + _docs: Optional['DocumentArraySourceType'] = None, + storage: str = 'milvus', + config: Optional[Union['MilvusConfig', Dict]] = None, + ) -> 'DocumentArrayMilvus': + """Create a Redis-powered DocumentArray object.""" + ... + def __enter__(self): return self From 21126da7a169819de127f793bfcf80ac20449802 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 26 Oct 2022 10:53:07 +0200 Subject: [PATCH 50/88] test: fix even more tests --- tests/unit/array/mixins/test_io.py | 2 +- tests/unit/array/mixins/test_plot.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/unit/array/mixins/test_io.py b/tests/unit/array/mixins/test_io.py index 3df0af3433a..232f3323292 100644 --- a/tests/unit/array/mixins/test_io.py +++ b/tests/unit/array/mixins/test_io.py @@ -203,7 +203,7 @@ def test_from_to_pd_dataframe(da_cls, config, start_storage): (DocumentArrayQdrant, QdrantConfig(n_dim=3)), (DocumentArrayElastic, ElasticConfig(n_dim=3)), (DocumentArrayRedis, RedisConfig(n_dim=3)), - (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=3)), + (DocumentArrayMilvus, MilvusConfig(n_dim=3)), ], ) def test_from_to_bytes(da_cls, config, start_storage): diff --git a/tests/unit/array/mixins/test_plot.py b/tests/unit/array/mixins/test_plot.py index fedeed92f70..710e0351a49 100644 --- a/tests/unit/array/mixins/test_plot.py +++ b/tests/unit/array/mixins/test_plot.py @@ -170,9 +170,11 @@ def test_plot_embeddings_same_path(tmpdir, da_cls, config_gen, start_storage): da1 = da_cls.empty(100) da2 = da_cls.empty(768) da1.embeddings = np.random.random([100, 5]) - p1 = da1.plot_embeddings(start_server=False, path=tmpdir) + with da1: + p1 = da1.plot_embeddings(start_server=False, path=tmpdir) da2.embeddings = np.random.random([768, 5]) - p2 = da2.plot_embeddings(start_server=False, path=tmpdir) + with da2: + p2 = da2.plot_embeddings(start_server=False, path=tmpdir) assert p1 == p2 assert os.path.exists(p1) with open(os.path.join(p1, 'config.json')) as fp: From f0832db22f9cd4b096ed39b07e79d42441ff40a3 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 26 Oct 2022 10:58:21 +0200 Subject: [PATCH 51/88] refactor: remove some comments --- docarray/array/storage/milvus/backend.py | 10 +++++----- docarray/array/storage/milvus/seqlike.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 0db5a8d7fe8..80703dbc2a8 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -59,7 +59,7 @@ class MilvusConfig: default_factory=lambda: { 'M': 4, 'efConstruction': 200, - } # TODO(johannes) check if these defaults are reasonable + } ) # passed to milvus at index creation time. The default assumes 'HNSW' index type collection_config: Dict = field( default_factory=dict @@ -130,13 +130,13 @@ def _create_or_reuse_collection(self): document_id = FieldSchema( name='document_id', dtype=DataType.VARCHAR, max_length=1024, is_primary=True - ) # TODO(johannes) this max_length is completely arbitrary + ) embedding = FieldSchema( name='embedding', dtype=DataType.FLOAT_VECTOR, dim=self._config.n_dim ) serialized = FieldSchema( name='serialized', dtype=DataType.VARCHAR, max_length=65_535 - ) # TODO(johannes) this is the maximus allowed length in milvus, could be optimized + ) additional_columns = [] for col, coltype in self._config.columns.items(): @@ -177,10 +177,10 @@ def _create_or_reuse_offset2id_collection(self): document_id = FieldSchema( name='document_id', dtype=DataType.VARCHAR, max_length=1024 - ) # TODO(johannes) this max_length is completely arbitrary + ) offset = FieldSchema( name='offset', dtype=DataType.VARCHAR, max_length=1024, is_primary=True - ) # TODO(johannes) this max_length is completely arbitrary + ) dummy_vector = FieldSchema( name='dummy_vector', dtype=DataType.FLOAT_VECTOR, dim=1 ) diff --git a/docarray/array/storage/milvus/seqlike.py b/docarray/array/storage/milvus/seqlike.py index b3a52a3305b..0d00b061125 100644 --- a/docarray/array/storage/milvus/seqlike.py +++ b/docarray/array/storage/milvus/seqlike.py @@ -24,7 +24,7 @@ def __contains__(self, x: Union[str, 'Document']): try: self._get_doc_by_id(x) return True - except: # TODO(johannes) make exception more specific + except: return False def __repr__(self): From 84f4bb0898598d506d778a61177593cb74acbda3 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 31 Oct 2022 15:13:12 +0100 Subject: [PATCH 52/88] test: use context manager even more --- tests/unit/array/mixins/test_io.py | 2 +- tests/unit/array/mixins/test_plot.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/array/mixins/test_io.py b/tests/unit/array/mixins/test_io.py index 232f3323292..b8a90b8183c 100644 --- a/tests/unit/array/mixins/test_io.py +++ b/tests/unit/array/mixins/test_io.py @@ -253,7 +253,7 @@ def test_push_pull_io(da_cls, config, show_progress, start_storage): da2 = da_cls.pull(name, show_progress=show_progress, config=config()) assert len(da1) == len(da2) == 10 - assert da1.texts == da2.texts == random_texts + assert da1[:, 'text'] == da2[:, 'text'] == random_texts all_names = DocumentArray.cloud_list() diff --git a/tests/unit/array/mixins/test_plot.py b/tests/unit/array/mixins/test_plot.py index 710e0351a49..c6aa0cb4176 100644 --- a/tests/unit/array/mixins/test_plot.py +++ b/tests/unit/array/mixins/test_plot.py @@ -169,11 +169,11 @@ def test_plot_embeddings_same_path(tmpdir, da_cls, config_gen, start_storage): else: da1 = da_cls.empty(100) da2 = da_cls.empty(768) - da1.embeddings = np.random.random([100, 5]) with da1: + da1.embeddings = np.random.random([100, 5]) p1 = da1.plot_embeddings(start_server=False, path=tmpdir) - da2.embeddings = np.random.random([768, 5]) with da2: + da2.embeddings = np.random.random([768, 5]) p2 = da2.plot_embeddings(start_server=False, path=tmpdir) assert p1 == p2 assert os.path.exists(p1) From 3bdfbb40ecb28d7d143470f86748ffafe9f4f351 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 31 Oct 2022 16:01:47 +0100 Subject: [PATCH 53/88] test: moar context manager usage --- tests/unit/array/mixins/test_plot.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/unit/array/mixins/test_plot.py b/tests/unit/array/mixins/test_plot.py index c6aa0cb4176..850b42b1fd9 100644 --- a/tests/unit/array/mixins/test_plot.py +++ b/tests/unit/array/mixins/test_plot.py @@ -140,7 +140,8 @@ def test_plot_sprites(tmpdir): def _test_plot_embeddings(da): - p = da.plot_embeddings(start_server=False) + with da: + p = da.plot_embeddings(start_server=False) assert os.path.exists(p) assert os.path.exists(os.path.join(p, 'config.json')) with open(os.path.join(p, 'config.json')) as fp: @@ -200,14 +201,16 @@ def test_summary_homo_hetero(da_cls, config, start_storage): da = da_cls.empty(100, config=config) else: da = da_cls.empty(100) - da._get_attributes() - da.summary() - da._get_raw_summary() + with da: + da._get_attributes() + da.summary() + da._get_raw_summary() da[0].pop('id') - da.summary() + with da: + da.summary() - da._get_raw_summary() + da._get_raw_summary() @pytest.mark.parametrize( From 752eb7ed1f8f362fef3062680fb201dd898bcc64 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 31 Oct 2022 17:33:10 +0100 Subject: [PATCH 54/88] test: fix test fixture input --- tests/unit/array/mixins/test_plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/array/mixins/test_plot.py b/tests/unit/array/mixins/test_plot.py index 850b42b1fd9..ab7240d63f0 100644 --- a/tests/unit/array/mixins/test_plot.py +++ b/tests/unit/array/mixins/test_plot.py @@ -71,7 +71,7 @@ def test_sprite_fail_tensor_success_uri( (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=128, scroll_batch_size=8)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=128)), (DocumentArrayRedis, lambda: RedisConfig(n_dim=128)), - (DocumentArrayMilvus, MilvusConfig(n_dim=128)), + (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=128)), ], ) @pytest.mark.parametrize('canvas_size', [50, 512]) From 8b7b9a406e2082c2a9c2a47419e0abc421811d6a Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 31 Oct 2022 17:33:40 +0100 Subject: [PATCH 55/88] test: remove milvus from test that it can't handle --- tests/unit/array/mixins/test_plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/array/mixins/test_plot.py b/tests/unit/array/mixins/test_plot.py index ab7240d63f0..a017bb19c91 100644 --- a/tests/unit/array/mixins/test_plot.py +++ b/tests/unit/array/mixins/test_plot.py @@ -31,7 +31,7 @@ (DocumentArrayQdrant, QdrantConfig(n_dim=128, scroll_batch_size=8)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128)), - (DocumentArrayMilvus, MilvusConfig(n_dim=128)), + # (DocumentArrayMilvus, MilvusConfig(n_dim=128)), # tensor is too large to handle ], ) def test_sprite_fail_tensor_success_uri( From 8f15868776f99c45deb6ca8c208fccb033c4f01b Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 1 Nov 2022 08:48:25 +0100 Subject: [PATCH 56/88] test: remove milvus from one more test --- tests/unit/array/mixins/test_plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/array/mixins/test_plot.py b/tests/unit/array/mixins/test_plot.py index a017bb19c91..8248fc4c227 100644 --- a/tests/unit/array/mixins/test_plot.py +++ b/tests/unit/array/mixins/test_plot.py @@ -71,7 +71,7 @@ def test_sprite_fail_tensor_success_uri( (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=128, scroll_batch_size=8)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=128)), (DocumentArrayRedis, lambda: RedisConfig(n_dim=128)), - (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=128)), + # (DocumentArrayMilvus, lambda: MilvusConfig(n_dim=128)), ], ) @pytest.mark.parametrize('canvas_size', [50, 512]) From 4f97db93d383f09a1f0d750dd38a464845bd20b1 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 2 Nov 2022 11:02:27 +0100 Subject: [PATCH 57/88] refactor: better mechanism for automatic collection loading --- docarray/array/storage/milvus/backend.py | 29 +++++++++-------- docarray/array/storage/milvus/find.py | 36 +++++++++++----------- docarray/array/storage/milvus/getsetdel.py | 25 ++++++++------- 3 files changed, 46 insertions(+), 44 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 80703dbc2a8..884ffb461c0 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -13,6 +13,7 @@ CollectionSchema, has_collection, MilvusException, + loading_progress, ) from docarray import Document, DocumentArray @@ -285,33 +286,35 @@ def __exit__(self, exc_type, exc_val, exc_tb): self._offset2id_collection.release() super().__exit__(exc_type, exc_val, exc_tb) - def _call_with_loaded_collection(self, fn, *fn_args, collection=None, **fn_kwargs): - # workaround since loaded_collection cntx manager cannot currently determine if coll was already loaded before - try: - return fn(*fn_args, **fn_kwargs) - except MilvusException: - with self.loaded_collection(collection): - return fn(*fn_args, **fn_kwargs) - def loaded_collection(self, collection=None): """ Context manager to load a collection and release it after the context is exited. - ## TODO 'If the collection is already loaded when entering, it will not be released.' This is not true currently, - ## talking to milvus team to enable this. + If the collection is already loaded when entering, it will not be released while exiting. :param collection: the collection to load. If None, the collection of this indexer is used. :return: Context manager for the provided collection. """ class LoadedCollectionMngr: - def __init__(self, coll): + def __init__(self, coll, connection_alias): self._collection = coll + self._loaded_when_enter = False + self._connection_alias = connection_alias def __enter__(self): + self._loaded_when_enter = ( + loading_progress( + self._collection.name, using=self._connection_alias + )['loading_progress'] + != '0%' + ) self._collection.load() return self def __exit__(self, exc_type, exc_val, exc_tb): - self._collection.release() + if not self._loaded_when_enter: + self._collection.release() - return LoadedCollectionMngr(collection if collection else self._collection) + return LoadedCollectionMngr( + collection if collection else self._collection, self._connection_alias + ) diff --git a/docarray/array/storage/milvus/find.py b/docarray/array/storage/milvus/find.py index d0c23a3642d..fdf7e5826b9 100644 --- a/docarray/array/storage/milvus/find.py +++ b/docarray/array/storage/milvus/find.py @@ -26,25 +26,25 @@ def _find( if param is None: param = dict() kwargs = self._update_consistency_level(**kwargs) - results = self._call_with_loaded_collection( - fn=self._collection.search, - data=query, - anns_field='embedding', - limit=limit, - expr=filter, - param=param, - output_fields=['serialized'], - **kwargs - ) + with self.loaded_collection(): + results = self._collection.search( + data=query, + anns_field='embedding', + limit=limit, + expr=filter, + param=param, + output_fields=['serialized'], + **kwargs, + ) return self._docs_from_search_response(results) def _filter(self, filter, limit=10, **kwargs): kwargs = self._update_consistency_level(**kwargs) - results = self._call_with_loaded_collection( - fn=self._collection.query, - expr=filter, - limit=limit, - output_fields=['serialized'], - **kwargs - ) - return self._docs_from_query_response(results)[:limit] + with self.loaded_collection(): + results = self._collection.query( + expr=filter, + limit=limit, + output_fields=['serialized'], + **kwargs, + ) + return self._docs_from_query_response(results) diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index 702332e5897..b85480c57d4 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -26,13 +26,12 @@ def _set_doc_by_id(self, _id: str, value: 'Document', **kwargs): def _load_offset2ids(self): collection = self._offset2id_collection - res = self._call_with_loaded_collection( - fn=collection.query, - collection=collection, - expr=always_true_expr('document_id'), - output_fields=['offset', 'document_id'], - consistency_level=self._config.consistency_level, - ) + with self.loaded_collection(collection): + res = collection.query( + expr=always_true_expr('document_id'), + output_fields=['offset', 'document_id'], + consistency_level=self._config.consistency_level, + ) sorted_res = sorted(res, key=lambda k: int(k['offset'])) self._offset2ids = Offset2ID([r['document_id'] for r in sorted_res]) @@ -52,12 +51,12 @@ def _get_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': if not ids: return DocumentArray() kwargs = self._update_consistency_level(**kwargs) - res = self._call_with_loaded_collection( - fn=self._collection.query, - expr=f'document_id in {ids_to_milvus_expr(ids)}', - output_fields=['serialized'], - **kwargs, - ) + with self.loaded_collection(): + res = self._collection.query( + expr=f'document_id in {ids_to_milvus_expr(ids)}', + output_fields=['serialized'], + **kwargs, + ) if not res: raise KeyError(f'No documents found for ids {ids}') docs = self._docs_from_query_response(res) From c4fcf4cf720acfebc13f1a1c761010ef6b4fa657 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 2 Nov 2022 15:53:22 +0100 Subject: [PATCH 58/88] docs: add docs section for milvus --- docs/advanced/document-store/index.md | 1 + docs/advanced/document-store/milvus.md | 456 +++++++++++++++++++++++++ 2 files changed, 457 insertions(+) create mode 100644 docs/advanced/document-store/milvus.md diff --git a/docs/advanced/document-store/index.md b/docs/advanced/document-store/index.md index b023217dd8d..80b5009da02 100644 --- a/docs/advanced/document-store/index.md +++ b/docs/advanced/document-store/index.md @@ -10,6 +10,7 @@ qdrant elasticsearch weaviate redis +milvus extend benchmark ``` diff --git a/docs/advanced/document-store/milvus.md b/docs/advanced/document-store/milvus.md new file mode 100644 index 00000000000..fa4eac72dcb --- /dev/null +++ b/docs/advanced/document-store/milvus.md @@ -0,0 +1,456 @@ +(milvus)= +# Milvus + +One can use [Milvus](https://milvus.io/) as the Document store for DocumentArray. It is useful when one wants to have faster Document retrieval on embeddings, i.e. `.match()`, `.find()`. + +````{tip} +This feature requires `pymilvus`. You can install it via `pip install "docarray[milvus]".` +```` + +## Usage + +### Start Milvus service + +To use Milvus as the storage backend, you need a running Milvus server. You can use the following `docker-compose.yml` +to start a Milvus server: + +`````{dropdown} docker-compose.yml + +```yaml +version: '3.5' + +services: + etcd: + container_name: milvus-etcd + image: quay.io/coreos/etcd:v3.5.0 + environment: + - ETCD_AUTO_COMPACTION_MODE=revision + - ETCD_AUTO_COMPACTION_RETENTION=1000 + - ETCD_QUOTA_BACKEND_BYTES=4294967296 + - ETCD_SNAPSHOT_COUNT=50000 + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd + command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd + + minio: + container_name: milvus-minio + image: minio/minio:RELEASE.2022-03-17T06-34-49Z + environment: + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data + command: minio server /minio_data + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 20s + retries: 3 + + standalone: + container_name: milvus-standalone + image: milvusdb/milvus:v2.1.4 + command: ["milvus", "run", "standalone"] + environment: + ETCD_ENDPOINTS: etcd:2379 + MINIO_ADDRESS: minio:9000 + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus + ports: + - "19530:19530" + - "9091:9091" + depends_on: + - "etcd" + - "minio" + +networks: + default: + name: milvus +``` + +````` + +Then + +```bash +docker-compose up +``` + +You can find more installation guidance in the [Milvus documentation](https://milvus.io/docs/v2.1.x/install_standalone-docker.md). + +### Create DocumentArray with Milvus backend + +Assuming service is started using the default configuration (i.e. the server's gRPC address `http://localhost:19530`), you can +instantiate a DocumentArray with Milvus storage like so: + +```python +from docarray import DocumentArray + +da = DocumentArray(storage='milvus', config={'n_dim': 10}) +``` + +Here, `config` is configuration for the created Milvus collection, +and `n_dim` is a mandatory field that specified the dimensionality of stored embeddings. +You can find a complete specification of the Milvus `config` {ref}`here `. + +To access a previously persisted DocumentArray, you can specify the `collection_name`, the `host` and the `port`. + + +```python +from docarray import DocumentArray + +da = DocumentArray( + storage='milvus', + config={ + 'collection_name': 'persisted', + 'host': 'localhost', + 'port': '19530', + 'n_dim': 10, + }, +) + +da.summary() +``` + +(milvus-config)= +## Config + +The following configs can be set: + +| Name | Description | Default | +|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------| +| `n_dim` | Number of dimensions of embeddings to be stored and retrieved | **This is always required** | +| `collection_name` | Qdrant collection name client | **Random collection name generated** | +| `host` | Hostname of the Milvus server | 'localhost' | +| `port` | port of the Milvus server | 6333 | +| `distance` | Distance metric to be used during search. Can be 'IP', 'L2', 'JACCARD', 'TANIMOTO', 'HAMMING', 'SUPERSTRUCTURE' or 'SUBSTRUCTURE'. | 'IP' (inner product) | +| `index_type` | Type of the (ANN) search index. Can be 'HNSW', 'FLAT', 'ANNOY', or one of multiple variants of IVF and RHNSW. A full list of supported index types can be found [here](https://milvus.io/docs/v2.1.x/build_index.md#Prepare-index-parameter). | 'HNSW | +| `index_params` | A dictionary of parameters used for index building. The allowed parameters depend on the index type, and can be found [here](https://milvus.io/docs/v2.1.x/index.md). | {'M': 4, 'efConstruction': 200} (assumes HNSW index) | +| `collection_config` | Configuration for the Milvus collection. Passed as **kwargs during collection creation (`Collection(...)`). | {} | +| `serialize_config` | [Serialization config of each Document](../../../fundamentals/document/serialization.md) | {} | + | `consistency_level` | [Consistency level](https://milvus.io/docs/v2.1.x/consistency.md#Consistency-levels) for Milvus database operations. Can be 'Session', 'Strong', 'Bounded' or 'Eventually'. | 'Session' | +| `columns` | Additional columns to be stored in the datbase, taken from Document `tags`. | None | + +## Minimal example + +Create `docker-compose.yml`: + +`````{dropdown} docker-compose.yml + +```yaml +version: '3.5' + +services: + etcd: + container_name: milvus-etcd + image: quay.io/coreos/etcd:v3.5.0 + environment: + - ETCD_AUTO_COMPACTION_MODE=revision + - ETCD_AUTO_COMPACTION_RETENTION=1000 + - ETCD_QUOTA_BACKEND_BYTES=4294967296 + - ETCD_SNAPSHOT_COUNT=50000 + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd + command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd + + minio: + container_name: milvus-minio + image: minio/minio:RELEASE.2022-03-17T06-34-49Z + environment: + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data + command: minio server /minio_data + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 20s + retries: 3 + + standalone: + container_name: milvus-standalone + image: milvusdb/milvus:v2.1.4 + command: ["milvus", "run", "standalone"] + environment: + ETCD_ENDPOINTS: etcd:2379 + MINIO_ADDRESS: minio:9000 + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus + ports: + - "19530:19530" + - "9091:9091" + depends_on: + - "etcd" + - "minio" + +networks: + default: + name: milvus +``` + +````` + +Install DocArray with Milvus and launch the Milvus server: + + +```bash +pip install -U docarray[milvus] +docker-compose up +``` + +Create a DocumentArray with some random data: + +```python +import numpy as np + +from docarray import DocumentArray + +N, D = 5, 128 + +da = DocumentArray.empty( + N, storage='milvus', config={'n_dim': D, 'distance': 'IP'} +) # init +with da: + da.embeddings = np.random.random([N, D]) +``` + +Perform an approximate nearest neighbor search: + +``` +print(da.find(np.random.random(D), limit=10)) +``` +Output: + +```bash + +``` + +(milvus-filter)= +## Vector search with filter + +Search with `.find` can be restricted by user-defined filters. + +Such filters can be constructed using the [filter expression language defined by Milvus](https://milvus.io/docs/v2.1.x/boolean.md). +Filters operate on the `tags` of a Document, which are stored as `columns` in the Milvus database. + + +### Example of `.find` with a filter + + +Consider Documents with embeddings `[0,0,0]` up to ` [9,9,9]` where the Document with embedding `[i,i,i]` +has as tag `price` with value `i`. We can create such example with the following code: + +```python +from docarray import Document, DocumentArray +import numpy as np + +n_dim = 3 +distance = 'L2' + +da = DocumentArray( + storage='milvus', + config={'n_dim': n_dim, 'columns': {'price': 'float'}, 'distance': distance}, +) + +print(f'\nDocumentArray distance: {distance}') + +with da: + da.extend( + [ + Document(id=f'r{i}', embedding=i * np.ones(n_dim), tags={'price': i}) + for i in range(10) + ] + ) + +print('\nIndexed Prices:\n') +for embedding, price in zip(da.embeddings, da[:, 'tags__price']): + print(f'\tembedding={embedding},\t price={price}') +``` + +Consider we want the nearest vectors to the embedding `[8. 8. 8.]`, with the restriction that +prices must follow a filter. As an example, retrieved Documents must have `price` value lower than +or equal to `max_price`. We can encode this information in Milvus using `filter = f'price <= {max_price}'`. + +Then you can implement and use the search with the proposed filter: + +```python +max_price = 7 +n_limit = 4 + +np_query = np.ones(n_dim) * 8 +print(f'\nQuery vector: \t{np_query}') + +filter = f'price <= {max_price}' +results = da.find(np_query, filter=filter, limit=n_limit) + +print('\nEmbeddings Nearest Neighbours with "price" at most 7:\n') +for embedding, price in zip(results.embeddings, results[:, 'tags__price']): + print(f'\tembedding={embedding},\t price={price}') +``` + +This will print: + +``` +Query vector: [8. 8. 8.] + +Embeddings Nearest Neighbours with "price" at most 7: + + embedding=[7. 7. 7.], price=7 + embedding=[6. 6. 6.], price=6 + embedding=[5. 5. 5.], price=5 + embedding=[4. 4. 4.], price=4 +``` +### Example of `.filter` with a filter + +The following example shows how to use DocArray with Milvus Document Store in order to filter text documents. +Consider Documents have the tag `price` with a value of `i`. We can create these with the following code: + +```python +from docarray import Document, DocumentArray +import numpy as np + +n_dim = 3 + +da = DocumentArray( + storage='milvus', + config={'n_dim': n_dim, 'columns': {'price': 'float'}}, +) + +with da: + da.extend( + [ + Document(id=f'r{i}', embedding=i * np.ones(n_dim), tags={'price': i}) + for i in range(10) + ] + ) + +print('\nIndexed Prices:\n') +for embedding, price in zip(da.embeddings, da[:, 'tags__price']): + print(f'\tembedding={embedding},\t price={price}') +``` + +Suppose you want to filter results such that +retrieved Documents must have a `price` value less than or equal to `max_price`. You can encode +this information in Milvus using `filter = f'price <= {max_price}'`. + +Then you can implement and use the search with the proposed filter: +```python +max_price = 7 +n_limit = 4 + +filter = f'price <= {max_price}' +results = da.find(filter=filter, limit=n_limit) + +print('\nPoints with "price" at most 7:\n') +for embedding, price in zip(results.embeddings, results[:, 'tags__price']): + print(f'\tembedding={embedding},\t price={price}') +``` +This prints: + +``` + +Points with "price" at most 7: + + embedding=[6. 6. 6.], price=6 + embedding=[7. 7. 7.], price=7 + embedding=[1. 1. 1.], price=1 + embedding=[2. 2. 2.], price=2 +``` + +(milvus-limitations)= +## Known limitations of the Milvus Document Store + +The Milvus Document Store implements the entire DocumentArray API, but there are some limitations that you should be aware of. + +(milvus-collection-loading)= +### Collection loading + +In Milvus, every search or query operation requires the index to be loaded into memory. +This includes simple Document access through DocArray, + +This loading operation can be costly, especially when performing multiple search or query operations in a row. + +To mitigate this, you should use the `with da:` context manager whenever you perform multiple reads, searches or queries +on a Milvus DocumentArray. +This context manager loads the index into memory only once, and releases it when the context is exited. + +```python +from docarray import Document, DocumentArray +import numpy as np + +da = DocumentArray( + [Document(id=f'r{i}', embedding=i * np.ones(3)) for i in range(10)], + storage='milvus', + config={'n_dim': 3}, +) + +with da: + # index is loaded into memory + for d in da: + pass +# index is released from memory + +with da: + # index is loaded into memory + embs, texts = da.embeddings, da.texts +# index is released from memory +``` + +Not using the `with da:` context manager will return the same results for the same operations, but will incur significant performance penalties: + +````{dropdown} ⚠️ Bad code + +```python +from docarray import Document, DocumentArray +import numpy as np + +da = DocumentArray( + [Document(id=f'r{i}', embedding=i * np.ones(3)) for i in range(10)], + storage='milvus', + config={'n_dim': 3}, +) + +for d in da: # index is loaded and released at every iteration + pass + +embs, texts = ( + da.embeddings, + da.texts, +) # index is loaded and released for every Document in `da` +``` + +```` + +### Storing large tensors outside of `embedding` field + +It is currently not possible to persist Documents with a large `.tensor` field. + +A suitable workaround for this is to remove a Document's tensor after computing its embedding and before adding it to the +Document Store: + +```python +from docarray import Document, DocumentArray + +da = DocumentArray(storage='milvus', config={'n_dim': 128}) + +doc = Document(tensor=np.random.rand(224, 224)) +doc.embed(...) +doc.tensor = None + +da.append(doc) +``` + +````{dropdown} Why does this limitation exist? +By default, DocArray stores three columns in any Document Store: The Document id's, the Document embeddings and +a serialized (Base64 encoded) representation of the Document itself. + +In Milvus, the the serialized Document are stored in a column of type 'VARCHAR', which imposes a limit of allowed length +per entry. +If the Base64 encoded Document exceeds this limit - which is usually the case for Documents with large tensors - the +Document cannot be stored. + +The Milvus team is currently working on a 'STRING' columm type that could solve this issue in the future. +```` + + From 7569c6d59a38fb2542603e5c6aeeb11566e48b51 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 2 Nov 2022 15:53:38 +0100 Subject: [PATCH 59/88] chore: remove comment --- docarray/array/storage/milvus/getsetdel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index b85480c57d4..f8cc6ccfa63 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -73,7 +73,6 @@ def _del_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': def _set_docs_by_ids( self, ids, docs: 'Iterable[Document]', mismatch_ids: 'Dict', **kwargs ): - # TODO(johannes) check if deletion is necesarry if ids already match # delete old entries kwargs = self._update_consistency_level(**kwargs) self._collection.delete( From b44c326855f6b439863a756e4304cf4bf092bb34 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 2 Nov 2022 16:07:00 +0100 Subject: [PATCH 60/88] fix: enforce limit for filter only queries --- docarray/array/storage/milvus/find.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/milvus/find.py b/docarray/array/storage/milvus/find.py index fdf7e5826b9..14eae7a535c 100644 --- a/docarray/array/storage/milvus/find.py +++ b/docarray/array/storage/milvus/find.py @@ -47,4 +47,4 @@ def _filter(self, filter, limit=10, **kwargs): output_fields=['serialized'], **kwargs, ) - return self._docs_from_query_response(results) + return self._docs_from_query_response(results)[:limit] From 6496a4c7489ee83f2ee9f70448bec9cfaa5bad33 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 2 Nov 2022 16:07:30 +0100 Subject: [PATCH 61/88] docs: fix typo --- docs/advanced/document-store/milvus.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/advanced/document-store/milvus.md b/docs/advanced/document-store/milvus.md index fa4eac72dcb..411cb5e04ea 100644 --- a/docs/advanced/document-store/milvus.md +++ b/docs/advanced/document-store/milvus.md @@ -235,7 +235,7 @@ Such filters can be constructed using the [filter expression language defined by Filters operate on the `tags` of a Document, which are stored as `columns` in the Milvus database. -### Example of `.find` with a filter +### Example of `.find` with filtered vector search Consider Documents with embeddings `[0,0,0]` up to ` [9,9,9]` where the Document with embedding `[i,i,i]` @@ -301,7 +301,7 @@ Embeddings Nearest Neighbours with "price" at most 7: embedding=[5. 5. 5.], price=5 embedding=[4. 4. 4.], price=4 ``` -### Example of `.filter` with a filter +### Example of `.find` with only a filter The following example shows how to use DocArray with Milvus Document Store in order to filter text documents. Consider Documents have the tag `price` with a value of `i`. We can create these with the following code: From 83d8ba08260eb19eb28eeccf7b31c19045df1433 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 2 Nov 2022 16:13:25 +0100 Subject: [PATCH 62/88] docs: add milvus to comparison and add docstring --- docarray/array/milvus.py | 36 +++++++++++++++++++++++++++ docarray/array/qdrant.py | 4 +-- docs/advanced/document-store/index.md | 1 + 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/docarray/array/milvus.py b/docarray/array/milvus.py index 4a6f54c9fed..d7924d86f4f 100644 --- a/docarray/array/milvus.py +++ b/docarray/array/milvus.py @@ -6,5 +6,41 @@ class DocumentArrayMilvus(StorageMixins, DocumentArray): + """ + DocumentArray that stores Documents in a `Milvus `_ vector search engine. + + .. note:: + This DocumentArray requires `pymilvus`. You can install it via `pip install "docarray[milvus]"`. + + To use Milvus as storage backend, a Milvus service needs to be running on your machine. + + With this implementation, :meth:`match` and :meth:`find` perform fast (approximate) vector search. + Additionally, search with filters is supported. + + Example usage: + + .. code-block:: python + + from docarray import DocumentArray + + # connect to running Milvus service with default configuration (address: http://localhost:19530) + da = DocumentArray(storage='milvus', config={'n_dim': 10}) + + # connect to a previously persisted DocumentArrayMilvus by specifying collection_name, host, and port + da = DocumentArray( + storage='milvus', + config={ + 'collection_name': 'persisted', + 'host': 'localhost', + 'port': '19530', + 'n_dim': 10, + }, + ) + + + .. seealso:: + For further details, see our :ref:`user guide `. + """ + def __new__(cls, *args, **kwargs): return super().__new__(cls) diff --git a/docarray/array/qdrant.py b/docarray/array/qdrant.py index d9d16e2ed48..92e662c54ce 100644 --- a/docarray/array/qdrant.py +++ b/docarray/array/qdrant.py @@ -6,7 +6,7 @@ class DocumentArrayQdrant(StorageMixins, DocumentArray): """ - DocumentArray that stores Documents in a `Qdrant `_ vector search engine. + DocumentArray that stores Documents in a `Qdrant `_ vector search engine. .. note:: This DocumentArray requires `qdrant-client`. You can install it via `pip install "docarray[qdrant]"`. @@ -25,7 +25,7 @@ class DocumentArrayQdrant(StorageMixins, DocumentArray): # connect to running Qdrant service with default configuration (address: http://localhost:6333) da = DocumentArray(storage='qdrant', config={'n_dim': 10}) - # connect to a previously persisted DocumentArrayWeaviate by specifying collection_name, host, and port + # connect to a previously persisted DocumentArrayQdrant by specifying collection_name, host, and port da = DocumentArray( storage='qdrant', config={ diff --git a/docs/advanced/document-store/index.md b/docs/advanced/document-store/index.md index 80b5009da02..722ac5074de 100644 --- a/docs/advanced/document-store/index.md +++ b/docs/advanced/document-store/index.md @@ -169,6 +169,7 @@ DocArray supports multiple storage backends with different search features. The | [`AnnLite`](./annlite.md) | `DocumentArray(storage='annlite')` | ✅ | ✅ | ✅ | | [`ElasticSearch`](./elasticsearch.md) | `DocumentArray(storage='elasticsearch')` | ✅ | ✅ | ✅ | | [`Redis`](./redis.md) | `DocumentArray(storage='redis')` | ✅ | ✅ | ✅ | +| [`Milvus`](./milvus.md) | `DocumentArray(storage='milvus')` | ✅ | ✅ | ✅ | The right backend choice depends on the scale of your data, the required performance and the desired ease of setup. For most use cases we recommend starting with [`AnnLite`](./annlite.md). [**Check our One Million Scale Benchmark for more details**](./benchmark#conclusion). From e00361844fee55873a4446b218c0afb97a92022f Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 2 Nov 2022 16:35:01 +0100 Subject: [PATCH 63/88] feat: bulk extend --- docarray/array/storage/milvus/seqlike.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docarray/array/storage/milvus/seqlike.py b/docarray/array/storage/milvus/seqlike.py index 0d00b061125..34db09085a4 100644 --- a/docarray/array/storage/milvus/seqlike.py +++ b/docarray/array/storage/milvus/seqlike.py @@ -44,3 +44,12 @@ def insert(self, index: int, value: 'Document', **kwargs): def _append(self, value: 'Document', **kwargs): self._set_doc_by_id(value.id, value, **kwargs) self._offset2ids.append(value.id) + + def _extend(self, values: Iterable['Document'], **kwargs): + docs = list(values) + if not docs: + return + kwargs = self._update_consistency_level(**kwargs) + payload = self._docs_to_milvus_payload(docs) + self._collection.insert(payload, **kwargs) + self._offset2ids.extend([doc.id for doc in docs]) From fa4c0706202342454a2164a3d0921d51d845615c Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 2 Nov 2022 17:34:29 +0100 Subject: [PATCH 64/88] docs: document advanced milvus options --- docs/advanced/document-store/milvus.md | 56 ++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/docs/advanced/document-store/milvus.md b/docs/advanced/document-store/milvus.md index 411cb5e04ea..2480ff7c534 100644 --- a/docs/advanced/document-store/milvus.md +++ b/docs/advanced/document-store/milvus.md @@ -358,6 +358,62 @@ Points with "price" at most 7: embedding=[2. 2. 2.], price=2 ``` +## Advancded options + +The Milvus Document Store allows the user to pass additional parameters to the Milvus server for all main operations. + +Currently, the main use cases for this are dynamic setting of a consistency level, and passing of search parameters. + +### Setting a consistency level + +By default, every operation on the Milvus Document Store is performed with a consistency level passed during intialization +as part of the {ref}`config `. + +When performing a specific operation, you can override this default consistency level by passing a `consistency_level` parameter: + +```python +from docarray import DocumentArray, Document +import numpy as np + +da = DocumentArray( + storage='milvus', + config={'consistency_level': 'Session', 'n_dim': 3}, +) + +da.append(Document(tensor=np.random.rand(3))) # consistency level is 'Session' +da.append( + Document(tensor=np.random.rand(3)), consistency_level='Strong' +) # consistency level is 'Strong' +``` + +Currently, dynamically setting a consistency level is supported for the following operations: +`.append()`, `.extend()`, `.find()`, and `.insert()`. + +### Passing search parameters + +In Milvus you can [pass parameters to the search operation](https://milvus.io/docs/v2.1.x/search.md#Conduct-a-vector-search) which [depend on the used index type](https://milvus.io/docs/v2.1.x/index.md). + +In DocumentArray, this ability is exposed through the `param` argument in the `.find()` method: + +```python +import numpy as np + +from docarray import DocumentArray + +N, D = 5, 128 + +da = DocumentArray.empty( + N, storage='milvus', config={'n_dim': D, 'distance': 'IP'} +) # init +with da: + da.embeddings = np.random.random([N, D]) + +da.find( + np.random.random(D), limit=10, param={"metric_type": "L2", "params": {"nprobe": 10}} +) +``` + + (milvus-limitations)= ## Known limitations of the Milvus Document Store From 9dd3b74764c8ae98144badfaa1eadd67c80a95d5 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 2 Nov 2022 17:34:50 +0100 Subject: [PATCH 65/88] feat: allo passing of kwargs to insert --- docarray/array/storage/base/seqlike.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docarray/array/storage/base/seqlike.py b/docarray/array/storage/base/seqlike.py index 73fa80a00b7..bab651c354a 100644 --- a/docarray/array/storage/base/seqlike.py +++ b/docarray/array/storage/base/seqlike.py @@ -14,13 +14,13 @@ def _update_subindices_append_extend(self, value): if len(docs_selector) > 0: da.extend(docs_selector) - def insert(self, index: int, value: 'Document'): + def insert(self, index: int, value: 'Document', **kwargs): """Insert `doc` at `index`. :param index: Position of the insertion. :param value: The doc needs to be inserted. """ - self._set_doc_by_id(value.id, value) + self._set_doc_by_id(value.id, value, **kwargs) self._offset2ids.insert(index, value.id) def append(self, value: 'Document', **kwargs): From 20da30b5dbad11abf1211ac5b5229c7a396b0fec Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 2 Nov 2022 17:35:12 +0100 Subject: [PATCH 66/88] chore: remove comment --- docarray/array/mixins/find.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docarray/array/mixins/find.py b/docarray/array/mixins/find.py index 27de4e7c014..c1c02d9fa3d 100644 --- a/docarray/array/mixins/find.py +++ b/docarray/array/mixins/find.py @@ -146,9 +146,7 @@ def find( ) from docarray import Document, DocumentArray - if isinstance( - query, dict - ): # TODO(johannes) since filters in milvus are strings, the can't be passes as `query`, otherwise it will be confused for text matching query + if isinstance(query, dict): if filter is None: return self._filter(query, limit=limit) else: From 6d9dfe53f9c834952da17c0f278e644eefb881ac Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 2 Nov 2022 17:44:39 +0100 Subject: [PATCH 67/88] chore: update accepted array types --- docarray/array/storage/milvus/find.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docarray/array/storage/milvus/find.py b/docarray/array/storage/milvus/find.py index 14eae7a535c..bd4b497c85c 100644 --- a/docarray/array/storage/milvus/find.py +++ b/docarray/array/storage/milvus/find.py @@ -1,12 +1,18 @@ -from typing import TYPE_CHECKING, TypeVar, List, Union, Optional, Dict +from typing import TYPE_CHECKING, TypeVar, List, Union, Optional, Dict, Sequence if TYPE_CHECKING: import numpy as np + import tensorflow + import torch # Define the expected input type that your ANN search supports MilvusArrayType = TypeVar( - 'MilvusArrayType', np.ndarray, list - ) # TODO(johannes) test torch, tf, etc. + 'MilvusArrayType', + np.ndarray, + tensorflow.Tensor, + torch.Tensor, + Sequence[float], + ) from docarray import Document, DocumentArray From 0b659c8837785e0a82188fdb640357603a3e3db9 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 3 Nov 2022 13:40:44 +0100 Subject: [PATCH 68/88] test: add milvus specific tests --- setup.py | 1 + tests/unit/array/storage/milvus/__init__.py | 0 .../unit/array/storage/milvus/test_milvus.py | 125 ++++++++++++++++++ 3 files changed, 126 insertions(+) create mode 100644 tests/unit/array/storage/milvus/__init__.py create mode 100644 tests/unit/array/storage/milvus/test_milvus.py diff --git a/setup.py b/setup.py index 0b29cde6b96..f416f2e9b71 100644 --- a/setup.py +++ b/setup.py @@ -111,6 +111,7 @@ 'redis>=4.3.0', 'pymilvus>=2.1.0', 'jina', + 'pytest-mock', ], }, classifiers=[ diff --git a/tests/unit/array/storage/milvus/__init__.py b/tests/unit/array/storage/milvus/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/unit/array/storage/milvus/test_milvus.py b/tests/unit/array/storage/milvus/test_milvus.py new file mode 100644 index 00000000000..5308b0f2304 --- /dev/null +++ b/tests/unit/array/storage/milvus/test_milvus.py @@ -0,0 +1,125 @@ +import pytest +from docarray import Document +from docarray.array.milvus import DocumentArrayMilvus, MilvusConfig +from pymilvus import loading_progress +import numpy as np + + +def _is_fully_loaded(da): + collections = da._collection, da._offset2id_collection + fully_loaded = True + for coll in collections: + coll_loaded = ( + loading_progress(coll.name, using=da._connection_alias)['loading_progress'] + == '100%' + ) + fully_loaded = fully_loaded and coll_loaded + return fully_loaded + + +def _is_fully_released(da): + collections = da._collection, da._offset2id_collection + fully_released = True + for coll in collections: + coll_released = ( + loading_progress(coll.name, using=da._connection_alias)['loading_progress'] + == '0%' + ) + fully_released = fully_released and coll_released + return fully_released + + +def test_memory_release(start_storage): + da = DocumentArrayMilvus( + config={ + 'n_dim': 10, + }, + ) + da.extend([Document(embedding=np.random.random([10])) for _ in range(10)]) + da.find(Document(embedding=np.random.random([10]))) + assert _is_fully_released(da) + + +def test_memory_cntxt_mngr(start_storage): + da = DocumentArrayMilvus( + config={ + 'n_dim': 10, + }, + ) + + # `with da` context manager + assert _is_fully_released(da) + with da: + assert _is_fully_loaded(da) + pass + assert _is_fully_released(da) + + # `da.loaded_collection` context manager + with da.loaded_collection(), da.loaded_collection(da._offset2id_collection): + assert _is_fully_loaded(da) + pass + assert _is_fully_released(da) + + # both combined + with da: + assert _is_fully_loaded(da) + with da.loaded_collection(), da.loaded_collection(da._offset2id_collection): + assert _is_fully_loaded(da) + pass + assert _is_fully_loaded(da) + assert _is_fully_released(da) + + +@pytest.fixture() +def mock_response(): + class MockHit: + @property + def entity(self): + return {'serialized': Document().to_base64()} + + return [[MockHit()]] + + +@pytest.mark.parametrize( + 'method,meth_input', + [ + ('append', [Document(embedding=np.random.random([10]))]), + ('extend', [[Document(embedding=np.random.random([10]))]]), + ('find', [Document(embedding=np.random.random([10]))]), + ('insert', [0, Document(embedding=np.random.random([10]))]), + ], +) +def test_consistency_level(start_storage, mocker, method, meth_input, mock_response): + init_consistency = 'Session' + da = DocumentArrayMilvus( + config={ + 'n_dim': 10, + 'consistency_level': init_consistency, + }, + ) + + # patch Milvus collection + patch_methods = ['insert', 'search', 'delete', 'query'] + for m in patch_methods: + setattr(da._collection, m, mocker.Mock(return_value=mock_response)) + + # test consistency level set in config + getattr(da, method)(*meth_input) + for m in patch_methods: + mock_meth = getattr(da._collection, m) + for args, kwargs in mock_meth.call_args_list: + if 'consistency_level' in kwargs: + assert kwargs['consistency_level'] == init_consistency + + # reset the mocks + for m in patch_methods: + setattr(da._collection, m, mocker.Mock(return_value=mock_response)) + + # test dynamic consistency level + new_consistency = 'Strong' + getattr(da, method)(*meth_input, consistency_level=new_consistency) + for m in patch_methods: + mock_meth = getattr(da._collection, m) + for args, kwargs in mock_meth.call_args_list: + if 'consistency_level' in kwargs: + assert kwargs['consistency_level'] == new_consistency From 3bf691f82cc8baf7d5fc1ae97d47b51e3b0bec2f Mon Sep 17 00:00:00 2001 From: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Date: Thu, 3 Nov 2022 13:49:36 +0100 Subject: [PATCH 69/88] docs: apply changes from code review Co-authored-by: Nicholas Dunham <11730795+NicholasDunham@users.noreply.github.com> Signed-off-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> --- docs/advanced/document-store/milvus.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/advanced/document-store/milvus.md b/docs/advanced/document-store/milvus.md index 2480ff7c534..b6df3a1d74b 100644 --- a/docs/advanced/document-store/milvus.md +++ b/docs/advanced/document-store/milvus.md @@ -80,7 +80,7 @@ You can find more installation guidance in the [Milvus documentation](https://mi ### Create DocumentArray with Milvus backend -Assuming service is started using the default configuration (i.e. the server's gRPC address `http://localhost:19530`), you can +Assuming the service is started using the default configuration (i.e. the server's gRPC address is `http://localhost:19530`), you can instantiate a DocumentArray with Milvus storage like so: ```python @@ -89,11 +89,11 @@ from docarray import DocumentArray da = DocumentArray(storage='milvus', config={'n_dim': 10}) ``` -Here, `config` is configuration for the created Milvus collection, -and `n_dim` is a mandatory field that specified the dimensionality of stored embeddings. -You can find a complete specification of the Milvus `config` {ref}`here `. +Here, `config` is configuration for the new Milvus collection, +and `n_dim` is a mandatory field that specifies the dimensionality of stored embeddings. +For more information about the Milvus `config`, refer to the {ref}`specification `. -To access a previously persisted DocumentArray, you can specify the `collection_name`, the `host` and the `port`. +To access a previously persisted DocumentArray, specify the `collection_name`, the `host`, and the `port`. ```python @@ -122,10 +122,10 @@ The following configs can be set: | `n_dim` | Number of dimensions of embeddings to be stored and retrieved | **This is always required** | | `collection_name` | Qdrant collection name client | **Random collection name generated** | | `host` | Hostname of the Milvus server | 'localhost' | -| `port` | port of the Milvus server | 6333 | +| `port` | Port of the Milvus server | 6333 | | `distance` | Distance metric to be used during search. Can be 'IP', 'L2', 'JACCARD', 'TANIMOTO', 'HAMMING', 'SUPERSTRUCTURE' or 'SUBSTRUCTURE'. | 'IP' (inner product) | -| `index_type` | Type of the (ANN) search index. Can be 'HNSW', 'FLAT', 'ANNOY', or one of multiple variants of IVF and RHNSW. A full list of supported index types can be found [here](https://milvus.io/docs/v2.1.x/build_index.md#Prepare-index-parameter). | 'HNSW | -| `index_params` | A dictionary of parameters used for index building. The allowed parameters depend on the index type, and can be found [here](https://milvus.io/docs/v2.1.x/index.md). | {'M': 4, 'efConstruction': 200} (assumes HNSW index) | +| `index_type` | Type of the (ANN) search index. Can be 'HNSW', 'FLAT', 'ANNOY', or one of multiple variants of IVF and RHNSW. Refer to the [list of supported index types](https://milvus.io/docs/v2.1.x/build_index.md#Prepare-index-parameter). | 'HNSW' | +| `index_params` | A dictionary of parameters used for index building. The [allowed parameters](https://milvus.io/docs/v2.1.x/index.md) depend on the index type. | {'M': 4, 'efConstruction': 200} (assumes HNSW index) | | `collection_config` | Configuration for the Milvus collection. Passed as **kwargs during collection creation (`Collection(...)`). | {} | | `serialize_config` | [Serialization config of each Document](../../../fundamentals/document/serialization.md) | {} | | `consistency_level` | [Consistency level](https://milvus.io/docs/v2.1.x/consistency.md#Consistency-levels) for Milvus database operations. Can be 'Session', 'Strong', 'Bounded' or 'Eventually'. | 'Session' | @@ -239,7 +239,7 @@ Filters operate on the `tags` of a Document, which are stored as `columns` in th Consider Documents with embeddings `[0,0,0]` up to ` [9,9,9]` where the Document with embedding `[i,i,i]` -has as tag `price` with value `i`. We can create such example with the following code: +has as tag `price` with value `i`. We can create such an example with the following code: ```python from docarray import Document, DocumentArray @@ -358,7 +358,7 @@ Points with "price" at most 7: embedding=[2. 2. 2.], price=2 ``` -## Advancded options +## Advanced options The Milvus Document Store allows the user to pass additional parameters to the Milvus server for all main operations. @@ -423,7 +423,7 @@ The Milvus Document Store implements the entire DocumentArray API, but there are ### Collection loading In Milvus, every search or query operation requires the index to be loaded into memory. -This includes simple Document access through DocArray, +This includes simple Document access through DocArray. This loading operation can be costly, especially when performing multiple search or query operations in a row. @@ -498,10 +498,10 @@ da.append(doc) ``` ````{dropdown} Why does this limitation exist? -By default, DocArray stores three columns in any Document Store: The Document id's, the Document embeddings and +By default, DocArray stores three columns in any Document Store: The Document ids, the Document embeddings and a serialized (Base64 encoded) representation of the Document itself. -In Milvus, the the serialized Document are stored in a column of type 'VARCHAR', which imposes a limit of allowed length +In Milvus, the the serialized Documents are stored in a column of type 'VARCHAR', which imposes a limit of allowed length per entry. If the Base64 encoded Document exceeds this limit - which is usually the case for Documents with large tensors - the Document cannot be stored. From 6ba7facbcba1e3a555352279296a4b39bc0b627d Mon Sep 17 00:00:00 2001 From: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Date: Thu, 3 Nov 2022 15:21:14 +0100 Subject: [PATCH 70/88] docs: update docs/advanced/document-store/milvus.md Co-authored-by: Nicholas Dunham <11730795+NicholasDunham@users.noreply.github.com> Signed-off-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> --- docs/advanced/document-store/milvus.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/advanced/document-store/milvus.md b/docs/advanced/document-store/milvus.md index b6df3a1d74b..025d2dc8391 100644 --- a/docs/advanced/document-store/milvus.md +++ b/docs/advanced/document-store/milvus.md @@ -239,7 +239,7 @@ Filters operate on the `tags` of a Document, which are stored as `columns` in th Consider Documents with embeddings `[0,0,0]` up to ` [9,9,9]` where the Document with embedding `[i,i,i]` -has as tag `price` with value `i`. We can create such an example with the following code: +has a tag `price` with value `i`. We can create such an example with the following code: ```python from docarray import Document, DocumentArray From e0218660b7c213e62854dfab581433740acf550f Mon Sep 17 00:00:00 2001 From: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Date: Fri, 4 Nov 2022 10:10:07 +0100 Subject: [PATCH 71/88] refactor: update type hint Co-authored-by: samsja <55492238+samsja@users.noreply.github.com> Signed-off-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> --- docarray/array/mixins/find.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/mixins/find.py b/docarray/array/mixins/find.py index c1c02d9fa3d..c33ece76cc9 100644 --- a/docarray/array/mixins/find.py +++ b/docarray/array/mixins/find.py @@ -96,7 +96,7 @@ def find( limit: Optional[Union[int, float]] = 20, metric_name: Optional[str] = None, exclude_self: bool = False, - filter: Optional[Union[Dict, str]] = None, + filter: Union[Dict, str, None] = None, only_id: bool = False, index: str = 'text', on: Optional[str] = None, From 50752ce966d4cdd244dc27d27425fb4e53a3f67f Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Fri, 4 Nov 2022 10:20:09 +0100 Subject: [PATCH 72/88] refactor: apply suggestions from code review --- docarray/array/storage/base/seqlike.py | 1 + docarray/array/storage/milvus/backend.py | 36 +++++++++++++++++----- docarray/array/storage/milvus/getsetdel.py | 10 +++--- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/docarray/array/storage/base/seqlike.py b/docarray/array/storage/base/seqlike.py index bab651c354a..d5ef0ebc50c 100644 --- a/docarray/array/storage/base/seqlike.py +++ b/docarray/array/storage/base/seqlike.py @@ -19,6 +19,7 @@ def insert(self, index: int, value: 'Document', **kwargs): :param index: Position of the insertion. :param value: The doc needs to be inserted. + :param kwargs: Additional Arguments that are passed to the Document Store. This has no effect for in-memory DocumentArray. """ self._set_doc_by_id(value.id, value, **kwargs) self._offset2ids.insert(index, value.id) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 884ffb461c0..50a93f08128 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -26,7 +26,15 @@ ) -def always_true_expr(primary_key: str) -> str: +ID_VARCHAR_LEN = 1024 +SERIALIZED_VARCHAR_LEN = ( + 65_535 # 65_535 is the maximum that Milvus allows for a VARCHAR field +) +COLUMN_VARCHAR_LEN = 1024 +OFFSET_VARCHAR_LEN = 1024 + + +def _always_true_expr(primary_key: str) -> str: """ Returns a Milvus expression that is always true, thus allowing for the retrieval of all entries in a Collection Assumes that the primary key is of type DataType.VARCHAR @@ -37,12 +45,18 @@ def always_true_expr(primary_key: str) -> str: return f'({primary_key} in ["1"]) or ({primary_key} not in ["1"])' -def ids_to_milvus_expr(ids): +def _ids_to_milvus_expr(ids): ids = ['"' + _id + '"' for _id in ids] return '[' + ','.join(ids) + ']' def _sanitize_collection_name(name): + """Removes all chars that are not allowed in a Milvus collection name. + Thus, it removes all chars that are not alphanumeric or an underscore. + + :param name: the collection name to sanitize + :return: the sanitized collection name. + """ return ''.join( re.findall('[a-zA-Z0-9_]', name) ) # remove everything that is not a letter, number or underscore @@ -130,20 +144,25 @@ def _create_or_reuse_collection(self): ) document_id = FieldSchema( - name='document_id', dtype=DataType.VARCHAR, max_length=1024, is_primary=True + name='document_id', + dtype=DataType.VARCHAR, + max_length=ID_VARCHAR_LEN, + is_primary=True, ) embedding = FieldSchema( name='embedding', dtype=DataType.FLOAT_VECTOR, dim=self._config.n_dim ) serialized = FieldSchema( - name='serialized', dtype=DataType.VARCHAR, max_length=65_535 + name='serialized', dtype=DataType.VARCHAR, max_length=SERIALIZED_VARCHAR_LEN ) additional_columns = [] for col, coltype in self._config.columns.items(): mapped_type = self._map_type(coltype) if mapped_type == DataType.VARCHAR: - field_ = FieldSchema(name=col, dtype=mapped_type, max_length=1024) + field_ = FieldSchema( + name=col, dtype=mapped_type, max_length=COLUMN_VARCHAR_LEN + ) else: field_ = FieldSchema(name=col, dtype=mapped_type) additional_columns.append(field_) @@ -177,10 +196,13 @@ def _create_or_reuse_offset2id_collection(self): ) document_id = FieldSchema( - name='document_id', dtype=DataType.VARCHAR, max_length=1024 + name='document_id', dtype=DataType.VARCHAR, max_length=ID_VARCHAR_LEN ) offset = FieldSchema( - name='offset', dtype=DataType.VARCHAR, max_length=1024, is_primary=True + name='offset', + dtype=DataType.VARCHAR, + max_length=OFFSET_VARCHAR_LEN, + is_primary=True, ) dummy_vector = FieldSchema( name='dummy_vector', dtype=DataType.FLOAT_VECTOR, dim=1 diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index f8cc6ccfa63..c26d2ad3b7f 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -5,7 +5,7 @@ from docarray import DocumentArray from docarray.array.storage.base.getsetdel import BaseGetSetDelMixin from docarray.array.storage.base.helper import Offset2ID -from docarray.array.storage.milvus.backend import always_true_expr, ids_to_milvus_expr +from docarray.array.storage.milvus.backend import _always_true_expr, _ids_to_milvus_expr if TYPE_CHECKING: from docarray import Document, DocumentArray @@ -28,7 +28,7 @@ def _load_offset2ids(self): collection = self._offset2id_collection with self.loaded_collection(collection): res = collection.query( - expr=always_true_expr('document_id'), + expr=_always_true_expr('document_id'), output_fields=['offset', 'document_id'], consistency_level=self._config.consistency_level, ) @@ -53,7 +53,7 @@ def _get_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': kwargs = self._update_consistency_level(**kwargs) with self.loaded_collection(): res = self._collection.query( - expr=f'document_id in {ids_to_milvus_expr(ids)}', + expr=f'document_id in {_ids_to_milvus_expr(ids)}', output_fields=['serialized'], **kwargs, ) @@ -67,7 +67,7 @@ def _get_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': def _del_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': kwargs = self._update_consistency_level(**kwargs) self._collection.delete( - expr=f'document_id in {ids_to_milvus_expr(ids)}', **kwargs + expr=f'document_id in {_ids_to_milvus_expr(ids)}', **kwargs ) def _set_docs_by_ids( @@ -76,7 +76,7 @@ def _set_docs_by_ids( # delete old entries kwargs = self._update_consistency_level(**kwargs) self._collection.delete( - expr=f'document_id in {ids_to_milvus_expr(ids)}', + expr=f'document_id in {_ids_to_milvus_expr(ids)}', **kwargs, ) # insert new entries From 4c7df10175dc4c4354802a9ea5e9d258b729aa23 Mon Sep 17 00:00:00 2001 From: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Date: Fri, 4 Nov 2022 10:43:59 +0100 Subject: [PATCH 73/88] docs: clarify docstring Signed-off-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> --- docarray/array/storage/milvus/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 50a93f08128..1b5b2ad4ff4 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -313,7 +313,7 @@ def loaded_collection(self, collection=None): Context manager to load a collection and release it after the context is exited. If the collection is already loaded when entering, it will not be released while exiting. - :param collection: the collection to load. If None, the collection of this indexer is used. + :param collection: the collection to load. If None, the main collection of this indexer is used. :return: Context manager for the provided collection. """ From af8dddddd5430381cd7310f4e060faba7775fb9d Mon Sep 17 00:00:00 2001 From: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Date: Fri, 4 Nov 2022 10:44:40 +0100 Subject: [PATCH 74/88] docs: fix fautly docstring Signed-off-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> --- docarray/array/document.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/document.py b/docarray/array/document.py index 1c1869a1278..871812f667c 100644 --- a/docarray/array/document.py +++ b/docarray/array/document.py @@ -148,7 +148,7 @@ def __new__( storage: str = 'milvus', config: Optional[Union['MilvusConfig', Dict]] = None, ) -> 'DocumentArrayMilvus': - """Create a Redis-powered DocumentArray object.""" + """Create a Milvus-powered DocumentArray object.""" ... def __enter__(self): From 91ec3bb572110a2bef0b355b2448a2e7a4fd860f Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Fri, 4 Nov 2022 16:11:50 +0100 Subject: [PATCH 75/88] refactor: another round of review changes --- docarray/array/storage/milvus/backend.py | 30 +++++--- docs/advanced/document-store/milvus.md | 73 +++---------------- tests/conftest.py | 10 --- .../unit/array/test_backend_configuration.py | 4 +- 4 files changed, 30 insertions(+), 87 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 50a93f08128..b4343c898f5 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -80,7 +80,7 @@ class MilvusConfig: default_factory=dict ) # passed to milvus at collection creation time serialize_config: Dict = field(default_factory=dict) - consistency_level: str = 'Session' + consistency_level: str = None columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None @@ -129,11 +129,11 @@ def _init_storage( # table and load the given `docs` if _docs is None: return - elif isinstance(_docs, Iterable): - self.clear() + + self.clear() + if isinstance(_docs, Iterable): self.extend(_docs) else: - self.clear() if isinstance(_docs, Document): self.append(_docs) @@ -169,7 +169,7 @@ def _create_or_reuse_collection(self): schema = CollectionSchema( fields=[document_id, embedding, serialized, *additional_columns], - description='DocumentArray collection', + description='DocumentArray collection schema', ) return Collection( name=self._config.collection_name, @@ -267,11 +267,14 @@ def _docs_from_search_response( def _update_consistency_level(self, **kwargs): kwargs_consistency_level = kwargs.get('consistency_level', None) - kwargs['consistency_level'] = ( - kwargs_consistency_level - if kwargs_consistency_level - else self._config.consistency_level - ) + config_consistency_level = self._config.consistency_level + + if ( + kwargs_consistency_level or not config_consistency_level + ): # no need to update + return kwargs + + kwargs['consistency_level'] = config_consistency_level return kwargs def _map_embedding(self, embedding): @@ -294,6 +297,9 @@ def __getstate__(self): def __setstate__(self, state): self.__dict__ = state + connections.connect( + alias=self._connection_alias, host=self._config.host, port=self._config.port + ) self._collection = self._create_or_reuse_collection() self._offset2id_collection = self._create_or_reuse_offset2id_collection() @@ -317,7 +323,7 @@ def loaded_collection(self, collection=None): :return: Context manager for the provided collection. """ - class LoadedCollectionMngr: + class LoadedCollectionManager: def __init__(self, coll, connection_alias): self._collection = coll self._loaded_when_enter = False @@ -337,6 +343,6 @@ def __exit__(self, exc_type, exc_val, exc_tb): if not self._loaded_when_enter: self._collection.release() - return LoadedCollectionMngr( + return LoadedCollectionManager( collection if collection else self._collection, self._connection_alias ) diff --git a/docs/advanced/document-store/milvus.md b/docs/advanced/document-store/milvus.md index 025d2dc8391..766a3759c7c 100644 --- a/docs/advanced/document-store/milvus.md +++ b/docs/advanced/document-store/milvus.md @@ -123,74 +123,23 @@ The following configs can be set: | `collection_name` | Qdrant collection name client | **Random collection name generated** | | `host` | Hostname of the Milvus server | 'localhost' | | `port` | Port of the Milvus server | 6333 | -| `distance` | Distance metric to be used during search. Can be 'IP', 'L2', 'JACCARD', 'TANIMOTO', 'HAMMING', 'SUPERSTRUCTURE' or 'SUBSTRUCTURE'. | 'IP' (inner product) | +| `distance` | [Distance metric](https://milvus.io/docs/v2.1.x/metric.md) to be used during search. Can be 'IP', 'L2', 'JACCARD', 'TANIMOTO', 'HAMMING', 'SUPERSTRUCTURE' or 'SUBSTRUCTURE'. | 'IP' (inner product) | | `index_type` | Type of the (ANN) search index. Can be 'HNSW', 'FLAT', 'ANNOY', or one of multiple variants of IVF and RHNSW. Refer to the [list of supported index types](https://milvus.io/docs/v2.1.x/build_index.md#Prepare-index-parameter). | 'HNSW' | | `index_params` | A dictionary of parameters used for index building. The [allowed parameters](https://milvus.io/docs/v2.1.x/index.md) depend on the index type. | {'M': 4, 'efConstruction': 200} (assumes HNSW index) | | `collection_config` | Configuration for the Milvus collection. Passed as **kwargs during collection creation (`Collection(...)`). | {} | | `serialize_config` | [Serialization config of each Document](../../../fundamentals/document/serialization.md) | {} | - | `consistency_level` | [Consistency level](https://milvus.io/docs/v2.1.x/consistency.md#Consistency-levels) for Milvus database operations. Can be 'Session', 'Strong', 'Bounded' or 'Eventually'. | 'Session' | + | `consistency_level` | [Consistency level](https://milvus.io/docs/v2.1.x/consistency.md#Consistency-levels) for Milvus database operations. Can be 'Session', 'Strong', 'Bounded' or 'Eventually'. | Default defined by Milvus | | `columns` | Additional columns to be stored in the datbase, taken from Document `tags`. | None | ## Minimal example -Create `docker-compose.yml`: +Download `docker-compose.yml`: -`````{dropdown} docker-compose.yml - -```yaml -version: '3.5' -services: - etcd: - container_name: milvus-etcd - image: quay.io/coreos/etcd:v3.5.0 - environment: - - ETCD_AUTO_COMPACTION_MODE=revision - - ETCD_AUTO_COMPACTION_RETENTION=1000 - - ETCD_QUOTA_BACKEND_BYTES=4294967296 - - ETCD_SNAPSHOT_COUNT=50000 - volumes: - - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd - command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd - - minio: - container_name: milvus-minio - image: minio/minio:RELEASE.2022-03-17T06-34-49Z - environment: - MINIO_ACCESS_KEY: minioadmin - MINIO_SECRET_KEY: minioadmin - volumes: - - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data - command: minio server /minio_data - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] - interval: 30s - timeout: 20s - retries: 3 - - standalone: - container_name: milvus-standalone - image: milvusdb/milvus:v2.1.4 - command: ["milvus", "run", "standalone"] - environment: - ETCD_ENDPOINTS: etcd:2379 - MINIO_ADDRESS: minio:9000 - volumes: - - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus - ports: - - "19530:19530" - - "9091:9091" - depends_on: - - "etcd" - - "minio" - -networks: - default: - name: milvus +```text +wget https://github.com/milvus-io/milvus/releases/download/v2.1.4/milvus-standalone-docker-compose.yml -O docker-compose.yml ``` -````` - Install DocArray with Milvus and launch the Milvus server: @@ -239,7 +188,7 @@ Filters operate on the `tags` of a Document, which are stored as `columns` in th Consider Documents with embeddings `[0,0,0]` up to ` [9,9,9]` where the Document with embedding `[i,i,i]` -has a tag `price` with value `i`. We can create such an example with the following code: +has a tag `price` with value `i`. You can create such an example with the following code: ```python from docarray import Document, DocumentArray @@ -268,9 +217,9 @@ for embedding, price in zip(da.embeddings, da[:, 'tags__price']): print(f'\tembedding={embedding},\t price={price}') ``` -Consider we want the nearest vectors to the embedding `[8. 8. 8.]`, with the restriction that +Consider you want the nearest vectors to the embedding `[8. 8. 8.]`, with the restriction that prices must follow a filter. As an example, retrieved Documents must have `price` value lower than -or equal to `max_price`. We can encode this information in Milvus using `filter = f'price <= {max_price}'`. +or equal to `max_price`. You can express this information in Milvus using `filter = f'price <= {max_price}'`. Then you can implement and use the search with the proposed filter: @@ -304,7 +253,7 @@ Embeddings Nearest Neighbours with "price" at most 7: ### Example of `.find` with only a filter The following example shows how to use DocArray with Milvus Document Store in order to filter text documents. -Consider Documents have the tag `price` with a value of `i`. We can create these with the following code: +Consider Documents have the tag `price` with a value of `i`. You can create these with the following code: ```python from docarray import Document, DocumentArray @@ -331,7 +280,7 @@ for embedding, price in zip(da.embeddings, da[:, 'tags__price']): ``` Suppose you want to filter results such that -retrieved Documents must have a `price` value less than or equal to `max_price`. You can encode +retrieved Documents must have a `price` value less than or equal to `max_price`. You can express this information in Milvus using `filter = f'price <= {max_price}'`. Then you can implement and use the search with the proposed filter: @@ -422,7 +371,7 @@ The Milvus Document Store implements the entire DocumentArray API, but there are (milvus-collection-loading)= ### Collection loading -In Milvus, every search or query operation requires the index to be loaded into memory. +In Milvus, every search or query operation requires the index to be [loaded into memory](https://milvus.io/api-reference/pymilvus/v2.1.3/Collection/load().md). This includes simple Document access through DocArray. This loading operation can be costly, especially when performing multiple search or query operations in a row. diff --git a/tests/conftest.py b/tests/conftest.py index 761053acc16..bd12271345c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -64,13 +64,3 @@ def set_env_vars(request): yield os.environ.clear() os.environ.update(_old_environ) - - -@pytest.fixture -def milvus_cleanup(): - yield - from pymilvus import list_collections, drop_collection - - alias = f'docarray_localhost_19530' # assumes default host and port are used - for c in list_collections(using=alias): - drop_collection(c, using=alias) diff --git a/tests/unit/array/test_backend_configuration.py b/tests/unit/array/test_backend_configuration.py index e1871f8e4a8..d6d5f8fd678 100644 --- a/tests/unit/array/test_backend_configuration.py +++ b/tests/unit/array/test_backend_configuration.py @@ -155,9 +155,7 @@ def test_cast_columns_qdrant(start_storage, type_da, type_column, request): @pytest.mark.parametrize('type_da', [int, float, str, bool]) @pytest.mark.parametrize('type_column', ['int', 'str', 'float', 'double', 'bool']) -def test_cast_columns_milvus( - start_storage, type_da, type_column, request, milvus_cleanup -): +def test_cast_columns_milvus(start_storage, type_da, type_column, request): test_id = request.node.callspec.id.replace( '-', '' ) # remove '-' from the test id for the milvus name From 4920635f1282ef4e2cf0eb173ae36e8bc3d4e70b Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Fri, 4 Nov 2022 17:22:29 +0100 Subject: [PATCH 76/88] fix: set consistency level for offset id loading --- docarray/array/storage/milvus/getsetdel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index c26d2ad3b7f..6807a06cb90 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -26,11 +26,12 @@ def _set_doc_by_id(self, _id: str, value: 'Document', **kwargs): def _load_offset2ids(self): collection = self._offset2id_collection + kwargs = self._update_consistency_level(**dict()) with self.loaded_collection(collection): res = collection.query( expr=_always_true_expr('document_id'), output_fields=['offset', 'document_id'], - consistency_level=self._config.consistency_level, + **kwargs, ) sorted_res = sorted(res, key=lambda k: int(k['offset'])) self._offset2ids = Offset2ID([r['document_id'] for r in sorted_res]) From cb4a39c21a8dbc16025bb5ea89442b318aae966b Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 8 Nov 2022 11:34:18 +0100 Subject: [PATCH 77/88] fix: set stricter defaut consistency level --- docarray/array/storage/milvus/backend.py | 2 +- docs/advanced/document-store/milvus.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 4f2dace1d9e..940065cd29f 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -80,7 +80,7 @@ class MilvusConfig: default_factory=dict ) # passed to milvus at collection creation time serialize_config: Dict = field(default_factory=dict) - consistency_level: str = None + consistency_level: str = 'Session' columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None diff --git a/docs/advanced/document-store/milvus.md b/docs/advanced/document-store/milvus.md index 766a3759c7c..fd1461f1f16 100644 --- a/docs/advanced/document-store/milvus.md +++ b/docs/advanced/document-store/milvus.md @@ -128,9 +128,9 @@ The following configs can be set: | `index_params` | A dictionary of parameters used for index building. The [allowed parameters](https://milvus.io/docs/v2.1.x/index.md) depend on the index type. | {'M': 4, 'efConstruction': 200} (assumes HNSW index) | | `collection_config` | Configuration for the Milvus collection. Passed as **kwargs during collection creation (`Collection(...)`). | {} | | `serialize_config` | [Serialization config of each Document](../../../fundamentals/document/serialization.md) | {} | - | `consistency_level` | [Consistency level](https://milvus.io/docs/v2.1.x/consistency.md#Consistency-levels) for Milvus database operations. Can be 'Session', 'Strong', 'Bounded' or 'Eventually'. | Default defined by Milvus | + | `consistency_level` | [Consistency level](https://milvus.io/docs/v2.1.x/consistency.md#Consistency-levels) for Milvus database operations. Can be 'Session', 'Strong', 'Bounded' or 'Eventually'. | 'Session' | | `columns` | Additional columns to be stored in the datbase, taken from Document `tags`. | None | - +l ## Minimal example Download `docker-compose.yml`: From 4b9e473a71d53c1c4dc1575419503d8abc9f17e3 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 8 Nov 2022 13:12:23 +0100 Subject: [PATCH 78/88] perf: optimize sorting of retrieved documents --- docarray/array/storage/milvus/getsetdel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index 6807a06cb90..fa2cfbaac94 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -62,8 +62,8 @@ def _get_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': raise KeyError(f'No documents found for ids {ids}') docs = self._docs_from_query_response(res) # sort output docs according to input id sorting - ids_list = list(ids) - return DocumentArray(sorted(docs, key=lambda d: ids_list.index(d.id))) + id_to_index = {id_: i for i, id_ in enumerate(ids)} + return DocumentArray(sorted(docs, key=lambda d: id_to_index[d.id])) def _del_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': kwargs = self._update_consistency_level(**kwargs) From 70828e437aac7c7814f0ef4c916e7f888cf37be8 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 8 Nov 2022 13:37:24 +0100 Subject: [PATCH 79/88] docs: document loading context manager --- docs/advanced/document-store/index.md | 1 + docs/advanced/document-store/milvus.md | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/advanced/document-store/index.md b/docs/advanced/document-store/index.md index 722ac5074de..024a64d3c9f 100644 --- a/docs/advanced/document-store/index.md +++ b/docs/advanced/document-store/index.md @@ -356,6 +356,7 @@ array([[7., 7., 7.], [4., 4., 4.]]) ``` +(backend-context-mngr)= ## Persistence, mutations and context manager Having DocumentArrays that are backed by a document store introduces an extra consideration into the way you think about DocumentArrays. diff --git a/docs/advanced/document-store/milvus.md b/docs/advanced/document-store/milvus.md index fd1461f1f16..0418bde9a2f 100644 --- a/docs/advanced/document-store/milvus.md +++ b/docs/advanced/document-store/milvus.md @@ -130,7 +130,7 @@ The following configs can be set: | `serialize_config` | [Serialization config of each Document](../../../fundamentals/document/serialization.md) | {} | | `consistency_level` | [Consistency level](https://milvus.io/docs/v2.1.x/consistency.md#Consistency-levels) for Milvus database operations. Can be 'Session', 'Strong', 'Bounded' or 'Eventually'. | 'Session' | | `columns` | Additional columns to be stored in the datbase, taken from Document `tags`. | None | -l + ## Minimal example Download `docker-compose.yml`: @@ -402,7 +402,14 @@ with da: # index is released from memory ``` -Not using the `with da:` context manager will return the same results for the same operations, but will incur significant performance penalties: +The `with da:` context manager also {ref}`manages persistence of the list-like interface ` of a DocumentArray, +which can introduce a small overhead when leaving the context. + +If you want to _only_ manage the loading and releasing behavior of your DocumentArray, you can use the `with da.loaded_collection()` +context manager instead. +In the example above it can be used as a drop-in replacement. + +Not using the `with da:` or `with da.loaded_collection()` context manager will return the same results for the same operations, but will incur significant performance penalties: ````{dropdown} ⚠️ Bad code From d672398cd05278307448397b1aca05afb37412a3 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 8 Nov 2022 14:29:16 +0100 Subject: [PATCH 80/88] refactor: find unboxing done by base class --- docarray/array/storage/milvus/backend.py | 4 ++-- docarray/array/storage/milvus/find.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 940065cd29f..6367a55f9d4 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -255,7 +255,7 @@ def _docs_from_query_response(response): @staticmethod def _docs_from_search_response( responses, - ) -> 'Union[List[DocumentArray], DocumentArray]': + ) -> 'List[DocumentArray]': das = [] for r in responses: das.append( @@ -263,7 +263,7 @@ def _docs_from_search_response( [Document.from_base64(hit.entity.get('serialized')) for hit in r] ) ) - return das if len(das) > 0 else das[0] + return das def _update_consistency_level(self, **kwargs): kwargs_consistency_level = kwargs.get('consistency_level', None) diff --git a/docarray/array/storage/milvus/find.py b/docarray/array/storage/milvus/find.py index bd4b497c85c..390011bc50a 100644 --- a/docarray/array/storage/milvus/find.py +++ b/docarray/array/storage/milvus/find.py @@ -24,7 +24,7 @@ def _find( filter: Optional[Dict] = None, param=None, **kwargs - ) -> Union['DocumentArray', List['DocumentArray']]: + ) -> List['DocumentArray']: """Returns `limit` approximate nearest neighbors given a batch of input queries. If the query is a single query, should return a DocumentArray, otherwise a list of DocumentArrays containing the closest Documents for each query. From 00b4de4afaa4240c31559a9038929945f84e620c Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 8 Nov 2022 17:20:36 +0100 Subject: [PATCH 81/88] feat: add batching --- docarray/array/storage/milvus/backend.py | 22 +++++--- docarray/array/storage/milvus/find.py | 4 +- docarray/array/storage/milvus/getsetdel.py | 59 +++++++++++++--------- docarray/array/storage/milvus/seqlike.py | 11 ++-- docs/advanced/document-store/milvus.md | 51 ++++++++++++++----- 5 files changed, 100 insertions(+), 47 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 6367a55f9d4..d77d01fb404 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -12,7 +12,6 @@ DataType, CollectionSchema, has_collection, - MilvusException, loading_progress, ) @@ -50,6 +49,16 @@ def _ids_to_milvus_expr(ids): return '[' + ','.join(ids) + ']' +def _batch_list(l: List, batch_size: int): + """Iterates over a list in batches of size batch_size""" + if batch_size < 1: + yield l + return + l_len = len(l) + for ndx in range(0, l_len, batch_size): + yield l[ndx : min(ndx + batch_size, l_len)] + + def _sanitize_collection_name(name): """Removes all chars that are not allowed in a Milvus collection name. Thus, it removes all chars that are not alphanumeric or an underscore. @@ -81,6 +90,7 @@ class MilvusConfig: ) # passed to milvus at collection creation time serialize_config: Dict = field(default_factory=dict) consistency_level: str = 'Session' + batch_size: int = -1 columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None @@ -265,16 +275,16 @@ def _docs_from_search_response( ) return das - def _update_consistency_level(self, **kwargs): - kwargs_consistency_level = kwargs.get('consistency_level', None) - config_consistency_level = self._config.consistency_level + def _update_kwargs_from_config(self, field_to_update, **kwargs): + kwargs_field_value = kwargs.get(field_to_update, None) + config_field_value = getattr(self._config, field_to_update, None) if ( - kwargs_consistency_level or not config_consistency_level + kwargs_field_value is not None or config_field_value is None ): # no need to update return kwargs - kwargs['consistency_level'] = config_consistency_level + kwargs[field_to_update] = config_field_value return kwargs def _map_embedding(self, embedding): diff --git a/docarray/array/storage/milvus/find.py b/docarray/array/storage/milvus/find.py index 390011bc50a..fa9f42709ad 100644 --- a/docarray/array/storage/milvus/find.py +++ b/docarray/array/storage/milvus/find.py @@ -31,7 +31,7 @@ def _find( """ if param is None: param = dict() - kwargs = self._update_consistency_level(**kwargs) + kwargs = self._update_kwargs_from_config('consistency_level', **kwargs) with self.loaded_collection(): results = self._collection.search( data=query, @@ -45,7 +45,7 @@ def _find( return self._docs_from_search_response(results) def _filter(self, filter, limit=10, **kwargs): - kwargs = self._update_consistency_level(**kwargs) + kwargs = self._update_kwargs_from_config('consistency_level', **kwargs) with self.loaded_collection(): results = self._collection.query( expr=filter, diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index fa2cfbaac94..15b4b5f9de8 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -5,7 +5,11 @@ from docarray import DocumentArray from docarray.array.storage.base.getsetdel import BaseGetSetDelMixin from docarray.array.storage.base.helper import Offset2ID -from docarray.array.storage.milvus.backend import _always_true_expr, _ids_to_milvus_expr +from docarray.array.storage.milvus.backend import ( + _always_true_expr, + _ids_to_milvus_expr, + _batch_list, +) if TYPE_CHECKING: from docarray import Document, DocumentArray @@ -26,7 +30,7 @@ def _set_doc_by_id(self, _id: str, value: 'Document', **kwargs): def _load_offset2ids(self): collection = self._offset2id_collection - kwargs = self._update_consistency_level(**dict()) + kwargs = self._update_kwargs_from_config('consistency_level', **dict()) with self.loaded_collection(collection): res = collection.query( expr=_always_true_expr('document_id'), @@ -51,38 +55,47 @@ def _save_offset2ids(self): def _get_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': if not ids: return DocumentArray() - kwargs = self._update_consistency_level(**kwargs) + ids = list(ids) + kwargs = self._update_kwargs_from_config('consistency_level', **kwargs) + kwargs = self._update_kwargs_from_config('batch_size', **kwargs) with self.loaded_collection(): - res = self._collection.query( - expr=f'document_id in {_ids_to_milvus_expr(ids)}', - output_fields=['serialized'], - **kwargs, - ) - if not res: - raise KeyError(f'No documents found for ids {ids}') - docs = self._docs_from_query_response(res) + docs = DocumentArray() + for id_batch in _batch_list(ids, kwargs['batch_size']): + res = self._collection.query( + expr=f'document_id in {_ids_to_milvus_expr(id_batch)}', + output_fields=['serialized'], + **kwargs, + ) + if not res: + raise KeyError(f'No documents found for ids {ids}') + docs.extend(self._docs_from_query_response(res)) # sort output docs according to input id sorting id_to_index = {id_: i for i, id_ in enumerate(ids)} return DocumentArray(sorted(docs, key=lambda d: id_to_index[d.id])) def _del_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': - kwargs = self._update_consistency_level(**kwargs) - self._collection.delete( - expr=f'document_id in {_ids_to_milvus_expr(ids)}', **kwargs - ) + kwargs = self._update_kwargs_from_config('consistency_level', **kwargs) + kwargs = self._update_kwargs_from_config('batch_size', **kwargs) + for id_batch in _batch_list(list(ids), kwargs['batch_size']): + self._collection.delete( + expr=f'document_id in {_ids_to_milvus_expr(id_batch)}', **kwargs + ) def _set_docs_by_ids( self, ids, docs: 'Iterable[Document]', mismatch_ids: 'Dict', **kwargs ): + kwargs = self._update_kwargs_from_config('consistency_level', **kwargs) + kwargs = self._update_kwargs_from_config('batch_size', **kwargs) # delete old entries - kwargs = self._update_consistency_level(**kwargs) - self._collection.delete( - expr=f'document_id in {_ids_to_milvus_expr(ids)}', - **kwargs, - ) - # insert new entries - payload = self._docs_to_milvus_payload(docs) - self._collection.insert(payload, **kwargs) + for id_batch in _batch_list(list(ids), kwargs['batch_size']): + self._collection.delete( + expr=f'document_id in {_ids_to_milvus_expr(id_batch)}', + **kwargs, + ) + for docs_batch in _batch_list(list(docs), kwargs['batch_size']): + # insert new entries + payload = self._docs_to_milvus_payload(docs_batch) + self._collection.insert(payload, **kwargs) def _clear_storage(self): self._collection.drop() diff --git a/docarray/array/storage/milvus/seqlike.py b/docarray/array/storage/milvus/seqlike.py index 34db09085a4..1711c5b8080 100644 --- a/docarray/array/storage/milvus/seqlike.py +++ b/docarray/array/storage/milvus/seqlike.py @@ -1,5 +1,6 @@ from typing import Iterable, Iterator, Union, TYPE_CHECKING from docarray.array.storage.base.seqlike import BaseSequenceLikeMixin +from docarray.array.storage.milvus.backend import _batch_list from docarray import Document @@ -49,7 +50,9 @@ def _extend(self, values: Iterable['Document'], **kwargs): docs = list(values) if not docs: return - kwargs = self._update_consistency_level(**kwargs) - payload = self._docs_to_milvus_payload(docs) - self._collection.insert(payload, **kwargs) - self._offset2ids.extend([doc.id for doc in docs]) + kwargs = self._update_kwargs_from_config('consistency_level', **kwargs) + kwargs = self._update_kwargs_from_config('batch_size', **kwargs) + for docs_batch in _batch_list(list(docs), kwargs['batch_size']): + payload = self._docs_to_milvus_payload(docs_batch) + self._collection.insert(payload, **kwargs) + self._offset2ids.extend([doc.id for doc in docs_batch]) diff --git a/docs/advanced/document-store/milvus.md b/docs/advanced/document-store/milvus.md index 0418bde9a2f..0962d6fd81a 100644 --- a/docs/advanced/document-store/milvus.md +++ b/docs/advanced/document-store/milvus.md @@ -117,19 +117,20 @@ da.summary() The following configs can be set: -| Name | Description | Default | -|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------| -| `n_dim` | Number of dimensions of embeddings to be stored and retrieved | **This is always required** | -| `collection_name` | Qdrant collection name client | **Random collection name generated** | -| `host` | Hostname of the Milvus server | 'localhost' | -| `port` | Port of the Milvus server | 6333 | -| `distance` | [Distance metric](https://milvus.io/docs/v2.1.x/metric.md) to be used during search. Can be 'IP', 'L2', 'JACCARD', 'TANIMOTO', 'HAMMING', 'SUPERSTRUCTURE' or 'SUBSTRUCTURE'. | 'IP' (inner product) | +| Name | Description | Default | +|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------| +| `n_dim` | Number of dimensions of embeddings to be stored and retrieved | **This is always required** | +| `collection_name` | Qdrant collection name client | **Random collection name generated** | +| `host` | Hostname of the Milvus server | 'localhost' | +| `port` | Port of the Milvus server | 6333 | +| `distance` | [Distance metric](https://milvus.io/docs/v2.1.x/metric.md) to be used during search. Can be 'IP', 'L2', 'JACCARD', 'TANIMOTO', 'HAMMING', 'SUPERSTRUCTURE' or 'SUBSTRUCTURE'. | 'IP' (inner product) | | `index_type` | Type of the (ANN) search index. Can be 'HNSW', 'FLAT', 'ANNOY', or one of multiple variants of IVF and RHNSW. Refer to the [list of supported index types](https://milvus.io/docs/v2.1.x/build_index.md#Prepare-index-parameter). | 'HNSW' | -| `index_params` | A dictionary of parameters used for index building. The [allowed parameters](https://milvus.io/docs/v2.1.x/index.md) depend on the index type. | {'M': 4, 'efConstruction': 200} (assumes HNSW index) | -| `collection_config` | Configuration for the Milvus collection. Passed as **kwargs during collection creation (`Collection(...)`). | {} | -| `serialize_config` | [Serialization config of each Document](../../../fundamentals/document/serialization.md) | {} | - | `consistency_level` | [Consistency level](https://milvus.io/docs/v2.1.x/consistency.md#Consistency-levels) for Milvus database operations. Can be 'Session', 'Strong', 'Bounded' or 'Eventually'. | 'Session' | -| `columns` | Additional columns to be stored in the datbase, taken from Document `tags`. | None | +| `index_params` | A dictionary of parameters used for index building. The [allowed parameters](https://milvus.io/docs/v2.1.x/index.md) depend on the index type. | {'M': 4, 'efConstruction': 200} (assumes HNSW index) | +| `collection_config` | Configuration for the Milvus collection. Passed as **kwargs during collection creation (`Collection(...)`). | {} | +| `serialize_config` | [Serialization config of each Document](../../../fundamentals/document/serialization.md) | {} | + | `consistency_level` | [Consistency level](https://milvus.io/docs/v2.1.x/consistency.md#Consistency-levels) for Milvus database operations. Can be 'Session', 'Strong', 'Bounded' or 'Eventually'. | 'Session' | +| `batch_size` | Default batch size for CRUD operations. | -1 (no batching) | +| `columns` | Additional columns to be stored in the datbase, taken from Document `tags`. | None | ## Minimal example @@ -338,6 +339,32 @@ da.append( Currently, dynamically setting a consistency level is supported for the following operations: `.append()`, `.extend()`, `.find()`, and `.insert()`. +### Setting a batch size + +You can configure your DocumentArray to, on every relevant operation, send Documents to the Milvus database in batches. +This default `batch_size` can be specified in the DocumentArray {ref}`config `. + +If you do not specify a default batch size, no batching will be performed. + + +When performing a specific operation, you can override this default batch size by passing a `batch_size` parameter: + +```python +from docarray import DocumentArray, Document +import numpy as np + +da = DocumentArray( + storage='milvus', + config={'batch_size': 100, 'n_dim': 3}, +) + +da.append(Document(tensor=np.random.rand(3))) # batch size is 100 +da.append(Document(tensor=np.random.rand(3)), batch_size=5) # batch size is 5 +``` + +Currently, dynamically setting a consistency level is supported for the following operations: +`.append()`, `.extend()`, and `.insert()`. + ### Passing search parameters In Milvus you can [pass parameters to the search operation](https://milvus.io/docs/v2.1.x/search.md#Conduct-a-vector-search) which [depend on the used index type](https://milvus.io/docs/v2.1.x/index.md). From b45d214c5dffea2b4997362eb80dccc4f64eaf25 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 8 Nov 2022 17:23:38 +0100 Subject: [PATCH 82/88] test: add test for batch size --- .../unit/array/storage/milvus/test_milvus.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tests/unit/array/storage/milvus/test_milvus.py b/tests/unit/array/storage/milvus/test_milvus.py index 5308b0f2304..42ac17173ea 100644 --- a/tests/unit/array/storage/milvus/test_milvus.py +++ b/tests/unit/array/storage/milvus/test_milvus.py @@ -123,3 +123,47 @@ def test_consistency_level(start_storage, mocker, method, meth_input, mock_respo for args, kwargs in mock_meth.call_args_list: if 'consistency_level' in kwargs: assert kwargs['consistency_level'] == new_consistency + + +@pytest.mark.parametrize( + 'method,meth_input', + [ + ('append', [Document(embedding=np.random.random([10]))]), + ('extend', [[Document(embedding=np.random.random([10]))]]), + ('insert', [0, Document(embedding=np.random.random([10]))]), + ], +) +def test_batching(start_storage, mocker, method, meth_input, mock_response): + init_batch_size = 5 + da = DocumentArrayMilvus( + config={ + 'n_dim': 10, + 'batch_size': init_batch_size, + }, + ) + + # patch Milvus collection + patch_methods = ['insert', 'search', 'delete', 'query'] + for m in patch_methods: + setattr(da._collection, m, mocker.Mock(return_value=mock_response)) + + # test batch_size set in config + getattr(da, method)(*meth_input) + for m in patch_methods: + mock_meth = getattr(da._collection, m) + for args, kwargs in mock_meth.call_args_list: + if 'batch_size' in kwargs: + assert kwargs['batch_size'] == init_batch_size + + # reset the mocks + for m in patch_methods: + setattr(da._collection, m, mocker.Mock(return_value=mock_response)) + + # test dynamic consistency level + new_batch_size = 100 + getattr(da, method)(*meth_input, batch_size=new_batch_size) + for m in patch_methods: + mock_meth = getattr(da._collection, m) + for args, kwargs in mock_meth.call_args_list: + if 'batch_size' in kwargs: + assert kwargs['batch_size'] == new_batch_size From 005b2e553d3202a7ae2a2b5cb29f14de5ea3d1d5 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Fri, 11 Nov 2022 10:37:18 +0100 Subject: [PATCH 83/88] test: restart milvus if it breaks --- tests/conftest.py | 53 +++++++++++++++++----- tests/unit/array/docker-compose.yml | 43 ------------------ tests/unit/array/milvus-docker-compose.yml | 44 ++++++++++++++++++ 3 files changed, 86 insertions(+), 54 deletions(-) create mode 100644 tests/unit/array/milvus-docker-compose.yml diff --git a/tests/conftest.py b/tests/conftest.py index bd12271345c..b58d3489a4f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,12 +4,14 @@ from typing import Dict import pytest -from pymilvus import MilvusUnavailableException cur_dir = os.path.dirname(os.path.abspath(__file__)) compose_yml = os.path.abspath( os.path.join(cur_dir, 'unit', 'array', 'docker-compose.yml') ) +milvus_compose_yml = os.path.abspath( + os.path.join(cur_dir, 'unit', 'array', 'milvus-docker-compose.yml') +) @pytest.fixture(autouse=True) @@ -24,6 +26,11 @@ def start_storage(): f"docker-compose -f {compose_yml} --project-directory . up --build -d " f"--remove-orphans" ) + os.system( + f"docker-compose -f {milvus_compose_yml} --project-directory . up --build -d " + f"--remove-orphans" + ) + _wait_for_es() _wait_for_milvus() @@ -32,6 +39,22 @@ def start_storage(): f"docker-compose -f {compose_yml} --project-directory . down " f"--remove-orphans" ) + os.system( + f"docker-compose -f {milvus_compose_yml} --project-directory . down " + f"--remove-orphans" + ) + + +def restart_milvus(): + os.system( + f"docker-compose -f {milvus_compose_yml} --project-directory . down " + f"--remove-orphans" + ) + os.system( + f"docker-compose -f {milvus_compose_yml} --project-directory . up --build -d " + f"--remove-orphans" + ) + _wait_for_milvus(restart_on_failure=False) def _wait_for_es(): @@ -42,19 +65,27 @@ def _wait_for_es(): time.sleep(0.5) -def _wait_for_milvus(): +def _wait_for_milvus(restart_on_failure=True): from pymilvus import connections, has_collection - from pymilvus.exceptions import MilvusUnavailableException + from pymilvus.exceptions import MilvusUnavailableException, MilvusException milvus_conn_alias = f'pytest_localhost_19530' - connections.connect(alias=milvus_conn_alias, host='localhost', port=19530) - milvus_ready = False - while not milvus_ready: - try: - has_collection('ping', using=milvus_conn_alias) - milvus_ready = True - except MilvusUnavailableException as e: - time.sleep(0.5) + try: + connections.connect(alias=milvus_conn_alias, host='localhost', port=19530) + milvus_ready = False + while not milvus_ready: + try: + has_collection('ping', using=milvus_conn_alias) + milvus_ready = True + except MilvusUnavailableException: + # Milvus is not ready yet, just wait + time.sleep(0.5) + except MilvusException as e: + if e.code == 1 and restart_on_failure: + # something went wrong with the docker container, restart and retry once + restart_milvus() + else: + raise e @pytest.fixture(scope='session') diff --git a/tests/unit/array/docker-compose.yml b/tests/unit/array/docker-compose.yml index 544c352976d..bd7ed1ef410 100644 --- a/tests/unit/array/docker-compose.yml +++ b/tests/unit/array/docker-compose.yml @@ -32,49 +32,6 @@ services: ports: - "6379:6379" - etcd: - container_name: milvus-etcd - image: quay.io/coreos/etcd:v3.5.0 - environment: - - ETCD_AUTO_COMPACTION_MODE=revision - - ETCD_AUTO_COMPACTION_RETENTION=1000 - - ETCD_QUOTA_BACKEND_BYTES=4294967296 - - ETCD_SNAPSHOT_COUNT=50000 - volumes: - - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd - command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd - - minio: - container_name: milvus-minio - image: minio/minio:RELEASE.2022-03-17T06-34-49Z - environment: - MINIO_ACCESS_KEY: minioadmin - MINIO_SECRET_KEY: minioadmin - volumes: - - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data - command: minio server /minio_data - healthcheck: - test: [ "CMD", "curl", "-f", "http://localhost:9000/minio/health/live" ] - interval: 30s - timeout: 20s - retries: 3 - - standalone: - container_name: milvus-standalone - image: milvusdb/milvus:v2.1.4 - command: [ "milvus", "run", "standalone" ] - environment: - ETCD_ENDPOINTS: etcd:2379 - MINIO_ADDRESS: minio:9000 - volumes: - - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus - ports: - - "19530:19530" - - "9091:9091" - depends_on: - - "etcd" - - "minio" - networks: elastic: diff --git a/tests/unit/array/milvus-docker-compose.yml b/tests/unit/array/milvus-docker-compose.yml new file mode 100644 index 00000000000..0e2fecfbb84 --- /dev/null +++ b/tests/unit/array/milvus-docker-compose.yml @@ -0,0 +1,44 @@ +version: "3.3" +services: + etcd: + container_name: milvus-etcd + image: quay.io/coreos/etcd:v3.5.0 + environment: + - ETCD_AUTO_COMPACTION_MODE=revision + - ETCD_AUTO_COMPACTION_RETENTION=1000 + - ETCD_QUOTA_BACKEND_BYTES=4294967296 + - ETCD_SNAPSHOT_COUNT=50000 + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd + command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd + + minio: + container_name: milvus-minio + image: minio/minio:RELEASE.2022-03-17T06-34-49Z + environment: + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data + command: minio server /minio_data + healthcheck: + test: [ "CMD", "curl", "-f", "http://localhost:9000/minio/health/live" ] + interval: 30s + timeout: 20s + retries: 3 + + standalone: + container_name: milvus-standalone + image: milvusdb/milvus:v2.1.4 + command: [ "milvus", "run", "standalone" ] + environment: + ETCD_ENDPOINTS: etcd:2379 + MINIO_ADDRESS: minio:9000 + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus + ports: + - "19530:19530" + - "9091:9091" + depends_on: + - "etcd" + - "minio" \ No newline at end of file From 012d01d50535117c68273dcece47cb2be14157b2 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Fri, 11 Nov 2022 11:58:29 +0100 Subject: [PATCH 84/88] test: dont remove-orphans where it is not needed --- tests/conftest.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index b58d3489a4f..729117fa329 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -27,8 +27,7 @@ def start_storage(): f"--remove-orphans" ) os.system( - f"docker-compose -f {milvus_compose_yml} --project-directory . up --build -d " - f"--remove-orphans" + f"docker-compose -f {milvus_compose_yml} --project-directory . up --build -d" ) _wait_for_es() @@ -46,13 +45,9 @@ def start_storage(): def restart_milvus(): + os.system(f"docker-compose -f {milvus_compose_yml} --project-directory . down") os.system( - f"docker-compose -f {milvus_compose_yml} --project-directory . down " - f"--remove-orphans" - ) - os.system( - f"docker-compose -f {milvus_compose_yml} --project-directory . up --build -d " - f"--remove-orphans" + f"docker-compose -f {milvus_compose_yml} --project-directory . up --build -d" ) _wait_for_milvus(restart_on_failure=False) From 412d79e2a95d36201def33ed333c877a2733e02a Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 14 Nov 2022 11:42:15 +0100 Subject: [PATCH 85/88] feat: add ability to disble list like behaviour Signed-off-by: Johannes Messner --- docarray/array/storage/milvus/backend.py | 2 ++ docarray/array/storage/milvus/getsetdel.py | 42 +++++++++++----------- docs/advanced/document-store/milvus.md | 1 + 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index d77d01fb404..35154ca3c66 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -92,6 +92,7 @@ class MilvusConfig: consistency_level: str = 'Session' batch_size: int = -1 columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None + list_like: bool = True class BackendMixin(BaseBackendMixin): @@ -121,6 +122,7 @@ def _init_storage( if config.collection_name is None: id = uuid.uuid4().hex config.collection_name = 'docarray__' + id + self._list_like = config.list_like self._config = config self._config.columns = self._normalize_columns(self._config.columns) diff --git a/docarray/array/storage/milvus/getsetdel.py b/docarray/array/storage/milvus/getsetdel.py index 15b4b5f9de8..08757f0f715 100644 --- a/docarray/array/storage/milvus/getsetdel.py +++ b/docarray/array/storage/milvus/getsetdel.py @@ -29,28 +29,30 @@ def _set_doc_by_id(self, _id: str, value: 'Document', **kwargs): self._set_docs_by_ids([_id], [value], None, **kwargs) def _load_offset2ids(self): - collection = self._offset2id_collection - kwargs = self._update_kwargs_from_config('consistency_level', **dict()) - with self.loaded_collection(collection): - res = collection.query( - expr=_always_true_expr('document_id'), - output_fields=['offset', 'document_id'], - **kwargs, - ) - sorted_res = sorted(res, key=lambda k: int(k['offset'])) - self._offset2ids = Offset2ID([r['document_id'] for r in sorted_res]) + if self._list_like: + collection = self._offset2id_collection + kwargs = self._update_kwargs_from_config('consistency_level', **dict()) + with self.loaded_collection(collection): + res = collection.query( + expr=_always_true_expr('document_id'), + output_fields=['offset', 'document_id'], + **kwargs, + ) + sorted_res = sorted(res, key=lambda k: int(k['offset'])) + self._offset2ids = Offset2ID([r['document_id'] for r in sorted_res]) def _save_offset2ids(self): - # delete old entries - self._clear_offset2ids_milvus() - # insert current entries - ids = self._offset2ids.ids - if not ids: - return - offsets = [str(i) for i in range(len(ids))] - dummy_vectors = [np.zeros(1) for _ in range(len(ids))] - collection = self._offset2id_collection - collection.insert([offsets, ids, dummy_vectors]) + if self._list_like: + # delete old entries + self._clear_offset2ids_milvus() + # insert current entries + ids = self._offset2ids.ids + if not ids: + return + offsets = [str(i) for i in range(len(ids))] + dummy_vectors = [np.zeros(1) for _ in range(len(ids))] + collection = self._offset2id_collection + collection.insert([offsets, ids, dummy_vectors]) def _get_docs_by_ids(self, ids: 'Iterable[str]', **kwargs) -> 'DocumentArray': if not ids: diff --git a/docs/advanced/document-store/milvus.md b/docs/advanced/document-store/milvus.md index 0962d6fd81a..af991a26fc5 100644 --- a/docs/advanced/document-store/milvus.md +++ b/docs/advanced/document-store/milvus.md @@ -131,6 +131,7 @@ The following configs can be set: | `consistency_level` | [Consistency level](https://milvus.io/docs/v2.1.x/consistency.md#Consistency-levels) for Milvus database operations. Can be 'Session', 'Strong', 'Bounded' or 'Eventually'. | 'Session' | | `batch_size` | Default batch size for CRUD operations. | -1 (no batching) | | `columns` | Additional columns to be stored in the datbase, taken from Document `tags`. | None | +| `list_like` | Controls if ordering of Documents is persisted in the Database. Disabling this breaks list-like features, but can improve performance. | True | ## Minimal example From dcad6391b487c49e9097e645ada8388f8242acf4 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Tue, 15 Nov 2022 14:05:36 +0100 Subject: [PATCH 86/88] ci: increase timeout to see how long it really takes --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 37bab63aaa9..bf580491574 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -187,7 +187,7 @@ jobs: pytest --suppress-no-test-exit-code --cov=docarray --cov-report=xml \ -v -s -m "not gpu" ${{ matrix.test-path }} echo "codecov_flag=docarray" >> $GITHUB_OUTPUT - timeout-minutes: 45 + timeout-minutes: 120 env: JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}" - name: Check codecov file From 3aecd6b843bcf5d6d71b220572eff4f406f3ba4e Mon Sep 17 00:00:00 2001 From: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Date: Tue, 15 Nov 2022 14:53:13 +0100 Subject: [PATCH 87/88] docs: apply suggestions from code review Co-authored-by: AlaeddineAbdessalem Signed-off-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> --- docs/advanced/document-store/milvus.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/advanced/document-store/milvus.md b/docs/advanced/document-store/milvus.md index af991a26fc5..af4722aa6c7 100644 --- a/docs/advanced/document-store/milvus.md +++ b/docs/advanced/document-store/milvus.md @@ -4,7 +4,7 @@ One can use [Milvus](https://milvus.io/) as the Document store for DocumentArray. It is useful when one wants to have faster Document retrieval on embeddings, i.e. `.match()`, `.find()`. ````{tip} -This feature requires `pymilvus`. You can install it via `pip install "docarray[milvus]".` +This feature requires `pymilvus`. You can install it via `pip install "docarray[milvus]"`. ```` ## Usage @@ -91,7 +91,7 @@ da = DocumentArray(storage='milvus', config={'n_dim': 10}) Here, `config` is configuration for the new Milvus collection, and `n_dim` is a mandatory field that specifies the dimensionality of stored embeddings. -For more information about the Milvus `config`, refer to the {ref}`specification `. +For more information about the Milvus `config`, refer to the {ref}`config `. To access a previously persisted DocumentArray, specify the `collection_name`, the `host`, and the `port`. @@ -370,7 +370,7 @@ Currently, dynamically setting a consistency level is supported for the followin In Milvus you can [pass parameters to the search operation](https://milvus.io/docs/v2.1.x/search.md#Conduct-a-vector-search) which [depend on the used index type](https://milvus.io/docs/v2.1.x/index.md). -In DocumentArray, this ability is exposed through the `param` argument in the `.find()` method: +In DocumentArray, this ability is exposed through the `param` argument in the `~docarray.array.mixins.find` method: ```python import numpy as np From 14c4a8fba1cdeb9dd05220a06742192e19a623dc Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 17 Nov 2022 10:48:33 +0100 Subject: [PATCH 88/88] ci: change timeouts Signed-off-by: Johannes Messner --- .github/workflows/ci.yml | 4 ++-- docs/advanced/document-store/milvus.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bf580491574..6996e768b45 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -187,7 +187,7 @@ jobs: pytest --suppress-no-test-exit-code --cov=docarray --cov-report=xml \ -v -s -m "not gpu" ${{ matrix.test-path }} echo "codecov_flag=docarray" >> $GITHUB_OUTPUT - timeout-minutes: 120 + timeout-minutes: 60 env: JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}" - name: Check codecov file @@ -238,7 +238,7 @@ jobs: pytest --suppress-no-test-exit-code --cov=docarray --cov-report=xml \ -v -s -m "not gpu" ${{ matrix.test-path }} echo "::set-output name=codecov_flag::docarray" - timeout-minutes: 40 + timeout-minutes: 60 env: JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}" - name: Check codecov file diff --git a/docs/advanced/document-store/milvus.md b/docs/advanced/document-store/milvus.md index af4722aa6c7..a93c9ccaed5 100644 --- a/docs/advanced/document-store/milvus.md +++ b/docs/advanced/document-store/milvus.md @@ -299,7 +299,7 @@ for embedding, price in zip(results.embeddings, results[:, 'tags__price']): ``` This prints: -``` +```text Points with "price" at most 7: