From 5f7acf0804554147d3422ec377955219fe3fb4b2 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 17 Nov 2022 21:58:58 +0800 Subject: [PATCH 01/37] feat: support root_id for subindex with redis Signed-off-by: AnneY --- docarray/array/mixins/find.py | 16 ++++++++-- docarray/array/storage/base/seqlike.py | 7 +++++ docarray/array/storage/redis/backend.py | 1 + tests/unit/array/mixins/test_find.py | 40 +++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 2 deletions(-) diff --git a/docarray/array/mixins/find.py b/docarray/array/mixins/find.py index b467e945445..ec96aad1c45 100644 --- a/docarray/array/mixins/find.py +++ b/docarray/array/mixins/find.py @@ -99,6 +99,7 @@ def find( filter: Optional[Dict] = None, only_id: bool = False, index: str = 'text', + return_root: Optional[bool] = False, on: Optional[str] = None, **kwargs, ) -> Union['DocumentArray', List['DocumentArray']]: @@ -131,9 +132,11 @@ def find( :return: a list of DocumentArrays containing the closest Document objects for each of the queries in `query`. """ + from docarray import Document, DocumentArray + index_da = self._get_index(subindex_name=on) if index_da is not self: - return index_da.find( + results = index_da.find( query, metric, limit, @@ -144,7 +147,16 @@ def find( index, on=None, ) - from docarray import Document, DocumentArray + + # print(results[:, 'id']) + + if return_root: + da = self[results[:, 'tags__root_id']] + return da + + # print(da[:,'id']) + + return results if isinstance(query, dict): if filter is None: diff --git a/docarray/array/storage/base/seqlike.py b/docarray/array/storage/base/seqlike.py index 73fa80a00b7..0b009f451c1 100644 --- a/docarray/array/storage/base/seqlike.py +++ b/docarray/array/storage/base/seqlike.py @@ -10,6 +10,13 @@ class BaseSequenceLikeMixin(MutableSequence[Document]): def _update_subindices_append_extend(self, value): if getattr(self, '_subindices', None): for selector, da in self._subindices.items(): + + if getattr(da, '_config', None): + if da._config.root_id: + for i in range(len(value)): + for doc in DocumentArray(value[i])[selector]: + doc.tags['root_id'] = value[i].id + docs_selector = DocumentArray(value)[selector] if len(docs_selector) > 0: da.extend(docs_selector) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 8652f0fefee..677ff0481dc 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -35,6 +35,7 @@ class RedisConfig: block_size: Optional[int] = None initial_cap: Optional[int] = None columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None + root_id: bool = False class BackendMixin(BaseBackendMixin): diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 0c3dd165923..0130551c467 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -949,3 +949,43 @@ class MMDoc: assert (closest_docs[0].embedding == np.array([3, 3])).all() for d in closest_docs: assert d.id.endswith('_2') + + +def test_find_return_root(start_storage): + with DocumentArray( + storage='redis', + config={ + 'n_dim': 128, + 'index_name': 'idx', + }, + subindex_configs={'@c': {'n_dim': 3}, '@cc': {'n_dim': 3, 'root_id': True}}, + ) as da: + da.extend( + [ + Document( + id=f'{i}', + chunks=[ + Document( + id=f'sub{i}_0', + chunks=[ + Document( + id=f'sub2_{i}_0', embedding=np.random.random(3) + ) + ], + ), + Document( + id=f'sub{i}_1', + chunks=[ + Document( + id=f'sub2_{i}_1', embedding=np.random.random(3) + ) + ], + ), + ], + ) + for i in range(10) + ] + ) + res = da.find(np.random.random(3), on='@cc', return_root=True, limit=5) + assert len(res) == 5 + assert all(d.id in [f'{i}' for i in range(10)] for d in res) From e031227e71e8984113153fedef7d9c0af0d4cac9 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 17 Nov 2022 22:47:41 +0800 Subject: [PATCH 02/37] feat: add root_id to config of backends Signed-off-by: AnneY --- docarray/array/storage/annlite/backend.py | 1 + docarray/array/storage/elastic/backend.py | 1 + docarray/array/storage/qdrant/backend.py | 1 + docarray/array/storage/sqlite/backend.py | 1 + docarray/array/storage/weaviate/backend.py | 1 + 5 files changed, 5 insertions(+) diff --git a/docarray/array/storage/annlite/backend.py b/docarray/array/storage/annlite/backend.py index 477741e49ab..eb947f44ad6 100644 --- a/docarray/array/storage/annlite/backend.py +++ b/docarray/array/storage/annlite/backend.py @@ -31,6 +31,7 @@ class AnnliteConfig: max_connection: Optional[int] = None n_components: Optional[int] = None columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None + root_id: bool = False class BackendMixin(BaseBackendMixin): diff --git a/docarray/array/storage/elastic/backend.py b/docarray/array/storage/elastic/backend.py index 99b6b27fddc..c758dafb56f 100644 --- a/docarray/array/storage/elastic/backend.py +++ b/docarray/array/storage/elastic/backend.py @@ -46,6 +46,7 @@ class ElasticConfig: ef_construction: Optional[int] = None m: Optional[int] = None columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None + root_id: bool = False _banned_indexname_chars = ['[', ' ', '"', '*', '\\', '<', '|', ',', '>', '/', '?', ']'] diff --git a/docarray/array/storage/qdrant/backend.py b/docarray/array/storage/qdrant/backend.py index c67847c57b8..09c32e8a54d 100644 --- a/docarray/array/storage/qdrant/backend.py +++ b/docarray/array/storage/qdrant/backend.py @@ -51,6 +51,7 @@ class QdrantConfig: full_scan_threshold: Optional[int] = None m: Optional[int] = None columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None + root_id: bool = False class BackendMixin(BaseBackendMixin): diff --git a/docarray/array/storage/sqlite/backend.py b/docarray/array/storage/sqlite/backend.py index 688158f0f91..5cc7bd38a3e 100644 --- a/docarray/array/storage/sqlite/backend.py +++ b/docarray/array/storage/sqlite/backend.py @@ -29,6 +29,7 @@ class SqliteConfig: conn_config: Dict = field(default_factory=dict) journal_mode: str = 'WAL' synchronous: str = 'OFF' + root_id: bool = False class BackendMixin(BaseBackendMixin): diff --git a/docarray/array/storage/weaviate/backend.py b/docarray/array/storage/weaviate/backend.py index c16eee41b38..9bcc7374c09 100644 --- a/docarray/array/storage/weaviate/backend.py +++ b/docarray/array/storage/weaviate/backend.py @@ -52,6 +52,7 @@ class WeaviateConfig: # weaviate python client parameters batch_size: Optional[int] = field(default=50) dynamic_batching: Optional[bool] = field(default=False) + root_id: bool = False def __post_init__(self): if isinstance(self.timeout_config, list): From 365052ddc380faedbb5d578a91fbe29060cfdd7c Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 21 Nov 2022 11:31:30 +0800 Subject: [PATCH 03/37] feat: store find scores Signed-off-by: AnneY --- docarray/array/mixins/find.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docarray/array/mixins/find.py b/docarray/array/mixins/find.py index ec96aad1c45..f7caf5b1b9b 100644 --- a/docarray/array/mixins/find.py +++ b/docarray/array/mixins/find.py @@ -152,6 +152,7 @@ def find( if return_root: da = self[results[:, 'tags__root_id']] + da[:, 'scores'] = results[:, 'scores'] return da # print(da[:,'id']) From 0f5f105b515e6e7560a2993429f8d34cccf4b1cf Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 21 Nov 2022 13:26:49 +0800 Subject: [PATCH 04/37] feat: add root_id in _update_subindices_set Signed-off-by: AnneY --- docarray/array/storage/base/getsetdel.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docarray/array/storage/base/getsetdel.py b/docarray/array/storage/base/getsetdel.py index 0c672fde963..dce673eb706 100644 --- a/docarray/array/storage/base/getsetdel.py +++ b/docarray/array/storage/base/getsetdel.py @@ -200,6 +200,12 @@ def _update_subindices_set(self, set_index, docs): _check_valid_values_nested_set(self[set_index], docs) if set_index in subindices: subindex_da = subindices[set_index] + + if getattr(subindex_da, '_config', None): + if subindex_da._config.root_id: + for i in range(len(docs)): + docs[i].tags['root_id'] = subindex_da[i].tags['root_id'] + subindex_da.clear() subindex_da.extend(docs) else: # root level set, update subindices iteratively From e290cff9d260d7d806f1c1876490031443c1e036 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 21 Nov 2022 13:33:04 +0800 Subject: [PATCH 05/37] refactor: remove comments Signed-off-by: AnneY --- docarray/array/mixins/find.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docarray/array/mixins/find.py b/docarray/array/mixins/find.py index f7caf5b1b9b..e519b264e5a 100644 --- a/docarray/array/mixins/find.py +++ b/docarray/array/mixins/find.py @@ -148,15 +148,11 @@ def find( on=None, ) - # print(results[:, 'id']) - if return_root: da = self[results[:, 'tags__root_id']] da[:, 'scores'] = results[:, 'scores'] return da - # print(da[:,'id']) - return results if isinstance(query, dict): From aef6b32d0abbe443a5ed7b351ab881fb835ccba0 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 21 Nov 2022 21:49:25 +0800 Subject: [PATCH 06/37] feat: set default root_id to True Signed-off-by: AnneY --- docarray/array/storage/annlite/backend.py | 2 +- docarray/array/storage/elastic/backend.py | 2 +- docarray/array/storage/qdrant/backend.py | 2 +- docarray/array/storage/redis/backend.py | 2 +- docarray/array/storage/sqlite/backend.py | 2 +- docarray/array/storage/weaviate/backend.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docarray/array/storage/annlite/backend.py b/docarray/array/storage/annlite/backend.py index eb947f44ad6..bdb3c0f294d 100644 --- a/docarray/array/storage/annlite/backend.py +++ b/docarray/array/storage/annlite/backend.py @@ -31,7 +31,7 @@ class AnnliteConfig: max_connection: Optional[int] = None n_components: Optional[int] = None columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None - root_id: bool = False + root_id: bool = True class BackendMixin(BaseBackendMixin): diff --git a/docarray/array/storage/elastic/backend.py b/docarray/array/storage/elastic/backend.py index c758dafb56f..b8f559bc6f3 100644 --- a/docarray/array/storage/elastic/backend.py +++ b/docarray/array/storage/elastic/backend.py @@ -46,7 +46,7 @@ class ElasticConfig: ef_construction: Optional[int] = None m: Optional[int] = None columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None - root_id: bool = False + root_id: bool = True _banned_indexname_chars = ['[', ' ', '"', '*', '\\', '<', '|', ',', '>', '/', '?', ']'] diff --git a/docarray/array/storage/qdrant/backend.py b/docarray/array/storage/qdrant/backend.py index 09c32e8a54d..d08519b9706 100644 --- a/docarray/array/storage/qdrant/backend.py +++ b/docarray/array/storage/qdrant/backend.py @@ -51,7 +51,7 @@ class QdrantConfig: full_scan_threshold: Optional[int] = None m: Optional[int] = None columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None - root_id: bool = False + root_id: bool = True class BackendMixin(BaseBackendMixin): diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 677ff0481dc..c0b95fa291c 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -35,7 +35,7 @@ class RedisConfig: block_size: Optional[int] = None initial_cap: Optional[int] = None columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None - root_id: bool = False + root_id: bool = True class BackendMixin(BaseBackendMixin): diff --git a/docarray/array/storage/sqlite/backend.py b/docarray/array/storage/sqlite/backend.py index 5cc7bd38a3e..7692f941523 100644 --- a/docarray/array/storage/sqlite/backend.py +++ b/docarray/array/storage/sqlite/backend.py @@ -29,7 +29,7 @@ class SqliteConfig: conn_config: Dict = field(default_factory=dict) journal_mode: str = 'WAL' synchronous: str = 'OFF' - root_id: bool = False + root_id: bool = True class BackendMixin(BaseBackendMixin): diff --git a/docarray/array/storage/weaviate/backend.py b/docarray/array/storage/weaviate/backend.py index 9bcc7374c09..57c5c811740 100644 --- a/docarray/array/storage/weaviate/backend.py +++ b/docarray/array/storage/weaviate/backend.py @@ -52,7 +52,7 @@ class WeaviateConfig: # weaviate python client parameters batch_size: Optional[int] = field(default=50) dynamic_batching: Optional[bool] = field(default=False) - root_id: bool = False + root_id: bool = True def __post_init__(self): if isinstance(self.timeout_config, list): From dc3da504dd689dd32c2877a9b23678a6d5713d30 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 21 Nov 2022 22:01:04 +0800 Subject: [PATCH 07/37] feat: keep user pre-defined root_id Signed-off-by: AnneY --- docarray/array/storage/base/seqlike.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/docarray/array/storage/base/seqlike.py b/docarray/array/storage/base/seqlike.py index 0b009f451c1..afe297f24f7 100644 --- a/docarray/array/storage/base/seqlike.py +++ b/docarray/array/storage/base/seqlike.py @@ -11,11 +11,14 @@ def _update_subindices_append_extend(self, value): if getattr(self, '_subindices', None): for selector, da in self._subindices.items(): - if getattr(da, '_config', None): - if da._config.root_id: - for i in range(len(value)): - for doc in DocumentArray(value[i])[selector]: - doc.tags['root_id'] = value[i].id + if getattr(da, '_config', None) and da._config.root_id: + for i in range(len(value)): + for doc in DocumentArray(value[i])[selector]: + doc.tags['root_id'] = ( + doc.tags['root_id'] + if 'root_id' in doc.tags + else value[i].id + ) docs_selector = DocumentArray(value)[selector] if len(docs_selector) > 0: From 2e5a4854a06b3a5343a8363e52b089767873d638 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 21 Nov 2022 22:53:59 +0800 Subject: [PATCH 08/37] feat: add warning for redis _extend about root_id Signed-off-by: AnneY --- docarray/array/storage/redis/seqlike.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docarray/array/storage/redis/seqlike.py b/docarray/array/storage/redis/seqlike.py index ad60024434e..cec97841652 100644 --- a/docarray/array/storage/redis/seqlike.py +++ b/docarray/array/storage/redis/seqlike.py @@ -1,3 +1,4 @@ +import warnings from typing import Iterable, Union from docarray import Document, DocumentArray @@ -59,6 +60,17 @@ def _upload_batch(self, batch_of_docs: Iterable['Document']): def _extend(self, docs: Iterable['Document']): da = DocumentArray(docs) + + if ( + self._config.root_id + and self._config.index_name.find('_subindex_') != -1 + and not all([doc.tags.get('root_id', None) for doc in da]) + ): + warnings.warn( + "Not all documents have root_id set. This may cause unexpected behavior.", + UserWarning, + ) + for batch_of_docs in da.batch(self._config.batch_size): self._upload_batch(batch_of_docs) if self._list_like: From 7081f7cfb7fcf8cccb77c87c113f813302dd4f22 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 21 Nov 2022 22:55:14 +0800 Subject: [PATCH 09/37] feat: raise error when not all res have root_id Signed-off-by: AnneY --- docarray/array/mixins/find.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docarray/array/mixins/find.py b/docarray/array/mixins/find.py index e519b264e5a..86e5392fe90 100644 --- a/docarray/array/mixins/find.py +++ b/docarray/array/mixins/find.py @@ -149,6 +149,11 @@ def find( ) if return_root: + if not all(results[:, 'tags__root_id']): + raise ValueError( + f'Not all Documents in the subindex {on} have the "tags__root_id" attribute set.' + ) + da = self[results[:, 'tags__root_id']] da[:, 'scores'] = results[:, 'scores'] return da From 865e02f43d9851f2812f9265b2d140e182b3bf66 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 21 Nov 2022 23:52:00 +0800 Subject: [PATCH 10/37] test: add test_find_return_root Signed-off-by: AnneY --- tests/unit/array/mixins/test_find.py | 60 ++++++++++++++-------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 0130551c467..5e484d2a3db 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -951,41 +951,43 @@ class MMDoc: assert d.id.endswith('_2') -def test_find_return_root(start_storage): - with DocumentArray( - storage='redis', - config={ - 'n_dim': 128, - 'index_name': 'idx', - }, - subindex_configs={'@c': {'n_dim': 3}, '@cc': {'n_dim': 3, 'root_id': True}}, - ) as da: +@pytest.mark.parametrize( + 'storage, config, subindex_configs', + [ + ( + 'weaviate', + { + 'n_dim': 3, + }, + {'@c': {'n_dim': 3}}, + ), + ('annlite', {'n_dim': 3}, {'@c': {'n_dim': 3}}), + ('qdrant', {'n_dim': 3}, {'@c': {'n_dim': 3}}), + ('elasticsearch', {'n_dim': 3}, {'@c': {'n_dim': 3}}), + ('redis', {'n_dim': 3}, {'@c': {'n_dim': 3}}), + ], +) +def test_find_return_root(storage, config, subindex_configs, start_storage): + da = DocumentArray( + storage=storage, + config=config, + subindex_configs=subindex_configs, + ) + + with da: da.extend( [ Document( id=f'{i}', chunks=[ - Document( - id=f'sub{i}_0', - chunks=[ - Document( - id=f'sub2_{i}_0', embedding=np.random.random(3) - ) - ], - ), - Document( - id=f'sub{i}_1', - chunks=[ - Document( - id=f'sub2_{i}_1', embedding=np.random.random(3) - ) - ], - ), + Document(id=f'sub{i}_0', embedding=np.random.random(3)), + Document(id=f'sub{i}_1', embedding=np.random.random(3)), ], ) - for i in range(10) + for i in range(5) ] ) - res = da.find(np.random.random(3), on='@cc', return_root=True, limit=5) - assert len(res) == 5 - assert all(d.id in [f'{i}' for i in range(10)] for d in res) + + res = da.find(np.random.random(3), on='@c', return_root=True, limit=5) + assert len(res) == 5 + assert all(d.id in [f'{i}' for i in range(10)] for d in res) From 0a0c51769d7cc7ed23f139a166d03fe18b4dda49 Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 22 Nov 2022 00:19:45 +0800 Subject: [PATCH 11/37] fix: convert value to docarray first Signed-off-by: AnneY --- docarray/array/storage/base/seqlike.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docarray/array/storage/base/seqlike.py b/docarray/array/storage/base/seqlike.py index afe297f24f7..3bbedbb5bd4 100644 --- a/docarray/array/storage/base/seqlike.py +++ b/docarray/array/storage/base/seqlike.py @@ -11,16 +11,16 @@ def _update_subindices_append_extend(self, value): if getattr(self, '_subindices', None): for selector, da in self._subindices.items(): + value = DocumentArray(value) + if getattr(da, '_config', None) and da._config.root_id: - for i in range(len(value)): - for doc in DocumentArray(value[i])[selector]: + for v in value: + for doc in DocumentArray(v)[selector]: doc.tags['root_id'] = ( - doc.tags['root_id'] - if 'root_id' in doc.tags - else value[i].id + doc.tags['root_id'] if 'root_id' in doc.tags else v.id ) - docs_selector = DocumentArray(value)[selector] + docs_selector = value[selector] if len(docs_selector) > 0: da.extend(docs_selector) From 601c881b4c378095240d947195066615c12372e3 Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 22 Nov 2022 14:49:15 +0800 Subject: [PATCH 12/37] feat: add no root_id warning Signed-off-by: AnneY --- docarray/array/storage/base/backend.py | 1 + docarray/array/storage/base/seqlike.py | 18 +++++++++++++++++- docarray/array/storage/redis/seqlike.py | 12 ------------ 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/docarray/array/storage/base/backend.py b/docarray/array/storage/base/backend.py index 7105b568b3a..6baf101e96c 100644 --- a/docarray/array/storage/base/backend.py +++ b/docarray/array/storage/base/backend.py @@ -41,6 +41,7 @@ def _init_subindices( config, config_subindex, config_joined, name ) self._subindices[name] = self.__class__(config=config_joined) + self._subindices[name]._is_subindex = True if _docs: from docarray import DocumentArray diff --git a/docarray/array/storage/base/seqlike.py b/docarray/array/storage/base/seqlike.py index 3bbedbb5bd4..7acbc415842 100644 --- a/docarray/array/storage/base/seqlike.py +++ b/docarray/array/storage/base/seqlike.py @@ -1,5 +1,6 @@ +import warnings from abc import abstractmethod -from typing import Iterator, Iterable, MutableSequence +from typing import Iterable, Iterator, MutableSequence from docarray import Document, DocumentArray @@ -72,6 +73,21 @@ def __bool__(self): return len(self) > 0 def extend(self, values: Iterable['Document'], **kwargs) -> None: + + # TODO whether to add warning for DocumentArrayInMemory + from docarray.array.memory import DocumentArrayInMemory + + if ( + getattr(self, '_is_subindex', False) + and not isinstance(self, DocumentArrayInMemory) + and self._config.root_id + and not all([doc.tags.get('root_id', None) for doc in values]) + ): + warnings.warn( + "root_id is enabled but not all documents have root_id set. This may cause unexpected behavior.", + UserWarning, + ) + self._extend(values, **kwargs) self._update_subindices_append_extend(values) diff --git a/docarray/array/storage/redis/seqlike.py b/docarray/array/storage/redis/seqlike.py index cec97841652..ad60024434e 100644 --- a/docarray/array/storage/redis/seqlike.py +++ b/docarray/array/storage/redis/seqlike.py @@ -1,4 +1,3 @@ -import warnings from typing import Iterable, Union from docarray import Document, DocumentArray @@ -60,17 +59,6 @@ def _upload_batch(self, batch_of_docs: Iterable['Document']): def _extend(self, docs: Iterable['Document']): da = DocumentArray(docs) - - if ( - self._config.root_id - and self._config.index_name.find('_subindex_') != -1 - and not all([doc.tags.get('root_id', None) for doc in da]) - ): - warnings.warn( - "Not all documents have root_id set. This may cause unexpected behavior.", - UserWarning, - ) - for batch_of_docs in da.batch(self._config.batch_size): self._upload_batch(batch_of_docs) if self._list_like: From 6d1fcbe7876eccd0d3d5533380247abec76875b5 Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 22 Nov 2022 15:43:29 +0800 Subject: [PATCH 13/37] feat: add root_id for root level set Signed-off-by: AnneY --- docarray/array/storage/base/getsetdel.py | 29 ++++++++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/docarray/array/storage/base/getsetdel.py b/docarray/array/storage/base/getsetdel.py index dce673eb706..d5ada423e26 100644 --- a/docarray/array/storage/base/getsetdel.py +++ b/docarray/array/storage/base/getsetdel.py @@ -201,10 +201,16 @@ def _update_subindices_set(self, set_index, docs): if set_index in subindices: subindex_da = subindices[set_index] - if getattr(subindex_da, '_config', None): - if subindex_da._config.root_id: - for i in range(len(docs)): - docs[i].tags['root_id'] = subindex_da[i].tags['root_id'] + if ( + getattr(subindex_da, '_config', None) + and subindex_da._config.root_id + ): + for doc, subindex_doc in zip(docs, subindex_da): + doc.tags['root_id'] = ( + doc.tags['root_id'] + if 'root_id' in doc.tags + else subindex_doc.tags['root_id'] + ) subindex_da.clear() subindex_da.extend(docs) @@ -212,7 +218,20 @@ def _update_subindices_set(self, set_index, docs): for subindex_selector, subindex_da in subindices.items(): old_ids = DocumentArray(self[set_index])[subindex_selector, 'id'] del subindex_da[old_ids] - subindex_da.extend(DocumentArray(docs)[subindex_selector]) + + value = DocumentArray(docs) + + if ( + getattr(subindex_da, '_config', None) + and subindex_da._config.root_id + ): + for v in value: + for doc in DocumentArray(v)[subindex_selector]: + doc.tags['root_id'] = ( + doc.tags['root_id'] if 'root_id' in doc.tags else v.id + ) + + subindex_da.extend(value[subindex_selector]) def _set_docs(self, ids, docs: Iterable['Document']): docs = list(docs) From dffcb2b815efe3038ae8031a52e588d215714e18 Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 22 Nov 2022 17:36:46 +0800 Subject: [PATCH 14/37] fix: do not assign root_id automatically Signed-off-by: AnneY --- docarray/array/storage/base/getsetdel.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/docarray/array/storage/base/getsetdel.py b/docarray/array/storage/base/getsetdel.py index d5ada423e26..6b1f54ce7c8 100644 --- a/docarray/array/storage/base/getsetdel.py +++ b/docarray/array/storage/base/getsetdel.py @@ -201,17 +201,6 @@ def _update_subindices_set(self, set_index, docs): if set_index in subindices: subindex_da = subindices[set_index] - if ( - getattr(subindex_da, '_config', None) - and subindex_da._config.root_id - ): - for doc, subindex_doc in zip(docs, subindex_da): - doc.tags['root_id'] = ( - doc.tags['root_id'] - if 'root_id' in doc.tags - else subindex_doc.tags['root_id'] - ) - subindex_da.clear() subindex_da.extend(docs) else: # root level set, update subindices iteratively From bcbfd4b510b71f0c36d7a6176f13bfd624958034 Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 22 Nov 2022 17:55:06 +0800 Subject: [PATCH 15/37] fix: keep scores when return_root is True Signed-off-by: AnneY --- docarray/array/mixins/find.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docarray/array/mixins/find.py b/docarray/array/mixins/find.py index 86e5392fe90..90d6d5b37d3 100644 --- a/docarray/array/mixins/find.py +++ b/docarray/array/mixins/find.py @@ -154,8 +154,9 @@ def find( f'Not all Documents in the subindex {on} have the "tags__root_id" attribute set.' ) - da = self[results[:, 'tags__root_id']] - da[:, 'scores'] = results[:, 'scores'] + da = DocumentArray(self[results[:, 'tags__root_id']]) + for d, s in zip(da, results[:, 'scores']): + d.scores = s return da return results From 6eb6eba195825b36260b144ac76429fcff3c945f Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 22 Nov 2022 18:31:25 +0800 Subject: [PATCH 16/37] test: add tests for root_id Signed-off-by: AnneY --- tests/unit/array/mixins/test_find.py | 70 ++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 4 deletions(-) diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 5e484d2a3db..ad6cebd4df2 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -962,12 +962,13 @@ class MMDoc: {'@c': {'n_dim': 3}}, ), ('annlite', {'n_dim': 3}, {'@c': {'n_dim': 3}}), + ('sqlite', dict(), {'@c': dict()}), ('qdrant', {'n_dim': 3}, {'@c': {'n_dim': 3}}), ('elasticsearch', {'n_dim': 3}, {'@c': {'n_dim': 3}}), ('redis', {'n_dim': 3}, {'@c': {'n_dim': 3}}), ], ) -def test_find_return_root(storage, config, subindex_configs, start_storage): +def test_find_return_root(storage, config, subindex_configs): da = DocumentArray( storage=storage, config=config, @@ -988,6 +989,67 @@ def test_find_return_root(storage, config, subindex_configs, start_storage): ] ) - res = da.find(np.random.random(3), on='@c', return_root=True, limit=5) - assert len(res) == 5 - assert all(d.id in [f'{i}' for i in range(10)] for d in res) + da[0] = Document( + id='5', + embedding=np.random.random(3), + chunks=[ + Document(id=f'sub5_0', embedding=np.random.random(3)), + Document(id=f'sub5_1', embedding=np.random.random(3)), + ], + ) + + assert all(d.tags['root_id'] in [f'{i}' for i in range(1, 6)] for d in da['@c']) + + res = da.find(np.random.random(3), on='@c', return_root=True) + assert len(res) > 0 + assert all(d.id in [f'{i}' for i in range(1, 6)] for d in res) + assert len(res[:, 'scores'][0]) > 0 + + +@pytest.mark.parametrize( + 'storage, config, subindex_configs', + [ + ( + 'weaviate', + { + 'n_dim': 3, + }, + {'@c': {'n_dim': 3}}, + ), + ('annlite', {'n_dim': 3}, {'@c': {'n_dim': 3}}), + ('sqlite', dict(), {'@c': dict()}), + ('qdrant', {'n_dim': 3}, {'@c': {'n_dim': 3}}), + ('elasticsearch', {'n_dim': 3}, {'@c': {'n_dim': 3}}), + ('redis', {'n_dim': 3}, {'@c': {'n_dim': 3}}), + ], +) +def test_subindex_root_id(storage, config, subindex_configs, start_storage): + da = DocumentArray( + storage=storage, + config=config, + subindex_configs=subindex_configs, + ) + + with da: + da.extend( + [ + Document( + id=f'{i}', + chunks=[ + Document(id=f'sub{i}_0', embedding=np.random.random(3)), + Document(id=f'sub{i}_1', embedding=np.random.random(3)), + ], + ) + for i in range(5) + ] + ) + + with pytest.warns(UserWarning): + new_da = DocumentArray( + [Document(id=f'temp{i}', embedding=np.random.random(3)) for i in range(10)] + ) + new_da[:, 'id'] = da['@c'][:, 'id'] + da['@c'] = new_da + + with pytest.warns(UserWarning): + da['@c'].extend([Document(id='sub_extra', embedding=np.random.random(3))]) From 65e9715b15d7c8d4e921904646d9c1ec65bef7d0 Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 22 Nov 2022 23:45:09 +0800 Subject: [PATCH 17/37] feat: add root-id support for DAInMemory Signed-off-by: AnneY --- docarray/array/mixins/find.py | 22 ++++++++++++++++++++-- docarray/array/storage/base/seqlike.py | 24 ++++++++++++++---------- tests/unit/array/mixins/test_find.py | 7 +++++-- 3 files changed, 39 insertions(+), 14 deletions(-) diff --git a/docarray/array/mixins/find.py b/docarray/array/mixins/find.py index 28904b9cbab..d5dc3621d37 100644 --- a/docarray/array/mixins/find.py +++ b/docarray/array/mixins/find.py @@ -149,12 +149,30 @@ def find( ) if return_root: - if not all(results[:, 'tags__root_id']): + from docarray.array.memory import DocumentArrayInMemory + + def get_root_docs(da, docs): + root_da_flat = da[...] + da = DocumentArray() + for doc in docs: + result = doc + while getattr(result, 'parent_id', None): + result = root_da_flat[result.parent_id] + da.append(result) + return da + + if not isinstance(self, DocumentArrayInMemory) and not all( + results[:, 'tags__root_id'] + ): raise ValueError( f'Not all Documents in the subindex {on} have the "tags__root_id" attribute set.' ) - da = DocumentArray(self[results[:, 'tags__root_id']]) + if isinstance(self, DocumentArrayInMemory): + da = get_root_docs(self, results) + else: + da = DocumentArray(self[results[:, 'tags__root_id']]) + for d, s in zip(da, results[:, 'scores']): d.scores = s return da diff --git a/docarray/array/storage/base/seqlike.py b/docarray/array/storage/base/seqlike.py index 2165985828b..8f12f48fdc3 100644 --- a/docarray/array/storage/base/seqlike.py +++ b/docarray/array/storage/base/seqlike.py @@ -78,16 +78,20 @@ def extend(self, values: Iterable['Document'], **kwargs) -> None: # TODO whether to add warning for DocumentArrayInMemory from docarray.array.memory import DocumentArrayInMemory - if ( - getattr(self, '_is_subindex', False) - and not isinstance(self, DocumentArrayInMemory) - and self._config.root_id - and not all([doc.tags.get('root_id', None) for doc in values]) - ): - warnings.warn( - "root_id is enabled but not all documents have root_id set. This may cause unexpected behavior.", - UserWarning, - ) + if getattr(self, '_is_subindex', False): + if isinstance(self, DocumentArrayInMemory): + if not all([getattr(doc, 'parent_id', None) for doc in values]): + warnings.warn( + "Not all documents have parent_id set. This may cause unexpected behavior.", + UserWarning, + ) + elif self._config.root_id and not all( + [doc.tags.get('root_id', None) for doc in values] + ): + warnings.warn( + "root_id is enabled but not all documents have root_id set. This may cause unexpected behavior.", + UserWarning, + ) self._extend(values, **kwargs) self._update_subindices_append_extend(values) diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 67c056c3631..bcc1d7b3da0 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -992,6 +992,7 @@ class MMDoc: @pytest.mark.parametrize( 'storage, config, subindex_configs', [ + ('memory', None, {'@c': None}), ( 'weaviate', { @@ -1006,7 +1007,7 @@ class MMDoc: ('redis', {'n_dim': 3}, {'@c': {'n_dim': 3}}), ], ) -def test_find_return_root(storage, config, subindex_configs): +def test_find_return_root(storage, config, subindex_configs, start_storage): da = DocumentArray( storage=storage, config=config, @@ -1036,7 +1037,8 @@ def test_find_return_root(storage, config, subindex_configs): ], ) - assert all(d.tags['root_id'] in [f'{i}' for i in range(1, 6)] for d in da['@c']) + if storage != 'memory': + assert all(d.tags['root_id'] in [f'{i}' for i in range(1, 6)] for d in da['@c']) res = da.find(np.random.random(3), on='@c', return_root=True) assert len(res) > 0 @@ -1047,6 +1049,7 @@ def test_find_return_root(storage, config, subindex_configs): @pytest.mark.parametrize( 'storage, config, subindex_configs', [ + ('memory', None, {'@c': None}), ( 'weaviate', { From 3997b7ec1af196abd6bb479a23d520550f6a5037 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 23 Nov 2022 17:22:11 +0800 Subject: [PATCH 18/37] docs: add doc for find return_root Signed-off-by: AnneY --- docarray/array/mixins/find.py | 1 + docs/fundamentals/documentarray/subindex.md | 28 +++++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/docarray/array/mixins/find.py b/docarray/array/mixins/find.py index d5dc3621d37..27548c0794c 100644 --- a/docarray/array/mixins/find.py +++ b/docarray/array/mixins/find.py @@ -127,6 +127,7 @@ def find( parameter is ignored. By default, the Document `text` attribute will be used for search, otherwise the tag field specified by `index` will be used. You can only use this parameter if the storage backend supports searching by text. + :param return_root: if set, then the root-level DocumentArray will be returned :param on: specifies a subindex to search on. If set, the returned DocumentArray will be retrieved from the given subindex. :param kwargs: other kwargs. diff --git a/docs/fundamentals/documentarray/subindex.md b/docs/fundamentals/documentarray/subindex.md index 405f7b0f19b..810d3baca30 100644 --- a/docs/fundamentals/documentarray/subindex.md +++ b/docs/fundamentals/documentarray/subindex.md @@ -217,8 +217,7 @@ Document(embedding=np.random.rand(512)).match(da, on='@c') ``` ```` -Such a search will return Documents from the subindex. If you are interested in the top-level Documents associated with -a match, you can retrieve them using `parent_id`: +Such a search will return Documents from the subindex. If you are interested in the top-level Documents associated with a match, you can retrieve them using `parent_id`: ````{tab} Subindex with dataclass modalities ```python @@ -232,3 +231,28 @@ top_image_matches = da.find(query=np.random.rand(512), on='@c') top_level_matches = da[top_image_matches[:, 'parent_id']] ``` ```` + +Or you can set `return_root=True` in `find` for DocumentArray in a {ref}`Document Store`: + +````{tab} Subindex with dataclass modalities +```python +top_level_matches = da.find(query=np.random.rand(512), on='@.[image]', return_root=True) +``` +```` +````{tab} Subindex with chunks +```python +top_level_matches = da.find(query=np.random.rand(512), on='@c', return_root=True) +``` +```` + +````{admonition} Note +:class: note +When you add more Documents to nested level, the `root_id` of new Documents should be set manually for `return_root=True` to work: + +```python +da['@c'].extend( + Document(embedding=np.random.random(512), tags={'root_id': 'your_root_id'}) +) +``` + +```` From a00bdbd3adfda09f06f20d187c39115a0343eb17 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 23 Nov 2022 17:44:10 +0800 Subject: [PATCH 19/37] fix: annlite _append Signed-off-by: AnneY --- docarray/array/storage/annlite/seqlike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/annlite/seqlike.py b/docarray/array/storage/annlite/seqlike.py index 93628ab3cbd..4c5a7fff6f0 100644 --- a/docarray/array/storage/annlite/seqlike.py +++ b/docarray/array/storage/annlite/seqlike.py @@ -20,7 +20,7 @@ def _extend(self, values: Iterable['Document']) -> None: self._offset2ids.extend([doc.id for doc in docs]) def _append(self, value: 'Document'): - self.extend([value]) + self._extend([value]) def __eq__(self, other): """In annlite backend, data are considered as identical if configs point to the same database source""" From faa09eca90714a7ece93583c4402308d6d3197fa Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 23 Nov 2022 18:00:45 +0800 Subject: [PATCH 20/37] feat: add root_id support for milvus Signed-off-by: AnneY --- docarray/array/storage/milvus/backend.py | 1 + tests/unit/array/mixins/test_find.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index 35154ca3c66..ad39e5896ad 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -93,6 +93,7 @@ class MilvusConfig: batch_size: int = -1 columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None list_like: bool = True + root_id: bool = True class BackendMixin(BaseBackendMixin): diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index bcc1d7b3da0..fdffd478ad9 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -1005,6 +1005,7 @@ class MMDoc: ('qdrant', {'n_dim': 3}, {'@c': {'n_dim': 3}}), ('elasticsearch', {'n_dim': 3}, {'@c': {'n_dim': 3}}), ('redis', {'n_dim': 3}, {'@c': {'n_dim': 3}}), + ('milvus', {'n_dim': 3}, {'@c': {'n_dim': 3}}), ], ) def test_find_return_root(storage, config, subindex_configs, start_storage): @@ -1043,7 +1044,6 @@ def test_find_return_root(storage, config, subindex_configs, start_storage): res = da.find(np.random.random(3), on='@c', return_root=True) assert len(res) > 0 assert all(d.id in [f'{i}' for i in range(1, 6)] for d in res) - assert len(res[:, 'scores'][0]) > 0 @pytest.mark.parametrize( @@ -1062,6 +1062,7 @@ def test_find_return_root(storage, config, subindex_configs, start_storage): ('qdrant', {'n_dim': 3}, {'@c': {'n_dim': 3}}), ('elasticsearch', {'n_dim': 3}, {'@c': {'n_dim': 3}}), ('redis', {'n_dim': 3}, {'@c': {'n_dim': 3}}), + ('milvus', {'n_dim': 3}, {'@c': {'n_dim': 3}}), ], ) def test_subindex_root_id(storage, config, subindex_configs, start_storage): From bd4194f228a2ef58cbda4a04b64c9a920db79d28 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 23 Nov 2022 18:01:38 +0800 Subject: [PATCH 21/37] docs: add root_id to config for backends Signed-off-by: AnneY --- docs/advanced/document-store/annlite.md | 1 + docs/advanced/document-store/elasticsearch.md | 1 + docs/advanced/document-store/milvus.md | 3 ++- docs/advanced/document-store/qdrant.md | 2 ++ docs/advanced/document-store/redis.md | 2 +- docs/advanced/document-store/sqlite.md | 19 ++++++++++--------- 6 files changed, 17 insertions(+), 11 deletions(-) diff --git a/docs/advanced/document-store/annlite.md b/docs/advanced/document-store/annlite.md index 922b4938306..c764a157b93 100644 --- a/docs/advanced/document-store/annlite.md +++ b/docs/advanced/document-store/annlite.md @@ -48,6 +48,7 @@ The following configs can be set: | `max_connection` | The number of bi-directional links created for every new element during construction. | `None`, defaults to the default value in the AnnLite package* | | `n_components` | The output dimension of PCA model. Should be a positive number and less than `n_dim` if it's not `None` | `None`, defaults to the default value in the AnnLite package* | | `list_like` | Controls if ordering of Documents is persisted in the Database. Disabling this breaks list-like features, but can improve performance. | True | +| `root_id` | Boolean flag indicating whether to store `root_id` in the tags of chunk level Documents | True | *You can check the default values in [the AnnLite source code](https://github.com/jina-ai/annlite/blob/main/annlite/core/index/hnsw/index.py) diff --git a/docs/advanced/document-store/elasticsearch.md b/docs/advanced/document-store/elasticsearch.md index b55e3ba3172..20af11fd6a7 100644 --- a/docs/advanced/document-store/elasticsearch.md +++ b/docs/advanced/document-store/elasticsearch.md @@ -404,6 +404,7 @@ The following configs can be set: | `tag_indices` | List of tags to index | False | | `batch_size` | Batch size used to handle storage refreshes/updates | 64 | | `list_like` | Controls if ordering of Documents is persisted in the Database. Disabling this breaks list-like features, but can improve performance. | True | +| `root_id` | Boolean flag indicating whether to store `root_id` in the tags of chunk level Documents | True | ```{tip} You can read more about HNSW parameters and their default values [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params) diff --git a/docs/advanced/document-store/milvus.md b/docs/advanced/document-store/milvus.md index a93c9ccaed5..6fdff41eb0f 100644 --- a/docs/advanced/document-store/milvus.md +++ b/docs/advanced/document-store/milvus.md @@ -128,10 +128,11 @@ The following configs can be set: | `index_params` | A dictionary of parameters used for index building. The [allowed parameters](https://milvus.io/docs/v2.1.x/index.md) depend on the index type. | {'M': 4, 'efConstruction': 200} (assumes HNSW index) | | `collection_config` | Configuration for the Milvus collection. Passed as **kwargs during collection creation (`Collection(...)`). | {} | | `serialize_config` | [Serialization config of each Document](../../../fundamentals/document/serialization.md) | {} | - | `consistency_level` | [Consistency level](https://milvus.io/docs/v2.1.x/consistency.md#Consistency-levels) for Milvus database operations. Can be 'Session', 'Strong', 'Bounded' or 'Eventually'. | 'Session' | +| `consistency_level` | [Consistency level](https://milvus.io/docs/v2.1.x/consistency.md#Consistency-levels) for Milvus database operations. Can be 'Session', 'Strong', 'Bounded' or 'Eventually'. | 'Session' | | `batch_size` | Default batch size for CRUD operations. | -1 (no batching) | | `columns` | Additional columns to be stored in the datbase, taken from Document `tags`. | None | | `list_like` | Controls if ordering of Documents is persisted in the Database. Disabling this breaks list-like features, but can improve performance. | True | +| `root_id` | Boolean flag indicating whether to store `root_id` in the tags of chunk level Documents | True | ## Minimal example diff --git a/docs/advanced/document-store/qdrant.md b/docs/advanced/document-store/qdrant.md index b7438f5499d..179fde41596 100644 --- a/docs/advanced/document-store/qdrant.md +++ b/docs/advanced/document-store/qdrant.md @@ -94,6 +94,8 @@ The following configs can be set: | `m` | Number of edges per node in the index graph. Larger = more accurate search, more space required | `None`, defaults to the default value in Qdrant* | | `columns` | Other fields to store in Document | `None` | | `list_like` | Controls if ordering of Documents is persisted in the Database. Disabling this breaks list-like features, but can improve performance. | True | +| `root_id` | Boolean flag indicating whether to store `root_id` in the tags of chunk level Documents | True | + *You can read more about the HNSW parameters and their default values [here](https://qdrant.tech/documentation/indexing/#vector-index) diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index aee5d43560b..664a0490b3e 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -136,7 +136,7 @@ The following configs can be set: | `block_size` | Optional parameter for Redis FLAT algorithm | `1048576` | | `initial_cap` | Optional parameter for Redis HNSW and FLAT algorithm | `None`, defaults to the default value in Redis | | `columns` | Other fields to store in Document and build schema | `None` | -| `list_like` | Controls if ordering of Documents is persisted in the Database. Disabling this breaks list-like features, but can improve performance. | True | +| `list_like` | Controls if ordering of Documents is persisted in the Database. Disabling this breaks list-like features, but can improve performance. | `True` | You can check the default values in [the docarray source code](https://github.com/jina-ai/docarray/blob/main/docarray/array/storage/redis/backend.py). For vector search configurations, default values are those of the database backend, which you can find in the [Redis documentation](https://redis.io/docs/stack/search/reference/vectors/). diff --git a/docs/advanced/document-store/sqlite.md b/docs/advanced/document-store/sqlite.md index 6184588d1de..980af482527 100644 --- a/docs/advanced/document-store/sqlite.md +++ b/docs/advanced/document-store/sqlite.md @@ -33,12 +33,13 @@ Other functions behave the same as in-memory DocumentArray. The following configs can be set: -| Name | Description | Default | -|--------------------|------------------------------------------------------------------------------------------------------------------|--| -| `connection` | SQLite database filename | a random temp file | -| `table_name` | SQLite table name | a random name | -| `serialize_config` | [Serialization config of each Document](../../../fundamentals/document/serialization.md) | None | -| `conn_config` | [Connection config pass to `sqlite3.connect`](https://docs.python.org/3/library/sqlite3.html#sqlite3.Connection) | None | -| `journal_mode` | [SQLite Pragma: journal mode](https://www.sqlite.org/pragma.html#pragma_journal_mode) | `'DELETE'` | -| `synchronous` | [SQLite Pragma: synchronous](https://www.sqlite.org/pragma.html#pragma_synchronous) | `'OFF'` | -| `list_like` | Controls if ordering of Documents is persisted in the Database. Disabling this breaks list-like features, but can improve performance. | True | +| Name | Description | Default | +|--------------------|----------------------------------------------------------------------------------------------------------------------------------------|--------------------| +| `connection` | SQLite database filename | a random temp file | +| `table_name` | SQLite table name | a random name | +| `serialize_config` | [Serialization config of each Document](../../../fundamentals/document/serialization.md) | None | +| `conn_config` | [Connection config pass to `sqlite3.connect`](https://docs.python.org/3/library/sqlite3.html#sqlite3.Connection) | None | +| `journal_mode` | [SQLite Pragma: journal mode](https://www.sqlite.org/pragma.html#pragma_journal_mode) | `'DELETE'` | +| `synchronous` | [SQLite Pragma: synchronous](https://www.sqlite.org/pragma.html#pragma_synchronous) | `'OFF'` | +| `list_like` | Controls if ordering of Documents is persisted in the Database. Disabling this breaks list-like features, but can improve performance. | True | +| `root_id` | Boolean flag indicating whether to store `root_id` in the tags of chunk level Documents | True | From da5e972e808ed6943afa1ddfdf56199ef492b17f Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 24 Nov 2022 18:33:58 +0800 Subject: [PATCH 22/37] refactor: change warning message Signed-off-by: AnneY --- docarray/array/mixins/find.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/mixins/find.py b/docarray/array/mixins/find.py index 27548c0794c..5e0e290f04b 100644 --- a/docarray/array/mixins/find.py +++ b/docarray/array/mixins/find.py @@ -166,7 +166,7 @@ def get_root_docs(da, docs): results[:, 'tags__root_id'] ): raise ValueError( - f'Not all Documents in the subindex {on} have the "tags__root_id" attribute set.' + f'Not all Documents in the subindex {on} have the "root_id" attribute set in all `tags`.' ) if isinstance(self, DocumentArrayInMemory): From b3df9f35145671a40c94b2cb56c2a740b2b839ca Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 24 Nov 2022 18:34:39 +0800 Subject: [PATCH 23/37] feat: init _is_subindex in base _init_storage Signed-off-by: AnneY --- docarray/array/storage/base/backend.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docarray/array/storage/base/backend.py b/docarray/array/storage/base/backend.py index 6baf101e96c..d06f271e2f3 100644 --- a/docarray/array/storage/base/backend.py +++ b/docarray/array/storage/base/backend.py @@ -20,6 +20,7 @@ def _init_storage( *args, **kwargs, ): + self._is_subindex = False self._load_offset2ids() def _init_subindices( From 6e33f98a62a759d50b0b1e77e7cb9013d3d4dad6 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 24 Nov 2022 19:16:02 +0800 Subject: [PATCH 24/37] feat: change root_id to private _root_id_ Signed-off-by: AnneY --- docarray/array/mixins/find.py | 6 +++--- docarray/array/storage/base/getsetdel.py | 4 +--- docarray/array/storage/base/seqlike.py | 8 +++----- tests/unit/array/mixins/test_find.py | 4 +++- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/docarray/array/mixins/find.py b/docarray/array/mixins/find.py index 5e0e290f04b..f28784b4a4a 100644 --- a/docarray/array/mixins/find.py +++ b/docarray/array/mixins/find.py @@ -163,16 +163,16 @@ def get_root_docs(da, docs): return da if not isinstance(self, DocumentArrayInMemory) and not all( - results[:, 'tags__root_id'] + results[:, 'tags___root_id_'] ): raise ValueError( - f'Not all Documents in the subindex {on} have the "root_id" attribute set in all `tags`.' + f'Not all Documents in the subindex {on} have the "_root_id_" attribute set in all `tags`.' ) if isinstance(self, DocumentArrayInMemory): da = get_root_docs(self, results) else: - da = DocumentArray(self[results[:, 'tags__root_id']]) + da = DocumentArray(self[results[:, 'tags___root_id_']]) for d, s in zip(da, results[:, 'scores']): d.scores = s diff --git a/docarray/array/storage/base/getsetdel.py b/docarray/array/storage/base/getsetdel.py index 6b1f54ce7c8..8899ed7b913 100644 --- a/docarray/array/storage/base/getsetdel.py +++ b/docarray/array/storage/base/getsetdel.py @@ -216,9 +216,7 @@ def _update_subindices_set(self, set_index, docs): ): for v in value: for doc in DocumentArray(v)[subindex_selector]: - doc.tags['root_id'] = ( - doc.tags['root_id'] if 'root_id' in doc.tags else v.id - ) + doc.tags['_root_id_'] = v.id subindex_da.extend(value[subindex_selector]) diff --git a/docarray/array/storage/base/seqlike.py b/docarray/array/storage/base/seqlike.py index 8f12f48fdc3..d93b6c065e3 100644 --- a/docarray/array/storage/base/seqlike.py +++ b/docarray/array/storage/base/seqlike.py @@ -17,9 +17,7 @@ def _update_subindices_append_extend(self, value): if getattr(da, '_config', None) and da._config.root_id: for v in value: for doc in DocumentArray(v)[selector]: - doc.tags['root_id'] = ( - doc.tags['root_id'] if 'root_id' in doc.tags else v.id - ) + doc.tags['_root_id_'] = v.id docs_selector = value[selector] if len(docs_selector) > 0: @@ -86,10 +84,10 @@ def extend(self, values: Iterable['Document'], **kwargs) -> None: UserWarning, ) elif self._config.root_id and not all( - [doc.tags.get('root_id', None) for doc in values] + [doc.tags.get('_root_id_', None) for doc in values] ): warnings.warn( - "root_id is enabled but not all documents have root_id set. This may cause unexpected behavior.", + "root_id is enabled but not all documents have _root_id_ set. This may cause unexpected behavior.", UserWarning, ) diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index fdffd478ad9..55c1ed84b6b 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -1039,7 +1039,9 @@ def test_find_return_root(storage, config, subindex_configs, start_storage): ) if storage != 'memory': - assert all(d.tags['root_id'] in [f'{i}' for i in range(1, 6)] for d in da['@c']) + assert all( + d.tags['_root_id_'] in [f'{i}' for i in range(1, 6)] for d in da['@c'] + ) res = da.find(np.random.random(3), on='@c', return_root=True) assert len(res) > 0 From 56cb9d07c49df902abcf4edbea4a5636e1b0a139 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 24 Nov 2022 21:17:29 +0800 Subject: [PATCH 25/37] refactor: simplify code for find return root Signed-off-by: AnneY --- docarray/array/mixins/find.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/docarray/array/mixins/find.py b/docarray/array/mixins/find.py index f28784b4a4a..93b431309e6 100644 --- a/docarray/array/mixins/find.py +++ b/docarray/array/mixins/find.py @@ -162,20 +162,18 @@ def get_root_docs(da, docs): da.append(result) return da - if not isinstance(self, DocumentArrayInMemory) and not all( - results[:, 'tags___root_id_'] - ): - raise ValueError( - f'Not all Documents in the subindex {on} have the "_root_id_" attribute set in all `tags`.' - ) - if isinstance(self, DocumentArrayInMemory): da = get_root_docs(self, results) else: + if not all(results[:, 'tags___root_id_']): + raise ValueError( + f'Not all Documents in the subindex {on} have the "_root_id_" attribute set in all `tags`.' + ) da = DocumentArray(self[results[:, 'tags___root_id_']]) for d, s in zip(da, results[:, 'scores']): d.scores = s + return da return results From 81824e735fb7a44104d83b3e54ce8d2e78d7c045 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 24 Nov 2022 21:40:08 +0800 Subject: [PATCH 26/37] refactor: move get_root_docs to common helper Signed-off-by: AnneY --- docarray/array/mixins/find.py | 17 +++-------------- docarray/helper.py | 13 +++++++++++++ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/docarray/array/mixins/find.py b/docarray/array/mixins/find.py index 93b431309e6..4fbd327f6ee 100644 --- a/docarray/array/mixins/find.py +++ b/docarray/array/mixins/find.py @@ -1,15 +1,13 @@ import abc -from typing import overload, Optional, Union, Dict, List, Tuple, Callable, TYPE_CHECKING +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union, overload import numpy as np - from docarray.math import ndarray from docarray.score import NamedScore if TYPE_CHECKING: # pragma: no cover - from docarray.typing import T, ArrayType - from docarray import Document, DocumentArray + from docarray.typing import ArrayType, T class FindMixin: @@ -151,16 +149,7 @@ def find( if return_root: from docarray.array.memory import DocumentArrayInMemory - - def get_root_docs(da, docs): - root_da_flat = da[...] - da = DocumentArray() - for doc in docs: - result = doc - while getattr(result, 'parent_id', None): - result = root_da_flat[result.parent_id] - da.append(result) - return da + from docarray.helper import get_root_docs if isinstance(self, DocumentArrayInMemory): da = get_root_docs(self, results) diff --git a/docarray/helper.py b/docarray/helper.py index b5858724517..8d619eee9b6 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -491,6 +491,19 @@ def _get_array_info(da: 'DocumentArray'): return is_homo, _nested_in, _nested_items, attr_counter, all_attrs_names +def get_root_docs(da: 'DocumentArray', docs: 'DocumentArray'): + from docarray import DocumentArray + + root_da_flat = da[...] + da = DocumentArray() + for doc in docs: + result = doc + while getattr(result, 'parent_id', None): + result = root_da_flat[result.parent_id] + da.append(result) + return da + + def login(interactive: Optional[bool] = None, force: bool = False, **kwargs): """Login to Jina AI Cloud account. :param interactive: If set to true, login will support notebook environments, otherwise the enviroment will be inferred. From 455e9c20c2ed27cc734949d1428a2fa75cba5e7d Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 24 Nov 2022 23:15:14 +0800 Subject: [PATCH 27/37] feat: add warning for subindex __setitem__ Signed-off-by: AnneY --- docarray/array/mixins/setitem.py | 4 ++++ docarray/array/storage/base/seqlike.py | 17 ++--------------- docarray/helper.py | 25 ++++++++++++++++++++++++- tests/unit/array/mixins/test_find.py | 17 ++++++++++------- 4 files changed, 40 insertions(+), 23 deletions(-) diff --git a/docarray/array/mixins/setitem.py b/docarray/array/mixins/setitem.py index 33968402c36..de1712c5683 100644 --- a/docarray/array/mixins/setitem.py +++ b/docarray/array/mixins/setitem.py @@ -63,6 +63,10 @@ def __setitem__( index: 'DocumentArrayIndexType', value: Union['Document', Sequence['Document']], ): + from docarray.helper import check_root_id + + if getattr(self, '_is_subindex', False): + check_root_id(self, value) self._update_subindices_set(index, value) # set by offset diff --git a/docarray/array/storage/base/seqlike.py b/docarray/array/storage/base/seqlike.py index d93b6c065e3..eedb8c45b2e 100644 --- a/docarray/array/storage/base/seqlike.py +++ b/docarray/array/storage/base/seqlike.py @@ -73,23 +73,10 @@ def __bool__(self): def extend(self, values: Iterable['Document'], **kwargs) -> None: - # TODO whether to add warning for DocumentArrayInMemory - from docarray.array.memory import DocumentArrayInMemory + from docarray.helper import check_root_id if getattr(self, '_is_subindex', False): - if isinstance(self, DocumentArrayInMemory): - if not all([getattr(doc, 'parent_id', None) for doc in values]): - warnings.warn( - "Not all documents have parent_id set. This may cause unexpected behavior.", - UserWarning, - ) - elif self._config.root_id and not all( - [doc.tags.get('_root_id_', None) for doc in values] - ): - warnings.warn( - "root_id is enabled but not all documents have _root_id_ set. This may cause unexpected behavior.", - UserWarning, - ) + check_root_id(self, values) self._extend(values, **kwargs) self._update_subindices_append_extend(values) diff --git a/docarray/helper.py b/docarray/helper.py index 8d619eee9b6..a6862bf2487 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -12,7 +12,7 @@ import hubble if TYPE_CHECKING: # pragma: no cover - from docarray import DocumentArray + from docarray import Document, DocumentArray __resources_path__ = os.path.join( os.path.dirname( @@ -504,6 +504,29 @@ def get_root_docs(da: 'DocumentArray', docs: 'DocumentArray'): return da +def check_root_id(da: 'DocumentArray', value: Union['Document', Sequence['Document']]): + + from docarray import Document + from docarray.array.memory import DocumentArrayInMemory + + if isinstance(value, Document): + value = [value] + + if isinstance(da, DocumentArrayInMemory): + if not all([getattr(doc, 'parent_id', None) for doc in value]): + warnings.warn( + "Not all documents have parent_id set. This may cause unexpected behavior.", + UserWarning, + ) + elif da._config.root_id and not all( + [doc.tags.get('_root_id_', None) for doc in value] + ): + warnings.warn( + "root_id is enabled but not all documents have _root_id_ set. This may cause unexpected behavior.", + UserWarning, + ) + + def login(interactive: Optional[bool] = None, force: bool = False, **kwargs): """Login to Jina AI Cloud account. :param interactive: If set to true, login will support notebook environments, otherwise the enviroment will be inferred. diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 55c1ed84b6b..8068cfe15f5 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -1080,8 +1080,8 @@ def test_subindex_root_id(storage, config, subindex_configs, start_storage): Document( id=f'{i}', chunks=[ - Document(id=f'sub{i}_0', embedding=np.random.random(3)), - Document(id=f'sub{i}_1', embedding=np.random.random(3)), + Document(id=f'sub{i}_0'), + Document(id=f'sub{i}_1'), ], ) for i in range(5) @@ -1089,11 +1089,14 @@ def test_subindex_root_id(storage, config, subindex_configs, start_storage): ) with pytest.warns(UserWarning): - new_da = DocumentArray( - [Document(id=f'temp{i}', embedding=np.random.random(3)) for i in range(10)] - ) + new_da = DocumentArray([Document(id=f'temp{i}') for i in range(10)]) new_da[:, 'id'] = da['@c'][:, 'id'] da['@c'] = new_da - with pytest.warns(UserWarning): - da['@c'].extend([Document(id='sub_extra', embedding=np.random.random(3))]) + da['@c'].extend([Document(id='sub_extra')]) + with pytest.warns(UserWarning): + da['@c']['sub0_0'] = Document(id='sub0_new') + with pytest.warns(UserWarning): + da['@c'][0] = Document(id='sub0_new') + with pytest.warns(UserWarning): + da['@c'][0:] = [Document(id='sub0_new'), Document(id='sub0_new2')] From 8bf42564fd3e07c98397800a82f23e10f7e1114f Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 24 Nov 2022 23:20:35 +0800 Subject: [PATCH 28/37] docs: add note for find return_root Signed-off-by: AnneY --- docs/fundamentals/documentarray/subindex.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/fundamentals/documentarray/subindex.md b/docs/fundamentals/documentarray/subindex.md index 810d3baca30..5ef387c369d 100644 --- a/docs/fundamentals/documentarray/subindex.md +++ b/docs/fundamentals/documentarray/subindex.md @@ -247,12 +247,11 @@ top_level_matches = da.find(query=np.random.rand(512), on='@c', return_root=True ````{admonition} Note :class: note -When you add more Documents to nested level, the `root_id` of new Documents should be set manually for `return_root=True` to work: +When you add or change Documents on nested level directly, the `_root_id_` (or `parent_id` for DocumentArrayInMemory) of new Documents should be set manually for `return_root=True` to work: ```python da['@c'].extend( - Document(embedding=np.random.random(512), tags={'root_id': 'your_root_id'}) + Document(embedding=np.random.random(512), tags={'_root_id_': 'your_root_id'}) ) ``` - ```` From f5dc4373a94340c81de1ccc88d9e44e64720736f Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 25 Nov 2022 14:37:44 +0800 Subject: [PATCH 29/37] refactor: use _is_subindex directly Signed-off-by: AnneY --- docarray/array/mixins/setitem.py | 2 +- docarray/array/storage/base/seqlike.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docarray/array/mixins/setitem.py b/docarray/array/mixins/setitem.py index de1712c5683..18832893567 100644 --- a/docarray/array/mixins/setitem.py +++ b/docarray/array/mixins/setitem.py @@ -65,7 +65,7 @@ def __setitem__( ): from docarray.helper import check_root_id - if getattr(self, '_is_subindex', False): + if self._is_subindex: check_root_id(self, value) self._update_subindices_set(index, value) diff --git a/docarray/array/storage/base/seqlike.py b/docarray/array/storage/base/seqlike.py index eedb8c45b2e..5e46cafe607 100644 --- a/docarray/array/storage/base/seqlike.py +++ b/docarray/array/storage/base/seqlike.py @@ -75,7 +75,7 @@ def extend(self, values: Iterable['Document'], **kwargs) -> None: from docarray.helper import check_root_id - if getattr(self, '_is_subindex', False): + if self._is_subindex: check_root_id(self, values) self._extend(values, **kwargs) From 3b703b6eb1f081a5562a2c5a530cfce1829ed1c5 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 25 Nov 2022 18:04:52 +0800 Subject: [PATCH 30/37] refactor: move _get_root_docs to mixins Signed-off-by: AnneY --- docarray/array/mixins/find.py | 25 +++++++++++++------------ docarray/array/storage/memory/find.py | 16 ++++++++++++++++ docarray/helper.py | 13 ------------- 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/docarray/array/mixins/find.py b/docarray/array/mixins/find.py index 4fbd327f6ee..e028f7c5550 100644 --- a/docarray/array/mixins/find.py +++ b/docarray/array/mixins/find.py @@ -148,18 +148,7 @@ def find( ) if return_root: - from docarray.array.memory import DocumentArrayInMemory - from docarray.helper import get_root_docs - - if isinstance(self, DocumentArrayInMemory): - da = get_root_docs(self, results) - else: - if not all(results[:, 'tags___root_id_']): - raise ValueError( - f'Not all Documents in the subindex {on} have the "_root_id_" attribute set in all `tags`.' - ) - da = DocumentArray(self[results[:, 'tags___root_id_']]) - + da = self._get_root_docs(results) for d, s in zip(da, results[:, 'scores']): d.scores = s @@ -322,3 +311,15 @@ def _find_by_text(self, *args, **kwargs): raise NotImplementedError( f'Search by text is not supported with this backend {self.__class__.__name__}' ) + + def _get_root_docs(self, docs: 'DocumentArray') -> 'DocumentArray': + """Get the root documents of the current DocumentArray. + + :return: a `DocumentArray` containing the root documents. + """ + + if not all(docs[:, 'tags___root_id_']): + raise ValueError( + f'Not all Documents in this subindex have the "_root_id_" attribute set in all `tags`.' + ) + return self[docs[:, 'tags___root_id_']] diff --git a/docarray/array/storage/memory/find.py b/docarray/array/storage/memory/find.py index 9cbd07dfc32..ccf1855fa90 100644 --- a/docarray/array/storage/memory/find.py +++ b/docarray/array/storage/memory/find.py @@ -180,3 +180,19 @@ def _get_dist(da: 'DocumentArray'): idx = np.take_along_axis(top_inds, permutation, axis=1) return dist, idx + + def _get_root_docs(self, docs: 'DocumentArray') -> 'DocumentArray': + """Get the root documents of the current DocumentArray. + + :return: a `DocumentArray` containing the root documents. + """ + from docarray import DocumentArray + + root_da_flat = self[...] + da = DocumentArray() + for doc in docs: + result = doc + while getattr(result, 'parent_id', None): + result = root_da_flat[result.parent_id] + da.append(result) + return da diff --git a/docarray/helper.py b/docarray/helper.py index a6862bf2487..636da91dfb6 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -491,19 +491,6 @@ def _get_array_info(da: 'DocumentArray'): return is_homo, _nested_in, _nested_items, attr_counter, all_attrs_names -def get_root_docs(da: 'DocumentArray', docs: 'DocumentArray'): - from docarray import DocumentArray - - root_da_flat = da[...] - da = DocumentArray() - for doc in docs: - result = doc - while getattr(result, 'parent_id', None): - result = root_da_flat[result.parent_id] - da.append(result) - return da - - def check_root_id(da: 'DocumentArray', value: Union['Document', Sequence['Document']]): from docarray import Document From 736a4e639fea9802cf9a7c4dd0d3e1a1dbe2ec7e Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 25 Nov 2022 21:51:33 +0800 Subject: [PATCH 31/37] fix: check_root_id only when value is Doc Signed-off-by: AnneY --- docarray/helper.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docarray/helper.py b/docarray/helper.py index 636da91dfb6..14a204a5c4b 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -496,6 +496,11 @@ def check_root_id(da: 'DocumentArray', value: Union['Document', Sequence['Docume from docarray import Document from docarray.array.memory import DocumentArrayInMemory + if not isinstance(value, Document) or ( + isinstance(value, Sequence) and not isinstance(value[0], Document) + ): + return + if isinstance(value, Document): value = [value] From 8431aacb6d59c3c167b0eef8d14b46f77d2e83ad Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 25 Nov 2022 22:48:03 +0800 Subject: [PATCH 32/37] fix: fix logic in check_root_id Signed-off-by: AnneY --- docarray/helper.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docarray/helper.py b/docarray/helper.py index 14a204a5c4b..6d5b676ada6 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -496,8 +496,9 @@ def check_root_id(da: 'DocumentArray', value: Union['Document', Sequence['Docume from docarray import Document from docarray.array.memory import DocumentArrayInMemory - if not isinstance(value, Document) or ( - isinstance(value, Sequence) and not isinstance(value[0], Document) + if not ( + isinstance(value, Document) + or (isinstance(value, Sequence) and isinstance(value[0], Document)) ): return From 5f8ca221f1c1fb0b575c0a9cbb93e3a15ca0ba36 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 28 Nov 2022 20:14:14 +0800 Subject: [PATCH 33/37] refactor: add comments Signed-off-by: AnneY --- docarray/array/storage/base/getsetdel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/base/getsetdel.py b/docarray/array/storage/base/getsetdel.py index 8899ed7b913..97ce50f9a71 100644 --- a/docarray/array/storage/base/getsetdel.py +++ b/docarray/array/storage/base/getsetdel.py @@ -211,7 +211,7 @@ def _update_subindices_set(self, set_index, docs): value = DocumentArray(docs) if ( - getattr(subindex_da, '_config', None) + getattr(subindex_da, '_config', None) # checks if in-memory da and subindex_da._config.root_id ): for v in value: From b1fbb270eda812de6713cf3c69db70a91dfd7444 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 28 Nov 2022 20:26:14 +0800 Subject: [PATCH 34/37] docs: doc modification Signed-off-by: AnneY --- docs/advanced/document-store/weaviate.md | 4 +++- docs/fundamentals/documentarray/subindex.md | 19 ++----------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/docs/advanced/document-store/weaviate.md b/docs/advanced/document-store/weaviate.md index 3563a9232cd..a60c2268ed2 100644 --- a/docs/advanced/document-store/weaviate.md +++ b/docs/advanced/document-store/weaviate.md @@ -103,7 +103,9 @@ The following configs can be set: | `flat_search_cutoff` | Absolute number of objects configured as the threshold for a flat-search cutoff. If a filter on a filtered vector search matches fewer than the specified elements, the HNSW index is bypassed entirely and a flat (brute-force) search is performed instead. This can speed up queries with very restrictive filters considerably. Optional, defaults to 40000. Set to 0 to turn off flat-search cutoff entirely. | `None`, defaults to the default value in Weaviate* | | `cleanup_interval_seconds` | How often the async process runs that “repairs” the HNSW graph after deletes and updates. (Prior to the repair/cleanup process, deleted objects are simply marked as deleted, but still a fully connected member of the HNSW graph. After the repair has run, the edges are reassigned and the datapoints deleted for good). Typically this value does not need to be adjusted, but if deletes or updates are very frequent it might make sense to adjust the value up or down. (Higher value means it runs less frequently, but cleans up more in a single batch. Lower value means it runs more frequently, but might not be as efficient with each run). | `None`, defaults to the default value in Weaviate* | | `skip` | There are situations where it doesn’t make sense to vectorize a class. For example if the class is just meant as glue between two other class (consisting only of references) or if the class contains mostly duplicate elements (Note that importing duplicate vectors into HNSW is very expensive as the algorithm uses a check whether a candidate’s distance is higher than the worst candidate’s distance for an early exit condition. With (mostly) identical vectors, this early exit condition is never met leading to an exhaustive search on each import or query). In this case, you can skip indexing a vector all-together. To do so, set "skip" to "true". skip defaults to false; if not set to true, classes will be indexed normally. This setting is immutable after class initialization. | `None`, defaults to the default value in Weaviate* | -| `list_like` | Controls if ordering of Documents is persisted in the Database. Disabling this breaks list-like features, but can improve performance. | True | +| `list_like` | Controls if ordering of Documents is persisted in the Database. Disabling this breaks list-like features, but can improve performance. | True | +| `root_id` | Boolean flag indicating whether to store `root_id` in the tags of chunk level Documents | True | + *You can read more about the HNSW parameters and their default values [here](https://weaviate.io/developers/weaviate/current/vector-index-plugins/hnsw.html#how-to-use-hnsw-and-parameters) diff --git a/docs/fundamentals/documentarray/subindex.md b/docs/fundamentals/documentarray/subindex.md index 5ef387c369d..1091c7d6749 100644 --- a/docs/fundamentals/documentarray/subindex.md +++ b/docs/fundamentals/documentarray/subindex.md @@ -217,22 +217,7 @@ Document(embedding=np.random.rand(512)).match(da, on='@c') ``` ```` -Such a search will return Documents from the subindex. If you are interested in the top-level Documents associated with a match, you can retrieve them using `parent_id`: - -````{tab} Subindex with dataclass modalities -```python -top_image_matches = da.find(query=np.random.rand(512), on='@.[image]') -top_level_matches = da[top_image_matches[:, 'parent_id']] -``` -```` -````{tab} Subindex with chunks -```python -top_image_matches = da.find(query=np.random.rand(512), on='@c') -top_level_matches = da[top_image_matches[:, 'parent_id']] -``` -```` - -Or you can set `return_root=True` in `find` for DocumentArray in a {ref}`Document Store`: +Such a search will return Documents from the subindex. If you are interested in the top-level Documents associated with a match, you can retrieve them by setting `return_root=True` in `find`: ````{tab} Subindex with dataclass modalities ```python @@ -247,7 +232,7 @@ top_level_matches = da.find(query=np.random.rand(512), on='@c', return_root=True ````{admonition} Note :class: note -When you add or change Documents on nested level directly, the `_root_id_` (or `parent_id` for DocumentArrayInMemory) of new Documents should be set manually for `return_root=True` to work: +When you add or change Documents directly on a subindex, the `_root_id_` (or `parent_id` for DocumentArrayInMemory) of new Documents should be set manually for `return_root=True` to work: ```python da['@c'].extend( From 1c119a458550c34ae28e1abd0c6d817be1973eb8 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 28 Nov 2022 21:17:25 +0800 Subject: [PATCH 35/37] test: add test cases for test_find_return_root Signed-off-by: AnneY --- tests/unit/array/mixins/test_find.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 8068cfe15f5..0c87ae34858 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -1021,31 +1021,36 @@ def test_find_return_root(storage, config, subindex_configs, start_storage): Document( id=f'{i}', chunks=[ - Document(id=f'sub{i}_0', embedding=np.random.random(3)), - Document(id=f'sub{i}_1', embedding=np.random.random(3)), + Document(id=f'sub{i}', embedding=np.random.random(3)), ], ) - for i in range(5) + for i in range(9) ] ) da[0] = Document( - id='5', + id='9', embedding=np.random.random(3), chunks=[ - Document(id=f'sub5_0', embedding=np.random.random(3)), - Document(id=f'sub5_1', embedding=np.random.random(3)), + Document(id=f'sub9', embedding=np.random.random(3)), ], ) if storage != 'memory': assert all( - d.tags['_root_id_'] in [f'{i}' for i in range(1, 6)] for d in da['@c'] + d.tags['_root_id_'] in [f'{i}' for i in range(1, 10)] for d in da['@c'] ) - res = da.find(np.random.random(3), on='@c', return_root=True) - assert len(res) > 0 - assert all(d.id in [f'{i}' for i in range(1, 6)] for d in res) + query = np.random.random(3) + res = da.find(query, on='@c') + root_level_res = da.find(query, on='@c', return_root=True) + + res_root_id = [i.id[3] for i in res] + assert res_root_id == root_level_res[:, 'id'] + assert res[:, 'scores'] == root_level_res[:, 'scores'] + + assert len(root_level_res) > 0 + assert all(d.id in [f'{i}' for i in range(1, 10)] for d in root_level_res) @pytest.mark.parametrize( From 34d43ce07ae7cb9e7fe006666ed1ed8f3f83ccca Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 28 Nov 2022 22:18:22 +0800 Subject: [PATCH 36/37] docs: add root_id to redis doc Signed-off-by: AnneY --- docs/advanced/document-store/redis.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index 664a0490b3e..c5267b4e61c 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -137,6 +137,7 @@ The following configs can be set: | `initial_cap` | Optional parameter for Redis HNSW and FLAT algorithm | `None`, defaults to the default value in Redis | | `columns` | Other fields to store in Document and build schema | `None` | | `list_like` | Controls if ordering of Documents is persisted in the Database. Disabling this breaks list-like features, but can improve performance. | `True` | +| `root_id` | Boolean flag indicating whether to store `root_id` in the tags of chunk level Documents | `True` | You can check the default values in [the docarray source code](https://github.com/jina-ai/docarray/blob/main/docarray/array/storage/redis/backend.py). For vector search configurations, default values are those of the database backend, which you can find in the [Redis documentation](https://redis.io/docs/stack/search/reference/vectors/). From 98a12703cadbbb36b80644e26aeb30d23d844984 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 28 Nov 2022 23:16:50 +0800 Subject: [PATCH 37/37] refactor: add _is_subindex to constructor Signed-off-by: AnneY --- docarray/array/storage/annlite/backend.py | 2 +- docarray/array/storage/base/backend.py | 8 +++++--- docarray/array/storage/elastic/backend.py | 2 +- docarray/array/storage/milvus/backend.py | 2 +- docarray/array/storage/qdrant/backend.py | 2 +- docarray/array/storage/redis/backend.py | 2 +- docarray/array/storage/sqlite/backend.py | 2 +- 7 files changed, 11 insertions(+), 9 deletions(-) diff --git a/docarray/array/storage/annlite/backend.py b/docarray/array/storage/annlite/backend.py index bdb3c0f294d..49fc493be9f 100644 --- a/docarray/array/storage/annlite/backend.py +++ b/docarray/array/storage/annlite/backend.py @@ -105,7 +105,7 @@ def _init_storage( self._annlite = AnnLite(self.n_dim, lock=False, **filter_dict(config)) - super()._init_storage() + super()._init_storage(**kwargs) if _docs is None: return diff --git a/docarray/array/storage/base/backend.py b/docarray/array/storage/base/backend.py index d06f271e2f3..effc784b9ec 100644 --- a/docarray/array/storage/base/backend.py +++ b/docarray/array/storage/base/backend.py @@ -17,10 +17,11 @@ def _init_storage( self, _docs: Optional['DocumentArraySourceType'] = None, copy: bool = False, + _is_subindex: bool = False, *args, **kwargs, ): - self._is_subindex = False + self._is_subindex = _is_subindex self._load_offset2ids() def _init_subindices( @@ -41,8 +42,9 @@ def _init_subindices( config_joined = self._ensure_unique_config( config, config_subindex, config_joined, name ) - self._subindices[name] = self.__class__(config=config_joined) - self._subindices[name]._is_subindex = True + self._subindices[name] = self.__class__( + config=config_joined, _is_subindex=True + ) if _docs: from docarray import DocumentArray diff --git a/docarray/array/storage/elastic/backend.py b/docarray/array/storage/elastic/backend.py index 439a114f893..1513987a4a5 100644 --- a/docarray/array/storage/elastic/backend.py +++ b/docarray/array/storage/elastic/backend.py @@ -101,7 +101,7 @@ def _init_storage( self._build_offset2id_index() # Note super()._init_storage() calls _load_offset2ids which calls _get_offset2ids_meta - super()._init_storage() + super()._init_storage(**kwargs) if _docs is None: return diff --git a/docarray/array/storage/milvus/backend.py b/docarray/array/storage/milvus/backend.py index ad39e5896ad..e09a24c093b 100644 --- a/docarray/array/storage/milvus/backend.py +++ b/docarray/array/storage/milvus/backend.py @@ -135,7 +135,7 @@ def _init_storage( self._collection = self._create_or_reuse_collection() self._offset2id_collection = self._create_or_reuse_offset2id_collection() self._build_index() - super()._init_storage() + super()._init_storage(**kwargs) # To align with Sqlite behavior; if `docs` is not `None` and table name # is provided, :class:`DocumentArraySqlite` will clear the existing diff --git a/docarray/array/storage/qdrant/backend.py b/docarray/array/storage/qdrant/backend.py index d08519b9706..13d27bb72ee 100644 --- a/docarray/array/storage/qdrant/backend.py +++ b/docarray/array/storage/qdrant/backend.py @@ -129,7 +129,7 @@ def _init_storage( self._initialize_qdrant_schema() - super()._init_storage() + super()._init_storage(**kwargs) if docs is None and config.collection_name: return diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 2f3ed67ef17..bb72fdab405 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -88,7 +88,7 @@ def _init_storage( self._client = self._build_client() self._build_index() - super()._init_storage() + super()._init_storage(**kwargs) if _docs is None: return diff --git a/docarray/array/storage/sqlite/backend.py b/docarray/array/storage/sqlite/backend.py index 7692f941523..5c98ef1e93b 100644 --- a/docarray/array/storage/sqlite/backend.py +++ b/docarray/array/storage/sqlite/backend.py @@ -102,7 +102,7 @@ def _init_storage( self._connection.commit() self._config = config self._list_like = config.list_like - super()._init_storage() + super()._init_storage(**kwargs) if _docs is None: return