From e28f8d78c52dfb6dacdaf2b163c742632d5bc503 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Mon, 24 Apr 2023 17:23:56 +0200 Subject: [PATCH 01/18] feat: add in-memory doc index Signed-off-by: anna-charlotte --- docarray/index/backends/hnswlib.py | 2 +- docarray/index/backends/in_memory.py | 265 ++++++++++++++++++++++++ tests/index/in_memory/__init__.py | 0 tests/index/in_memory/test_in_memory.py | 69 ++++++ 4 files changed, 335 insertions(+), 1 deletion(-) create mode 100644 docarray/index/backends/in_memory.py create mode 100644 tests/index/in_memory/__init__.py create mode 100644 tests/index/in_memory/test_in_memory.py diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index 4c66dc52de8..648217f035e 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -232,7 +232,7 @@ def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs): def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: """ - Execute a query on the WeaviateDocumentIndex. + Execute a query on the HnswDocumentIndex. Can take two kinds of inputs: diff --git a/docarray/index/backends/in_memory.py b/docarray/index/backends/in_memory.py new file mode 100644 index 00000000000..e240b9fceee --- /dev/null +++ b/docarray/index/backends/in_memory.py @@ -0,0 +1,265 @@ +from dataclasses import dataclass, field +from typing import ( + Any, + Dict, + Generator, + Generic, + List, + Optional, + Sequence, + Tuple, + Type, + TypeVar, + Union, + cast, +) + +import numpy as np + +from docarray import BaseDoc, DocList +from docarray.index.abstract import BaseDocIndex, _raise_not_supported +from docarray.index.backends.hnswlib import _collect_query_args +from docarray.typing import ID, AnyTensor, NdArray +from docarray.typing.tensor.abstract_tensor import AbstractTensor +from docarray.utils.filter import filter_docs +from docarray.utils.find import ( + FindResult, + FindResultBatched, + _FindResult, + _FindResultBatched, + find, + find_batched, +) + +TSchema = TypeVar('TSchema', bound=BaseDoc) + + +class InMemoryDocIndex(BaseDocIndex, Generic[TSchema]): + def __init__(self, docs: Optional[DocList] = None, **kwargs): + super().__init__(db_config=None, **kwargs) + if docs is None: + self._docs = DocList[self._schema]() + else: + self._docs = docs + + def python_type_to_db_type(self, python_type: Type) -> Any: + """Map python type to database type. + Takes any python type and returns the corresponding database column type. + + :param python_type: a python type. + :return: the corresponding database column type, + or None if ``python_type`` is not supported. + """ + return python_type + + class QueryBuilder(BaseDocIndex.QueryBuilder): + def __init__(self, query: Optional[List[Tuple[str, Dict]]] = None): + super().__init__() + # list of tuples (method name, kwargs) + self._queries: List[Tuple[str, Dict]] = query or [] + + def build(self, *args, **kwargs) -> Any: + """Build the query object.""" + return self._queries + + find = _collect_query_args('find') + filter = _collect_query_args('filter') + text_search = _raise_not_supported('text_search') + find_batched = _raise_not_supported('find_batched') + filter_batched = _raise_not_supported('find_batched') + text_search_batched = _raise_not_supported('text_search') + + @dataclass + class DBConfig(BaseDocIndex.DBConfig): + """Dataclass that contains all "static" configurations of InMemoryDocIndex.""" + + pass + + @dataclass + class RuntimeConfig(BaseDocIndex.RuntimeConfig): + """Dataclass that contains all "dynamic" configurations of InMemoryDocIndex.""" + + default_column_config: Dict[Type, Dict[str, Any]] = field( + default_factory=lambda: { + np.ndarray: {}, + str: {}, + int: {}, + float: {}, + list: {}, + set: {}, + dict: {}, + ID: {}, + AbstractTensor: {}, + # `None` is not a Type, but we allow it here anyway + None: {}, # type: ignore + } + ) + + def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs): + # implementing the public option because conversion to column dict is not needed + docs = self._validate_docs(docs) + self._docs.extend(docs) + + def _index(self, column_to_data: Dict[str, Generator[Any, None, None]]): + raise NotImplementedError + + def num_docs(self) -> int: + return len(self._docs) + + def _del_items(self, doc_ids: Sequence[str]): + indices = [] + for i, doc in enumerate(self._docs): + if doc.id in doc_ids: + indices.append(i) + + for idx in reversed(indices): + self._docs.pop(idx) + + def _get_items( + self, doc_ids: Sequence[str] + ) -> Union[Sequence[TSchema], Sequence[Dict[str, Any]]]: + indices = [] + for i, doc in enumerate(self._docs): + if doc.id in doc_ids: + indices.append(i) + return self._docs[indices] + + def execute_query(self, query: List[str], *args, **kwargs) -> Any: + """ + Execute a query on the HnswDocumentIndex. + + Can take two kinds of inputs: + + 1. A native query of the underlying database. This is meant as a passthrough so that you + can enjoy any functionality that is not available through the Document index API. + 2. The output of this Document index' `QueryBuilder.build()` method. + + :param query: the query to execute + :param args: positional arguments to pass to the query + :param kwargs: keyword arguments to pass to the query + :return: the result of the query + """ + if args or kwargs: + raise ValueError( + f'args and kwargs not supported for `execute_query` on {type(self)}' + ) + + ann_docs = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))([]) + filter_conditions = [] + doc_to_score: Dict[BaseDoc, Any] = {} + for op, op_kwargs in query: + if op == 'find': + docs, scores = self.find(**op_kwargs) + ann_docs.extend(docs) + doc_to_score.update(zip(docs.__getattribute__('id'), scores)) + elif op == 'filter': + filter_conditions.append(op_kwargs['filter_query']) + + self._logger.debug(f'Executing query {query}') + docs_filtered = ann_docs + for cond in filter_conditions: + docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) + docs_filtered = docs_cls(filter_docs(docs_filtered, cond)) + + self._logger.debug(f'{len(docs_filtered)} results found') + docs_and_scores = zip( + docs_filtered, (doc_to_score[doc.id] for doc in docs_filtered) + ) + docs_sorted = sorted(docs_and_scores, key=lambda x: x[1]) + out_docs, out_scores = zip(*docs_sorted) + return FindResult(documents=out_docs, scores=out_scores) + + def find( + self, + query: Union[AnyTensor, BaseDoc], + search_field: str = '', + limit: int = 10, + **kwargs, + ) -> FindResult: + + self._logger.debug(f'Executing `find` for search field {search_field}') + self._validate_search_field(search_field) + + docs, scores = find( + index=self._docs, + query=query, + search_field=search_field, + limit=limit, + ) + return FindResult(documents=DocList[self._schema](docs), scores=scores) + + def _find( + self, query: np.ndarray, limit: int, search_field: str = '' + ) -> _FindResult: + pass + + def find_batched( + self, + queries: Union[AnyTensor, DocList], + search_field: str = '', + limit: int = 10, + **kwargs, + ) -> FindResultBatched: + """Find documents in the index using nearest neighbor search. + + :param queries: query vector for KNN/ANN search. + Can be either a tensor-like (np.array, torch.Tensor, etc.) with a, + or a DocList. + If a tensor-like is passed, it should have shape (batch_size, vector_dim) + :param search_field: name of the field to search on. + Documents in the index are retrieved based on this similarity + of this field to the query. + :param limit: maximum number of documents to return per query + :return: a named tuple containing `documents` and `scores` + """ + self._logger.debug(f'Executing `find_batched` for search field {search_field}') + self._validate_search_field(search_field) + + find_res = find_batched( + index=self._docs, + query=cast(NdArray, queries), + search_field=search_field, + limit=limit, + ) + + return find_res + + def _find_batched( + self, queries: np.ndarray, limit: int, search_field: str = '' + ) -> _FindResultBatched: + pass + + def filter( + self, + filter_query: Any, + limit: int = 10, + **kwargs, + ) -> DocList: + """Find documents in the index based on a filter query + + :param filter_query: the DB specific filter query to execute + :param limit: maximum number of documents to return + :return: a DocList containing the documents that match the filter query + """ + self._logger.debug(f'Executing `filter` for the query {filter_query}') + + docs = filter_docs(docs=self._docs, query=filter_query) + return cast(DocList, docs) + + def _filter(self, filter_query: Any, limit: int) -> Union[DocList, List[Dict]]: + pass + + def _filter_batched( + self, filter_queries: Any, limit: int + ) -> Union[List[DocList], List[List[Dict]]]: + raise NotImplementedError(f'{type(self)} does not support filtering.') + + def _text_search( + self, query: str, limit: int, search_field: str = '' + ) -> _FindResult: + raise NotImplementedError(f'{type(self)} does not support text search.') + + def _text_search_batched( + self, queries: Sequence[str], limit: int, search_field: str = '' + ) -> _FindResultBatched: + raise NotImplementedError(f'{type(self)} does not support text search.') diff --git a/tests/index/in_memory/__init__.py b/tests/index/in_memory/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/index/in_memory/test_in_memory.py b/tests/index/in_memory/test_in_memory.py new file mode 100644 index 00000000000..97a0b3abda4 --- /dev/null +++ b/tests/index/in_memory/test_in_memory.py @@ -0,0 +1,69 @@ +import numpy as np +import pytest + +from docarray import BaseDoc, DocList +from docarray.index.backends.in_memory import InMemoryDocIndex +from docarray.typing import NdArray + + +class SchemaDoc(BaseDoc): + text: str + price: int + tensor: NdArray[10] + + +@pytest.fixture +def docs(): + docs = DocList[SchemaDoc]( + [ + SchemaDoc(text=f'hello {i}', price=i, tensor=np.array([i] * 10)) + for i in range(9) + ] + ) + docs.append(SchemaDoc(text='good bye', price=100, tensor=np.array([100.0] * 10))) + return docs + + +def test_indexing(docs): + doc_index = InMemoryDocIndex[SchemaDoc]() + assert doc_index.num_docs() == 0 + + doc_index.index(docs) + assert doc_index.num_docs() == 10 + + +@pytest.fixture +def doc_index(docs): + doc_index = InMemoryDocIndex[SchemaDoc]() + doc_index.index(docs) + return doc_index + + +def test_del_item(docs, doc_index): + to_remove = [docs[0].id, docs[1].id] + doc_index._del_items(to_remove) + assert doc_index.num_docs() == 8 + + +def test_find(doc_index): + query = SchemaDoc(text='query', price=0, tensor=np.ones(10)) + docs, scores = doc_index.find(query, search_field='tensor', limit=5) + + assert len(docs) == 5 + assert len(scores) == 5 + assert doc_index.num_docs() == 10 + + +def test_concatenated_queries(doc_index): + query = SchemaDoc(text='query', price=0, tensor=np.ones(10)) + + q = ( + doc_index.build_query() + .find(query=query, search_field='tensor', limit=5) + .filter(filter_query={'price': {'$neq': 5}}) + .build() + ) + + docs, scores = doc_index.execute_query(q) + + assert len(docs) == 4 From 2436df1804775a97006acfad6cda289ae3bc7f49 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 25 Apr 2023 08:09:58 +0200 Subject: [PATCH 02/18] fix: mypy Signed-off-by: anna-charlotte --- docarray/index/backends/in_memory.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/docarray/index/backends/in_memory.py b/docarray/index/backends/in_memory.py index e240b9fceee..aedf302e068 100644 --- a/docarray/index/backends/in_memory.py +++ b/docarray/index/backends/in_memory.py @@ -37,6 +37,8 @@ class InMemoryDocIndex(BaseDocIndex, Generic[TSchema]): def __init__(self, docs: Optional[DocList] = None, **kwargs): super().__init__(db_config=None, **kwargs) + + self._docs: DocList if docs is None: self._docs = DocList[self._schema]() else: @@ -63,10 +65,10 @@ def build(self, *args, **kwargs) -> Any: return self._queries find = _collect_query_args('find') + find_batched = _collect_query_args('find_batched') filter = _collect_query_args('filter') - text_search = _raise_not_supported('text_search') - find_batched = _raise_not_supported('find_batched') filter_batched = _raise_not_supported('find_batched') + text_search = _raise_not_supported('text_search') text_search_batched = _raise_not_supported('text_search') @dataclass @@ -104,6 +106,9 @@ def _index(self, column_to_data: Dict[str, Generator[Any, None, None]]): raise NotImplementedError def num_docs(self) -> int: + """ + Get the number of documents. + """ return len(self._docs) def _del_items(self, doc_ids: Sequence[str]): @@ -124,9 +129,9 @@ def _get_items( indices.append(i) return self._docs[indices] - def execute_query(self, query: List[str], *args, **kwargs) -> Any: + def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: """ - Execute a query on the HnswDocumentIndex. + Execute a query on the InMemoryDocIndex. Can take two kinds of inputs: @@ -191,7 +196,7 @@ def find( def _find( self, query: np.ndarray, limit: int, search_field: str = '' ) -> _FindResult: - pass + raise NotImplementedError def find_batched( self, @@ -227,7 +232,7 @@ def find_batched( def _find_batched( self, queries: np.ndarray, limit: int, search_field: str = '' ) -> _FindResultBatched: - pass + raise NotImplementedError def filter( self, @@ -247,7 +252,7 @@ def filter( return cast(DocList, docs) def _filter(self, filter_query: Any, limit: int) -> Union[DocList, List[Dict]]: - pass + raise NotImplementedError def _filter_batched( self, filter_queries: Any, limit: int From c074e9ea65e8a121be6fcbdb12fd46c904e94d01 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 25 Apr 2023 09:28:47 +0200 Subject: [PATCH 03/18] fix: add space param Signed-off-by: anna-charlotte --- docarray/index/backends/in_memory.py | 21 +++++++++++-- tests/index/in_memory/test_in_memory.py | 42 +++++++++++++++++++++++-- 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/docarray/index/backends/in_memory.py b/docarray/index/backends/in_memory.py index aedf302e068..2fa876b15ff 100644 --- a/docarray/index/backends/in_memory.py +++ b/docarray/index/backends/in_memory.py @@ -83,7 +83,6 @@ class RuntimeConfig(BaseDocIndex.RuntimeConfig): default_column_config: Dict[Type, Dict[str, Any]] = field( default_factory=lambda: { - np.ndarray: {}, str: {}, int: {}, float: {}, @@ -91,7 +90,8 @@ class RuntimeConfig(BaseDocIndex.RuntimeConfig): set: {}, dict: {}, ID: {}, - AbstractTensor: {}, + np.ndarray: {'space': 'cosine_sim'}, + AbstractTensor: {'space': 'cosine_sim'}, # `None` is not a Type, but we allow it here anyway None: {}, # type: ignore } @@ -181,15 +181,27 @@ def find( limit: int = 10, **kwargs, ) -> FindResult: + """Find documents in the index using nearest neighbor search. + :param query: query vector for KNN/ANN search. + Can be either a tensor-like (np.array, torch.Tensor, etc.) + with a single axis, or a Document + :param search_field: name of the field to search on. + Documents in the index are retrieved based on this similarity + of this field to the query. + :param limit: maximum number of documents to return + :return: a named tuple containing `documents` and `scores` + """ self._logger.debug(f'Executing `find` for search field {search_field}') self._validate_search_field(search_field) + config = self._column_infos[search_field].config docs, scores = find( index=self._docs, query=query, search_field=search_field, limit=limit, + metric=config['space'], ) return FindResult(documents=DocList[self._schema](docs), scores=scores) @@ -219,12 +231,14 @@ def find_batched( """ self._logger.debug(f'Executing `find_batched` for search field {search_field}') self._validate_search_field(search_field) + config = self._column_infos[search_field].config find_res = find_batched( index=self._docs, query=cast(NdArray, queries), search_field=search_field, limit=limit, + metric=config['space'], ) return find_res @@ -242,7 +256,8 @@ def filter( ) -> DocList: """Find documents in the index based on a filter query - :param filter_query: the DB specific filter query to execute + :param filter_query: the filter query to execute following the query + language of :param limit: maximum number of documents to return :return: a DocList containing the documents that match the filter query """ diff --git a/tests/index/in_memory/test_in_memory.py b/tests/index/in_memory/test_in_memory.py index 97a0b3abda4..6550a89220b 100644 --- a/tests/index/in_memory/test_in_memory.py +++ b/tests/index/in_memory/test_in_memory.py @@ -1,5 +1,6 @@ import numpy as np import pytest +from pydantic import Field from docarray import BaseDoc, DocList from docarray.index.backends.in_memory import InMemoryDocIndex @@ -45,8 +46,19 @@ def test_del_item(docs, doc_index): assert doc_index.num_docs() == 8 -def test_find(doc_index): - query = SchemaDoc(text='query', price=0, tensor=np.ones(10)) +@pytest.mark.parametrize('space', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +@pytest.mark.parametrize('is_query_doc', [True, False]) +def test_find(doc_index, space, is_query_doc): + class MyDoc(BaseDoc): + text: str + price: int + tensor: NdArray[10] = Field(space=space) + + if is_query_doc: + query = MyDoc(text='query', price=0, tensor=np.ones(10)) + else: + query = np.ones(10) + docs, scores = doc_index.find(query, search_field='tensor', limit=5) assert len(docs) == 5 @@ -54,6 +66,32 @@ def test_find(doc_index): assert doc_index.num_docs() == 10 +@pytest.mark.parametrize('space', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +@pytest.mark.parametrize('is_query_doc', [True, False]) +def test_find_batched(doc_index, space, is_query_doc): + class MyDoc(BaseDoc): + text: str + price: int + tensor: NdArray[10] = Field(space=space) + + if is_query_doc: + query = DocList[MyDoc]( + [ + MyDoc(text='query 0', price=0, tensor=np.zeros(10)), + MyDoc(text='query 1', price=1, tensor=np.ones(10)), + ] + ) + else: + query = np.ones((2, 10)) + + docs, scores = doc_index.find_batched(query, search_field='tensor', limit=5) + + assert len(docs) == 2 + for result in docs: + assert len(result) == 5 + assert doc_index.num_docs() == 10 + + def test_concatenated_queries(doc_index): query = SchemaDoc(text='query', price=0, tensor=np.ones(10)) From 616d043f3dc3644d38ccf061a075c1248437aa40 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 25 Apr 2023 10:03:56 +0200 Subject: [PATCH 04/18] fix: clean up docstrings Signed-off-by: anna-charlotte --- docarray/index/backends/in_memory.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docarray/index/backends/in_memory.py b/docarray/index/backends/in_memory.py index 2fa876b15ff..015baae31b2 100644 --- a/docarray/index/backends/in_memory.py +++ b/docarray/index/backends/in_memory.py @@ -36,6 +36,7 @@ class InMemoryDocIndex(BaseDocIndex, Generic[TSchema]): def __init__(self, docs: Optional[DocList] = None, **kwargs): + """Initialize InMemoryDocIndex""" super().__init__(db_config=None, **kwargs) self._docs: DocList @@ -98,6 +99,17 @@ class RuntimeConfig(BaseDocIndex.RuntimeConfig): ) def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs): + """index Documents into the index. + + !!! note + Passing a sequence of Documents that is not a DocList + (such as a List of Docs) comes at a performance penalty. + This is because the Index needs to check compatibility between itself and + the data. With a DocList as input this is a single check; for other inputs + compatibility needs to be checked for every Document individually. + + :param docs: Documents to index. + """ # implementing the public option because conversion to column dict is not needed docs = self._validate_docs(docs) self._docs.extend(docs) @@ -112,6 +124,10 @@ def num_docs(self) -> int: return len(self._docs) def _del_items(self, doc_ids: Sequence[str]): + """Delete Documents from the index. + + :param doc_ids: ids to delete from the Document Store + """ indices = [] for i, doc in enumerate(self._docs): if doc.id in doc_ids: @@ -123,6 +139,12 @@ def _del_items(self, doc_ids: Sequence[str]): def _get_items( self, doc_ids: Sequence[str] ) -> Union[Sequence[TSchema], Sequence[Dict[str, Any]]]: + """Get Documents from the index, by `id`. + If no document is found, a KeyError is raised. + + :param doc_ids: ids to get from the Document index + :return: Sequence of Documents, sorted corresponding to the order of `doc_ids`. Duplicate `doc_ids` can be omitted in the output. + """ indices = [] for i, doc in enumerate(self._docs): if doc.id in doc_ids: From 0e378fcbafe5938f99f3d0d438968fbce35faf69 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 25 Apr 2023 10:10:34 +0200 Subject: [PATCH 05/18] docs: add in memory doc index Signed-off-by: anna-charlotte --- docs/user_guide/storing/index_in_memory.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/user_guide/storing/index_in_memory.md diff --git a/docs/user_guide/storing/index_in_memory.md b/docs/user_guide/storing/index_in_memory.md new file mode 100644 index 00000000000..3771e515368 --- /dev/null +++ b/docs/user_guide/storing/index_in_memory.md @@ -0,0 +1 @@ +# In-Memory Document Index \ No newline at end of file From 172a9a1d9191d3c1c44cf388c69399302e5208c7 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 25 Apr 2023 10:18:08 +0200 Subject: [PATCH 06/18] docs: add in memory index to mkdocs.yml Signed-off-by: anna-charlotte --- mkdocs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs.yml b/mkdocs.yml index 8cc549c4f9a..23a9cb62eb1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -102,6 +102,7 @@ nav: - user_guide/storing/first_step.md - DocIndex: - user_guide/storing/docindex.md + - user_guide/storing/index_in_memory.md - user_guide/storing/index_hnswlib.md - user_guide/storing/index_weaviate.md - user_guide/storing/index_elastic.md From 4ca60d70714c6eda9f9732dddc23c62a62d51392 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 25 Apr 2023 12:00:23 +0200 Subject: [PATCH 07/18] fix: use defaultdict in runtime config Signed-off-by: anna-charlotte --- docarray/index/backends/in_memory.py | 30 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/docarray/index/backends/in_memory.py b/docarray/index/backends/in_memory.py index 015baae31b2..6df7bdcbfa5 100644 --- a/docarray/index/backends/in_memory.py +++ b/docarray/index/backends/in_memory.py @@ -1,3 +1,4 @@ +from collections import defaultdict from dataclasses import dataclass, field from typing import ( Any, @@ -19,7 +20,7 @@ from docarray import BaseDoc, DocList from docarray.index.abstract import BaseDocIndex, _raise_not_supported from docarray.index.backends.hnswlib import _collect_query_args -from docarray.typing import ID, AnyTensor, NdArray +from docarray.typing import AnyTensor, NdArray from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.utils.filter import filter_docs from docarray.utils.find import ( @@ -34,11 +35,17 @@ TSchema = TypeVar('TSchema', bound=BaseDoc) +def _get_default_dict() -> dict: + d = defaultdict(lambda: {}) + d[AbstractTensor] = {'space': 'cosine_sim'} + return d + + class InMemoryDocIndex(BaseDocIndex, Generic[TSchema]): def __init__(self, docs: Optional[DocList] = None, **kwargs): """Initialize InMemoryDocIndex""" super().__init__(db_config=None, **kwargs) - + self._runtime_config = self.RuntimeConfig() self._docs: DocList if docs is None: self._docs = DocList[self._schema]() @@ -83,19 +90,12 @@ class RuntimeConfig(BaseDocIndex.RuntimeConfig): """Dataclass that contains all "dynamic" configurations of InMemoryDocIndex.""" default_column_config: Dict[Type, Dict[str, Any]] = field( - default_factory=lambda: { - str: {}, - int: {}, - float: {}, - list: {}, - set: {}, - dict: {}, - ID: {}, - np.ndarray: {'space': 'cosine_sim'}, - AbstractTensor: {'space': 'cosine_sim'}, - # `None` is not a Type, but we allow it here anyway - None: {}, # type: ignore - } + default_factory=lambda: defaultdict( + dict, + { + AbstractTensor: {'space': 'cosine_sim'}, + }, + ) ) def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs): From 50b5590bb3e66288ac1c00436d75cb65942a171f Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 25 Apr 2023 13:00:18 +0200 Subject: [PATCH 08/18] fix: mypy Signed-off-by: anna-charlotte --- docarray/index/__init__.py | 3 ++- docarray/index/backends/in_memory.py | 13 +++++-------- tests/index/in_memory/test_in_memory.py | 5 +++++ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/docarray/index/__init__.py b/docarray/index/__init__.py index 9d6f2e469ba..30f6b3edf58 100644 --- a/docarray/index/__init__.py +++ b/docarray/index/__init__.py @@ -1,6 +1,7 @@ import types from typing import TYPE_CHECKING +from docarray.index.backends.in_memory import InMemoryDocIndex from docarray.utils._internal.misc import ( _get_path_from_docarray_root_level, import_library, @@ -13,7 +14,7 @@ from docarray.index.backends.qdrant import QdrantDocumentIndex # noqa: F401 from docarray.index.backends.weaviate import WeaviateDocumentIndex # noqa: F401 -__all__ = [] +__all__ = ['InMemoryDocIndex'] def __getattr__(name: str): diff --git a/docarray/index/backends/in_memory.py b/docarray/index/backends/in_memory.py index 6df7bdcbfa5..58efb59e407 100644 --- a/docarray/index/backends/in_memory.py +++ b/docarray/index/backends/in_memory.py @@ -35,12 +35,6 @@ TSchema = TypeVar('TSchema', bound=BaseDoc) -def _get_default_dict() -> dict: - d = defaultdict(lambda: {}) - d[AbstractTensor] = {'space': 'cosine_sim'} - return d - - class InMemoryDocIndex(BaseDocIndex, Generic[TSchema]): def __init__(self, docs: Optional[DocList] = None, **kwargs): """Initialize InMemoryDocIndex""" @@ -48,7 +42,7 @@ def __init__(self, docs: Optional[DocList] = None, **kwargs): self._runtime_config = self.RuntimeConfig() self._docs: DocList if docs is None: - self._docs = DocList[self._schema]() + self._docs = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))() else: self._docs = docs @@ -225,7 +219,10 @@ def find( limit=limit, metric=config['space'], ) - return FindResult(documents=DocList[self._schema](docs), scores=scores) + docs_with_schema = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))( + docs + ) + return FindResult(documents=docs_with_schema, scores=scores) def _find( self, query: np.ndarray, limit: int, search_field: str = '' diff --git a/tests/index/in_memory/test_in_memory.py b/tests/index/in_memory/test_in_memory.py index 6550a89220b..e999ac15f30 100644 --- a/tests/index/in_memory/test_in_memory.py +++ b/tests/index/in_memory/test_in_memory.py @@ -46,6 +46,11 @@ def test_del_item(docs, doc_index): assert doc_index.num_docs() == 8 +def test_del(docs, doc_index): + del doc_index[docs[0].id] + assert doc_index.num_docs() == 9 + + @pytest.mark.parametrize('space', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) @pytest.mark.parametrize('is_query_doc', [True, False]) def test_find(doc_index, space, is_query_doc): From 32c35e6e3539b18aa9d5756aae705a703076e1fc Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 25 Apr 2023 13:00:41 +0200 Subject: [PATCH 09/18] fix: ruff Signed-off-by: anna-charlotte --- tests/index/qdrant/fixtures.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/index/qdrant/fixtures.py b/tests/index/qdrant/fixtures.py index 33bfd862101..d44a0950d35 100644 --- a/tests/index/qdrant/fixtures.py +++ b/tests/index/qdrant/fixtures.py @@ -1,5 +1,3 @@ -import uuid - import pytest import qdrant_client From 1c77f7edf74eb85670c652b9d4a1c8349c187291 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 25 Apr 2023 13:07:55 +0200 Subject: [PATCH 10/18] docs: add documentation Signed-off-by: anna-charlotte --- .../doc_index/backends/in_memory.md | 3 + docs/user_guide/storing/index_in_memory.md | 225 +++++++++++++++++- 2 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 docs/API_reference/doc_index/backends/in_memory.md diff --git a/docs/API_reference/doc_index/backends/in_memory.md b/docs/API_reference/doc_index/backends/in_memory.md new file mode 100644 index 00000000000..6fe76893440 --- /dev/null +++ b/docs/API_reference/doc_index/backends/in_memory.md @@ -0,0 +1,3 @@ +# InMemoryDocIndex + +::: docarray.index.backends.in_memory.InMemoryDocIndex diff --git a/docs/user_guide/storing/index_in_memory.md b/docs/user_guide/storing/index_in_memory.md index 3771e515368..cb9cdede99e 100644 --- a/docs/user_guide/storing/index_in_memory.md +++ b/docs/user_guide/storing/index_in_memory.md @@ -1 +1,224 @@ -# In-Memory Document Index \ No newline at end of file +# In-Memory Document Index + + + +[InMemoryDocIndex][docarray.index.backends.in_memory.InMemoryDocIndex] stores all Documents in DocLists in memory. +It is a great starting point for small-sized datasets, but it is not battle tested in production. + +!!! note "Production readiness" + If scalability, uptime, etc. are important to you, we recommend you eventually transition to one of our + database-backed Document Index implementations: + + - [QdrantDocumentIndex][docarray.index.backends.qdrant.QdrantDocumentIndex] + - [WeaviateDocumentIndex][docarray.index.backends.weaviate.WeaviateDocumentIndex] + - [ElasticDocumentIndex][docarray.index.backends.elastic.ElasticDocIndex] + +## Basic Usage + +To see how to create a [InMemoryDocIndex][docarray.index.backends.in_memory.InMemoryDocIndex] instance, add Documents, +perform search, etc. see the [general user guide](./docindex.md). + +You can initialize the index as follows: + +```python +from docarray import BaseDoc, DocList +from docarray.index.backends.in_memory import InMemoryDocIndex +from docarray.typing import NdArray + + +class MyDoc(BaseDoc): + tensor: NdArray = None + + +docs = DocList[MyDoc](MyDoc() for _ in range(10)) + +doc_index = InMemoryDocIndex[MyDoc]() +doc_index.index(docs) + +# or in one step: +doc_index = InMemoryDocIndex[MyDoc](docs) +``` + +## Configuration + +This section lays out the configurations and options that are specific to [InMemoryDocIndex][docarray.index.backends.in_memory.InMemoryDocIndex]. + +### RuntimeConfig + +The `RuntimeConfig` of [InMemoryDocIndex][docarray.index.backends.in_memory.InMemoryDocIndex] contains only one entry: +the default mapping from Python types to column configurations. + +You can see in the [section below](#field-wise-configurations) how to override configurations for specific fields. +If you want to set configurations globally, i.e. for all vector fields in your documents, you can do that using `RuntimeConfig`: + +```python +from collections import defaultdict +from docarray.typing import AbstractTensor + +index.configure( + default_column_config=defaultdict( + dict, + { + AbstractTensor: {'space': 'cosine_sim'}, + }, + ) +) +``` + +This will set the default configuration for all vector fields to the one specified in the example above. + +For more information on these settings, see [below](#field-wise-configurations). + +Fields that are not vector fields (e.g. of type `str` or `int` etc.) do not offer any configuration. + + +### Field-wise configurations + +For a vector field you can adjust the `space` parameter. It can be one of: + +- `'cosine_sim'` (default) +- `'euclidean_dist'` +- `'sqeuclidean_dist'` + +You pass it using the `field: Type = Field(...)` syntax: + +```python +from docarray import BaseDoc +from pydantic import Field + + +class Schema(BaseDoc): + tensor_1: NdArray[100] = Field(space='euclidean_dist') + tensor_2: NdArray[100] = Field(space='sqeuclidean_dist') +``` + +In the example above you can see how to configure two different vector fields, with two different sets of settings. + +## Nested Index + +When using the index, you can define multiple fields and their nested structure. In the following example, you have `YouTubeVideoDoc` including the `tensor` field calculated based on the description. `YouTubeVideoDoc` has `thumbnail` and `video` fields, each with their own `tensor`. + +```python +import numpy as np +from docarray import BaseDoc +from docarray.index.backends.in_memory import InMemoryDocIndex +from docarray.typing import ImageUrl, VideoUrl, AnyTensor +from pydantic import Field + + +class ImageDoc(BaseDoc): + url: ImageUrl + tensor: AnyTensor = Field(space='cosine_sim') + + +class VideoDoc(BaseDoc): + url: VideoUrl + tensor: AnyTensor = Field(space='cosine_sim') + + +class YouTubeVideoDoc(BaseDoc): + title: str + description: str + thumbnail: ImageDoc + video: VideoDoc + tensor: AnyTensor = Field(space='cosine_sim') + + +doc_index = InMemoryDocIndex[YouTubeVideoDoc]() +index_docs = [ + YouTubeVideoDoc( + title=f'video {i+1}', + description=f'this is video from author {10*i}', + thumbnail=ImageDoc(url=f'http://example.ai/images/{i}', tensor=np.ones(64)), + video=VideoDoc(url=f'http://example.ai/videos/{i}', tensor=np.ones(128)), + tensor=np.ones(256), + ) + for i in range(8) +] +doc_index.index(index_docs) +``` + +## Search docs + +You can use the `search_field` to specify which field to use when performing the vector search. You can use the dunder operator to specify the field defined in the nested data. In the following code, you can perform vector search on the `tensor` field of the `YouTubeVideoDoc` or on the `tensor` field of the `thumbnail` and `video` field: + +```python +# find by the youtubevideo tensor +query = parse_obj_as(NdArray, np.ones(256)) +docs, scores = doc_index.find(query, search_field='tensor', limit=3) + +# find by the thumbnail tensor +query = parse_obj_as(NdArray, np.ones(64)) +docs, scores = doc_index.find(query, search_field='thumbnail__tensor', limit=3) + +# find by the video tensor +query = parse_obj_as(NdArray, np.ones(128)) +docs, scores = doc_index.find(query, search_field='video__tensor', limit=3) +``` + +## Filter docs + +You can filter your documents by using the `filter()` or `filter_batched()` method with a corresponding filter query. +The query should follow the query language of the DocArray's [`filter_docs()`][docarray.utils.filter.filter_docs] function. + +In the following example let filter for all the books that are cheaper than 29 dollars: + +```python +from docarray import BaseDoc, DocList + + +class Book(BaseDoc): + title: str + price: int + + +books = DocList[Book]([Book(title=f'title {i}', price=i * 10) for i in range(10)]) +book_index = InMemoryDocIndex[Book](books) + +# filter for books that are cheaper than 29 dollars +query = {'price': {'$lte': 29}} +cheap_books = book_index.filter(query) + +assert len(cheap_books) == 3 +for doc in cheap_books: + doc.summary() +``` + +
+ Output + ```text + 📄 Book : 1f7da15 ... + ╭──────────────────────┬───────────────╮ + │ Attribute │ Value │ + ├──────────────────────┼───────────────┤ + │ title: str │ title 0 │ + │ price: int │ 0 │ + ╰──────────────────────┴───────────────╯ + 📄 Book : 63fd13a ... + ╭──────────────────────┬───────────────╮ + │ Attribute │ Value │ + ├──────────────────────┼───────────────┤ + │ title: str │ title 1 │ + │ price: int │ 10 │ + ╰──────────────────────┴───────────────╯ + 📄 Book : 49b21de ... + ╭──────────────────────┬───────────────╮ + │ Attribute │ Value │ + ├──────────────────────┼───────────────┤ + │ title: str │ title 2 │ + │ price: int │ 20 │ + ╰──────────────────────┴───────────────╯ + ``` +
+ +## Delete docs + +To delete nested data, you need to specify the `id`. + +!!! note + You can only delete Documents at the top level. Deletion of Documents on lower levels is not yet supported. + +```python +# example of deleting nested and flat index +del doc_index[index_docs[6].id] +``` From 8d094f590337a2b68563488bfe20f89e3c6f5200 Mon Sep 17 00:00:00 2001 From: Charlotte Gerhaher Date: Tue, 25 Apr 2023 14:24:27 +0200 Subject: [PATCH 11/18] fix: apply suggestions from code review Co-authored-by: Alex Cureton-Griffiths Signed-off-by: Charlotte Gerhaher --- docarray/index/backends/in_memory.py | 6 +++--- docs/user_guide/storing/index_in_memory.md | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docarray/index/backends/in_memory.py b/docarray/index/backends/in_memory.py index 58efb59e407..1d3d7100eca 100644 --- a/docarray/index/backends/in_memory.py +++ b/docarray/index/backends/in_memory.py @@ -197,7 +197,7 @@ def find( limit: int = 10, **kwargs, ) -> FindResult: - """Find documents in the index using nearest neighbor search. + """Find Documents in the index using nearest-neighbor search. :param query: query vector for KNN/ANN search. Can be either a tensor-like (np.array, torch.Tensor, etc.) @@ -205,7 +205,7 @@ def find( :param search_field: name of the field to search on. Documents in the index are retrieved based on this similarity of this field to the query. - :param limit: maximum number of documents to return + :param limit: maximum number of Documents to return :return: a named tuple containing `documents` and `scores` """ self._logger.debug(f'Executing `find` for search field {search_field}') @@ -236,7 +236,7 @@ def find_batched( limit: int = 10, **kwargs, ) -> FindResultBatched: - """Find documents in the index using nearest neighbor search. + """Find Documents in the index using nearest-neighbor search. :param queries: query vector for KNN/ANN search. Can be either a tensor-like (np.array, torch.Tensor, etc.) with a, diff --git a/docs/user_guide/storing/index_in_memory.md b/docs/user_guide/storing/index_in_memory.md index cb9cdede99e..37bc94b6fe1 100644 --- a/docs/user_guide/storing/index_in_memory.md +++ b/docs/user_guide/storing/index_in_memory.md @@ -3,10 +3,10 @@ [InMemoryDocIndex][docarray.index.backends.in_memory.InMemoryDocIndex] stores all Documents in DocLists in memory. -It is a great starting point for small-sized datasets, but it is not battle tested in production. +It is a great starting point for small datasets, but is not battle-tested in production. !!! note "Production readiness" - If scalability, uptime, etc. are important to you, we recommend you eventually transition to one of our + If scalability, uptime, etc. are important, we recommend you eventually transition to one of our database-backed Document Index implementations: - [QdrantDocumentIndex][docarray.index.backends.qdrant.QdrantDocumentIndex] @@ -94,7 +94,7 @@ class Schema(BaseDoc): In the example above you can see how to configure two different vector fields, with two different sets of settings. -## Nested Index +## Nested index When using the index, you can define multiple fields and their nested structure. In the following example, you have `YouTubeVideoDoc` including the `tensor` field calculated based on the description. `YouTubeVideoDoc` has `thumbnail` and `video` fields, each with their own `tensor`. @@ -140,7 +140,7 @@ doc_index.index(index_docs) ## Search docs -You can use the `search_field` to specify which field to use when performing the vector search. You can use the dunder operator to specify the field defined in the nested data. In the following code, you can perform vector search on the `tensor` field of the `YouTubeVideoDoc` or on the `tensor` field of the `thumbnail` and `video` field: +You can use the `search_field` to specify which field to use when performing the vector search. You can use the dunder operator to specify the field defined in nested data. In the following code, you can perform vector search on the `tensor` field of the `YouTubeVideoDoc` or the `tensor` field of the `thumbnail` and `video` field: ```python # find by the youtubevideo tensor @@ -156,12 +156,12 @@ query = parse_obj_as(NdArray, np.ones(128)) docs, scores = doc_index.find(query, search_field='video__tensor', limit=3) ``` -## Filter docs +## Filter Documents You can filter your documents by using the `filter()` or `filter_batched()` method with a corresponding filter query. The query should follow the query language of the DocArray's [`filter_docs()`][docarray.utils.filter.filter_docs] function. -In the following example let filter for all the books that are cheaper than 29 dollars: +In the following example let's filter for all the books that are cheaper than 29 dollars: ```python from docarray import BaseDoc, DocList From 7dd4d4734ec678dfb5adaf6d0b65fb0797beb027 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 25 Apr 2023 14:52:01 +0200 Subject: [PATCH 12/18] fix: apply suggestion from samis code review Signed-off-by: anna-charlotte --- docarray/index/backends/in_memory.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docarray/index/backends/in_memory.py b/docarray/index/backends/in_memory.py index 1d3d7100eca..11265878c07 100644 --- a/docarray/index/backends/in_memory.py +++ b/docarray/index/backends/in_memory.py @@ -40,11 +40,11 @@ def __init__(self, docs: Optional[DocList] = None, **kwargs): """Initialize InMemoryDocIndex""" super().__init__(db_config=None, **kwargs) self._runtime_config = self.RuntimeConfig() - self._docs: DocList - if docs is None: - self._docs = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))() - else: - self._docs = docs + self._docs = ( + docs + if docs is not None + else DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))() + ) def python_type_to_db_type(self, python_type: Type) -> Any: """Map python type to database type. From 5ee6d595737320083dbe85520547fe938600633a Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 25 Apr 2023 15:34:01 +0200 Subject: [PATCH 13/18] fix: apply johannes suggestion Signed-off-by: anna-charlotte --- docarray/index/backends/in_memory.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docarray/index/backends/in_memory.py b/docarray/index/backends/in_memory.py index 11265878c07..638fdbaa922 100644 --- a/docarray/index/backends/in_memory.py +++ b/docarray/index/backends/in_memory.py @@ -127,8 +127,7 @@ def _del_items(self, doc_ids: Sequence[str]): if doc.id in doc_ids: indices.append(i) - for idx in reversed(indices): - self._docs.pop(idx) + del self._docs[indices] def _get_items( self, doc_ids: Sequence[str] From 1874f98a84a7d46b3a795e7efdb4fbe745c2c049 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 25 Apr 2023 16:09:41 +0200 Subject: [PATCH 14/18] fix: apply suggestions Signed-off-by: anna-charlotte --- docs/user_guide/storing/index_in_memory.md | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/docs/user_guide/storing/index_in_memory.md b/docs/user_guide/storing/index_in_memory.md index 37bc94b6fe1..6c6382204ac 100644 --- a/docs/user_guide/storing/index_in_memory.md +++ b/docs/user_guide/storing/index_in_memory.md @@ -1,17 +1,11 @@ # In-Memory Document Index - [InMemoryDocIndex][docarray.index.backends.in_memory.InMemoryDocIndex] stores all Documents in DocLists in memory. -It is a great starting point for small datasets, but is not battle-tested in production. - -!!! note "Production readiness" - If scalability, uptime, etc. are important, we recommend you eventually transition to one of our - database-backed Document Index implementations: +It is a great starting point for small datasets, where you may not want to launch a database server. - - [QdrantDocumentIndex][docarray.index.backends.qdrant.QdrantDocumentIndex] - - [WeaviateDocumentIndex][docarray.index.backends.weaviate.WeaviateDocumentIndex] - - [ElasticDocumentIndex][docarray.index.backends.elastic.ElasticDocIndex] +For vector search and filtering the InMemoryDocIndex utilizes DocArray's [`find()`][docarray.utils.find.find] and +[`filter_docs()`][docarray.utils.filter.filter_docs] functions. ## Basic Usage @@ -140,7 +134,12 @@ doc_index.index(index_docs) ## Search docs -You can use the `search_field` to specify which field to use when performing the vector search. You can use the dunder operator to specify the field defined in nested data. In the following code, you can perform vector search on the `tensor` field of the `YouTubeVideoDoc` or the `tensor` field of the `thumbnail` and `video` field: +To search Documents, the `InMemoryDocIndex` uses DocArray's [`find`][docarray.utils.find.find] function. + +You can use the `search_field` to specify which field to use when performing the vector search. +You can use the dunder operator to specify the field defined in nested data. +In the following code, you can perform vector search on the `tensor` field of the `YouTubeVideoDoc` +or the `tensor` field of the `thumbnail` and `video` field: ```python # find by the youtubevideo tensor @@ -158,6 +157,8 @@ docs, scores = doc_index.find(query, search_field='video__tensor', limit=3) ## Filter Documents +To filter Documents, the `InMemoryDocIndex` uses DocArray's [`filter_docs()`][docarray.utils.filter.filter_docs] function. + You can filter your documents by using the `filter()` or `filter_batched()` method with a corresponding filter query. The query should follow the query language of the DocArray's [`filter_docs()`][docarray.utils.filter.filter_docs] function. From ea86dda4b985f6de76a960263017d0d30d09bfd6 Mon Sep 17 00:00:00 2001 From: Charlotte Gerhaher Date: Tue, 25 Apr 2023 16:11:03 +0200 Subject: [PATCH 15/18] fix: apply suggestions from code review Co-authored-by: Alex Cureton-Griffiths Signed-off-by: Charlotte Gerhaher --- docs/user_guide/storing/index_in_memory.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user_guide/storing/index_in_memory.md b/docs/user_guide/storing/index_in_memory.md index 6c6382204ac..99db41a5c65 100644 --- a/docs/user_guide/storing/index_in_memory.md +++ b/docs/user_guide/storing/index_in_memory.md @@ -7,7 +7,7 @@ It is a great starting point for small datasets, where you may not want to launc For vector search and filtering the InMemoryDocIndex utilizes DocArray's [`find()`][docarray.utils.find.find] and [`filter_docs()`][docarray.utils.filter.filter_docs] functions. -## Basic Usage +## Basic usage To see how to create a [InMemoryDocIndex][docarray.index.backends.in_memory.InMemoryDocIndex] instance, add Documents, perform search, etc. see the [general user guide](./docindex.md). @@ -43,7 +43,7 @@ The `RuntimeConfig` of [InMemoryDocIndex][docarray.index.backends.in_memory.InMe the default mapping from Python types to column configurations. You can see in the [section below](#field-wise-configurations) how to override configurations for specific fields. -If you want to set configurations globally, i.e. for all vector fields in your documents, you can do that using `RuntimeConfig`: +If you want to set configurations globally, i.e. for all vector fields in your Documents, you can do that using `RuntimeConfig`: ```python from collections import defaultdict From 0155af3bc647058cf61edb31519ccc2a6542040b Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 25 Apr 2023 16:13:09 +0200 Subject: [PATCH 16/18] fix: apply suggestions Signed-off-by: anna-charlotte --- docs/user_guide/storing/index_in_memory.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user_guide/storing/index_in_memory.md b/docs/user_guide/storing/index_in_memory.md index 99db41a5c65..a45373ce5b0 100644 --- a/docs/user_guide/storing/index_in_memory.md +++ b/docs/user_guide/storing/index_in_memory.md @@ -139,7 +139,7 @@ To search Documents, the `InMemoryDocIndex` uses DocArray's [`find`][docarray.ut You can use the `search_field` to specify which field to use when performing the vector search. You can use the dunder operator to specify the field defined in nested data. In the following code, you can perform vector search on the `tensor` field of the `YouTubeVideoDoc` -or the `tensor` field of the `thumbnail` and `video` field: +or the `tensor` field of the `thumbnail` and `video` fields: ```python # find by the youtubevideo tensor From a6146fe33d50c383000c09a7371c360638c0fc36 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 25 Apr 2023 17:02:33 +0200 Subject: [PATCH 17/18] fix: extract duplicate code to helper.py Signed-off-by: anna-charlotte --- docarray/index/backends/helper.py | 58 ++++++++++++++++++++++++++++ docarray/index/backends/hnswlib.py | 47 ++++------------------ docarray/index/backends/in_memory.py | 33 ++++------------ 3 files changed, 74 insertions(+), 64 deletions(-) create mode 100644 docarray/index/backends/helper.py diff --git a/docarray/index/backends/helper.py b/docarray/index/backends/helper.py new file mode 100644 index 00000000000..e8739fdfcb4 --- /dev/null +++ b/docarray/index/backends/helper.py @@ -0,0 +1,58 @@ +from typing import Any, Dict, List, Tuple, Type, cast + +from docarray import BaseDoc, DocList +from docarray.index.abstract import BaseDocIndex +from docarray.utils.filter import filter_docs +from docarray.utils.find import FindResult + + +def _collect_query_args(method_name: str): # TODO: use partialmethod instead + def inner(self, *args, **kwargs): + if args: + raise ValueError( + f'Positional arguments are not supported for ' + f'`{type(self)}.{method_name}`.' + f' Use keyword arguments instead.' + ) + updated_query = self._queries + [(method_name, kwargs)] + return type(self)(updated_query) + + return inner + + +def _execute_find_and_filter_query( + doc_index: BaseDocIndex, query: List[Tuple[str, Dict]] +) -> FindResult: + """ + Executes all find calls from query first using `doc_index.find()`, + and filtering queries after that using DocArray's `filter_docs()`. + + Text search is not supported. + """ + docs_found = DocList.__class_getitem__(cast(Type[BaseDoc], doc_index._schema))([]) + filter_conditions = [] + doc_to_score: Dict[BaseDoc, Any] = {} + for op, op_kwargs in query: + if op == 'find': + docs, scores = doc_index.find(**op_kwargs) + docs_found.extend(docs) + doc_to_score.update(zip(docs.__getattribute__('id'), scores)) + elif op == 'filter': + filter_conditions.append(op_kwargs['filter_query']) + else: + raise ValueError(f'Query operation is not supported: {op}') + + doc_index._logger.debug(f'Executing query {query}') + docs_filtered = docs_found + for cond in filter_conditions: + docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], doc_index._schema)) + docs_filtered = docs_cls(filter_docs(docs_filtered, cond)) + + doc_index._logger.debug(f'{len(docs_filtered)} results found') + docs_and_scores = zip( + docs_filtered, (doc_to_score[doc.id] for doc in docs_filtered) + ) + docs_sorted = sorted(docs_and_scores, key=lambda x: x[1]) + out_docs, out_scores = zip(*docs_sorted) + + return FindResult(documents=out_docs, scores=out_scores) diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index 648217f035e..6caf5817272 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -27,11 +27,14 @@ _raise_not_composable, _raise_not_supported, ) +from docarray.index.backends.helper import ( + _collect_query_args, + _execute_find_and_filter_query, +) from docarray.proto import DocProto from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.typing.tensor.ndarray import NdArray from docarray.utils._internal.misc import import_library, is_np_int -from docarray.utils.filter import filter_docs from docarray.utils.find import _FindResult, _FindResultBatched if TYPE_CHECKING: @@ -61,20 +64,6 @@ T = TypeVar('T', bound='HnswDocumentIndex') -def _collect_query_args(method_name: str): # TODO: use partialmethod instead - def inner(self, *args, **kwargs): - if args: - raise ValueError( - f'Positional arguments are not supported for ' - f'`{type(self)}.{method_name}`.' - f' Use keyword arguments instead.' - ) - updated_query = self._queries + [(method_name, kwargs)] - return type(self)(updated_query) - - return inner - - class HnswDocumentIndex(BaseDocIndex, Generic[TSchema]): def __init__(self, db_config=None, **kwargs): """Initialize HnswDocumentIndex""" @@ -249,31 +238,11 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: raise ValueError( f'args and kwargs not supported for `execute_query` on {type(self)}' ) - - ann_docs = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))([]) - filter_conditions = [] - doc_to_score: Dict[BaseDoc, Any] = {} - for op, op_kwargs in query: - if op == 'find': - docs, scores = self.find(**op_kwargs) - ann_docs.extend(docs) - doc_to_score.update(zip(docs.__getattribute__('id'), scores)) - elif op == 'filter': - filter_conditions.append(op_kwargs['filter_query']) - - self._logger.debug(f'Executing query {query}') - docs_filtered = ann_docs - for cond in filter_conditions: - docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) - docs_filtered = docs_cls(filter_docs(docs_filtered, cond)) - - self._logger.debug(f'{len(docs_filtered)} results found') - docs_and_scores = zip( - docs_filtered, (doc_to_score[doc.id] for doc in docs_filtered) + find_res = _execute_find_and_filter_query( + doc_index=self, + query=query, ) - docs_sorted = sorted(docs_and_scores, key=lambda x: x[1]) - out_docs, out_scores = zip(*docs_sorted) - return _FindResult(documents=out_docs, scores=out_scores) + return find_res def _find_batched( self, diff --git a/docarray/index/backends/in_memory.py b/docarray/index/backends/in_memory.py index 638fdbaa922..853cc47d2b1 100644 --- a/docarray/index/backends/in_memory.py +++ b/docarray/index/backends/in_memory.py @@ -19,7 +19,10 @@ from docarray import BaseDoc, DocList from docarray.index.abstract import BaseDocIndex, _raise_not_supported -from docarray.index.backends.hnswlib import _collect_query_args +from docarray.index.backends.helper import ( + _collect_query_args, + _execute_find_and_filter_query, +) from docarray.typing import AnyTensor, NdArray from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.utils.filter import filter_docs @@ -163,31 +166,11 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: raise ValueError( f'args and kwargs not supported for `execute_query` on {type(self)}' ) - - ann_docs = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))([]) - filter_conditions = [] - doc_to_score: Dict[BaseDoc, Any] = {} - for op, op_kwargs in query: - if op == 'find': - docs, scores = self.find(**op_kwargs) - ann_docs.extend(docs) - doc_to_score.update(zip(docs.__getattribute__('id'), scores)) - elif op == 'filter': - filter_conditions.append(op_kwargs['filter_query']) - - self._logger.debug(f'Executing query {query}') - docs_filtered = ann_docs - for cond in filter_conditions: - docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) - docs_filtered = docs_cls(filter_docs(docs_filtered, cond)) - - self._logger.debug(f'{len(docs_filtered)} results found') - docs_and_scores = zip( - docs_filtered, (doc_to_score[doc.id] for doc in docs_filtered) + find_res = _execute_find_and_filter_query( + doc_index=self, + query=query, ) - docs_sorted = sorted(docs_and_scores, key=lambda x: x[1]) - out_docs, out_scores = zip(*docs_sorted) - return FindResult(documents=out_docs, scores=out_scores) + return find_res def find( self, From 6cd6114e7f8d3f356b811aa2078244cc27efc458 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 25 Apr 2023 17:02:50 +0200 Subject: [PATCH 18/18] Fix: docs Signed-off-by: anna-charlotte --- docs/user_guide/storing/index_in_memory.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user_guide/storing/index_in_memory.md b/docs/user_guide/storing/index_in_memory.md index a45373ce5b0..88bab5ce5c2 100644 --- a/docs/user_guide/storing/index_in_memory.md +++ b/docs/user_guide/storing/index_in_memory.md @@ -132,7 +132,7 @@ index_docs = [ doc_index.index(index_docs) ``` -## Search docs +## Search Documents To search Documents, the `InMemoryDocIndex` uses DocArray's [`find`][docarray.utils.find.find] function. @@ -212,7 +212,7 @@ for doc in cheap_books: ``` -## Delete docs +## Delete Documents To delete nested data, you need to specify the `id`.