diff --git a/docarray/index/__init__.py b/docarray/index/__init__.py index 9d6f2e469ba..30f6b3edf58 100644 --- a/docarray/index/__init__.py +++ b/docarray/index/__init__.py @@ -1,6 +1,7 @@ import types from typing import TYPE_CHECKING +from docarray.index.backends.in_memory import InMemoryDocIndex from docarray.utils._internal.misc import ( _get_path_from_docarray_root_level, import_library, @@ -13,7 +14,7 @@ from docarray.index.backends.qdrant import QdrantDocumentIndex # noqa: F401 from docarray.index.backends.weaviate import WeaviateDocumentIndex # noqa: F401 -__all__ = [] +__all__ = ['InMemoryDocIndex'] def __getattr__(name: str): diff --git a/docarray/index/backends/helper.py b/docarray/index/backends/helper.py new file mode 100644 index 00000000000..e8739fdfcb4 --- /dev/null +++ b/docarray/index/backends/helper.py @@ -0,0 +1,58 @@ +from typing import Any, Dict, List, Tuple, Type, cast + +from docarray import BaseDoc, DocList +from docarray.index.abstract import BaseDocIndex +from docarray.utils.filter import filter_docs +from docarray.utils.find import FindResult + + +def _collect_query_args(method_name: str): # TODO: use partialmethod instead + def inner(self, *args, **kwargs): + if args: + raise ValueError( + f'Positional arguments are not supported for ' + f'`{type(self)}.{method_name}`.' + f' Use keyword arguments instead.' + ) + updated_query = self._queries + [(method_name, kwargs)] + return type(self)(updated_query) + + return inner + + +def _execute_find_and_filter_query( + doc_index: BaseDocIndex, query: List[Tuple[str, Dict]] +) -> FindResult: + """ + Executes all find calls from query first using `doc_index.find()`, + and filtering queries after that using DocArray's `filter_docs()`. + + Text search is not supported. + """ + docs_found = DocList.__class_getitem__(cast(Type[BaseDoc], doc_index._schema))([]) + filter_conditions = [] + doc_to_score: Dict[BaseDoc, Any] = {} + for op, op_kwargs in query: + if op == 'find': + docs, scores = doc_index.find(**op_kwargs) + docs_found.extend(docs) + doc_to_score.update(zip(docs.__getattribute__('id'), scores)) + elif op == 'filter': + filter_conditions.append(op_kwargs['filter_query']) + else: + raise ValueError(f'Query operation is not supported: {op}') + + doc_index._logger.debug(f'Executing query {query}') + docs_filtered = docs_found + for cond in filter_conditions: + docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], doc_index._schema)) + docs_filtered = docs_cls(filter_docs(docs_filtered, cond)) + + doc_index._logger.debug(f'{len(docs_filtered)} results found') + docs_and_scores = zip( + docs_filtered, (doc_to_score[doc.id] for doc in docs_filtered) + ) + docs_sorted = sorted(docs_and_scores, key=lambda x: x[1]) + out_docs, out_scores = zip(*docs_sorted) + + return FindResult(documents=out_docs, scores=out_scores) diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index 4c66dc52de8..6caf5817272 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -27,11 +27,14 @@ _raise_not_composable, _raise_not_supported, ) +from docarray.index.backends.helper import ( + _collect_query_args, + _execute_find_and_filter_query, +) from docarray.proto import DocProto from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.typing.tensor.ndarray import NdArray from docarray.utils._internal.misc import import_library, is_np_int -from docarray.utils.filter import filter_docs from docarray.utils.find import _FindResult, _FindResultBatched if TYPE_CHECKING: @@ -61,20 +64,6 @@ T = TypeVar('T', bound='HnswDocumentIndex') -def _collect_query_args(method_name: str): # TODO: use partialmethod instead - def inner(self, *args, **kwargs): - if args: - raise ValueError( - f'Positional arguments are not supported for ' - f'`{type(self)}.{method_name}`.' - f' Use keyword arguments instead.' - ) - updated_query = self._queries + [(method_name, kwargs)] - return type(self)(updated_query) - - return inner - - class HnswDocumentIndex(BaseDocIndex, Generic[TSchema]): def __init__(self, db_config=None, **kwargs): """Initialize HnswDocumentIndex""" @@ -232,7 +221,7 @@ def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs): def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: """ - Execute a query on the WeaviateDocumentIndex. + Execute a query on the HnswDocumentIndex. Can take two kinds of inputs: @@ -249,31 +238,11 @@ def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: raise ValueError( f'args and kwargs not supported for `execute_query` on {type(self)}' ) - - ann_docs = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))([]) - filter_conditions = [] - doc_to_score: Dict[BaseDoc, Any] = {} - for op, op_kwargs in query: - if op == 'find': - docs, scores = self.find(**op_kwargs) - ann_docs.extend(docs) - doc_to_score.update(zip(docs.__getattribute__('id'), scores)) - elif op == 'filter': - filter_conditions.append(op_kwargs['filter_query']) - - self._logger.debug(f'Executing query {query}') - docs_filtered = ann_docs - for cond in filter_conditions: - docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) - docs_filtered = docs_cls(filter_docs(docs_filtered, cond)) - - self._logger.debug(f'{len(docs_filtered)} results found') - docs_and_scores = zip( - docs_filtered, (doc_to_score[doc.id] for doc in docs_filtered) + find_res = _execute_find_and_filter_query( + doc_index=self, + query=query, ) - docs_sorted = sorted(docs_and_scores, key=lambda x: x[1]) - out_docs, out_scores = zip(*docs_sorted) - return _FindResult(documents=out_docs, scores=out_scores) + return find_res def _find_batched( self, diff --git a/docarray/index/backends/in_memory.py b/docarray/index/backends/in_memory.py new file mode 100644 index 00000000000..853cc47d2b1 --- /dev/null +++ b/docarray/index/backends/in_memory.py @@ -0,0 +1,286 @@ +from collections import defaultdict +from dataclasses import dataclass, field +from typing import ( + Any, + Dict, + Generator, + Generic, + List, + Optional, + Sequence, + Tuple, + Type, + TypeVar, + Union, + cast, +) + +import numpy as np + +from docarray import BaseDoc, DocList +from docarray.index.abstract import BaseDocIndex, _raise_not_supported +from docarray.index.backends.helper import ( + _collect_query_args, + _execute_find_and_filter_query, +) +from docarray.typing import AnyTensor, NdArray +from docarray.typing.tensor.abstract_tensor import AbstractTensor +from docarray.utils.filter import filter_docs +from docarray.utils.find import ( + FindResult, + FindResultBatched, + _FindResult, + _FindResultBatched, + find, + find_batched, +) + +TSchema = TypeVar('TSchema', bound=BaseDoc) + + +class InMemoryDocIndex(BaseDocIndex, Generic[TSchema]): + def __init__(self, docs: Optional[DocList] = None, **kwargs): + """Initialize InMemoryDocIndex""" + super().__init__(db_config=None, **kwargs) + self._runtime_config = self.RuntimeConfig() + self._docs = ( + docs + if docs is not None + else DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))() + ) + + def python_type_to_db_type(self, python_type: Type) -> Any: + """Map python type to database type. + Takes any python type and returns the corresponding database column type. + + :param python_type: a python type. + :return: the corresponding database column type, + or None if ``python_type`` is not supported. + """ + return python_type + + class QueryBuilder(BaseDocIndex.QueryBuilder): + def __init__(self, query: Optional[List[Tuple[str, Dict]]] = None): + super().__init__() + # list of tuples (method name, kwargs) + self._queries: List[Tuple[str, Dict]] = query or [] + + def build(self, *args, **kwargs) -> Any: + """Build the query object.""" + return self._queries + + find = _collect_query_args('find') + find_batched = _collect_query_args('find_batched') + filter = _collect_query_args('filter') + filter_batched = _raise_not_supported('find_batched') + text_search = _raise_not_supported('text_search') + text_search_batched = _raise_not_supported('text_search') + + @dataclass + class DBConfig(BaseDocIndex.DBConfig): + """Dataclass that contains all "static" configurations of InMemoryDocIndex.""" + + pass + + @dataclass + class RuntimeConfig(BaseDocIndex.RuntimeConfig): + """Dataclass that contains all "dynamic" configurations of InMemoryDocIndex.""" + + default_column_config: Dict[Type, Dict[str, Any]] = field( + default_factory=lambda: defaultdict( + dict, + { + AbstractTensor: {'space': 'cosine_sim'}, + }, + ) + ) + + def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs): + """index Documents into the index. + + !!! note + Passing a sequence of Documents that is not a DocList + (such as a List of Docs) comes at a performance penalty. + This is because the Index needs to check compatibility between itself and + the data. With a DocList as input this is a single check; for other inputs + compatibility needs to be checked for every Document individually. + + :param docs: Documents to index. + """ + # implementing the public option because conversion to column dict is not needed + docs = self._validate_docs(docs) + self._docs.extend(docs) + + def _index(self, column_to_data: Dict[str, Generator[Any, None, None]]): + raise NotImplementedError + + def num_docs(self) -> int: + """ + Get the number of documents. + """ + return len(self._docs) + + def _del_items(self, doc_ids: Sequence[str]): + """Delete Documents from the index. + + :param doc_ids: ids to delete from the Document Store + """ + indices = [] + for i, doc in enumerate(self._docs): + if doc.id in doc_ids: + indices.append(i) + + del self._docs[indices] + + def _get_items( + self, doc_ids: Sequence[str] + ) -> Union[Sequence[TSchema], Sequence[Dict[str, Any]]]: + """Get Documents from the index, by `id`. + If no document is found, a KeyError is raised. + + :param doc_ids: ids to get from the Document index + :return: Sequence of Documents, sorted corresponding to the order of `doc_ids`. Duplicate `doc_ids` can be omitted in the output. + """ + indices = [] + for i, doc in enumerate(self._docs): + if doc.id in doc_ids: + indices.append(i) + return self._docs[indices] + + def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any: + """ + Execute a query on the InMemoryDocIndex. + + Can take two kinds of inputs: + + 1. A native query of the underlying database. This is meant as a passthrough so that you + can enjoy any functionality that is not available through the Document index API. + 2. The output of this Document index' `QueryBuilder.build()` method. + + :param query: the query to execute + :param args: positional arguments to pass to the query + :param kwargs: keyword arguments to pass to the query + :return: the result of the query + """ + if args or kwargs: + raise ValueError( + f'args and kwargs not supported for `execute_query` on {type(self)}' + ) + find_res = _execute_find_and_filter_query( + doc_index=self, + query=query, + ) + return find_res + + def find( + self, + query: Union[AnyTensor, BaseDoc], + search_field: str = '', + limit: int = 10, + **kwargs, + ) -> FindResult: + """Find Documents in the index using nearest-neighbor search. + + :param query: query vector for KNN/ANN search. + Can be either a tensor-like (np.array, torch.Tensor, etc.) + with a single axis, or a Document + :param search_field: name of the field to search on. + Documents in the index are retrieved based on this similarity + of this field to the query. + :param limit: maximum number of Documents to return + :return: a named tuple containing `documents` and `scores` + """ + self._logger.debug(f'Executing `find` for search field {search_field}') + self._validate_search_field(search_field) + config = self._column_infos[search_field].config + + docs, scores = find( + index=self._docs, + query=query, + search_field=search_field, + limit=limit, + metric=config['space'], + ) + docs_with_schema = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))( + docs + ) + return FindResult(documents=docs_with_schema, scores=scores) + + def _find( + self, query: np.ndarray, limit: int, search_field: str = '' + ) -> _FindResult: + raise NotImplementedError + + def find_batched( + self, + queries: Union[AnyTensor, DocList], + search_field: str = '', + limit: int = 10, + **kwargs, + ) -> FindResultBatched: + """Find Documents in the index using nearest-neighbor search. + + :param queries: query vector for KNN/ANN search. + Can be either a tensor-like (np.array, torch.Tensor, etc.) with a, + or a DocList. + If a tensor-like is passed, it should have shape (batch_size, vector_dim) + :param search_field: name of the field to search on. + Documents in the index are retrieved based on this similarity + of this field to the query. + :param limit: maximum number of documents to return per query + :return: a named tuple containing `documents` and `scores` + """ + self._logger.debug(f'Executing `find_batched` for search field {search_field}') + self._validate_search_field(search_field) + config = self._column_infos[search_field].config + + find_res = find_batched( + index=self._docs, + query=cast(NdArray, queries), + search_field=search_field, + limit=limit, + metric=config['space'], + ) + + return find_res + + def _find_batched( + self, queries: np.ndarray, limit: int, search_field: str = '' + ) -> _FindResultBatched: + raise NotImplementedError + + def filter( + self, + filter_query: Any, + limit: int = 10, + **kwargs, + ) -> DocList: + """Find documents in the index based on a filter query + + :param filter_query: the filter query to execute following the query + language of + :param limit: maximum number of documents to return + :return: a DocList containing the documents that match the filter query + """ + self._logger.debug(f'Executing `filter` for the query {filter_query}') + + docs = filter_docs(docs=self._docs, query=filter_query) + return cast(DocList, docs) + + def _filter(self, filter_query: Any, limit: int) -> Union[DocList, List[Dict]]: + raise NotImplementedError + + def _filter_batched( + self, filter_queries: Any, limit: int + ) -> Union[List[DocList], List[List[Dict]]]: + raise NotImplementedError(f'{type(self)} does not support filtering.') + + def _text_search( + self, query: str, limit: int, search_field: str = '' + ) -> _FindResult: + raise NotImplementedError(f'{type(self)} does not support text search.') + + def _text_search_batched( + self, queries: Sequence[str], limit: int, search_field: str = '' + ) -> _FindResultBatched: + raise NotImplementedError(f'{type(self)} does not support text search.') diff --git a/docs/API_reference/doc_index/backends/in_memory.md b/docs/API_reference/doc_index/backends/in_memory.md new file mode 100644 index 00000000000..6fe76893440 --- /dev/null +++ b/docs/API_reference/doc_index/backends/in_memory.md @@ -0,0 +1,3 @@ +# InMemoryDocIndex + +::: docarray.index.backends.in_memory.InMemoryDocIndex diff --git a/docs/user_guide/storing/index_in_memory.md b/docs/user_guide/storing/index_in_memory.md new file mode 100644 index 00000000000..88bab5ce5c2 --- /dev/null +++ b/docs/user_guide/storing/index_in_memory.md @@ -0,0 +1,225 @@ +# In-Memory Document Index + + +[InMemoryDocIndex][docarray.index.backends.in_memory.InMemoryDocIndex] stores all Documents in DocLists in memory. +It is a great starting point for small datasets, where you may not want to launch a database server. + +For vector search and filtering the InMemoryDocIndex utilizes DocArray's [`find()`][docarray.utils.find.find] and +[`filter_docs()`][docarray.utils.filter.filter_docs] functions. + +## Basic usage + +To see how to create a [InMemoryDocIndex][docarray.index.backends.in_memory.InMemoryDocIndex] instance, add Documents, +perform search, etc. see the [general user guide](./docindex.md). + +You can initialize the index as follows: + +```python +from docarray import BaseDoc, DocList +from docarray.index.backends.in_memory import InMemoryDocIndex +from docarray.typing import NdArray + + +class MyDoc(BaseDoc): + tensor: NdArray = None + + +docs = DocList[MyDoc](MyDoc() for _ in range(10)) + +doc_index = InMemoryDocIndex[MyDoc]() +doc_index.index(docs) + +# or in one step: +doc_index = InMemoryDocIndex[MyDoc](docs) +``` + +## Configuration + +This section lays out the configurations and options that are specific to [InMemoryDocIndex][docarray.index.backends.in_memory.InMemoryDocIndex]. + +### RuntimeConfig + +The `RuntimeConfig` of [InMemoryDocIndex][docarray.index.backends.in_memory.InMemoryDocIndex] contains only one entry: +the default mapping from Python types to column configurations. + +You can see in the [section below](#field-wise-configurations) how to override configurations for specific fields. +If you want to set configurations globally, i.e. for all vector fields in your Documents, you can do that using `RuntimeConfig`: + +```python +from collections import defaultdict +from docarray.typing import AbstractTensor + +index.configure( + default_column_config=defaultdict( + dict, + { + AbstractTensor: {'space': 'cosine_sim'}, + }, + ) +) +``` + +This will set the default configuration for all vector fields to the one specified in the example above. + +For more information on these settings, see [below](#field-wise-configurations). + +Fields that are not vector fields (e.g. of type `str` or `int` etc.) do not offer any configuration. + + +### Field-wise configurations + +For a vector field you can adjust the `space` parameter. It can be one of: + +- `'cosine_sim'` (default) +- `'euclidean_dist'` +- `'sqeuclidean_dist'` + +You pass it using the `field: Type = Field(...)` syntax: + +```python +from docarray import BaseDoc +from pydantic import Field + + +class Schema(BaseDoc): + tensor_1: NdArray[100] = Field(space='euclidean_dist') + tensor_2: NdArray[100] = Field(space='sqeuclidean_dist') +``` + +In the example above you can see how to configure two different vector fields, with two different sets of settings. + +## Nested index + +When using the index, you can define multiple fields and their nested structure. In the following example, you have `YouTubeVideoDoc` including the `tensor` field calculated based on the description. `YouTubeVideoDoc` has `thumbnail` and `video` fields, each with their own `tensor`. + +```python +import numpy as np +from docarray import BaseDoc +from docarray.index.backends.in_memory import InMemoryDocIndex +from docarray.typing import ImageUrl, VideoUrl, AnyTensor +from pydantic import Field + + +class ImageDoc(BaseDoc): + url: ImageUrl + tensor: AnyTensor = Field(space='cosine_sim') + + +class VideoDoc(BaseDoc): + url: VideoUrl + tensor: AnyTensor = Field(space='cosine_sim') + + +class YouTubeVideoDoc(BaseDoc): + title: str + description: str + thumbnail: ImageDoc + video: VideoDoc + tensor: AnyTensor = Field(space='cosine_sim') + + +doc_index = InMemoryDocIndex[YouTubeVideoDoc]() +index_docs = [ + YouTubeVideoDoc( + title=f'video {i+1}', + description=f'this is video from author {10*i}', + thumbnail=ImageDoc(url=f'http://example.ai/images/{i}', tensor=np.ones(64)), + video=VideoDoc(url=f'http://example.ai/videos/{i}', tensor=np.ones(128)), + tensor=np.ones(256), + ) + for i in range(8) +] +doc_index.index(index_docs) +``` + +## Search Documents + +To search Documents, the `InMemoryDocIndex` uses DocArray's [`find`][docarray.utils.find.find] function. + +You can use the `search_field` to specify which field to use when performing the vector search. +You can use the dunder operator to specify the field defined in nested data. +In the following code, you can perform vector search on the `tensor` field of the `YouTubeVideoDoc` +or the `tensor` field of the `thumbnail` and `video` fields: + +```python +# find by the youtubevideo tensor +query = parse_obj_as(NdArray, np.ones(256)) +docs, scores = doc_index.find(query, search_field='tensor', limit=3) + +# find by the thumbnail tensor +query = parse_obj_as(NdArray, np.ones(64)) +docs, scores = doc_index.find(query, search_field='thumbnail__tensor', limit=3) + +# find by the video tensor +query = parse_obj_as(NdArray, np.ones(128)) +docs, scores = doc_index.find(query, search_field='video__tensor', limit=3) +``` + +## Filter Documents + +To filter Documents, the `InMemoryDocIndex` uses DocArray's [`filter_docs()`][docarray.utils.filter.filter_docs] function. + +You can filter your documents by using the `filter()` or `filter_batched()` method with a corresponding filter query. +The query should follow the query language of the DocArray's [`filter_docs()`][docarray.utils.filter.filter_docs] function. + +In the following example let's filter for all the books that are cheaper than 29 dollars: + +```python +from docarray import BaseDoc, DocList + + +class Book(BaseDoc): + title: str + price: int + + +books = DocList[Book]([Book(title=f'title {i}', price=i * 10) for i in range(10)]) +book_index = InMemoryDocIndex[Book](books) + +# filter for books that are cheaper than 29 dollars +query = {'price': {'$lte': 29}} +cheap_books = book_index.filter(query) + +assert len(cheap_books) == 3 +for doc in cheap_books: + doc.summary() +``` + +
+ Output + ```text + 📄 Book : 1f7da15 ... + ╭──────────────────────┬───────────────╮ + │ Attribute │ Value │ + ├──────────────────────┼───────────────┤ + │ title: str │ title 0 │ + │ price: int │ 0 │ + ╰──────────────────────┴───────────────╯ + 📄 Book : 63fd13a ... + ╭──────────────────────┬───────────────╮ + │ Attribute │ Value │ + ├──────────────────────┼───────────────┤ + │ title: str │ title 1 │ + │ price: int │ 10 │ + ╰──────────────────────┴───────────────╯ + 📄 Book : 49b21de ... + ╭──────────────────────┬───────────────╮ + │ Attribute │ Value │ + ├──────────────────────┼───────────────┤ + │ title: str │ title 2 │ + │ price: int │ 20 │ + ╰──────────────────────┴───────────────╯ + ``` +
+ +## Delete Documents + +To delete nested data, you need to specify the `id`. + +!!! note + You can only delete Documents at the top level. Deletion of Documents on lower levels is not yet supported. + +```python +# example of deleting nested and flat index +del doc_index[index_docs[6].id] +``` diff --git a/mkdocs.yml b/mkdocs.yml index 8cc549c4f9a..23a9cb62eb1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -102,6 +102,7 @@ nav: - user_guide/storing/first_step.md - DocIndex: - user_guide/storing/docindex.md + - user_guide/storing/index_in_memory.md - user_guide/storing/index_hnswlib.md - user_guide/storing/index_weaviate.md - user_guide/storing/index_elastic.md diff --git a/tests/index/in_memory/__init__.py b/tests/index/in_memory/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/index/in_memory/test_in_memory.py b/tests/index/in_memory/test_in_memory.py new file mode 100644 index 00000000000..e999ac15f30 --- /dev/null +++ b/tests/index/in_memory/test_in_memory.py @@ -0,0 +1,112 @@ +import numpy as np +import pytest +from pydantic import Field + +from docarray import BaseDoc, DocList +from docarray.index.backends.in_memory import InMemoryDocIndex +from docarray.typing import NdArray + + +class SchemaDoc(BaseDoc): + text: str + price: int + tensor: NdArray[10] + + +@pytest.fixture +def docs(): + docs = DocList[SchemaDoc]( + [ + SchemaDoc(text=f'hello {i}', price=i, tensor=np.array([i] * 10)) + for i in range(9) + ] + ) + docs.append(SchemaDoc(text='good bye', price=100, tensor=np.array([100.0] * 10))) + return docs + + +def test_indexing(docs): + doc_index = InMemoryDocIndex[SchemaDoc]() + assert doc_index.num_docs() == 0 + + doc_index.index(docs) + assert doc_index.num_docs() == 10 + + +@pytest.fixture +def doc_index(docs): + doc_index = InMemoryDocIndex[SchemaDoc]() + doc_index.index(docs) + return doc_index + + +def test_del_item(docs, doc_index): + to_remove = [docs[0].id, docs[1].id] + doc_index._del_items(to_remove) + assert doc_index.num_docs() == 8 + + +def test_del(docs, doc_index): + del doc_index[docs[0].id] + assert doc_index.num_docs() == 9 + + +@pytest.mark.parametrize('space', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +@pytest.mark.parametrize('is_query_doc', [True, False]) +def test_find(doc_index, space, is_query_doc): + class MyDoc(BaseDoc): + text: str + price: int + tensor: NdArray[10] = Field(space=space) + + if is_query_doc: + query = MyDoc(text='query', price=0, tensor=np.ones(10)) + else: + query = np.ones(10) + + docs, scores = doc_index.find(query, search_field='tensor', limit=5) + + assert len(docs) == 5 + assert len(scores) == 5 + assert doc_index.num_docs() == 10 + + +@pytest.mark.parametrize('space', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +@pytest.mark.parametrize('is_query_doc', [True, False]) +def test_find_batched(doc_index, space, is_query_doc): + class MyDoc(BaseDoc): + text: str + price: int + tensor: NdArray[10] = Field(space=space) + + if is_query_doc: + query = DocList[MyDoc]( + [ + MyDoc(text='query 0', price=0, tensor=np.zeros(10)), + MyDoc(text='query 1', price=1, tensor=np.ones(10)), + ] + ) + else: + query = np.ones((2, 10)) + + docs, scores = doc_index.find_batched(query, search_field='tensor', limit=5) + + assert len(docs) == 2 + for result in docs: + assert len(result) == 5 + assert doc_index.num_docs() == 10 + + +def test_concatenated_queries(doc_index): + query = SchemaDoc(text='query', price=0, tensor=np.ones(10)) + + q = ( + doc_index.build_query() + .find(query=query, search_field='tensor', limit=5) + .filter(filter_query={'price': {'$neq': 5}}) + .build() + ) + + docs, scores = doc_index.execute_query(q) + + assert len(docs) == 4 diff --git a/tests/index/qdrant/fixtures.py b/tests/index/qdrant/fixtures.py index 33bfd862101..d44a0950d35 100644 --- a/tests/index/qdrant/fixtures.py +++ b/tests/index/qdrant/fixtures.py @@ -1,5 +1,3 @@ -import uuid - import pytest import qdrant_client