From 4d13c4c33321ac22a68139df81e272f282ca23a9 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 17 Mar 2023 13:01:09 +0800 Subject: [PATCH 01/14] feat: elastic store based on version 8 Signed-off-by: AnneY --- .../doc_index/backends/elasticv8_doc_index.py | 453 ++++++++++++++++++ poetry.lock | 47 +- pyproject.toml | 5 +- tests/doc_index/elastic/fixture.py | 58 +++ tests/doc_index/elastic/v8/docker-compose.yml | 16 + tests/doc_index/elastic/v8/test_find.py | 278 +++++++++++ .../elastic/v8/test_index_get_del.py | 232 +++++++++ 7 files changed, 1084 insertions(+), 5 deletions(-) create mode 100644 docarray/doc_index/backends/elasticv8_doc_index.py create mode 100644 tests/doc_index/elastic/fixture.py create mode 100644 tests/doc_index/elastic/v8/docker-compose.yml create mode 100644 tests/doc_index/elastic/v8/test_find.py create mode 100644 tests/doc_index/elastic/v8/test_index_get_del.py diff --git a/docarray/doc_index/backends/elasticv8_doc_index.py b/docarray/doc_index/backends/elasticv8_doc_index.py new file mode 100644 index 00000000000..ee10d20c0d2 --- /dev/null +++ b/docarray/doc_index/backends/elasticv8_doc_index.py @@ -0,0 +1,453 @@ +import uuid +import warnings +from collections import defaultdict +from dataclasses import dataclass, field +from typing import ( + Any, + Dict, + Generator, + Generic, + Iterable, + List, + Mapping, + Optional, + Sequence, + Tuple, + Type, + TypeVar, + Union, + cast, +) + +import numpy as np +from elastic_transport import NodeConfig +from elasticsearch import Elasticsearch +from elasticsearch.helpers import parallel_bulk + +import docarray.typing +from docarray import BaseDocument +from docarray.doc_index.abstract_doc_index import ( + BaseDocumentIndex, + _ColumnInfo, + _FindResultBatched, + _raise_not_composable, +) +from docarray.typing import AnyTensor +from docarray.utils.find import _FindResult +from docarray.utils.misc import torch_imported + +TSchema = TypeVar('TSchema', bound=BaseDocument) +T = TypeVar('T', bound='ElasticDocumentV8Index') + +ELASTIC_PY_VEC_TYPES = [list, tuple, np.ndarray] +ELASTIC_PY_TYPES = [bool, int, float, str, docarray.typing.ID] +if torch_imported: + import torch + + ELASTIC_PY_VEC_TYPES.append(torch.Tensor) + + +class ElasticDocumentV8Index(BaseDocumentIndex, Generic[TSchema]): + def __init__(self, db_config=None, **kwargs): + super().__init__(db_config=db_config, **kwargs) + self._db_config = cast(ElasticDocumentV8Index.DBConfig, self._db_config) + + if self._db_config.index_name is None: + id = uuid.uuid4().hex + self._db_config.index_name = 'index__' + id + + self._index_name = self._db_config.index_name + + self._client = Elasticsearch( + hosts=self._db_config.hosts, + **self._db_config.es_config, + ) + + # ElasticSearh index setup + self._index_init_params = ('type',) + self._index_vector_params = ('dims', 'similarity', 'index') + self._index_vector_options = ('m', 'ef_construction') + + mappings: Dict[str, Any] = { + 'dynamic': True, + '_source': {'enabled': 'true'}, + 'properties': {}, + } + + for col_name, col in self._column_infos.items(): + if not col.config: + continue # do not create column index if no config is given + mappings['properties'][col_name] = self._create_index(col) + + if self._client.indices.exists(index=self._index_name): # type: ignore + self._client.indices.put_mapping( + index=self._index_name, properties=mappings['properties'] + ) + else: + self._client.indices.create(index=self._index_name, mappings=mappings) + + if len(self._db_config.index_settings): + self._client.indices.put_settings( + index=self._index_name, settings=self._db_config.index_settings + ) + + self._refresh(self._index_name) + + ############################################### + # Inner classes for query builder and configs # + ############################################### + class QueryBuilder(BaseDocumentIndex.QueryBuilder): + def __init__(self, outer_instance, **kwargs): + super().__init__() + self._outer_instance = outer_instance + self._query: Dict[str, Any] = { + 'query': defaultdict(lambda: defaultdict(list)) + } + + def build(self, *args, **kwargs) -> Any: + if len(self._query['query']) == 0: + del self._query['query'] + elif 'knn' in self._query: + self._query['knn']['filter'] = self._query['query'] + del self._query['query'] + + return self._query + + def find( + self, + query: Union[AnyTensor, BaseDocument], + search_field: str = 'embedding', + limit: int = 10, + ): + if isinstance(query, BaseDocument): + query_vec = BaseDocumentIndex._get_values_by_column( + [query], search_field + )[0] + else: + query_vec = query + query_vec_np = BaseDocumentIndex._to_numpy(self._outer_instance, query_vec) + self._query['knn'] = { + 'field': search_field, + 'query_vector': query_vec_np, + 'k': limit, + 'num_candidates': self._outer_instance._runtime_config.default_column_config[ + np.ndarray + ][ + 'num_candidates' + ], + } + return self + + # filter accrpts Leaf/Compound query clauses + # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html + def filter(self, query: Dict[str, Any], limit: int = 10): + self._query['size'] = limit + self._query['query']['bool']['filter'].append(query) + return self + + def text_search(self, query: str, search_field: str = 'text', limit: int = 10): + self._query['size'] = limit + self._query['query']['bool']['must'].append( + {'match': {search_field: query}} + ) + return self + + find_batched = _raise_not_composable('find_batched') + filter_batched = _raise_not_composable('find_batched') + text_search_batched = _raise_not_composable('text_search') + + def build_query(self, **kwargs) -> QueryBuilder: + """ + Build a query for this DocumentIndex. + """ + return self.QueryBuilder(self, **kwargs) # type: ignore + + @dataclass + class DBConfig(BaseDocumentIndex.DBConfig): + + hosts: Union[ + str, List[Union[str, Mapping[str, Union[str, int]], NodeConfig]], None + ] = 'http://localhost:9200' + index_name: Optional[str] = None + es_config: Dict[str, Any] = field(default_factory=dict) + index_settings: Dict[str, Any] = field(default_factory=dict) + + @dataclass + class RuntimeConfig(BaseDocumentIndex.RuntimeConfig): + default_column_config: Dict[Type, Dict[str, Any]] = field( + default_factory=lambda: { + np.ndarray: { + 'type': 'dense_vector', + 'index': True, + 'dims': 128, + 'similarity': 'cosine', # 'l2_norm', 'dot_product', 'cosine' + 'm': 16, + 'ef_construction': 100, + 'num_candidates': 10000, + }, + docarray.typing.ID: {'type': 'keyword'}, + bool: {'type': 'boolean'}, + int: {'type': 'integer'}, + float: {'type': 'float'}, + str: {'type': 'text'}, + # `None` is not a Type, but we allow it here anyway + None: {}, # type: ignore + } + ) + + ############################################### + # Implementation of abstract methods # + ############################################### + + def python_type_to_db_type(self, python_type: Type) -> Any: + """Map python type to database type.""" + for allowed_type in ELASTIC_PY_VEC_TYPES: + if issubclass(python_type, allowed_type): + return np.ndarray + + if python_type in ELASTIC_PY_TYPES: + return python_type + + raise ValueError(f'Unsupported column type for {type(self)}: {python_type}') + + def _index( + self, + column_to_data: Dict[str, Generator[Any, None, None]], + refresh: bool = True, + ): + + data = self._transpose_col_value_dict(column_to_data) # type: ignore + requests = [] + + for row in data: + request = { + '_index': self._index_name, + '_id': row['id'], + } + for col_name, col in self._column_infos.items(): + if not col.config: + continue + if col.db_type == np.ndarray and np.all(row[col_name] == 0): + row[col_name] = row[col_name] + 1.0e-9 + request[col_name] = row[col_name] + requests.append(request) + + _, warning_info = self._send_requests(requests) + for info in warning_info: + warnings.warn(str(info)) + + if refresh: + self._refresh(self._index_name) + + def num_docs(self) -> int: + return self._client.count(index=self._index_name)['count'] + + def _del_items(self, doc_ids: Sequence[str]): + requests = [] + for _id in doc_ids: + requests.append( + {'_op_type': 'delete', '_index': self._index_name, '_id': _id} + ) + + _, warning_info = self._send_requests(requests) + + # raise warning if some ids are not found + if warning_info: + ids = [info['delete']['_id'] for info in warning_info] + warnings.warn(f'No document with id {ids} found') + + self._refresh(self._index_name) + + def _get_items(self, doc_ids: Sequence[str]) -> Sequence[TSchema]: + accumulated_docs = [] + accumulated_docs_id_not_found = [] + + es_rows = self._client.mget( + index=self._index_name, + ids=doc_ids, # type: ignore + )['docs'] + + for row in es_rows: + if row['found']: + doc_dict = row['_source'] + accumulated_docs.append(doc_dict) + else: + accumulated_docs_id_not_found.append(row['_id']) + + # raise warning if some ids are not found + if accumulated_docs_id_not_found: + warnings.warn(f'No document with id {accumulated_docs_id_not_found} found') + + return accumulated_docs + + def _find( + self, + query: np.ndarray, + limit: int, + search_field: str = '', + ) -> _FindResult: + knn_query = { + 'field': search_field, + 'query_vector': query, + 'k': limit, + 'num_candidates': self._runtime_config.default_column_config[np.ndarray][ + 'num_candidates' + ], + } + + resp = self._client.search( + index=self._index_name, + knn=knn_query, + size=limit, + ) + + docs, scores = self._format_response(resp) + + return _FindResult(documents=docs, scores=np.array(scores)) # type: ignore + + def _find_batched( + self, + queries: np.ndarray, + limit: int, + search_field: str = '', + ) -> _FindResultBatched: + result_das = [] + result_scores = [] + + for query in queries: + documents, scores = self._find(query, limit, search_field) + result_das.append(documents) + result_scores.append(scores) + + return _FindResultBatched(documents=result_das, scores=np.array(result_scores)) # type: ignore + + def _filter( + self, + filter_query: Dict[str, Any], + limit: int, + ) -> List[Dict]: + resp = self._client.search( + index=self._index_name, + query=filter_query, + size=limit, + ) + + docs, _ = self._format_response(resp) + + return docs + + def _filter_batched( + self, + filter_queries: Any, + limit: int, + ) -> List[List[Dict]]: + result_das = [] + for query in filter_queries: + result_das.append(self._filter(query, limit)) + return result_das + + def _text_search( + self, + query: str, + limit: int, + search_field: str = '', + ) -> _FindResult: + search_query = { + "bool": { + "must": [ + {"match": {search_field: query}}, + ], + } + } + + resp = self._client.search( + index=self._index_name, + query=search_query, + size=limit, + ) + + docs, scores = self._format_response(resp) + + return _FindResult(documents=docs, scores=np.array(scores)) # type: ignore + + def _text_search_batched( + self, + queries: Sequence[str], + limit: int, + search_field: str = '', + ) -> _FindResultBatched: + result_das = [] + result_scores = [] + + for query in queries: + documents, scores = self._text_search(query, limit, search_field) + result_das.append(documents) + result_scores.append(scores) + + return _FindResultBatched(documents=result_das, scores=np.array(result_scores, dtype=object)) # type: ignore + + def execute_query(self, query: Dict[str, Any], *args, **kwargs) -> Any: + if args or kwargs: + raise ValueError( + f'args and kwargs not supported for `execute_query` on {type(self)}' + ) + + resp = self._client.search(index=self._index_name, **query) + docs, scores = self._format_response(resp) + return _FindResult(documents=docs, scores=np.array(scores)) # type: ignore + + ############################################### + # Helpers # + ############################################### + + # ElasticSearch helpers + def _create_index(self, col: '_ColumnInfo') -> Dict[str, Any]: + """Create a new HNSW index for a column, and initialize it.""" + index = dict((k, col.config[k]) for k in self._index_init_params) + if col.db_type == np.ndarray: + for k in self._index_vector_params: + index[k] = col.config[k] + if col.n_dim: + index['dims'] = col.n_dim + index['index_options'] = dict( + (k, col.config[k]) for k in self._index_vector_options + ) + index['index_options']['type'] = 'hnsw' + return index + + def _send_requests( + self, request: Iterable[Dict[str, Any]], **kwargs + ) -> Tuple[List[Dict], List[Any]]: + """Send bulk request to Elastic and gather the successful info""" + + # TODO chunk_size + + accumulated_info = [] + warning_info = [] + for success, info in parallel_bulk( + self._client, + request, + raise_on_error=False, + raise_on_exception=False, + **kwargs, + ): + if not success: + warning_info.append(info) + else: + accumulated_info.append(info) + + return accumulated_info, warning_info + + def _format_response(self, response: Any) -> Tuple[List[Dict], List[float]]: + docs = [] + scores = [] + for result in response['hits']['hits']: + doc_dict = result['_source'] + doc_dict['id'] = result['_id'] + docs.append(doc_dict) + scores.append(result['_score']) + + return docs, scores + + def _refresh(self, index_name: str): + self._client.indices.refresh(index=index_name) diff --git a/poetry.lock b/poetry.lock index 8b8d2526924..2ef1f5edf90 100644 --- a/poetry.lock +++ b/poetry.lock @@ -155,7 +155,7 @@ dev = ["Sphinx (==4.3.2)", "black (==22.3.0)", "build (==0.8.0)", "flake8 (==4.0 name = "certifi" version = "2022.9.24" description = "Python package for providing Mozilla's CA Bundle." -category = "dev" +category = "main" optional = false python-versions = ">=3.6" @@ -252,6 +252,36 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "elastic-transport" +version = "8.4.0" +description = "Transport classes and utilities shared among Python Elastic client libraries" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +certifi = "*" +urllib3 = ">=1.26.2,<2" + +[package.extras] +develop = ["aiohttp", "mock", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests", "trustme"] + +[[package]] +name = "elasticsearch" +version = "8.6.2" +description = "Python client for Elasticsearch" +category = "main" +optional = false +python-versions = ">=3.6, <4" + +[package.dependencies] +elastic-transport = ">=8,<9" + +[package.extras] +async = ["aiohttp (>=3,<4)"] +requests = ["requests (>=2.4.0,<3.0.0)"] + [[package]] name = "entrypoints" version = "0.4" @@ -1626,7 +1656,7 @@ typing-extensions = ">=3.7.4" name = "urllib3" version = "1.26.14" description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "dev" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" @@ -1724,6 +1754,7 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools" [extras] audio = ["pydub"] common = ["protobuf", "lz4"] +elasticsearch = ["elasticsearch", "elastic-transport"] hnswlib = ["hnswlib"] image = ["pillow", "types-pillow"] mesh = ["trimesh"] @@ -1734,8 +1765,8 @@ web = ["fastapi"] [metadata] lock-version = "1.1" -python-versions = ">=3.7" -content-hash = "3ec1e886d794ed803736bc2b49a626c600a97bda8a3db734b53604c10f08d252" +python-versions = ">=3.7, <4" +content-hash = "b4a74c6ca07e3ebe3beda8ba5909257646a41410ac45c39d7277bc1bfa9e37a2" [metadata.files] anyio = [ @@ -1984,6 +2015,14 @@ distlib = [ {file = "distlib-0.3.6-py2.py3-none-any.whl", hash = "sha256:f35c4b692542ca110de7ef0bea44d73981caeb34ca0b9b6b2e6d7790dda8f80e"}, {file = "distlib-0.3.6.tar.gz", hash = "sha256:14bad2d9b04d3a36127ac97f30b12a19268f211063d8f8ee4f47108896e11b46"}, ] +elastic-transport = [ + {file = "elastic-transport-8.4.0.tar.gz", hash = "sha256:b9ad708ceb7fcdbc6b30a96f886609a109f042c0b9d9f2e44403b3133ba7ff10"}, + {file = "elastic_transport-8.4.0-py3-none-any.whl", hash = "sha256:19db271ab79c9f70f8c43f8f5b5111408781a6176b54ab2e54d713b6d9ceb815"}, +] +elasticsearch = [ + {file = "elasticsearch-8.6.2-py3-none-any.whl", hash = "sha256:8ccbebd9a0f6f523c7db67bb54863dde8bdb93daae4ff97f7c814e0500a73e84"}, + {file = "elasticsearch-8.6.2.tar.gz", hash = "sha256:084458e84caa91e3ad807b68aa82c022e785bead853a3b125641a25e894a1d47"}, +] entrypoints = [ {file = "entrypoints-0.4-py3-none-any.whl", hash = "sha256:f174b5ff827504fd3cd97cc3f8649f3693f51538c7e4bdf3ef002c8429d42f9f"}, {file = "entrypoints-0.4.tar.gz", hash = "sha256:b706eddaa9218a19ebcd67b56818f05bb27589b1ca9e8d797b74affad4ccacd4"}, diff --git a/pyproject.toml b/pyproject.toml index 22636b46fa5..66ba648117c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ authors=['DocArray'] license='Apache 2.0' [tool.poetry.dependencies] -python = ">=3.7" +python = ">=3.7, <4" pydantic = ">=1.10.2" numpy = ">=1.17.3" protobuf = { version = ">=3.19.0", optional = true } @@ -24,6 +24,8 @@ hnswlib = {version = ">=0.6.2", optional = true } lz4 = {version= ">=1.0.0", optional = true} pydub = {version = "^0.25.1", optional = true } pandas = {version = ">=1.1.0", optional = true } +elastic-transport = "^8.4.0" +elasticsearch = "^8.6.2" [tool.poetry.extras] common = ["protobuf", "lz4"] @@ -35,6 +37,7 @@ mesh = ["trimesh"] web = ["fastapi"] hnswlib = ["hnswlib"] pandas = ["pandas"] +elasticsearch = ["elasticsearch", "elastic_transport"] [tool.poetry.dev-dependencies] pytest = ">=7.0" diff --git a/tests/doc_index/elastic/fixture.py b/tests/doc_index/elastic/fixture.py new file mode 100644 index 00000000000..3862b04f8d4 --- /dev/null +++ b/tests/doc_index/elastic/fixture.py @@ -0,0 +1,58 @@ +import os +import time + +import pytest +from pydantic import Field + +from docarray import BaseDocument +from docarray.typing import NdArray + +pytestmark = [pytest.mark.slow, pytest.mark.doc_index] + + +class SimpleDoc(BaseDocument): + tens: NdArray[10] = Field(dims=1000) + + +class FlatDoc(BaseDocument): + tens_one: NdArray = Field(dims=10) + tens_two: NdArray = Field(dims=50) + + +class NestedDoc(BaseDocument): + d: SimpleDoc + + +class DeepNestedDoc(BaseDocument): + d: NestedDoc + + +cur_dir = os.path.dirname(os.path.abspath(__file__)) +compose_yml_v7 = os.path.abspath(os.path.join(cur_dir, 'v7/docker-compose.yml')) +compose_yml_v8 = os.path.abspath(os.path.join(cur_dir, 'v8/docker-compose.yml')) + + +@pytest.fixture(scope='module', autouse=True) +def start_storage_v7(): + os.system(f"docker-compose -f {compose_yml_v7} up -d --remove-orphans") + _wait_for_es() + + yield + os.system(f"docker-compose -f {compose_yml_v7} down --remove-orphans") + + +@pytest.fixture(scope='module', autouse=True) +def start_storage_v8(): + os.system(f"docker-compose -f {compose_yml_v8} up -d --remove-orphans") + _wait_for_es() + + yield + os.system(f"docker-compose -f {compose_yml_v8} down --remove-orphans") + + +def _wait_for_es(): + from elasticsearch import Elasticsearch + + es = Elasticsearch(hosts='http://localhost:9200/') + while not es.ping(): + time.sleep(0.5) diff --git a/tests/doc_index/elastic/v8/docker-compose.yml b/tests/doc_index/elastic/v8/docker-compose.yml new file mode 100644 index 00000000000..70eedba34f5 --- /dev/null +++ b/tests/doc_index/elastic/v8/docker-compose.yml @@ -0,0 +1,16 @@ +version: "3.3" +services: + elastic: + image: docker.elastic.co/elasticsearch/elasticsearch:8.6.2 + environment: + - xpack.security.enabled=false + - discovery.type=single-node + - ES_JAVA_OPTS=-Xmx1024m + ports: + - "9200:9200" + networks: + - elastic + +networks: + elastic: + name: elastic \ No newline at end of file diff --git a/tests/doc_index/elastic/v8/test_find.py b/tests/doc_index/elastic/v8/test_find.py new file mode 100644 index 00000000000..944ed9887d7 --- /dev/null +++ b/tests/doc_index/elastic/v8/test_find.py @@ -0,0 +1,278 @@ +import numpy as np +import pytest +from pydantic import Field + +from docarray import BaseDocument +from docarray.doc_index.backends.elasticv8_doc_index import ElasticDocumentV8Index +from docarray.typing import NdArray +from tests.doc_index.elastic.fixture import start_storage_v8 # noqa: F401 +from tests.doc_index.elastic.fixture import FlatDoc, SimpleDoc + + +@pytest.mark.parametrize('similarity', ['cosine', 'l2_norm', 'dot_product']) +def test_find_simple_schema(similarity): + class SimpleSchema(BaseDocument): + tens: NdArray[10] = Field(similarity=similarity) + + store = ElasticDocumentV8Index[SimpleSchema]() + + index_docs = [] + for _ in range(10): + vec = np.random.rand(10) + if similarity == 'dot_product': + vec = vec / np.linalg.norm(vec) + index_docs.append(SimpleDoc(tens=vec)) + store.index(index_docs) + + query = index_docs[-1] + docs, scores = store.find(query, search_field='tens', limit=5) + + assert len(docs) == 5 + assert len(scores) == 5 + assert docs[0].id == index_docs[-1].id + assert np.allclose(docs[0].tens, index_docs[-1].tens) + + +@pytest.mark.parametrize('similarity', ['cosine', 'l2_norm', 'dot_product']) +def test_find_flat_schema(similarity): + class FlatSchema(BaseDocument): + tens_one: NdArray = Field(dims=10, similarity=similarity) + tens_two: NdArray = Field(dims=50, similarity=similarity) + + store = ElasticDocumentV8Index[FlatSchema]() + + index_docs = [] + for _ in range(10): + vec_one = np.random.rand(10) + vec_two = np.random.rand(50) + if similarity == 'dot_product': + vec_one = vec_one / np.linalg.norm(vec_one) + vec_two = vec_two / np.linalg.norm(vec_two) + index_docs.append(FlatDoc(tens_one=vec_one, tens_two=vec_two)) + + store.index(index_docs) + + query = index_docs[-1] + + # find on tens_one + docs, scores = store.find(query, search_field='tens_one', limit=5) + assert len(docs) == 5 + assert len(scores) == 5 + assert docs[0].id == index_docs[-1].id + assert np.allclose(docs[0].tens_one, index_docs[-1].tens_one) + assert np.allclose(docs[0].tens_two, index_docs[-1].tens_two) + + # find on tens_two + docs, scores = store.find(query, search_field='tens_two', limit=5) + assert len(docs) == 5 + assert len(scores) == 5 + assert docs[0].id == index_docs[-1].id + assert np.allclose(docs[0].tens_one, index_docs[-1].tens_one) + assert np.allclose(docs[0].tens_two, index_docs[-1].tens_two) + + +@pytest.mark.parametrize('similarity', ['cosine', 'l2_norm', 'dot_product']) +def test_find_nested_schema(similarity): + class SimpleDoc(BaseDocument): + tens: NdArray[10] = Field(similarity=similarity) + + class NestedDoc(BaseDocument): + d: SimpleDoc + tens: NdArray[10] = Field(similarity=similarity) + + class DeepNestedDoc(BaseDocument): + d: NestedDoc + tens: NdArray = Field(similarity=similarity, dims=10) + + store = ElasticDocumentV8Index[DeepNestedDoc]() + + index_docs = [] + for _ in range(10): + vec_simple = np.random.rand(10) + vec_nested = np.random.rand(10) + vec_deep = np.random.rand(10) + if similarity == 'dot_product': + vec_simple = vec_simple / np.linalg.norm(vec_simple) + vec_nested = vec_nested / np.linalg.norm(vec_nested) + vec_deep = vec_deep / np.linalg.norm(vec_deep) + index_docs.append( + DeepNestedDoc( + d=NestedDoc(d=SimpleDoc(tens=vec_simple), tens=vec_nested), + tens=vec_deep, + ) + ) + + store.index(index_docs) + + query = index_docs[-1] + + # find on root level + docs, scores = store.find(query, search_field='tens', limit=5) + assert len(docs) == 5 + assert len(scores) == 5 + assert docs[0].id == index_docs[-1].id + assert np.allclose(docs[0].tens, index_docs[-1].tens) + + # find on first nesting level + docs, scores = store.find(query, search_field='d__tens', limit=5) + assert len(docs) == 5 + assert len(scores) == 5 + assert docs[0].id == index_docs[-1].id + assert np.allclose(docs[0].d.tens, index_docs[-1].d.tens) + + # find on second nesting level + docs, scores = store.find(query, search_field='d__d__tens', limit=5) + assert len(docs) == 5 + assert len(scores) == 5 + assert docs[0].id == index_docs[-1].id + assert np.allclose(docs[0].d.d.tens, index_docs[-1].d.d.tens) + + +def test_find_batched(): + store = ElasticDocumentV8Index[SimpleDoc]() + + index_docs = [SimpleDoc(tens=np.random.rand(10)) for _ in range(10)] + store.index(index_docs) + + queries = index_docs[-2:] + docs_batched, scores_batched = store.find_batched( + queries, search_field='tens', limit=5 + ) + + for docs, scores, query in zip(docs_batched, scores_batched, queries): + assert len(docs) == 5 + assert len(scores) == 5 + assert docs[0].id == query.id + assert np.allclose(docs[0].tens, query.tens) + + +def test_filter(): + import itertools + + class MyDoc(BaseDocument): + A: bool + B: int + C: float + + store = ElasticDocumentV8Index[MyDoc]() + + A_list = [True, False] + B_list = [1, 2] + C_list = [1.5, 2.5] + + # cross product of all possible combinations + combinations = itertools.product(A_list, B_list, C_list) + index_docs = [MyDoc(A=A, B=B, C=C) for A, B, C in combinations] + store.index(index_docs) + + filter_query = {'term': {'A': True}} + docs = store.filter(filter_query) + assert len(docs) > 0 + for doc in docs: + assert doc.A + + filter_query = {'term': {'B': 1}} + docs = store.filter(filter_query) + assert len(docs) > 0 + for doc in docs: + assert doc.B == 1 + + filter_query = {'term': {'C': 1.5}} + docs = store.filter(filter_query) + assert len(docs) > 0 + for doc in docs: + assert doc.C == 1.5 + + +def test_text_search(): + class MyDoc(BaseDocument): + text: str + + store = ElasticDocumentV8Index[MyDoc]() + index_docs = [ + MyDoc(text='hello world'), + MyDoc(text='never gonna give you up'), + MyDoc(text='we are the world'), + ] + store.index(index_docs) + + query = 'world' + docs, scores = store.text_search(query, search_field='text') + + assert len(docs) == 2 + assert len(scores) == 2 + assert docs[0].text.index(query) >= 0 + assert docs[1].text.index(query) >= 0 + + queries = ['world', 'never'] + docs, scores = store.text_search_batched(queries, search_field='text') + for query, da, score in zip(queries, docs, scores): + assert len(da) > 0 + assert len(score) > 0 + for doc in da: + assert doc.text.index(query) >= 0 + + +def test_query_builder(): + class MyDoc(BaseDocument): + tens: NdArray[10] = Field(similarity='l2_norm') + num: int + text: str + + store = ElasticDocumentV8Index[MyDoc]() + index_docs = [ + MyDoc(id=f'{i}', tens=np.ones(10) * i, num=int(i / 2), text=f'text {int(i/2)}') + for i in range(10) + ] + store.index(index_docs) + + # build_query + q = store.build_query() + assert isinstance(q, store.QueryBuilder) + + # filter + q = store.build_query().filter({'term': {'num': 0}}).build() + docs, _ = store.execute_query(q) + assert [doc['id'] for doc in docs] == ['0', '1'] + + # find + q = store.build_query().find(index_docs[-1], search_field='tens', limit=3).build() + docs, _ = store.execute_query(q) + assert [doc['id'] for doc in docs] == ['9', '8', '7'] + + # text_search + q = store.build_query().text_search('0', search_field='text').build() + docs, _ = store.execute_query(q) + assert [doc['id'] for doc in docs] == ['0', '1'] + + # combination + q = ( + store.build_query() + .filter({'range': {'num': {'lte': 3}}}) + .find(index_docs[-1], search_field='tens') + .text_search('0', search_field='text') + .build() + ) + docs, _ = store.execute_query(q) + assert [doc['id'] for doc in docs] == ['1', '0'] + + # direct + query = { + 'knn': { + 'field': 'tens', + 'query_vector': [9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0], + 'k': 10, + 'num_candidates': 10000, + 'filter': { + 'bool': { + 'filter': [ + {'range': {'num': {'gte': 2}}}, + {'range': {'num': {'lte': 3}}}, + ] + } + }, + }, + } + + docs, _ = store.execute_query(query) + assert [doc['id'] for doc in docs] == ['7', '6', '5', '4'] diff --git a/tests/doc_index/elastic/v8/test_index_get_del.py b/tests/doc_index/elastic/v8/test_index_get_del.py new file mode 100644 index 00000000000..c826ed344c4 --- /dev/null +++ b/tests/doc_index/elastic/v8/test_index_get_del.py @@ -0,0 +1,232 @@ +import numpy as np +import pytest + +from docarray import DocumentArray +from docarray.doc_index.backends.elasticv8_doc_index import ElasticDocumentV8Index +from tests.doc_index.elastic.fixture import start_storage_v8 # noqa: F401 +from tests.doc_index.elastic.fixture import DeepNestedDoc, FlatDoc, NestedDoc, SimpleDoc + + +@pytest.fixture +def ten_simple_docs(): + return [SimpleDoc(tens=np.random.randn(10)) for _ in range(10)] + + +@pytest.fixture +def ten_flat_docs(): + return [ + FlatDoc(tens_one=np.random.randn(10), tens_two=np.random.randn(50)) + for _ in range(10) + ] + + +@pytest.fixture +def ten_nested_docs(): + return [NestedDoc(d=SimpleDoc(tens=np.random.randn(10))) for _ in range(10)] + + +@pytest.fixture +def ten_deep_nested_docs(): + return [ + DeepNestedDoc(d=NestedDoc(d=SimpleDoc(tens=np.random.randn(10)))) + for _ in range(10) + ] + + +@pytest.mark.parametrize('use_docarray', [True, False]) +def test_index_simple_schema(ten_simple_docs, use_docarray): + store = ElasticDocumentV8Index[SimpleDoc]() + if use_docarray: + ten_simple_docs = DocumentArray[SimpleDoc](ten_simple_docs) + + store.index(ten_simple_docs) + assert store.num_docs() == 10 + + +@pytest.mark.parametrize('use_docarray', [True, False]) +def test_index_flat_schema(ten_flat_docs, use_docarray): + store = ElasticDocumentV8Index[FlatDoc]() + if use_docarray: + ten_flat_docs = DocumentArray[FlatDoc](ten_flat_docs) + + store.index(ten_flat_docs) + assert store.num_docs() == 10 + + +@pytest.mark.parametrize('use_docarray', [True, False]) +def test_index_nested_schema(ten_nested_docs, use_docarray): + store = ElasticDocumentV8Index[NestedDoc]() + if use_docarray: + ten_nested_docs = DocumentArray[NestedDoc](ten_nested_docs) + + store.index(ten_nested_docs) + assert store.num_docs() == 10 + + +@pytest.mark.parametrize('use_docarray', [True, False]) +def test_index_deep_nested_schema(ten_deep_nested_docs, use_docarray): + store = ElasticDocumentV8Index[DeepNestedDoc]() + if use_docarray: + ten_deep_nested_docs = DocumentArray[DeepNestedDoc](ten_deep_nested_docs) + + store.index(ten_deep_nested_docs) + assert store.num_docs() == 10 + + +def test_get_single(ten_simple_docs, ten_flat_docs, ten_nested_docs): + # simple + store = ElasticDocumentV8Index[SimpleDoc]() + store.index(ten_simple_docs) + + assert store.num_docs() == 10 + for d in ten_simple_docs: + id_ = d.id + assert store[id_].id == id_ + assert np.all(store[id_].tens == d.tens) + + # flat + store = ElasticDocumentV8Index[FlatDoc]() + store.index(ten_flat_docs) + + assert store.num_docs() == 10 + for d in ten_flat_docs: + id_ = d.id + assert store[id_].id == id_ + assert np.all(store[id_].tens_one == d.tens_one) + assert np.all(store[id_].tens_two == d.tens_two) + + # nested + store = ElasticDocumentV8Index[NestedDoc]() + store.index(ten_nested_docs) + + assert store.num_docs() == 10 + for d in ten_nested_docs: + id_ = d.id + assert store[id_].id == id_ + assert store[id_].d.id == d.d.id + assert np.all(store[id_].d.tens == d.d.tens) + + +def test_get_multiple(ten_simple_docs, ten_flat_docs, ten_nested_docs): + docs_to_get_idx = [0, 2, 4, 6, 8] + + # simple + store = ElasticDocumentV8Index[SimpleDoc]() + store.index(ten_simple_docs) + + assert store.num_docs() == 10 + docs_to_get = [ten_simple_docs[i] for i in docs_to_get_idx] + ids_to_get = [d.id for d in docs_to_get] + retrieved_docs = store[ids_to_get] + for id_, d_in, d_out in zip(ids_to_get, docs_to_get, retrieved_docs): + assert d_out.id == id_ + assert np.all(d_out.tens == d_in.tens) + + # flat + store = ElasticDocumentV8Index[FlatDoc]() + store.index(ten_flat_docs) + + assert store.num_docs() == 10 + docs_to_get = [ten_flat_docs[i] for i in docs_to_get_idx] + ids_to_get = [d.id for d in docs_to_get] + retrieved_docs = store[ids_to_get] + for id_, d_in, d_out in zip(ids_to_get, docs_to_get, retrieved_docs): + assert d_out.id == id_ + assert np.all(d_out.tens_one == d_in.tens_one) + assert np.all(d_out.tens_two == d_in.tens_two) + + # nested + store = ElasticDocumentV8Index[NestedDoc]() + store.index(ten_nested_docs) + + assert store.num_docs() == 10 + docs_to_get = [ten_nested_docs[i] for i in docs_to_get_idx] + ids_to_get = [d.id for d in docs_to_get] + retrieved_docs = store[ids_to_get] + for id_, d_in, d_out in zip(ids_to_get, docs_to_get, retrieved_docs): + assert d_out.id == id_ + assert d_out.d.id == d_in.d.id + assert np.all(d_out.d.tens == d_in.d.tens) + + +def test_get_key_error(ten_simple_docs): + store = ElasticDocumentV8Index[SimpleDoc]() + store.index(ten_simple_docs) + + with pytest.raises(KeyError): + store['not_a_real_id'] + + +def test_del_single(ten_simple_docs): + store = ElasticDocumentV8Index[SimpleDoc]() + store.index(ten_simple_docs) + # delete once + assert store.num_docs() == 10 + del store[ten_simple_docs[0].id] + assert store.num_docs() == 9 + for i, d in enumerate(ten_simple_docs): + id_ = d.id + if i == 0: # deleted + with pytest.raises(KeyError): + store[id_] + else: + assert store[id_].id == id_ + assert np.all(store[id_].tens == d.tens) + # delete again + del store[ten_simple_docs[3].id] + assert store.num_docs() == 8 + for i, d in enumerate(ten_simple_docs): + id_ = d.id + if i in (0, 3): # deleted + with pytest.raises(KeyError): + store[id_] + else: + assert store[id_].id == id_ + assert np.all(store[id_].tens == d.tens) + + +def test_del_multiple(ten_simple_docs): + docs_to_del_idx = [0, 2, 4, 6, 8] + + store = ElasticDocumentV8Index[SimpleDoc]() + store.index(ten_simple_docs) + + assert store.num_docs() == 10 + docs_to_del = [ten_simple_docs[i] for i in docs_to_del_idx] + ids_to_del = [d.id for d in docs_to_del] + del store[ids_to_del] + for i, doc in enumerate(ten_simple_docs): + if i in docs_to_del_idx: + with pytest.raises(KeyError): + store[doc.id] + else: + assert store[doc.id].id == doc.id + assert np.all(store[doc.id].tens == doc.tens) + + +def test_del_key_error(ten_simple_docs): + store = ElasticDocumentV8Index[SimpleDoc]() + store.index(ten_simple_docs) + + with pytest.warns(UserWarning): + del store['not_a_real_id'] + + +def test_num_docs(ten_simple_docs): + store = ElasticDocumentV8Index[SimpleDoc]() + store.index(ten_simple_docs) + + assert store.num_docs() == 10 + + del store[ten_simple_docs[0].id] + assert store.num_docs() == 9 + + del store[ten_simple_docs[3].id, ten_simple_docs[5].id] + assert store.num_docs() == 7 + + more_docs = [SimpleDoc(tens=np.random.rand(10)) for _ in range(5)] + store.index(more_docs) + assert store.num_docs() == 12 + + del store[more_docs[2].id, ten_simple_docs[7].id] + assert store.num_docs() == 10 From 6ba2f32ccd327f861b9be722cfbf1c2ff4f0f3df Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 23 Mar 2023 14:12:52 +0800 Subject: [PATCH 02/14] fix: update Signed-off-by: AnneY --- docarray/index/__init__.py | 3 +- .../{elasticv8_doc_index.py => elasticv8.py} | 110 ++++++++++-------- poetry.lock | 49 +++++++- tests/{doc_index => index}/elastic/fixture.py | 0 .../elastic/v8/docker-compose.yml | 0 .../elastic/v8/test_find.py | 35 +++--- .../elastic/v8/test_index_get_del.py | 6 +- 7 files changed, 121 insertions(+), 82 deletions(-) rename docarray/index/backends/{elasticv8_doc_index.py => elasticv8.py} (87%) rename tests/{doc_index => index}/elastic/fixture.py (100%) rename tests/{doc_index => index}/elastic/v8/docker-compose.yml (100%) rename tests/{doc_index => index}/elastic/v8/test_find.py (91%) rename tests/{doc_index => index}/elastic/v8/test_index_get_del.py (96%) diff --git a/docarray/index/__init__.py b/docarray/index/__init__.py index 5fdbf8ad736..dd348fda606 100644 --- a/docarray/index/__init__.py +++ b/docarray/index/__init__.py @@ -1,3 +1,4 @@ +from docarray.index.backends.elasticv8 import ElasticDocumentV8Index from docarray.index.backends.hnswlib import HnswDocumentIndex -__all__ = ['HnswDocumentIndex'] +__all__ = ['HnswDocumentIndex', 'ElasticDocumentV8Index'] diff --git a/docarray/index/backends/elasticv8_doc_index.py b/docarray/index/backends/elasticv8.py similarity index 87% rename from docarray/index/backends/elasticv8_doc_index.py rename to docarray/index/backends/elasticv8.py index ee10d20c0d2..2b84f640272 100644 --- a/docarray/index/backends/elasticv8_doc_index.py +++ b/docarray/index/backends/elasticv8.py @@ -26,7 +26,7 @@ import docarray.typing from docarray import BaseDocument -from docarray.doc_index.abstract_doc_index import ( +from docarray.index.abstract import ( BaseDocumentIndex, _ColumnInfo, _FindResultBatched, @@ -39,8 +39,7 @@ TSchema = TypeVar('TSchema', bound=BaseDocument) T = TypeVar('T', bound='ElasticDocumentV8Index') -ELASTIC_PY_VEC_TYPES = [list, tuple, np.ndarray] -ELASTIC_PY_TYPES = [bool, int, float, str, docarray.typing.ID] +ELASTIC_PY_VEC_TYPES: List[Any] = [np.ndarray] if torch_imported: import torch @@ -64,7 +63,6 @@ def __init__(self, db_config=None, **kwargs): ) # ElasticSearh index setup - self._index_init_params = ('type',) self._index_vector_params = ('dims', 'similarity', 'index') self._index_vector_options = ('m', 'ef_construction') @@ -75,11 +73,9 @@ def __init__(self, db_config=None, **kwargs): } for col_name, col in self._column_infos.items(): - if not col.config: - continue # do not create column index if no config is given - mappings['properties'][col_name] = self._create_index(col) + mappings['properties'][col_name] = self._create_index_mapping(col) - if self._client.indices.exists(index=self._index_name): # type: ignore + if self._client.indices.exists(index=self._index_name): self._client.indices.put_mapping( index=self._index_name, properties=mappings['properties'] ) @@ -131,7 +127,7 @@ def find( 'query_vector': query_vec_np, 'k': limit, 'num_candidates': self._outer_instance._runtime_config.default_column_config[ - np.ndarray + 'dense_vector' ][ 'num_candidates' ], @@ -160,7 +156,7 @@ def build_query(self, **kwargs) -> QueryBuilder: """ Build a query for this DocumentIndex. """ - return self.QueryBuilder(self, **kwargs) # type: ignore + return self.QueryBuilder(self, **kwargs) @dataclass class DBConfig(BaseDocumentIndex.DBConfig): @@ -174,10 +170,9 @@ class DBConfig(BaseDocumentIndex.DBConfig): @dataclass class RuntimeConfig(BaseDocumentIndex.RuntimeConfig): - default_column_config: Dict[Type, Dict[str, Any]] = field( + default_column_config: Dict[Any, Dict[str, Any]] = field( default_factory=lambda: { - np.ndarray: { - 'type': 'dense_vector', + 'dense_vector': { 'index': True, 'dims': 128, 'similarity': 'cosine', # 'l2_norm', 'dot_product', 'cosine' @@ -185,15 +180,16 @@ class RuntimeConfig(BaseDocumentIndex.RuntimeConfig): 'ef_construction': 100, 'num_candidates': 10000, }, - docarray.typing.ID: {'type': 'keyword'}, - bool: {'type': 'boolean'}, - int: {'type': 'integer'}, - float: {'type': 'float'}, - str: {'type': 'text'}, + 'keyword': {}, + 'boolean': {}, + 'integer': {}, + 'float': {}, + 'text': {}, # `None` is not a Type, but we allow it here anyway None: {}, # type: ignore } ) + chunk_size: int = 500 ############################################### # Implementation of abstract methods # @@ -203,10 +199,18 @@ def python_type_to_db_type(self, python_type: Type) -> Any: """Map python type to database type.""" for allowed_type in ELASTIC_PY_VEC_TYPES: if issubclass(python_type, allowed_type): - return np.ndarray + return 'dense_vector' + + elastic_py_types = { + bool: 'boolean', + int: 'integer', + float: 'float', + str: 'text', + docarray.typing.ID: 'keyword', + } - if python_type in ELASTIC_PY_TYPES: - return python_type + if python_type in elastic_py_types: + return elastic_py_types[python_type] raise ValueError(f'Unsupported column type for {type(self)}: {python_type}') @@ -214,6 +218,7 @@ def _index( self, column_to_data: Dict[str, Generator[Any, None, None]], refresh: bool = True, + chunk_size: Optional[int] = None, ): data = self._transpose_col_value_dict(column_to_data) # type: ignore @@ -225,14 +230,12 @@ def _index( '_id': row['id'], } for col_name, col in self._column_infos.items(): - if not col.config: - continue - if col.db_type == np.ndarray and np.all(row[col_name] == 0): + if col.db_type == 'dense_vector' and np.all(row[col_name] == 0): row[col_name] = row[col_name] + 1.0e-9 request[col_name] = row[col_name] requests.append(request) - _, warning_info = self._send_requests(requests) + _, warning_info = self._send_requests(requests, chunk_size) for info in warning_info: warnings.warn(str(info)) @@ -242,14 +245,18 @@ def _index( def num_docs(self) -> int: return self._client.count(index=self._index_name)['count'] - def _del_items(self, doc_ids: Sequence[str]): + def _del_items( + self, + doc_ids: Sequence[str], + chunk_size: Optional[int] = None, + ): requests = [] for _id in doc_ids: requests.append( {'_op_type': 'delete', '_index': self._index_name, '_id': _id} ) - _, warning_info = self._send_requests(requests) + _, warning_info = self._send_requests(requests, chunk_size) # raise warning if some ids are not found if warning_info: @@ -280,17 +287,24 @@ def _get_items(self, doc_ids: Sequence[str]) -> Sequence[TSchema]: return accumulated_docs + def execute_query(self, query: Dict[str, Any], *args, **kwargs) -> Any: + if args or kwargs: + raise ValueError( + f'args and kwargs not supported for `execute_query` on {type(self)}' + ) + + resp = self._client.search(index=self._index_name, **query) + docs, scores = self._format_response(resp) + return _FindResult(documents=docs, scores=np.array(scores)) # type: ignore + def _find( - self, - query: np.ndarray, - limit: int, - search_field: str = '', + self, query: np.ndarray, limit: int, search_field: str = '' ) -> _FindResult: knn_query = { 'field': search_field, 'query_vector': query, 'k': limit, - 'num_candidates': self._runtime_config.default_column_config[np.ndarray][ + 'num_candidates': self._runtime_config.default_column_config['dense_vector'][ # type: ignore 'num_candidates' ], } @@ -353,9 +367,9 @@ def _text_search( search_field: str = '', ) -> _FindResult: search_query = { - "bool": { - "must": [ - {"match": {search_field: query}}, + 'bool': { + 'must': [ + {'match': {search_field: query}}, ], } } @@ -386,25 +400,17 @@ def _text_search_batched( return _FindResultBatched(documents=result_das, scores=np.array(result_scores, dtype=object)) # type: ignore - def execute_query(self, query: Dict[str, Any], *args, **kwargs) -> Any: - if args or kwargs: - raise ValueError( - f'args and kwargs not supported for `execute_query` on {type(self)}' - ) - - resp = self._client.search(index=self._index_name, **query) - docs, scores = self._format_response(resp) - return _FindResult(documents=docs, scores=np.array(scores)) # type: ignore - ############################################### # Helpers # ############################################### # ElasticSearch helpers - def _create_index(self, col: '_ColumnInfo') -> Dict[str, Any]: + def _create_index_mapping(self, col: '_ColumnInfo') -> Dict[str, Any]: """Create a new HNSW index for a column, and initialize it.""" - index = dict((k, col.config[k]) for k in self._index_init_params) - if col.db_type == np.ndarray: + + index = {'type': col.config['type'] if 'type' in col.config else col.db_type} + + if col.db_type == 'dense_vector': for k in self._index_vector_params: index[k] = col.config[k] if col.n_dim: @@ -416,12 +422,13 @@ def _create_index(self, col: '_ColumnInfo') -> Dict[str, Any]: return index def _send_requests( - self, request: Iterable[Dict[str, Any]], **kwargs + self, + request: Iterable[Dict[str, Any]], + chunk_size: Optional[int] = None, + **kwargs, ) -> Tuple[List[Dict], List[Any]]: """Send bulk request to Elastic and gather the successful info""" - # TODO chunk_size - accumulated_info = [] warning_info = [] for success, info in parallel_bulk( @@ -429,6 +436,7 @@ def _send_requests( request, raise_on_error=False, raise_on_exception=False, + chunk_size=chunk_size if chunk_size else self._runtime_config.chunk_size, # type: ignore **kwargs, ): if not success: diff --git a/poetry.lock b/poetry.lock index d1889e68e3c..5f38ec62978 100644 --- a/poetry.lock +++ b/poetry.lock @@ -293,7 +293,7 @@ files = [ name = "certifi" version = "2022.9.24" description = "Python package for providing Mozilla's CA Bundle." -category = "dev" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -512,6 +512,44 @@ files = [ {file = "distlib-0.3.6.tar.gz", hash = "sha256:14bad2d9b04d3a36127ac97f30b12a19268f211063d8f8ee4f47108896e11b46"}, ] +[[package]] +name = "elastic-transport" +version = "8.4.0" +description = "Transport classes and utilities shared among Python Elastic client libraries" +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "elastic-transport-8.4.0.tar.gz", hash = "sha256:b9ad708ceb7fcdbc6b30a96f886609a109f042c0b9d9f2e44403b3133ba7ff10"}, + {file = "elastic_transport-8.4.0-py3-none-any.whl", hash = "sha256:19db271ab79c9f70f8c43f8f5b5111408781a6176b54ab2e54d713b6d9ceb815"}, +] + +[package.dependencies] +certifi = "*" +urllib3 = ">=1.26.2,<2" + +[package.extras] +develop = ["aiohttp", "mock", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests", "trustme"] + +[[package]] +name = "elasticsearch" +version = "8.6.2" +description = "Python client for Elasticsearch" +category = "main" +optional = false +python-versions = ">=3.6, <4" +files = [ + {file = "elasticsearch-8.6.2-py3-none-any.whl", hash = "sha256:8ccbebd9a0f6f523c7db67bb54863dde8bdb93daae4ff97f7c814e0500a73e84"}, + {file = "elasticsearch-8.6.2.tar.gz", hash = "sha256:084458e84caa91e3ad807b68aa82c022e785bead853a3b125641a25e894a1d47"}, +] + +[package.dependencies] +elastic-transport = ">=8,<9" + +[package.extras] +async = ["aiohttp (>=3,<4)"] +requests = ["requests (>=2.4.0,<3.0.0)"] + [[package]] name = "entrypoints" version = "0.4" @@ -3134,7 +3172,7 @@ typing-extensions = ">=3.7.4" name = "urllib3" version = "1.26.14" description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "dev" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ @@ -3304,7 +3342,8 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools" [extras] audio = ["pydub"] -common = ["protobuf", "lz4"] +common = ["lz4", "protobuf"] +elasticsearch = ["elastic-transport", "elasticsearch"] hnswlib = ["hnswlib"] image = ["pillow", "types-pillow"] mesh = ["trimesh"] @@ -3315,5 +3354,5 @@ web = ["fastapi"] [metadata] lock-version = "2.0" -python-versions = ">=3.7" -content-hash = "60dc7dedebd775c6fe3f45ddd2869a07df2c28bbc83420e875eb61e118b064b2" +python-versions = ">=3.7, <4" +content-hash = "379b7b8c2142c02057a764e6adf06128da9b4917744fa30bbebf9a2c129abd03" diff --git a/tests/doc_index/elastic/fixture.py b/tests/index/elastic/fixture.py similarity index 100% rename from tests/doc_index/elastic/fixture.py rename to tests/index/elastic/fixture.py diff --git a/tests/doc_index/elastic/v8/docker-compose.yml b/tests/index/elastic/v8/docker-compose.yml similarity index 100% rename from tests/doc_index/elastic/v8/docker-compose.yml rename to tests/index/elastic/v8/docker-compose.yml diff --git a/tests/doc_index/elastic/v8/test_find.py b/tests/index/elastic/v8/test_find.py similarity index 91% rename from tests/doc_index/elastic/v8/test_find.py rename to tests/index/elastic/v8/test_find.py index 944ed9887d7..14c5d5ef20a 100644 --- a/tests/doc_index/elastic/v8/test_find.py +++ b/tests/index/elastic/v8/test_find.py @@ -3,10 +3,10 @@ from pydantic import Field from docarray import BaseDocument -from docarray.doc_index.backends.elasticv8_doc_index import ElasticDocumentV8Index +from docarray.index import ElasticDocumentV8Index from docarray.typing import NdArray -from tests.doc_index.elastic.fixture import start_storage_v8 # noqa: F401 -from tests.doc_index.elastic.fixture import FlatDoc, SimpleDoc +from tests.index.elastic.fixture import start_storage_v8 # noqa: F401 +from tests.index.elastic.fixture import FlatDoc, SimpleDoc @pytest.mark.parametrize('similarity', ['cosine', 'l2_norm', 'dot_product']) @@ -147,8 +147,6 @@ def test_find_batched(): def test_filter(): - import itertools - class MyDoc(BaseDocument): A: bool B: int @@ -156,13 +154,7 @@ class MyDoc(BaseDocument): store = ElasticDocumentV8Index[MyDoc]() - A_list = [True, False] - B_list = [1, 2] - C_list = [1.5, 2.5] - - # cross product of all possible combinations - combinations = itertools.product(A_list, B_list, C_list) - index_docs = [MyDoc(A=A, B=B, C=C) for A, B, C in combinations] + index_docs = [MyDoc(id=f'{i}', A=(i % 2 == 0), B=i, C=i + 0.5) for i in range(10)] store.index(index_docs) filter_query = {'term': {'A': True}} @@ -171,17 +163,16 @@ class MyDoc(BaseDocument): for doc in docs: assert doc.A - filter_query = {'term': {'B': 1}} - docs = store.filter(filter_query) - assert len(docs) > 0 - for doc in docs: - assert doc.B == 1 - - filter_query = {'term': {'C': 1.5}} + filter_query = { + "bool": { + "filter": [ + {"terms": {"B": [3, 4, 7, 8]}}, + {"range": {"C": {"gte": 3, "lte": 5}}}, + ] + } + } docs = store.filter(filter_query) - assert len(docs) > 0 - for doc in docs: - assert doc.C == 1.5 + assert [doc.id for doc in docs] == ['3', '4'] def test_text_search(): diff --git a/tests/doc_index/elastic/v8/test_index_get_del.py b/tests/index/elastic/v8/test_index_get_del.py similarity index 96% rename from tests/doc_index/elastic/v8/test_index_get_del.py rename to tests/index/elastic/v8/test_index_get_del.py index c826ed344c4..5777a3bc977 100644 --- a/tests/doc_index/elastic/v8/test_index_get_del.py +++ b/tests/index/elastic/v8/test_index_get_del.py @@ -2,9 +2,9 @@ import pytest from docarray import DocumentArray -from docarray.doc_index.backends.elasticv8_doc_index import ElasticDocumentV8Index -from tests.doc_index.elastic.fixture import start_storage_v8 # noqa: F401 -from tests.doc_index.elastic.fixture import DeepNestedDoc, FlatDoc, NestedDoc, SimpleDoc +from docarray.index import ElasticDocumentV8Index +from tests.index.elastic.fixture import start_storage_v8 # noqa: F401 +from tests.index.elastic.fixture import DeepNestedDoc, FlatDoc, NestedDoc, SimpleDoc @pytest.fixture From 9470e624cc17bfe30038702a4b4cad636379c85d Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 31 Mar 2023 11:03:56 +0800 Subject: [PATCH 03/14] refactor: elastic v7 inherits v8 Signed-off-by: AnneY --- docarray/index/__init__.py | 6 +- docarray/index/backends/elastic.py | 205 +++---- docarray/index/backends/elasticv7.py | 133 +++++ docarray/index/backends/elasticv8.py | 461 ---------------- tests/index/elastic/v8/test_find.py | 538 +++++++++---------- tests/index/elastic/v8/test_index_get_del.py | 424 +++++++-------- 6 files changed, 728 insertions(+), 1039 deletions(-) create mode 100644 docarray/index/backends/elasticv7.py delete mode 100644 docarray/index/backends/elasticv8.py diff --git a/docarray/index/__init__.py b/docarray/index/__init__.py index 1e4f3ad8f7c..a678a03a415 100644 --- a/docarray/index/__init__.py +++ b/docarray/index/__init__.py @@ -1,5 +1,5 @@ -from docarray.index.backends.elastic import ElasticV7DocIndex -from docarray.index.backends.elasticv8 import ElasticDocumentIndex +from docarray.index.backends.elastic import ElasticDocIndex +from docarray.index.backends.elasticv7 import ElasticV7DocIndex from docarray.index.backends.hnswlib import HnswDocumentIndex -__all__ = ['HnswDocumentIndex', 'ElasticDocumentIndex', 'ElasticV7DocIndex'] +__all__ = ['HnswDocumentIndex', 'ElasticDocIndex', 'ElasticV7DocIndex'] diff --git a/docarray/index/backends/elastic.py b/docarray/index/backends/elastic.py index deefc3b2a86..462b3a56591 100644 --- a/docarray/index/backends/elastic.py +++ b/docarray/index/backends/elastic.py @@ -1,4 +1,4 @@ -import os +# mypy: ignore-errors import uuid import warnings from collections import defaultdict @@ -21,6 +21,7 @@ ) import numpy as np +from elastic_transport import NodeConfig from elasticsearch import Elasticsearch from elasticsearch.helpers import parallel_bulk from pydantic import parse_obj_as @@ -40,7 +41,7 @@ from docarray.utils.find import _FindResult TSchema = TypeVar('TSchema', bound=BaseDoc) -T = TypeVar('T', bound='ElasticV7DocIndex') +T = TypeVar('T', bound='ElasticDocIndex') ELASTIC_PY_VEC_TYPES: List[Any] = [list, tuple, np.ndarray, AbstractTensor] @@ -58,11 +59,15 @@ ELASTIC_PY_VEC_TYPES.append(TensorFlowTensor) -class ElasticV7DocIndex(BaseDocIndex, Generic[TSchema]): +# toml +# elastic-transport = "^8.4.0" +# elasticsearch = "^8.6.2" +class ElasticDocIndex(BaseDocIndex, Generic[TSchema]): def __init__(self, db_config=None, **kwargs): super().__init__(db_config=db_config, **kwargs) - self._db_config = cast(ElasticV7DocIndex.DBConfig, self._db_config) + self._db_config = cast(self.DBConfig, self._db_config) + # ElasticSearch client creation if self._db_config.index_name is None: id = uuid.uuid4().hex self._db_config.index_name = 'index__' + id @@ -74,40 +79,32 @@ def __init__(self, db_config=None, **kwargs): **self._db_config.es_config, ) - # compatibility - self._server_version = self._client.info()['version']['number'] - if int(self._server_version.split('.')[0]) >= 8: - os.environ['ELASTIC_CLIENT_APIVERSIONING'] = '1' + # ElasticSearh index setup + self._index_vector_params = ('dims', 'similarity', 'index') + self._index_vector_options = ('m', 'ef_construction') - body: Dict[str, Any] = { - 'mappings': { - 'dynamic': True, - '_source': {'enabled': 'true'}, - 'properties': {}, - } + mappings: Dict[str, Any] = { + 'dynamic': True, + '_source': {'enabled': 'true'}, + 'properties': {}, } for col_name, col in self._column_infos.items(): - body['mappings']['properties'][col_name] = self._create_index_mapping(col) + mappings['properties'][col_name] = self._create_index_mapping(col) if self._client.indices.exists(index=self._index_name): - self._client.indices.put_mapping( - index=self._index_name, body=body['mappings'] - ) + self._client_put_mapping(mappings) else: - self._client.indices.create(index=self._index_name, body=body) + self._client_create(mappings) if len(self._db_config.index_settings): - self._client.indices.put_settings( - index=self._index_name, body=self._db_config.index_settings - ) + self._client_put_settings(self._db_config.index_settings) self._refresh(self._index_name) ############################################### # Inner classes for query builder and configs # ############################################### - class QueryBuilder(BaseDocIndex.QueryBuilder): def __init__(self, outer_instance, **kwargs): super().__init__() @@ -117,16 +114,11 @@ def __init__(self, outer_instance, **kwargs): } def build(self, *args, **kwargs) -> Any: - if ( - 'script_score' in self._query['query'] - and 'bool' in self._query['query'] - and len(self._query['query']['bool']) > 0 - ): - self._query['query']['script_score']['query'] = {} - self._query['query']['script_score']['query']['bool'] = self._query[ - 'query' - ]['bool'] - del self._query['query']['bool'] + if len(self._query['query']) == 0: + del self._query['query'] + elif 'knn' in self._query: + self._query['knn']['filter'] = self._query['query'] + del self._query['query'] return self._query @@ -141,13 +133,19 @@ def find( else: query_vec = query query_vec_np = BaseDocIndex._to_numpy(self._outer_instance, query_vec) - self._query['size'] = limit - self._query['query']['script_score'] = ElasticV7DocIndex._form_search_body( - query_vec_np, limit, search_field - )['query']['script_score'] + self._query['knn'] = ElasticDocIndex._form_search_body( + query_vec_np, + limit, + search_field, + self._outer_instance._runtime_config.default_column_config[ + 'dense_vector' + ]['num_candidates'], + )['knn'] return self + # filter accrpts Leaf/Compound query clauses + # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html def filter(self, query: Dict[str, Any], limit: int = 10): self._query['size'] = limit self._query['query']['bool']['filter'].append(query) @@ -172,15 +170,20 @@ def build_query(self, **kwargs) -> QueryBuilder: @dataclass class DBConfig(BaseDocIndex.DBConfig): - hosts: Union[str, List[str], None] = 'http://localhost:9200' + hosts: Union[ + str, List[Union[str, Mapping[str, Union[str, int]], NodeConfig]], None + ] = 'http://localhost:9200' index_name: Optional[str] = None es_config: Dict[str, Any] = field(default_factory=dict) index_settings: Dict[str, Any] = field(default_factory=dict) @dataclass class RuntimeConfig(BaseDocIndex.RuntimeConfig): - default_column_config: Dict[Any, Dict[str, Any]] = field( - default_factory=lambda: { + default_column_config: Dict[Any, Dict[str, Any]] = field(default_factory=dict) + chunk_size: int = 500 + + def __post_init__(self): + self.default_column_config = { 'binary': {}, 'boolean': {}, 'keyword': {}, @@ -212,7 +215,6 @@ class RuntimeConfig(BaseDocIndex.RuntimeConfig): 'completion': {}, 'search_as_you_type': {}, 'token_count': {}, - 'dense_vector': {'dims': 128}, 'sparse_vector': {}, 'rank_feature': {}, 'rank_features': {}, @@ -224,8 +226,19 @@ class RuntimeConfig(BaseDocIndex.RuntimeConfig): # `None` is not a Type, but we allow it here anyway None: {}, # type: ignore } - ) - chunk_size: int = 500 + self.default_column_config['dense_vector'] = self.dense_vector_config() + + def dense_vector_config(self): + config = { + 'index': True, + 'dims': 128, + 'similarity': 'cosine', # 'l2_norm', 'dot_product', 'cosine' + 'm': 16, + 'ef_construction': 100, + 'num_candidates': 10000, + } + + return config ############################################### # Implementation of abstract methods # @@ -233,7 +246,6 @@ class RuntimeConfig(BaseDocIndex.RuntimeConfig): def python_type_to_db_type(self, python_type: Type) -> Any: """Map python type to database type.""" - for allowed_type in ELASTIC_PY_VEC_TYPES: if issubclass(python_type, allowed_type): return 'dense_vector' @@ -312,10 +324,7 @@ def _get_items(self, doc_ids: Sequence[str]) -> Sequence[TSchema]: accumulated_docs = [] accumulated_docs_id_not_found = [] - es_rows = self._client.mget( - index=self._index_name, - body={'ids': doc_ids}, - )['docs'] + es_rows = self._client_mget(doc_ids)['docs'] for row in es_rows: if row['found']: @@ -336,7 +345,7 @@ def execute_query(self, query: Dict[str, Any], *args, **kwargs) -> Any: f'args and kwargs not supported for `execute_query` on {type(self)}' ) - resp = self._client.search(index=self._index_name, body=query) + resp = self._client.search(index=self._index_name, **query) docs, scores = self._format_response(resp) return _FindResult(documents=docs, scores=scores) @@ -344,17 +353,9 @@ def execute_query(self, query: Dict[str, Any], *args, **kwargs) -> Any: def _find( self, query: np.ndarray, limit: int, search_field: str = '' ) -> _FindResult: - if int(self._server_version.split('.')[0]) >= 8: - warnings.warn( - 'You are using Elasticsearch 8.0+ and the current client is 7.10.1. HNSW based vector search is not supported and the find method has a default implementation using exhaustive KNN search with cosineSimilarity, which may result in slow performance.' - ) - body = self._form_search_body(query, limit, search_field) - resp = self._client.search( - index=self._index_name, - body=body, - ) + resp = self._client_search(**body) docs, scores = self._format_response(resp) @@ -372,7 +373,7 @@ def _find_batched( body = self._form_search_body(query, limit, search_field) request.extend([head, body]) - responses = self._client.msearch(body=request) + responses = self._client_msearch(request) das, scores = zip( *[self._format_response(resp) for resp in responses['responses']] @@ -384,15 +385,7 @@ def _filter( filter_query: Dict[str, Any], limit: int, ) -> List[Dict]: - body = { - 'size': limit, - 'query': filter_query, - } - - resp = self._client.search( - index=self._index_name, - body=body, - ) + resp = self._client_search(query=filter_query, size=limit) docs, _ = self._format_response(resp) @@ -409,7 +402,7 @@ def _filter_batched( body = {'query': query, 'size': limit} request.extend([head, body]) - responses = self._client.msearch(body=request) + responses = self._client_msearch(request) das, _ = zip(*[self._format_response(resp) for resp in responses['responses']]) return list(das) @@ -422,15 +415,11 @@ def _text_search( ) -> _FindResult: body = self._form_text_search_body(query, limit, search_field) - - resp = self._client.search( - index=self._index_name, - body=body, - ) + resp = self._client_search(**body) docs, scores = self._format_response(resp) - return _FindResult(documents=docs, scores=scores) + return _FindResult(documents=docs, scores=np.array(scores)) # type: ignore def _text_search_batched( self, @@ -444,28 +433,32 @@ def _text_search_batched( body = self._form_text_search_body(query, limit, search_field) request.extend([head, body]) - responses = self._client.msearch(body=request) - + responses = self._client_msearch(request) das, scores = zip( *[self._format_response(resp) for resp in responses['responses']] ) - return _FindResultBatched(documents=list(das), scores=np.array(scores)) + return _FindResultBatched( + documents=list(das), scores=np.array(scores, dtype=object) + ) ############################################### # Helpers # ############################################### - # ElasticSearch helpers def _create_index_mapping(self, col: '_ColumnInfo') -> Dict[str, Any]: """Create a new HNSW index for a column, and initialize it.""" - index = col.config.copy() - if 'type' not in index: - index['type'] = col.db_type - - if col.db_type == 'dense_vector' and col.n_dim: - index['dims'] = col.n_dim + index = {'type': col.config['type'] if 'type' in col.config else col.db_type} + if col.db_type == 'dense_vector': + for k in self._index_vector_params: + index[k] = col.config[k] + if col.n_dim: + index['dims'] = col.n_dim + index['index_options'] = dict( + (k, col.config[k]) for k in self._index_vector_options + ) + index['index_options']['type'] = 'hnsw' return index def _send_requests( @@ -495,18 +488,18 @@ def _send_requests( @staticmethod def _form_search_body( - query: np.ndarray, limit: int, search_field: str = '' + query: np.ndarray, + limit: int, + search_field: str = '', + num_candidates: int = 10000, ) -> Dict[str, Any]: body = { 'size': limit, - 'query': { - 'script_score': { - 'query': {'match_all': {}}, - 'script': { - 'source': f'cosineSimilarity(params.query_vector, \'{search_field}\') + 1.0', - 'params': {'query_vector': query}, - }, - } + 'knn': { + 'field': search_field, + 'query_vector': query, + 'k': limit, + 'num_candidates': num_candidates, }, } return body @@ -544,3 +537,27 @@ def _format_response(self, response: Any) -> Tuple[List[Dict], NdArray]: def _refresh(self, index_name: str): self._client.indices.refresh(index=index_name) + + ############################################### + # API Wrappers # + ############################################### + + def _client_put_mapping(self, mappings: Dict[str, Any]): + self._client.indices.put_mapping( + index=self._index_name, properties=mappings['properties'] + ) + + def _client_create(self, mappings: Dict[str, Any]): + self._client.indices.create(index=self._index_name, mappings=mappings) + + def _client_put_settings(self, settings: Dict[str, Any]): + self._client.indices.put_settings(index=self._index_name, settings=settings) + + def _client_mget(self, ids: Sequence[str]): + return self._client.mget(index=self._index_name, ids=ids) + + def _client_search(self, **kwargs): + return self._client.search(index=self._index_name, **kwargs) + + def _client_msearch(self, request: List[Dict[str, Any]]): + return self._client.msearch(index=self._index_name, searches=request) diff --git a/docarray/index/backends/elasticv7.py b/docarray/index/backends/elasticv7.py new file mode 100644 index 00000000000..5f80379f85e --- /dev/null +++ b/docarray/index/backends/elasticv7.py @@ -0,0 +1,133 @@ +from dataclasses import dataclass +from typing import Any, Dict, List, Sequence, TypeVar, Union + +import numpy as np + +from docarray import BaseDoc +from docarray.index import ElasticDocIndex +from docarray.index.abstract import BaseDocIndex, _ColumnInfo +from docarray.typing import AnyTensor +from docarray.utils.find import _FindResult + +TSchema = TypeVar('TSchema', bound=BaseDoc) +T = TypeVar('T', bound='ElasticV7DocIndex') + + +class ElasticV7DocIndex(ElasticDocIndex): + + ############################################### + # Inner classes for query builder and configs # + ############################################### + + class QueryBuilder(ElasticDocIndex.QueryBuilder): + def build(self, *args, **kwargs) -> Any: + if ( + 'script_score' in self._query['query'] + and 'bool' in self._query['query'] + and len(self._query['query']['bool']) > 0 + ): + self._query['query']['script_score']['query'] = {} + self._query['query']['script_score']['query']['bool'] = self._query[ + 'query' + ]['bool'] + del self._query['query']['bool'] + + return self._query + + def find( + self, + query: Union[AnyTensor, BaseDoc], + search_field: str = 'embedding', + limit: int = 10, + ): + if isinstance(query, BaseDoc): + query_vec = BaseDocIndex._get_values_by_column([query], search_field)[0] + else: + query_vec = query + query_vec_np = BaseDocIndex._to_numpy(self._outer_instance, query_vec) + self._query['size'] = limit + self._query['query']['script_score'] = ElasticV7DocIndex._form_search_body( + query_vec_np, limit, search_field + )['query']['script_score'] + + return self + + @dataclass + class DBConfig(ElasticDocIndex.DBConfig): + hosts: Union[str, List[str], None] = 'http://localhost:9200' # type: ignore + + @dataclass + class RuntimeConfig(ElasticDocIndex.RuntimeConfig): + def dense_vector_config(self): + return {'dims': 128} + + ############################################### + # Implementation of abstract methods # + ############################################### + + def execute_query(self, query: Dict[str, Any], *args, **kwargs) -> Any: + if args or kwargs: + raise ValueError( + f'args and kwargs not supported for `execute_query` on {type(self)}' + ) + + resp = self._client.search(index=self._index_name, body=query) + docs, scores = self._format_response(resp) + + return _FindResult(documents=docs, scores=scores) + + ############################################### + # Helpers # + ############################################### + + # ElasticSearch helpers + def _create_index_mapping(self, col: '_ColumnInfo') -> Dict[str, Any]: + """Create a new HNSW index for a column, and initialize it.""" + + index = col.config.copy() + if 'type' not in index: + index['type'] = col.db_type + + if col.db_type == 'dense_vector' and col.n_dim: + index['dims'] = col.n_dim + + return index + + @staticmethod + def _form_search_body(query: np.ndarray, limit: int, search_field: str = '') -> Dict[str, Any]: # type: ignore + body = { + 'size': limit, + 'query': { + 'script_score': { + 'query': {'match_all': {}}, + 'script': { + 'source': f'cosineSimilarity(params.query_vector, \'{search_field}\') + 1.0', + 'params': {'query_vector': query}, + }, + } + }, + } + return body + + ############################################### + # API Wrappers # + ############################################### + + def _client_put_mapping(self, mappings: Dict[str, Any]): + self._client.indices.put_mapping(index=self._index_name, body=mappings) + + def _client_create(self, mappings: Dict[str, Any]): + body = {'mappings': mappings} + self._client.indices.create(index=self._index_name, body=body) + + def _client_put_settings(self, settings: Dict[str, Any]): + self._client.indices.put_settings(index=self._index_name, body=settings) + + def _client_mget(self, ids: Sequence[str]): + return self._client.mget(index=self._index_name, body={'ids': ids}) + + def _client_search(self, **kwargs): + return self._client.search(index=self._index_name, body=kwargs) + + def _client_msearch(self, request: List[Dict[str, Any]]): + return self._client.msearch(index=self._index_name, body=request) diff --git a/docarray/index/backends/elasticv8.py b/docarray/index/backends/elasticv8.py deleted file mode 100644 index 2b84f640272..00000000000 --- a/docarray/index/backends/elasticv8.py +++ /dev/null @@ -1,461 +0,0 @@ -import uuid -import warnings -from collections import defaultdict -from dataclasses import dataclass, field -from typing import ( - Any, - Dict, - Generator, - Generic, - Iterable, - List, - Mapping, - Optional, - Sequence, - Tuple, - Type, - TypeVar, - Union, - cast, -) - -import numpy as np -from elastic_transport import NodeConfig -from elasticsearch import Elasticsearch -from elasticsearch.helpers import parallel_bulk - -import docarray.typing -from docarray import BaseDocument -from docarray.index.abstract import ( - BaseDocumentIndex, - _ColumnInfo, - _FindResultBatched, - _raise_not_composable, -) -from docarray.typing import AnyTensor -from docarray.utils.find import _FindResult -from docarray.utils.misc import torch_imported - -TSchema = TypeVar('TSchema', bound=BaseDocument) -T = TypeVar('T', bound='ElasticDocumentV8Index') - -ELASTIC_PY_VEC_TYPES: List[Any] = [np.ndarray] -if torch_imported: - import torch - - ELASTIC_PY_VEC_TYPES.append(torch.Tensor) - - -class ElasticDocumentV8Index(BaseDocumentIndex, Generic[TSchema]): - def __init__(self, db_config=None, **kwargs): - super().__init__(db_config=db_config, **kwargs) - self._db_config = cast(ElasticDocumentV8Index.DBConfig, self._db_config) - - if self._db_config.index_name is None: - id = uuid.uuid4().hex - self._db_config.index_name = 'index__' + id - - self._index_name = self._db_config.index_name - - self._client = Elasticsearch( - hosts=self._db_config.hosts, - **self._db_config.es_config, - ) - - # ElasticSearh index setup - self._index_vector_params = ('dims', 'similarity', 'index') - self._index_vector_options = ('m', 'ef_construction') - - mappings: Dict[str, Any] = { - 'dynamic': True, - '_source': {'enabled': 'true'}, - 'properties': {}, - } - - for col_name, col in self._column_infos.items(): - mappings['properties'][col_name] = self._create_index_mapping(col) - - if self._client.indices.exists(index=self._index_name): - self._client.indices.put_mapping( - index=self._index_name, properties=mappings['properties'] - ) - else: - self._client.indices.create(index=self._index_name, mappings=mappings) - - if len(self._db_config.index_settings): - self._client.indices.put_settings( - index=self._index_name, settings=self._db_config.index_settings - ) - - self._refresh(self._index_name) - - ############################################### - # Inner classes for query builder and configs # - ############################################### - class QueryBuilder(BaseDocumentIndex.QueryBuilder): - def __init__(self, outer_instance, **kwargs): - super().__init__() - self._outer_instance = outer_instance - self._query: Dict[str, Any] = { - 'query': defaultdict(lambda: defaultdict(list)) - } - - def build(self, *args, **kwargs) -> Any: - if len(self._query['query']) == 0: - del self._query['query'] - elif 'knn' in self._query: - self._query['knn']['filter'] = self._query['query'] - del self._query['query'] - - return self._query - - def find( - self, - query: Union[AnyTensor, BaseDocument], - search_field: str = 'embedding', - limit: int = 10, - ): - if isinstance(query, BaseDocument): - query_vec = BaseDocumentIndex._get_values_by_column( - [query], search_field - )[0] - else: - query_vec = query - query_vec_np = BaseDocumentIndex._to_numpy(self._outer_instance, query_vec) - self._query['knn'] = { - 'field': search_field, - 'query_vector': query_vec_np, - 'k': limit, - 'num_candidates': self._outer_instance._runtime_config.default_column_config[ - 'dense_vector' - ][ - 'num_candidates' - ], - } - return self - - # filter accrpts Leaf/Compound query clauses - # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html - def filter(self, query: Dict[str, Any], limit: int = 10): - self._query['size'] = limit - self._query['query']['bool']['filter'].append(query) - return self - - def text_search(self, query: str, search_field: str = 'text', limit: int = 10): - self._query['size'] = limit - self._query['query']['bool']['must'].append( - {'match': {search_field: query}} - ) - return self - - find_batched = _raise_not_composable('find_batched') - filter_batched = _raise_not_composable('find_batched') - text_search_batched = _raise_not_composable('text_search') - - def build_query(self, **kwargs) -> QueryBuilder: - """ - Build a query for this DocumentIndex. - """ - return self.QueryBuilder(self, **kwargs) - - @dataclass - class DBConfig(BaseDocumentIndex.DBConfig): - - hosts: Union[ - str, List[Union[str, Mapping[str, Union[str, int]], NodeConfig]], None - ] = 'http://localhost:9200' - index_name: Optional[str] = None - es_config: Dict[str, Any] = field(default_factory=dict) - index_settings: Dict[str, Any] = field(default_factory=dict) - - @dataclass - class RuntimeConfig(BaseDocumentIndex.RuntimeConfig): - default_column_config: Dict[Any, Dict[str, Any]] = field( - default_factory=lambda: { - 'dense_vector': { - 'index': True, - 'dims': 128, - 'similarity': 'cosine', # 'l2_norm', 'dot_product', 'cosine' - 'm': 16, - 'ef_construction': 100, - 'num_candidates': 10000, - }, - 'keyword': {}, - 'boolean': {}, - 'integer': {}, - 'float': {}, - 'text': {}, - # `None` is not a Type, but we allow it here anyway - None: {}, # type: ignore - } - ) - chunk_size: int = 500 - - ############################################### - # Implementation of abstract methods # - ############################################### - - def python_type_to_db_type(self, python_type: Type) -> Any: - """Map python type to database type.""" - for allowed_type in ELASTIC_PY_VEC_TYPES: - if issubclass(python_type, allowed_type): - return 'dense_vector' - - elastic_py_types = { - bool: 'boolean', - int: 'integer', - float: 'float', - str: 'text', - docarray.typing.ID: 'keyword', - } - - if python_type in elastic_py_types: - return elastic_py_types[python_type] - - raise ValueError(f'Unsupported column type for {type(self)}: {python_type}') - - def _index( - self, - column_to_data: Dict[str, Generator[Any, None, None]], - refresh: bool = True, - chunk_size: Optional[int] = None, - ): - - data = self._transpose_col_value_dict(column_to_data) # type: ignore - requests = [] - - for row in data: - request = { - '_index': self._index_name, - '_id': row['id'], - } - for col_name, col in self._column_infos.items(): - if col.db_type == 'dense_vector' and np.all(row[col_name] == 0): - row[col_name] = row[col_name] + 1.0e-9 - request[col_name] = row[col_name] - requests.append(request) - - _, warning_info = self._send_requests(requests, chunk_size) - for info in warning_info: - warnings.warn(str(info)) - - if refresh: - self._refresh(self._index_name) - - def num_docs(self) -> int: - return self._client.count(index=self._index_name)['count'] - - def _del_items( - self, - doc_ids: Sequence[str], - chunk_size: Optional[int] = None, - ): - requests = [] - for _id in doc_ids: - requests.append( - {'_op_type': 'delete', '_index': self._index_name, '_id': _id} - ) - - _, warning_info = self._send_requests(requests, chunk_size) - - # raise warning if some ids are not found - if warning_info: - ids = [info['delete']['_id'] for info in warning_info] - warnings.warn(f'No document with id {ids} found') - - self._refresh(self._index_name) - - def _get_items(self, doc_ids: Sequence[str]) -> Sequence[TSchema]: - accumulated_docs = [] - accumulated_docs_id_not_found = [] - - es_rows = self._client.mget( - index=self._index_name, - ids=doc_ids, # type: ignore - )['docs'] - - for row in es_rows: - if row['found']: - doc_dict = row['_source'] - accumulated_docs.append(doc_dict) - else: - accumulated_docs_id_not_found.append(row['_id']) - - # raise warning if some ids are not found - if accumulated_docs_id_not_found: - warnings.warn(f'No document with id {accumulated_docs_id_not_found} found') - - return accumulated_docs - - def execute_query(self, query: Dict[str, Any], *args, **kwargs) -> Any: - if args or kwargs: - raise ValueError( - f'args and kwargs not supported for `execute_query` on {type(self)}' - ) - - resp = self._client.search(index=self._index_name, **query) - docs, scores = self._format_response(resp) - return _FindResult(documents=docs, scores=np.array(scores)) # type: ignore - - def _find( - self, query: np.ndarray, limit: int, search_field: str = '' - ) -> _FindResult: - knn_query = { - 'field': search_field, - 'query_vector': query, - 'k': limit, - 'num_candidates': self._runtime_config.default_column_config['dense_vector'][ # type: ignore - 'num_candidates' - ], - } - - resp = self._client.search( - index=self._index_name, - knn=knn_query, - size=limit, - ) - - docs, scores = self._format_response(resp) - - return _FindResult(documents=docs, scores=np.array(scores)) # type: ignore - - def _find_batched( - self, - queries: np.ndarray, - limit: int, - search_field: str = '', - ) -> _FindResultBatched: - result_das = [] - result_scores = [] - - for query in queries: - documents, scores = self._find(query, limit, search_field) - result_das.append(documents) - result_scores.append(scores) - - return _FindResultBatched(documents=result_das, scores=np.array(result_scores)) # type: ignore - - def _filter( - self, - filter_query: Dict[str, Any], - limit: int, - ) -> List[Dict]: - resp = self._client.search( - index=self._index_name, - query=filter_query, - size=limit, - ) - - docs, _ = self._format_response(resp) - - return docs - - def _filter_batched( - self, - filter_queries: Any, - limit: int, - ) -> List[List[Dict]]: - result_das = [] - for query in filter_queries: - result_das.append(self._filter(query, limit)) - return result_das - - def _text_search( - self, - query: str, - limit: int, - search_field: str = '', - ) -> _FindResult: - search_query = { - 'bool': { - 'must': [ - {'match': {search_field: query}}, - ], - } - } - - resp = self._client.search( - index=self._index_name, - query=search_query, - size=limit, - ) - - docs, scores = self._format_response(resp) - - return _FindResult(documents=docs, scores=np.array(scores)) # type: ignore - - def _text_search_batched( - self, - queries: Sequence[str], - limit: int, - search_field: str = '', - ) -> _FindResultBatched: - result_das = [] - result_scores = [] - - for query in queries: - documents, scores = self._text_search(query, limit, search_field) - result_das.append(documents) - result_scores.append(scores) - - return _FindResultBatched(documents=result_das, scores=np.array(result_scores, dtype=object)) # type: ignore - - ############################################### - # Helpers # - ############################################### - - # ElasticSearch helpers - def _create_index_mapping(self, col: '_ColumnInfo') -> Dict[str, Any]: - """Create a new HNSW index for a column, and initialize it.""" - - index = {'type': col.config['type'] if 'type' in col.config else col.db_type} - - if col.db_type == 'dense_vector': - for k in self._index_vector_params: - index[k] = col.config[k] - if col.n_dim: - index['dims'] = col.n_dim - index['index_options'] = dict( - (k, col.config[k]) for k in self._index_vector_options - ) - index['index_options']['type'] = 'hnsw' - return index - - def _send_requests( - self, - request: Iterable[Dict[str, Any]], - chunk_size: Optional[int] = None, - **kwargs, - ) -> Tuple[List[Dict], List[Any]]: - """Send bulk request to Elastic and gather the successful info""" - - accumulated_info = [] - warning_info = [] - for success, info in parallel_bulk( - self._client, - request, - raise_on_error=False, - raise_on_exception=False, - chunk_size=chunk_size if chunk_size else self._runtime_config.chunk_size, # type: ignore - **kwargs, - ): - if not success: - warning_info.append(info) - else: - accumulated_info.append(info) - - return accumulated_info, warning_info - - def _format_response(self, response: Any) -> Tuple[List[Dict], List[float]]: - docs = [] - scores = [] - for result in response['hits']['hits']: - doc_dict = result['_source'] - doc_dict['id'] = result['_id'] - docs.append(doc_dict) - scores.append(result['_score']) - - return docs, scores - - def _refresh(self, index_name: str): - self._client.indices.refresh(index=index_name) diff --git a/tests/index/elastic/v8/test_find.py b/tests/index/elastic/v8/test_find.py index 14c5d5ef20a..d61ae643ae0 100644 --- a/tests/index/elastic/v8/test_find.py +++ b/tests/index/elastic/v8/test_find.py @@ -1,269 +1,269 @@ -import numpy as np -import pytest -from pydantic import Field - -from docarray import BaseDocument -from docarray.index import ElasticDocumentV8Index -from docarray.typing import NdArray -from tests.index.elastic.fixture import start_storage_v8 # noqa: F401 -from tests.index.elastic.fixture import FlatDoc, SimpleDoc - - -@pytest.mark.parametrize('similarity', ['cosine', 'l2_norm', 'dot_product']) -def test_find_simple_schema(similarity): - class SimpleSchema(BaseDocument): - tens: NdArray[10] = Field(similarity=similarity) - - store = ElasticDocumentV8Index[SimpleSchema]() - - index_docs = [] - for _ in range(10): - vec = np.random.rand(10) - if similarity == 'dot_product': - vec = vec / np.linalg.norm(vec) - index_docs.append(SimpleDoc(tens=vec)) - store.index(index_docs) - - query = index_docs[-1] - docs, scores = store.find(query, search_field='tens', limit=5) - - assert len(docs) == 5 - assert len(scores) == 5 - assert docs[0].id == index_docs[-1].id - assert np.allclose(docs[0].tens, index_docs[-1].tens) - - -@pytest.mark.parametrize('similarity', ['cosine', 'l2_norm', 'dot_product']) -def test_find_flat_schema(similarity): - class FlatSchema(BaseDocument): - tens_one: NdArray = Field(dims=10, similarity=similarity) - tens_two: NdArray = Field(dims=50, similarity=similarity) - - store = ElasticDocumentV8Index[FlatSchema]() - - index_docs = [] - for _ in range(10): - vec_one = np.random.rand(10) - vec_two = np.random.rand(50) - if similarity == 'dot_product': - vec_one = vec_one / np.linalg.norm(vec_one) - vec_two = vec_two / np.linalg.norm(vec_two) - index_docs.append(FlatDoc(tens_one=vec_one, tens_two=vec_two)) - - store.index(index_docs) - - query = index_docs[-1] - - # find on tens_one - docs, scores = store.find(query, search_field='tens_one', limit=5) - assert len(docs) == 5 - assert len(scores) == 5 - assert docs[0].id == index_docs[-1].id - assert np.allclose(docs[0].tens_one, index_docs[-1].tens_one) - assert np.allclose(docs[0].tens_two, index_docs[-1].tens_two) - - # find on tens_two - docs, scores = store.find(query, search_field='tens_two', limit=5) - assert len(docs) == 5 - assert len(scores) == 5 - assert docs[0].id == index_docs[-1].id - assert np.allclose(docs[0].tens_one, index_docs[-1].tens_one) - assert np.allclose(docs[0].tens_two, index_docs[-1].tens_two) - - -@pytest.mark.parametrize('similarity', ['cosine', 'l2_norm', 'dot_product']) -def test_find_nested_schema(similarity): - class SimpleDoc(BaseDocument): - tens: NdArray[10] = Field(similarity=similarity) - - class NestedDoc(BaseDocument): - d: SimpleDoc - tens: NdArray[10] = Field(similarity=similarity) - - class DeepNestedDoc(BaseDocument): - d: NestedDoc - tens: NdArray = Field(similarity=similarity, dims=10) - - store = ElasticDocumentV8Index[DeepNestedDoc]() - - index_docs = [] - for _ in range(10): - vec_simple = np.random.rand(10) - vec_nested = np.random.rand(10) - vec_deep = np.random.rand(10) - if similarity == 'dot_product': - vec_simple = vec_simple / np.linalg.norm(vec_simple) - vec_nested = vec_nested / np.linalg.norm(vec_nested) - vec_deep = vec_deep / np.linalg.norm(vec_deep) - index_docs.append( - DeepNestedDoc( - d=NestedDoc(d=SimpleDoc(tens=vec_simple), tens=vec_nested), - tens=vec_deep, - ) - ) - - store.index(index_docs) - - query = index_docs[-1] - - # find on root level - docs, scores = store.find(query, search_field='tens', limit=5) - assert len(docs) == 5 - assert len(scores) == 5 - assert docs[0].id == index_docs[-1].id - assert np.allclose(docs[0].tens, index_docs[-1].tens) - - # find on first nesting level - docs, scores = store.find(query, search_field='d__tens', limit=5) - assert len(docs) == 5 - assert len(scores) == 5 - assert docs[0].id == index_docs[-1].id - assert np.allclose(docs[0].d.tens, index_docs[-1].d.tens) - - # find on second nesting level - docs, scores = store.find(query, search_field='d__d__tens', limit=5) - assert len(docs) == 5 - assert len(scores) == 5 - assert docs[0].id == index_docs[-1].id - assert np.allclose(docs[0].d.d.tens, index_docs[-1].d.d.tens) - - -def test_find_batched(): - store = ElasticDocumentV8Index[SimpleDoc]() - - index_docs = [SimpleDoc(tens=np.random.rand(10)) for _ in range(10)] - store.index(index_docs) - - queries = index_docs[-2:] - docs_batched, scores_batched = store.find_batched( - queries, search_field='tens', limit=5 - ) - - for docs, scores, query in zip(docs_batched, scores_batched, queries): - assert len(docs) == 5 - assert len(scores) == 5 - assert docs[0].id == query.id - assert np.allclose(docs[0].tens, query.tens) - - -def test_filter(): - class MyDoc(BaseDocument): - A: bool - B: int - C: float - - store = ElasticDocumentV8Index[MyDoc]() - - index_docs = [MyDoc(id=f'{i}', A=(i % 2 == 0), B=i, C=i + 0.5) for i in range(10)] - store.index(index_docs) - - filter_query = {'term': {'A': True}} - docs = store.filter(filter_query) - assert len(docs) > 0 - for doc in docs: - assert doc.A - - filter_query = { - "bool": { - "filter": [ - {"terms": {"B": [3, 4, 7, 8]}}, - {"range": {"C": {"gte": 3, "lte": 5}}}, - ] - } - } - docs = store.filter(filter_query) - assert [doc.id for doc in docs] == ['3', '4'] - - -def test_text_search(): - class MyDoc(BaseDocument): - text: str - - store = ElasticDocumentV8Index[MyDoc]() - index_docs = [ - MyDoc(text='hello world'), - MyDoc(text='never gonna give you up'), - MyDoc(text='we are the world'), - ] - store.index(index_docs) - - query = 'world' - docs, scores = store.text_search(query, search_field='text') - - assert len(docs) == 2 - assert len(scores) == 2 - assert docs[0].text.index(query) >= 0 - assert docs[1].text.index(query) >= 0 - - queries = ['world', 'never'] - docs, scores = store.text_search_batched(queries, search_field='text') - for query, da, score in zip(queries, docs, scores): - assert len(da) > 0 - assert len(score) > 0 - for doc in da: - assert doc.text.index(query) >= 0 - - -def test_query_builder(): - class MyDoc(BaseDocument): - tens: NdArray[10] = Field(similarity='l2_norm') - num: int - text: str - - store = ElasticDocumentV8Index[MyDoc]() - index_docs = [ - MyDoc(id=f'{i}', tens=np.ones(10) * i, num=int(i / 2), text=f'text {int(i/2)}') - for i in range(10) - ] - store.index(index_docs) - - # build_query - q = store.build_query() - assert isinstance(q, store.QueryBuilder) - - # filter - q = store.build_query().filter({'term': {'num': 0}}).build() - docs, _ = store.execute_query(q) - assert [doc['id'] for doc in docs] == ['0', '1'] - - # find - q = store.build_query().find(index_docs[-1], search_field='tens', limit=3).build() - docs, _ = store.execute_query(q) - assert [doc['id'] for doc in docs] == ['9', '8', '7'] - - # text_search - q = store.build_query().text_search('0', search_field='text').build() - docs, _ = store.execute_query(q) - assert [doc['id'] for doc in docs] == ['0', '1'] - - # combination - q = ( - store.build_query() - .filter({'range': {'num': {'lte': 3}}}) - .find(index_docs[-1], search_field='tens') - .text_search('0', search_field='text') - .build() - ) - docs, _ = store.execute_query(q) - assert [doc['id'] for doc in docs] == ['1', '0'] - - # direct - query = { - 'knn': { - 'field': 'tens', - 'query_vector': [9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0], - 'k': 10, - 'num_candidates': 10000, - 'filter': { - 'bool': { - 'filter': [ - {'range': {'num': {'gte': 2}}}, - {'range': {'num': {'lte': 3}}}, - ] - } - }, - }, - } - - docs, _ = store.execute_query(query) - assert [doc['id'] for doc in docs] == ['7', '6', '5', '4'] +# import numpy as np +# import pytest +# from pydantic import Field + +# from docarray import BaseDoc +# from docarray.index import ElasticDocIndex +# from docarray.typing import NdArray +# from tests.index.elastic.fixture import start_storage_v8 # noqa: F401 +# from tests.index.elastic.fixture import FlatDoc, SimpleDoc + + +# @pytest.mark.parametrize('similarity', ['cosine', 'l2_norm', 'dot_product']) +# def test_find_simple_schema(similarity): +# class SimpleSchema(BaseDoc): +# tens: NdArray[10] = Field(similarity=similarity) + +# store = ElasticDocIndex[SimpleSchema]() + +# index_docs = [] +# for _ in range(10): +# vec = np.random.rand(10) +# if similarity == 'dot_product': +# vec = vec / np.linalg.norm(vec) +# index_docs.append(SimpleDoc(tens=vec)) +# store.index(index_docs) + +# query = index_docs[-1] +# docs, scores = store.find(query, search_field='tens', limit=5) + +# assert len(docs) == 5 +# assert len(scores) == 5 +# assert docs[0].id == index_docs[-1].id +# assert np.allclose(docs[0].tens, index_docs[-1].tens) + + +# @pytest.mark.parametrize('similarity', ['cosine', 'l2_norm', 'dot_product']) +# def test_find_flat_schema(similarity): +# class FlatSchema(BaseDoc): +# tens_one: NdArray = Field(dims=10, similarity=similarity) +# tens_two: NdArray = Field(dims=50, similarity=similarity) + +# store = ElasticDocIndex[FlatSchema]() + +# index_docs = [] +# for _ in range(10): +# vec_one = np.random.rand(10) +# vec_two = np.random.rand(50) +# if similarity == 'dot_product': +# vec_one = vec_one / np.linalg.norm(vec_one) +# vec_two = vec_two / np.linalg.norm(vec_two) +# index_docs.append(FlatDoc(tens_one=vec_one, tens_two=vec_two)) + +# store.index(index_docs) + +# query = index_docs[-1] + +# # find on tens_one +# docs, scores = store.find(query, search_field='tens_one', limit=5) +# assert len(docs) == 5 +# assert len(scores) == 5 +# assert docs[0].id == index_docs[-1].id +# assert np.allclose(docs[0].tens_one, index_docs[-1].tens_one) +# assert np.allclose(docs[0].tens_two, index_docs[-1].tens_two) + +# # find on tens_two +# docs, scores = store.find(query, search_field='tens_two', limit=5) +# assert len(docs) == 5 +# assert len(scores) == 5 +# assert docs[0].id == index_docs[-1].id +# assert np.allclose(docs[0].tens_one, index_docs[-1].tens_one) +# assert np.allclose(docs[0].tens_two, index_docs[-1].tens_two) + + +# @pytest.mark.parametrize('similarity', ['cosine', 'l2_norm', 'dot_product']) +# def test_find_nested_schema(similarity): +# class SimpleDoc(BaseDoc): +# tens: NdArray[10] = Field(similarity=similarity) + +# class NestedDoc(BaseDoc): +# d: SimpleDoc +# tens: NdArray[10] = Field(similarity=similarity) + +# class DeepNestedDoc(BaseDoc): +# d: NestedDoc +# tens: NdArray = Field(similarity=similarity, dims=10) + +# store = ElasticDocIndex[DeepNestedDoc]() + +# index_docs = [] +# for _ in range(10): +# vec_simple = np.random.rand(10) +# vec_nested = np.random.rand(10) +# vec_deep = np.random.rand(10) +# if similarity == 'dot_product': +# vec_simple = vec_simple / np.linalg.norm(vec_simple) +# vec_nested = vec_nested / np.linalg.norm(vec_nested) +# vec_deep = vec_deep / np.linalg.norm(vec_deep) +# index_docs.append( +# DeepNestedDoc( +# d=NestedDoc(d=SimpleDoc(tens=vec_simple), tens=vec_nested), +# tens=vec_deep, +# ) +# ) + +# store.index(index_docs) + +# query = index_docs[-1] + +# # find on root level +# docs, scores = store.find(query, search_field='tens', limit=5) +# assert len(docs) == 5 +# assert len(scores) == 5 +# assert docs[0].id == index_docs[-1].id +# assert np.allclose(docs[0].tens, index_docs[-1].tens) + +# # find on first nesting level +# docs, scores = store.find(query, search_field='d__tens', limit=5) +# assert len(docs) == 5 +# assert len(scores) == 5 +# assert docs[0].id == index_docs[-1].id +# assert np.allclose(docs[0].d.tens, index_docs[-1].d.tens) + +# # find on second nesting level +# docs, scores = store.find(query, search_field='d__d__tens', limit=5) +# assert len(docs) == 5 +# assert len(scores) == 5 +# assert docs[0].id == index_docs[-1].id +# assert np.allclose(docs[0].d.d.tens, index_docs[-1].d.d.tens) + + +# def test_find_batched(): +# store = ElasticDocIndex[SimpleDoc]() + +# index_docs = [SimpleDoc(tens=np.random.rand(10)) for _ in range(10)] +# store.index(index_docs) + +# queries = index_docs[-2:] +# docs_batched, scores_batched = store.find_batched( +# queries, search_field='tens', limit=5 +# ) + +# for docs, scores, query in zip(docs_batched, scores_batched, queries): +# assert len(docs) == 5 +# assert len(scores) == 5 +# assert docs[0].id == query.id +# assert np.allclose(docs[0].tens, query.tens) + + +# def test_filter(): +# class MyDoc(BaseDoc): +# A: bool +# B: int +# C: float + +# store = ElasticDocIndex[MyDoc]() + +# index_docs = [MyDoc(id=f'{i}', A=(i % 2 == 0), B=i, C=i + 0.5) for i in range(10)] +# store.index(index_docs) + +# filter_query = {'term': {'A': True}} +# docs = store.filter(filter_query) +# assert len(docs) > 0 +# for doc in docs: +# assert doc.A + +# filter_query = { +# "bool": { +# "filter": [ +# {"terms": {"B": [3, 4, 7, 8]}}, +# {"range": {"C": {"gte": 3, "lte": 5}}}, +# ] +# } +# } +# docs = store.filter(filter_query) +# assert [doc.id for doc in docs] == ['3', '4'] + + +# def test_text_search(): +# class MyDoc(BaseDoc): +# text: str + +# store = ElasticDocIndex[MyDoc]() +# index_docs = [ +# MyDoc(text='hello world'), +# MyDoc(text='never gonna give you up'), +# MyDoc(text='we are the world'), +# ] +# store.index(index_docs) + +# query = 'world' +# docs, scores = store.text_search(query, search_field='text') + +# assert len(docs) == 2 +# assert len(scores) == 2 +# assert docs[0].text.index(query) >= 0 +# assert docs[1].text.index(query) >= 0 + +# queries = ['world', 'never'] +# docs, scores = store.text_search_batched(queries, search_field='text') +# for query, da, score in zip(queries, docs, scores): +# assert len(da) > 0 +# assert len(score) > 0 +# for doc in da: +# assert doc.text.index(query) >= 0 + + +# def test_query_builder(): +# class MyDoc(BaseDoc): +# tens: NdArray[10] = Field(similarity='l2_norm') +# num: int +# text: str + +# store = ElasticDocIndex[MyDoc]() +# index_docs = [ +# MyDoc(id=f'{i}', tens=np.ones(10) * i, num=int(i / 2), text=f'text {int(i/2)}') +# for i in range(10) +# ] +# store.index(index_docs) + +# # build_query +# q = store.build_query() +# assert isinstance(q, store.QueryBuilder) + +# # filter +# q = store.build_query().filter({'term': {'num': 0}}).build() +# docs, _ = store.execute_query(q) +# assert [doc['id'] for doc in docs] == ['0', '1'] + +# # find +# q = store.build_query().find(index_docs[-1], search_field='tens', limit=3).build() +# docs, _ = store.execute_query(q) +# assert [doc['id'] for doc in docs] == ['9', '8', '7'] + +# # text_search +# q = store.build_query().text_search('0', search_field='text').build() +# docs, _ = store.execute_query(q) +# assert [doc['id'] for doc in docs] == ['0', '1'] + +# # combination +# q = ( +# store.build_query() +# .filter({'range': {'num': {'lte': 3}}}) +# .find(index_docs[-1], search_field='tens') +# .text_search('0', search_field='text') +# .build() +# ) +# docs, _ = store.execute_query(q) +# assert [doc['id'] for doc in docs] == ['1', '0'] + +# # direct +# query = { +# 'knn': { +# 'field': 'tens', +# 'query_vector': [9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0], +# 'k': 10, +# 'num_candidates': 10000, +# 'filter': { +# 'bool': { +# 'filter': [ +# {'range': {'num': {'gte': 2}}}, +# {'range': {'num': {'lte': 3}}}, +# ] +# } +# }, +# }, +# } + +# docs, _ = store.execute_query(query) +# assert [doc['id'] for doc in docs] == ['7', '6', '5', '4'] diff --git a/tests/index/elastic/v8/test_index_get_del.py b/tests/index/elastic/v8/test_index_get_del.py index 5777a3bc977..b3e8b27c162 100644 --- a/tests/index/elastic/v8/test_index_get_del.py +++ b/tests/index/elastic/v8/test_index_get_del.py @@ -1,232 +1,232 @@ -import numpy as np -import pytest +# import numpy as np +# import pytest -from docarray import DocumentArray -from docarray.index import ElasticDocumentV8Index -from tests.index.elastic.fixture import start_storage_v8 # noqa: F401 -from tests.index.elastic.fixture import DeepNestedDoc, FlatDoc, NestedDoc, SimpleDoc +# from docarray import DocArray +# from docarray.index import ElasticDocIndex +# from tests.index.elastic.fixture import start_storage_v8 # noqa: F401 +# from tests.index.elastic.fixture import DeepNestedDoc, FlatDoc, NestedDoc, SimpleDoc -@pytest.fixture -def ten_simple_docs(): - return [SimpleDoc(tens=np.random.randn(10)) for _ in range(10)] +# @pytest.fixture +# def ten_simple_docs(): +# return [SimpleDoc(tens=np.random.randn(10)) for _ in range(10)] -@pytest.fixture -def ten_flat_docs(): - return [ - FlatDoc(tens_one=np.random.randn(10), tens_two=np.random.randn(50)) - for _ in range(10) - ] +# @pytest.fixture +# def ten_flat_docs(): +# return [ +# FlatDoc(tens_one=np.random.randn(10), tens_two=np.random.randn(50)) +# for _ in range(10) +# ] -@pytest.fixture -def ten_nested_docs(): - return [NestedDoc(d=SimpleDoc(tens=np.random.randn(10))) for _ in range(10)] +# @pytest.fixture +# def ten_nested_docs(): +# return [NestedDoc(d=SimpleDoc(tens=np.random.randn(10))) for _ in range(10)] -@pytest.fixture -def ten_deep_nested_docs(): - return [ - DeepNestedDoc(d=NestedDoc(d=SimpleDoc(tens=np.random.randn(10)))) - for _ in range(10) - ] +# @pytest.fixture +# def ten_deep_nested_docs(): +# return [ +# DeepNestedDoc(d=NestedDoc(d=SimpleDoc(tens=np.random.randn(10)))) +# for _ in range(10) +# ] -@pytest.mark.parametrize('use_docarray', [True, False]) -def test_index_simple_schema(ten_simple_docs, use_docarray): - store = ElasticDocumentV8Index[SimpleDoc]() - if use_docarray: - ten_simple_docs = DocumentArray[SimpleDoc](ten_simple_docs) +# @pytest.mark.parametrize('use_docarray', [True, False]) +# def test_index_simple_schema(ten_simple_docs, use_docarray): +# store = ElasticDocIndex[SimpleDoc]() +# if use_docarray: +# ten_simple_docs = DocArray[SimpleDoc](ten_simple_docs) - store.index(ten_simple_docs) - assert store.num_docs() == 10 +# store.index(ten_simple_docs) +# assert store.num_docs() == 10 -@pytest.mark.parametrize('use_docarray', [True, False]) -def test_index_flat_schema(ten_flat_docs, use_docarray): - store = ElasticDocumentV8Index[FlatDoc]() - if use_docarray: - ten_flat_docs = DocumentArray[FlatDoc](ten_flat_docs) +# @pytest.mark.parametrize('use_docarray', [True, False]) +# def test_index_flat_schema(ten_flat_docs, use_docarray): +# store = ElasticDocIndex[FlatDoc]() +# if use_docarray: +# ten_flat_docs = DocArray[FlatDoc](ten_flat_docs) - store.index(ten_flat_docs) - assert store.num_docs() == 10 +# store.index(ten_flat_docs) +# assert store.num_docs() == 10 -@pytest.mark.parametrize('use_docarray', [True, False]) -def test_index_nested_schema(ten_nested_docs, use_docarray): - store = ElasticDocumentV8Index[NestedDoc]() - if use_docarray: - ten_nested_docs = DocumentArray[NestedDoc](ten_nested_docs) +# @pytest.mark.parametrize('use_docarray', [True, False]) +# def test_index_nested_schema(ten_nested_docs, use_docarray): +# store = ElasticDocIndex[NestedDoc]() +# if use_docarray: +# ten_nested_docs = DocArray[NestedDoc](ten_nested_docs) - store.index(ten_nested_docs) - assert store.num_docs() == 10 +# store.index(ten_nested_docs) +# assert store.num_docs() == 10 -@pytest.mark.parametrize('use_docarray', [True, False]) -def test_index_deep_nested_schema(ten_deep_nested_docs, use_docarray): - store = ElasticDocumentV8Index[DeepNestedDoc]() - if use_docarray: - ten_deep_nested_docs = DocumentArray[DeepNestedDoc](ten_deep_nested_docs) - - store.index(ten_deep_nested_docs) - assert store.num_docs() == 10 - - -def test_get_single(ten_simple_docs, ten_flat_docs, ten_nested_docs): - # simple - store = ElasticDocumentV8Index[SimpleDoc]() - store.index(ten_simple_docs) - - assert store.num_docs() == 10 - for d in ten_simple_docs: - id_ = d.id - assert store[id_].id == id_ - assert np.all(store[id_].tens == d.tens) - - # flat - store = ElasticDocumentV8Index[FlatDoc]() - store.index(ten_flat_docs) - - assert store.num_docs() == 10 - for d in ten_flat_docs: - id_ = d.id - assert store[id_].id == id_ - assert np.all(store[id_].tens_one == d.tens_one) - assert np.all(store[id_].tens_two == d.tens_two) - - # nested - store = ElasticDocumentV8Index[NestedDoc]() - store.index(ten_nested_docs) - - assert store.num_docs() == 10 - for d in ten_nested_docs: - id_ = d.id - assert store[id_].id == id_ - assert store[id_].d.id == d.d.id - assert np.all(store[id_].d.tens == d.d.tens) - - -def test_get_multiple(ten_simple_docs, ten_flat_docs, ten_nested_docs): - docs_to_get_idx = [0, 2, 4, 6, 8] - - # simple - store = ElasticDocumentV8Index[SimpleDoc]() - store.index(ten_simple_docs) - - assert store.num_docs() == 10 - docs_to_get = [ten_simple_docs[i] for i in docs_to_get_idx] - ids_to_get = [d.id for d in docs_to_get] - retrieved_docs = store[ids_to_get] - for id_, d_in, d_out in zip(ids_to_get, docs_to_get, retrieved_docs): - assert d_out.id == id_ - assert np.all(d_out.tens == d_in.tens) - - # flat - store = ElasticDocumentV8Index[FlatDoc]() - store.index(ten_flat_docs) - - assert store.num_docs() == 10 - docs_to_get = [ten_flat_docs[i] for i in docs_to_get_idx] - ids_to_get = [d.id for d in docs_to_get] - retrieved_docs = store[ids_to_get] - for id_, d_in, d_out in zip(ids_to_get, docs_to_get, retrieved_docs): - assert d_out.id == id_ - assert np.all(d_out.tens_one == d_in.tens_one) - assert np.all(d_out.tens_two == d_in.tens_two) - - # nested - store = ElasticDocumentV8Index[NestedDoc]() - store.index(ten_nested_docs) - - assert store.num_docs() == 10 - docs_to_get = [ten_nested_docs[i] for i in docs_to_get_idx] - ids_to_get = [d.id for d in docs_to_get] - retrieved_docs = store[ids_to_get] - for id_, d_in, d_out in zip(ids_to_get, docs_to_get, retrieved_docs): - assert d_out.id == id_ - assert d_out.d.id == d_in.d.id - assert np.all(d_out.d.tens == d_in.d.tens) - - -def test_get_key_error(ten_simple_docs): - store = ElasticDocumentV8Index[SimpleDoc]() - store.index(ten_simple_docs) - - with pytest.raises(KeyError): - store['not_a_real_id'] - - -def test_del_single(ten_simple_docs): - store = ElasticDocumentV8Index[SimpleDoc]() - store.index(ten_simple_docs) - # delete once - assert store.num_docs() == 10 - del store[ten_simple_docs[0].id] - assert store.num_docs() == 9 - for i, d in enumerate(ten_simple_docs): - id_ = d.id - if i == 0: # deleted - with pytest.raises(KeyError): - store[id_] - else: - assert store[id_].id == id_ - assert np.all(store[id_].tens == d.tens) - # delete again - del store[ten_simple_docs[3].id] - assert store.num_docs() == 8 - for i, d in enumerate(ten_simple_docs): - id_ = d.id - if i in (0, 3): # deleted - with pytest.raises(KeyError): - store[id_] - else: - assert store[id_].id == id_ - assert np.all(store[id_].tens == d.tens) - - -def test_del_multiple(ten_simple_docs): - docs_to_del_idx = [0, 2, 4, 6, 8] - - store = ElasticDocumentV8Index[SimpleDoc]() - store.index(ten_simple_docs) - - assert store.num_docs() == 10 - docs_to_del = [ten_simple_docs[i] for i in docs_to_del_idx] - ids_to_del = [d.id for d in docs_to_del] - del store[ids_to_del] - for i, doc in enumerate(ten_simple_docs): - if i in docs_to_del_idx: - with pytest.raises(KeyError): - store[doc.id] - else: - assert store[doc.id].id == doc.id - assert np.all(store[doc.id].tens == doc.tens) - - -def test_del_key_error(ten_simple_docs): - store = ElasticDocumentV8Index[SimpleDoc]() - store.index(ten_simple_docs) - - with pytest.warns(UserWarning): - del store['not_a_real_id'] - - -def test_num_docs(ten_simple_docs): - store = ElasticDocumentV8Index[SimpleDoc]() - store.index(ten_simple_docs) - - assert store.num_docs() == 10 - - del store[ten_simple_docs[0].id] - assert store.num_docs() == 9 - - del store[ten_simple_docs[3].id, ten_simple_docs[5].id] - assert store.num_docs() == 7 - - more_docs = [SimpleDoc(tens=np.random.rand(10)) for _ in range(5)] - store.index(more_docs) - assert store.num_docs() == 12 - - del store[more_docs[2].id, ten_simple_docs[7].id] - assert store.num_docs() == 10 +# @pytest.mark.parametrize('use_docarray', [True, False]) +# def test_index_deep_nested_schema(ten_deep_nested_docs, use_docarray): +# store = ElasticDocIndex[DeepNestedDoc]() +# if use_docarray: +# ten_deep_nested_docs = DocArray[DeepNestedDoc](ten_deep_nested_docs) + +# store.index(ten_deep_nested_docs) +# assert store.num_docs() == 10 + + +# def test_get_single(ten_simple_docs, ten_flat_docs, ten_nested_docs): +# # simple +# store = ElasticDocIndex[SimpleDoc]() +# store.index(ten_simple_docs) + +# assert store.num_docs() == 10 +# for d in ten_simple_docs: +# id_ = d.id +# assert store[id_].id == id_ +# assert np.all(store[id_].tens == d.tens) + +# # flat +# store = ElasticDocIndex[FlatDoc]() +# store.index(ten_flat_docs) + +# assert store.num_docs() == 10 +# for d in ten_flat_docs: +# id_ = d.id +# assert store[id_].id == id_ +# assert np.all(store[id_].tens_one == d.tens_one) +# assert np.all(store[id_].tens_two == d.tens_two) + +# # nested +# store = ElasticDocIndex[NestedDoc]() +# store.index(ten_nested_docs) + +# assert store.num_docs() == 10 +# for d in ten_nested_docs: +# id_ = d.id +# assert store[id_].id == id_ +# assert store[id_].d.id == d.d.id +# assert np.all(store[id_].d.tens == d.d.tens) + + +# def test_get_multiple(ten_simple_docs, ten_flat_docs, ten_nested_docs): +# docs_to_get_idx = [0, 2, 4, 6, 8] + +# # simple +# store = ElasticDocIndex[SimpleDoc]() +# store.index(ten_simple_docs) + +# assert store.num_docs() == 10 +# docs_to_get = [ten_simple_docs[i] for i in docs_to_get_idx] +# ids_to_get = [d.id for d in docs_to_get] +# retrieved_docs = store[ids_to_get] +# for id_, d_in, d_out in zip(ids_to_get, docs_to_get, retrieved_docs): +# assert d_out.id == id_ +# assert np.all(d_out.tens == d_in.tens) + +# # flat +# store = ElasticDocIndex[FlatDoc]() +# store.index(ten_flat_docs) + +# assert store.num_docs() == 10 +# docs_to_get = [ten_flat_docs[i] for i in docs_to_get_idx] +# ids_to_get = [d.id for d in docs_to_get] +# retrieved_docs = store[ids_to_get] +# for id_, d_in, d_out in zip(ids_to_get, docs_to_get, retrieved_docs): +# assert d_out.id == id_ +# assert np.all(d_out.tens_one == d_in.tens_one) +# assert np.all(d_out.tens_two == d_in.tens_two) + +# # nested +# store = ElasticDocIndex[NestedDoc]() +# store.index(ten_nested_docs) + +# assert store.num_docs() == 10 +# docs_to_get = [ten_nested_docs[i] for i in docs_to_get_idx] +# ids_to_get = [d.id for d in docs_to_get] +# retrieved_docs = store[ids_to_get] +# for id_, d_in, d_out in zip(ids_to_get, docs_to_get, retrieved_docs): +# assert d_out.id == id_ +# assert d_out.d.id == d_in.d.id +# assert np.all(d_out.d.tens == d_in.d.tens) + + +# def test_get_key_error(ten_simple_docs): +# store = ElasticDocIndex[SimpleDoc]() +# store.index(ten_simple_docs) + +# with pytest.raises(KeyError): +# store['not_a_real_id'] + + +# def test_del_single(ten_simple_docs): +# store = ElasticDocIndex[SimpleDoc]() +# store.index(ten_simple_docs) +# # delete once +# assert store.num_docs() == 10 +# del store[ten_simple_docs[0].id] +# assert store.num_docs() == 9 +# for i, d in enumerate(ten_simple_docs): +# id_ = d.id +# if i == 0: # deleted +# with pytest.raises(KeyError): +# store[id_] +# else: +# assert store[id_].id == id_ +# assert np.all(store[id_].tens == d.tens) +# # delete again +# del store[ten_simple_docs[3].id] +# assert store.num_docs() == 8 +# for i, d in enumerate(ten_simple_docs): +# id_ = d.id +# if i in (0, 3): # deleted +# with pytest.raises(KeyError): +# store[id_] +# else: +# assert store[id_].id == id_ +# assert np.all(store[id_].tens == d.tens) + + +# def test_del_multiple(ten_simple_docs): +# docs_to_del_idx = [0, 2, 4, 6, 8] + +# store = ElasticDocIndex[SimpleDoc]() +# store.index(ten_simple_docs) + +# assert store.num_docs() == 10 +# docs_to_del = [ten_simple_docs[i] for i in docs_to_del_idx] +# ids_to_del = [d.id for d in docs_to_del] +# del store[ids_to_del] +# for i, doc in enumerate(ten_simple_docs): +# if i in docs_to_del_idx: +# with pytest.raises(KeyError): +# store[doc.id] +# else: +# assert store[doc.id].id == doc.id +# assert np.all(store[doc.id].tens == doc.tens) + + +# def test_del_key_error(ten_simple_docs): +# store = ElasticDocIndex[SimpleDoc]() +# store.index(ten_simple_docs) + +# with pytest.warns(UserWarning): +# del store['not_a_real_id'] + + +# def test_num_docs(ten_simple_docs): +# store = ElasticDocIndex[SimpleDoc]() +# store.index(ten_simple_docs) + +# assert store.num_docs() == 10 + +# del store[ten_simple_docs[0].id] +# assert store.num_docs() == 9 + +# del store[ten_simple_docs[3].id, ten_simple_docs[5].id] +# assert store.num_docs() == 7 + +# more_docs = [SimpleDoc(tens=np.random.rand(10)) for _ in range(5)] +# store.index(more_docs) +# assert store.num_docs() == 12 + +# del store[more_docs[2].id, ten_simple_docs[7].id] +# assert store.num_docs() == 10 From da5380554087edf5f90b3e18f2c76b3115456a98 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 31 Mar 2023 11:17:07 +0800 Subject: [PATCH 04/14] feat: add elasticdoc v8 Signed-off-by: AnneY --- docarray/index/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docarray/index/__init__.py b/docarray/index/__init__.py index aefe7fa6bfe..df0d133d29c 100644 --- a/docarray/index/__init__.py +++ b/docarray/index/__init__.py @@ -20,11 +20,11 @@ def __getattr__(name: str): import_library('hnswlib', raise_error=True) import docarray.index.backends.hnswlib as lib elif name == 'ElasticDocIndex': - import_library('elasticsearch==8.6.2', raise_error=True) - import docarray.index.backends.elasticv7 as lib - elif name == 'ElasticV7DocIndex': - import_library('elasticsearch==7.10.1', raise_error=True) + import_library('elasticsearch', raise_error=True) import docarray.index.backends.elastic as lib + elif name == 'ElasticV7DocIndex': + import_library('elasticsearch', raise_error=True) + import docarray.index.backends.elasticv7 as lib else: raise ImportError( f'cannot import name \'{name}\' from \'{_get_path_from_docarray_root_level(__file__)}\'' From c0a3c5d6ee165cb7e965411cf070cfdc61447787 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 31 Mar 2023 14:35:58 +0800 Subject: [PATCH 05/14] fix: update poetry Signed-off-by: AnneY --- poetry.lock | 25 ++++++++++++++++++++++--- pyproject.toml | 1 + 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index a9bc680af7f..dbbe7aeae8f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -803,6 +803,25 @@ six = ">=1.9.0" gmpy = ["gmpy"] gmpy2 = ["gmpy2"] +[[package]] +name = "elastic-transport" +version = "8.4.0" +description = "Transport classes and utilities shared among Python Elastic client libraries" +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "elastic-transport-8.4.0.tar.gz", hash = "sha256:b9ad708ceb7fcdbc6b30a96f886609a109f042c0b9d9f2e44403b3133ba7ff10"}, + {file = "elastic_transport-8.4.0-py3-none-any.whl", hash = "sha256:19db271ab79c9f70f8c43f8f5b5111408781a6176b54ab2e54d713b6d9ceb815"}, +] + +[package.dependencies] +certifi = "*" +urllib3 = ">=1.26.2,<2" + +[package.extras] +develop = ["aiohttp", "mock", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests", "trustme"] + [[package]] name = "elasticsearch" version = "7.10.1" @@ -4590,9 +4609,9 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools" [extras] audio = ["pydub"] aws = ["smart-open"] -common = ["protobuf", "lz4"] +common = ["lz4", "protobuf"] elasticsearch = ["elasticsearch"] -full = ["protobuf", "lz4", "pandas", "pillow", "types-pillow", "av", "pydub", "trimesh"] +full = ["av", "lz4", "pandas", "pillow", "protobuf", "pydub", "trimesh", "types-pillow"] hnswlib = ["hnswlib"] image = ["pillow", "types-pillow"] jac = ["jina-hubble-sdk"] @@ -4605,4 +4624,4 @@ web = ["fastapi"] [metadata] lock-version = "2.0" python-versions = ">=3.7,<4.0" -content-hash = "821f6cd00f78c456f6146f39c14f0704e4f2d113c35db00c58462d8cfbe3a538" +content-hash = "49f70eda2036ec961a1ed06e9364c56710c91f152d030ddf566519b443b52f93" diff --git a/pyproject.toml b/pyproject.toml index 3114ff8dc61..229151108d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ pandas = {version = ">=1.1.0", optional = true } elasticsearch = {version = "7.10.1", optional = true } smart-open = {version = ">=6.3.0", extras = ["s3"], optional = true} jina-hubble-sdk = {version = ">=0.34.0", optional = true} +elastic-transport = "^8.4.0" [tool.poetry.extras] common = ["protobuf", "lz4"] From 82a76815b2da6f64fadc5013fdcd8c115efa9e15 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 31 Mar 2023 20:34:15 +0800 Subject: [PATCH 06/14] refactor: adjust folder structure Signed-off-by: AnneY --- tests/{index => integrations/doc_index}/hnswlib/__init__.py | 0 tests/{index => integrations/doc_index}/hnswlib/test_find.py | 0 .../doc_index}/hnswlib/test_index_get_del.py | 0 .../doc_index}/hnswlib/test_persist_data.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename tests/{index => integrations/doc_index}/hnswlib/__init__.py (100%) rename tests/{index => integrations/doc_index}/hnswlib/test_find.py (100%) rename tests/{index => integrations/doc_index}/hnswlib/test_index_get_del.py (100%) rename tests/{index => integrations/doc_index}/hnswlib/test_persist_data.py (100%) diff --git a/tests/index/hnswlib/__init__.py b/tests/integrations/doc_index/hnswlib/__init__.py similarity index 100% rename from tests/index/hnswlib/__init__.py rename to tests/integrations/doc_index/hnswlib/__init__.py diff --git a/tests/index/hnswlib/test_find.py b/tests/integrations/doc_index/hnswlib/test_find.py similarity index 100% rename from tests/index/hnswlib/test_find.py rename to tests/integrations/doc_index/hnswlib/test_find.py diff --git a/tests/index/hnswlib/test_index_get_del.py b/tests/integrations/doc_index/hnswlib/test_index_get_del.py similarity index 100% rename from tests/index/hnswlib/test_index_get_del.py rename to tests/integrations/doc_index/hnswlib/test_index_get_del.py diff --git a/tests/index/hnswlib/test_persist_data.py b/tests/integrations/doc_index/hnswlib/test_persist_data.py similarity index 100% rename from tests/index/hnswlib/test_persist_data.py rename to tests/integrations/doc_index/hnswlib/test_persist_data.py From f757a2791e8e9014890cc96fa0efd8ed31970383 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 31 Mar 2023 20:44:51 +0800 Subject: [PATCH 07/14] test: elastic v8 tests Signed-off-by: AnneY --- docarray/index/backends/elastic.py | 3 - tests/index/elastic/v8/test_find.py | 269 -------------- tests/index/elastic/v8/test_index_get_del.py | 232 ------------ .../integrations/doc_index/elastic/fixture.py | 62 +++- .../elastic/v7/test_index_get_del.py | 58 +-- .../doc_index}/elastic/v8/docker-compose.yml | 0 .../elastic/v8/test_column_config.py | 131 +++++++ .../doc_index/elastic/v8/test_find.py | 329 ++++++++++++++++++ .../elastic/v8/test_index_get_del.py | 270 ++++++++++++++ 9 files changed, 792 insertions(+), 562 deletions(-) delete mode 100644 tests/index/elastic/v8/test_find.py delete mode 100644 tests/index/elastic/v8/test_index_get_del.py rename tests/{index => integrations/doc_index}/elastic/v8/docker-compose.yml (100%) create mode 100644 tests/integrations/doc_index/elastic/v8/test_column_config.py create mode 100644 tests/integrations/doc_index/elastic/v8/test_find.py create mode 100644 tests/integrations/doc_index/elastic/v8/test_index_get_del.py diff --git a/docarray/index/backends/elastic.py b/docarray/index/backends/elastic.py index 462b3a56591..c003e9d4095 100644 --- a/docarray/index/backends/elastic.py +++ b/docarray/index/backends/elastic.py @@ -59,9 +59,6 @@ ELASTIC_PY_VEC_TYPES.append(TensorFlowTensor) -# toml -# elastic-transport = "^8.4.0" -# elasticsearch = "^8.6.2" class ElasticDocIndex(BaseDocIndex, Generic[TSchema]): def __init__(self, db_config=None, **kwargs): super().__init__(db_config=db_config, **kwargs) diff --git a/tests/index/elastic/v8/test_find.py b/tests/index/elastic/v8/test_find.py deleted file mode 100644 index d61ae643ae0..00000000000 --- a/tests/index/elastic/v8/test_find.py +++ /dev/null @@ -1,269 +0,0 @@ -# import numpy as np -# import pytest -# from pydantic import Field - -# from docarray import BaseDoc -# from docarray.index import ElasticDocIndex -# from docarray.typing import NdArray -# from tests.index.elastic.fixture import start_storage_v8 # noqa: F401 -# from tests.index.elastic.fixture import FlatDoc, SimpleDoc - - -# @pytest.mark.parametrize('similarity', ['cosine', 'l2_norm', 'dot_product']) -# def test_find_simple_schema(similarity): -# class SimpleSchema(BaseDoc): -# tens: NdArray[10] = Field(similarity=similarity) - -# store = ElasticDocIndex[SimpleSchema]() - -# index_docs = [] -# for _ in range(10): -# vec = np.random.rand(10) -# if similarity == 'dot_product': -# vec = vec / np.linalg.norm(vec) -# index_docs.append(SimpleDoc(tens=vec)) -# store.index(index_docs) - -# query = index_docs[-1] -# docs, scores = store.find(query, search_field='tens', limit=5) - -# assert len(docs) == 5 -# assert len(scores) == 5 -# assert docs[0].id == index_docs[-1].id -# assert np.allclose(docs[0].tens, index_docs[-1].tens) - - -# @pytest.mark.parametrize('similarity', ['cosine', 'l2_norm', 'dot_product']) -# def test_find_flat_schema(similarity): -# class FlatSchema(BaseDoc): -# tens_one: NdArray = Field(dims=10, similarity=similarity) -# tens_two: NdArray = Field(dims=50, similarity=similarity) - -# store = ElasticDocIndex[FlatSchema]() - -# index_docs = [] -# for _ in range(10): -# vec_one = np.random.rand(10) -# vec_two = np.random.rand(50) -# if similarity == 'dot_product': -# vec_one = vec_one / np.linalg.norm(vec_one) -# vec_two = vec_two / np.linalg.norm(vec_two) -# index_docs.append(FlatDoc(tens_one=vec_one, tens_two=vec_two)) - -# store.index(index_docs) - -# query = index_docs[-1] - -# # find on tens_one -# docs, scores = store.find(query, search_field='tens_one', limit=5) -# assert len(docs) == 5 -# assert len(scores) == 5 -# assert docs[0].id == index_docs[-1].id -# assert np.allclose(docs[0].tens_one, index_docs[-1].tens_one) -# assert np.allclose(docs[0].tens_two, index_docs[-1].tens_two) - -# # find on tens_two -# docs, scores = store.find(query, search_field='tens_two', limit=5) -# assert len(docs) == 5 -# assert len(scores) == 5 -# assert docs[0].id == index_docs[-1].id -# assert np.allclose(docs[0].tens_one, index_docs[-1].tens_one) -# assert np.allclose(docs[0].tens_two, index_docs[-1].tens_two) - - -# @pytest.mark.parametrize('similarity', ['cosine', 'l2_norm', 'dot_product']) -# def test_find_nested_schema(similarity): -# class SimpleDoc(BaseDoc): -# tens: NdArray[10] = Field(similarity=similarity) - -# class NestedDoc(BaseDoc): -# d: SimpleDoc -# tens: NdArray[10] = Field(similarity=similarity) - -# class DeepNestedDoc(BaseDoc): -# d: NestedDoc -# tens: NdArray = Field(similarity=similarity, dims=10) - -# store = ElasticDocIndex[DeepNestedDoc]() - -# index_docs = [] -# for _ in range(10): -# vec_simple = np.random.rand(10) -# vec_nested = np.random.rand(10) -# vec_deep = np.random.rand(10) -# if similarity == 'dot_product': -# vec_simple = vec_simple / np.linalg.norm(vec_simple) -# vec_nested = vec_nested / np.linalg.norm(vec_nested) -# vec_deep = vec_deep / np.linalg.norm(vec_deep) -# index_docs.append( -# DeepNestedDoc( -# d=NestedDoc(d=SimpleDoc(tens=vec_simple), tens=vec_nested), -# tens=vec_deep, -# ) -# ) - -# store.index(index_docs) - -# query = index_docs[-1] - -# # find on root level -# docs, scores = store.find(query, search_field='tens', limit=5) -# assert len(docs) == 5 -# assert len(scores) == 5 -# assert docs[0].id == index_docs[-1].id -# assert np.allclose(docs[0].tens, index_docs[-1].tens) - -# # find on first nesting level -# docs, scores = store.find(query, search_field='d__tens', limit=5) -# assert len(docs) == 5 -# assert len(scores) == 5 -# assert docs[0].id == index_docs[-1].id -# assert np.allclose(docs[0].d.tens, index_docs[-1].d.tens) - -# # find on second nesting level -# docs, scores = store.find(query, search_field='d__d__tens', limit=5) -# assert len(docs) == 5 -# assert len(scores) == 5 -# assert docs[0].id == index_docs[-1].id -# assert np.allclose(docs[0].d.d.tens, index_docs[-1].d.d.tens) - - -# def test_find_batched(): -# store = ElasticDocIndex[SimpleDoc]() - -# index_docs = [SimpleDoc(tens=np.random.rand(10)) for _ in range(10)] -# store.index(index_docs) - -# queries = index_docs[-2:] -# docs_batched, scores_batched = store.find_batched( -# queries, search_field='tens', limit=5 -# ) - -# for docs, scores, query in zip(docs_batched, scores_batched, queries): -# assert len(docs) == 5 -# assert len(scores) == 5 -# assert docs[0].id == query.id -# assert np.allclose(docs[0].tens, query.tens) - - -# def test_filter(): -# class MyDoc(BaseDoc): -# A: bool -# B: int -# C: float - -# store = ElasticDocIndex[MyDoc]() - -# index_docs = [MyDoc(id=f'{i}', A=(i % 2 == 0), B=i, C=i + 0.5) for i in range(10)] -# store.index(index_docs) - -# filter_query = {'term': {'A': True}} -# docs = store.filter(filter_query) -# assert len(docs) > 0 -# for doc in docs: -# assert doc.A - -# filter_query = { -# "bool": { -# "filter": [ -# {"terms": {"B": [3, 4, 7, 8]}}, -# {"range": {"C": {"gte": 3, "lte": 5}}}, -# ] -# } -# } -# docs = store.filter(filter_query) -# assert [doc.id for doc in docs] == ['3', '4'] - - -# def test_text_search(): -# class MyDoc(BaseDoc): -# text: str - -# store = ElasticDocIndex[MyDoc]() -# index_docs = [ -# MyDoc(text='hello world'), -# MyDoc(text='never gonna give you up'), -# MyDoc(text='we are the world'), -# ] -# store.index(index_docs) - -# query = 'world' -# docs, scores = store.text_search(query, search_field='text') - -# assert len(docs) == 2 -# assert len(scores) == 2 -# assert docs[0].text.index(query) >= 0 -# assert docs[1].text.index(query) >= 0 - -# queries = ['world', 'never'] -# docs, scores = store.text_search_batched(queries, search_field='text') -# for query, da, score in zip(queries, docs, scores): -# assert len(da) > 0 -# assert len(score) > 0 -# for doc in da: -# assert doc.text.index(query) >= 0 - - -# def test_query_builder(): -# class MyDoc(BaseDoc): -# tens: NdArray[10] = Field(similarity='l2_norm') -# num: int -# text: str - -# store = ElasticDocIndex[MyDoc]() -# index_docs = [ -# MyDoc(id=f'{i}', tens=np.ones(10) * i, num=int(i / 2), text=f'text {int(i/2)}') -# for i in range(10) -# ] -# store.index(index_docs) - -# # build_query -# q = store.build_query() -# assert isinstance(q, store.QueryBuilder) - -# # filter -# q = store.build_query().filter({'term': {'num': 0}}).build() -# docs, _ = store.execute_query(q) -# assert [doc['id'] for doc in docs] == ['0', '1'] - -# # find -# q = store.build_query().find(index_docs[-1], search_field='tens', limit=3).build() -# docs, _ = store.execute_query(q) -# assert [doc['id'] for doc in docs] == ['9', '8', '7'] - -# # text_search -# q = store.build_query().text_search('0', search_field='text').build() -# docs, _ = store.execute_query(q) -# assert [doc['id'] for doc in docs] == ['0', '1'] - -# # combination -# q = ( -# store.build_query() -# .filter({'range': {'num': {'lte': 3}}}) -# .find(index_docs[-1], search_field='tens') -# .text_search('0', search_field='text') -# .build() -# ) -# docs, _ = store.execute_query(q) -# assert [doc['id'] for doc in docs] == ['1', '0'] - -# # direct -# query = { -# 'knn': { -# 'field': 'tens', -# 'query_vector': [9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0], -# 'k': 10, -# 'num_candidates': 10000, -# 'filter': { -# 'bool': { -# 'filter': [ -# {'range': {'num': {'gte': 2}}}, -# {'range': {'num': {'lte': 3}}}, -# ] -# } -# }, -# }, -# } - -# docs, _ = store.execute_query(query) -# assert [doc['id'] for doc in docs] == ['7', '6', '5', '4'] diff --git a/tests/index/elastic/v8/test_index_get_del.py b/tests/index/elastic/v8/test_index_get_del.py deleted file mode 100644 index b3e8b27c162..00000000000 --- a/tests/index/elastic/v8/test_index_get_del.py +++ /dev/null @@ -1,232 +0,0 @@ -# import numpy as np -# import pytest - -# from docarray import DocArray -# from docarray.index import ElasticDocIndex -# from tests.index.elastic.fixture import start_storage_v8 # noqa: F401 -# from tests.index.elastic.fixture import DeepNestedDoc, FlatDoc, NestedDoc, SimpleDoc - - -# @pytest.fixture -# def ten_simple_docs(): -# return [SimpleDoc(tens=np.random.randn(10)) for _ in range(10)] - - -# @pytest.fixture -# def ten_flat_docs(): -# return [ -# FlatDoc(tens_one=np.random.randn(10), tens_two=np.random.randn(50)) -# for _ in range(10) -# ] - - -# @pytest.fixture -# def ten_nested_docs(): -# return [NestedDoc(d=SimpleDoc(tens=np.random.randn(10))) for _ in range(10)] - - -# @pytest.fixture -# def ten_deep_nested_docs(): -# return [ -# DeepNestedDoc(d=NestedDoc(d=SimpleDoc(tens=np.random.randn(10)))) -# for _ in range(10) -# ] - - -# @pytest.mark.parametrize('use_docarray', [True, False]) -# def test_index_simple_schema(ten_simple_docs, use_docarray): -# store = ElasticDocIndex[SimpleDoc]() -# if use_docarray: -# ten_simple_docs = DocArray[SimpleDoc](ten_simple_docs) - -# store.index(ten_simple_docs) -# assert store.num_docs() == 10 - - -# @pytest.mark.parametrize('use_docarray', [True, False]) -# def test_index_flat_schema(ten_flat_docs, use_docarray): -# store = ElasticDocIndex[FlatDoc]() -# if use_docarray: -# ten_flat_docs = DocArray[FlatDoc](ten_flat_docs) - -# store.index(ten_flat_docs) -# assert store.num_docs() == 10 - - -# @pytest.mark.parametrize('use_docarray', [True, False]) -# def test_index_nested_schema(ten_nested_docs, use_docarray): -# store = ElasticDocIndex[NestedDoc]() -# if use_docarray: -# ten_nested_docs = DocArray[NestedDoc](ten_nested_docs) - -# store.index(ten_nested_docs) -# assert store.num_docs() == 10 - - -# @pytest.mark.parametrize('use_docarray', [True, False]) -# def test_index_deep_nested_schema(ten_deep_nested_docs, use_docarray): -# store = ElasticDocIndex[DeepNestedDoc]() -# if use_docarray: -# ten_deep_nested_docs = DocArray[DeepNestedDoc](ten_deep_nested_docs) - -# store.index(ten_deep_nested_docs) -# assert store.num_docs() == 10 - - -# def test_get_single(ten_simple_docs, ten_flat_docs, ten_nested_docs): -# # simple -# store = ElasticDocIndex[SimpleDoc]() -# store.index(ten_simple_docs) - -# assert store.num_docs() == 10 -# for d in ten_simple_docs: -# id_ = d.id -# assert store[id_].id == id_ -# assert np.all(store[id_].tens == d.tens) - -# # flat -# store = ElasticDocIndex[FlatDoc]() -# store.index(ten_flat_docs) - -# assert store.num_docs() == 10 -# for d in ten_flat_docs: -# id_ = d.id -# assert store[id_].id == id_ -# assert np.all(store[id_].tens_one == d.tens_one) -# assert np.all(store[id_].tens_two == d.tens_two) - -# # nested -# store = ElasticDocIndex[NestedDoc]() -# store.index(ten_nested_docs) - -# assert store.num_docs() == 10 -# for d in ten_nested_docs: -# id_ = d.id -# assert store[id_].id == id_ -# assert store[id_].d.id == d.d.id -# assert np.all(store[id_].d.tens == d.d.tens) - - -# def test_get_multiple(ten_simple_docs, ten_flat_docs, ten_nested_docs): -# docs_to_get_idx = [0, 2, 4, 6, 8] - -# # simple -# store = ElasticDocIndex[SimpleDoc]() -# store.index(ten_simple_docs) - -# assert store.num_docs() == 10 -# docs_to_get = [ten_simple_docs[i] for i in docs_to_get_idx] -# ids_to_get = [d.id for d in docs_to_get] -# retrieved_docs = store[ids_to_get] -# for id_, d_in, d_out in zip(ids_to_get, docs_to_get, retrieved_docs): -# assert d_out.id == id_ -# assert np.all(d_out.tens == d_in.tens) - -# # flat -# store = ElasticDocIndex[FlatDoc]() -# store.index(ten_flat_docs) - -# assert store.num_docs() == 10 -# docs_to_get = [ten_flat_docs[i] for i in docs_to_get_idx] -# ids_to_get = [d.id for d in docs_to_get] -# retrieved_docs = store[ids_to_get] -# for id_, d_in, d_out in zip(ids_to_get, docs_to_get, retrieved_docs): -# assert d_out.id == id_ -# assert np.all(d_out.tens_one == d_in.tens_one) -# assert np.all(d_out.tens_two == d_in.tens_two) - -# # nested -# store = ElasticDocIndex[NestedDoc]() -# store.index(ten_nested_docs) - -# assert store.num_docs() == 10 -# docs_to_get = [ten_nested_docs[i] for i in docs_to_get_idx] -# ids_to_get = [d.id for d in docs_to_get] -# retrieved_docs = store[ids_to_get] -# for id_, d_in, d_out in zip(ids_to_get, docs_to_get, retrieved_docs): -# assert d_out.id == id_ -# assert d_out.d.id == d_in.d.id -# assert np.all(d_out.d.tens == d_in.d.tens) - - -# def test_get_key_error(ten_simple_docs): -# store = ElasticDocIndex[SimpleDoc]() -# store.index(ten_simple_docs) - -# with pytest.raises(KeyError): -# store['not_a_real_id'] - - -# def test_del_single(ten_simple_docs): -# store = ElasticDocIndex[SimpleDoc]() -# store.index(ten_simple_docs) -# # delete once -# assert store.num_docs() == 10 -# del store[ten_simple_docs[0].id] -# assert store.num_docs() == 9 -# for i, d in enumerate(ten_simple_docs): -# id_ = d.id -# if i == 0: # deleted -# with pytest.raises(KeyError): -# store[id_] -# else: -# assert store[id_].id == id_ -# assert np.all(store[id_].tens == d.tens) -# # delete again -# del store[ten_simple_docs[3].id] -# assert store.num_docs() == 8 -# for i, d in enumerate(ten_simple_docs): -# id_ = d.id -# if i in (0, 3): # deleted -# with pytest.raises(KeyError): -# store[id_] -# else: -# assert store[id_].id == id_ -# assert np.all(store[id_].tens == d.tens) - - -# def test_del_multiple(ten_simple_docs): -# docs_to_del_idx = [0, 2, 4, 6, 8] - -# store = ElasticDocIndex[SimpleDoc]() -# store.index(ten_simple_docs) - -# assert store.num_docs() == 10 -# docs_to_del = [ten_simple_docs[i] for i in docs_to_del_idx] -# ids_to_del = [d.id for d in docs_to_del] -# del store[ids_to_del] -# for i, doc in enumerate(ten_simple_docs): -# if i in docs_to_del_idx: -# with pytest.raises(KeyError): -# store[doc.id] -# else: -# assert store[doc.id].id == doc.id -# assert np.all(store[doc.id].tens == doc.tens) - - -# def test_del_key_error(ten_simple_docs): -# store = ElasticDocIndex[SimpleDoc]() -# store.index(ten_simple_docs) - -# with pytest.warns(UserWarning): -# del store['not_a_real_id'] - - -# def test_num_docs(ten_simple_docs): -# store = ElasticDocIndex[SimpleDoc]() -# store.index(ten_simple_docs) - -# assert store.num_docs() == 10 - -# del store[ten_simple_docs[0].id] -# assert store.num_docs() == 9 - -# del store[ten_simple_docs[3].id, ten_simple_docs[5].id] -# assert store.num_docs() == 7 - -# more_docs = [SimpleDoc(tens=np.random.rand(10)) for _ in range(5)] -# store.index(more_docs) -# assert store.num_docs() == 12 - -# del store[more_docs[2].id, ten_simple_docs[7].id] -# assert store.num_docs() == 10 diff --git a/tests/integrations/doc_index/elastic/fixture.py b/tests/integrations/doc_index/elastic/fixture.py index 1caa31da2a6..4f047b63587 100644 --- a/tests/integrations/doc_index/elastic/fixture.py +++ b/tests/integrations/doc_index/elastic/fixture.py @@ -1,6 +1,7 @@ import os import time +import numpy as np import pytest from pydantic import Field @@ -9,24 +10,6 @@ pytestmark = [pytest.mark.slow, pytest.mark.doc_index] - -class SimpleDoc(BaseDoc): - tens: NdArray[10] = Field(dims=1000) - - -class FlatDoc(BaseDoc): - tens_one: NdArray = Field(dims=10) - tens_two: NdArray = Field(dims=50) - - -class NestedDoc(BaseDoc): - d: SimpleDoc - - -class DeepNestedDoc(BaseDoc): - d: NestedDoc - - cur_dir = os.path.dirname(os.path.abspath(__file__)) compose_yml_v7 = os.path.abspath(os.path.join(cur_dir, 'v7/docker-compose.yml')) compose_yml_v8 = os.path.abspath(os.path.join(cur_dir, 'v8/docker-compose.yml')) @@ -56,3 +39,46 @@ def _wait_for_es(): es = Elasticsearch(hosts='http://localhost:9200/') while not es.ping(): time.sleep(0.5) + + +class SimpleDoc(BaseDoc): + tens: NdArray[10] = Field(dims=1000) + + +class FlatDoc(BaseDoc): + tens_one: NdArray = Field(dims=10) + tens_two: NdArray = Field(dims=50) + + +class NestedDoc(BaseDoc): + d: SimpleDoc + + +class DeepNestedDoc(BaseDoc): + d: NestedDoc + + +@pytest.fixture(scope='function') +def ten_simple_docs(): + return [SimpleDoc(tens=np.random.randn(10)) for _ in range(10)] + + +@pytest.fixture(scope='function') +def ten_flat_docs(): + return [ + FlatDoc(tens_one=np.random.randn(10), tens_two=np.random.randn(50)) + for _ in range(10) + ] + + +@pytest.fixture(scope='function') +def ten_nested_docs(): + return [NestedDoc(d=SimpleDoc(tens=np.random.randn(10))) for _ in range(10)] + + +@pytest.fixture(scope='function') +def ten_deep_nested_docs(): + return [ + DeepNestedDoc(d=NestedDoc(d=SimpleDoc(tens=np.random.randn(10)))) + for _ in range(10) + ] diff --git a/tests/integrations/doc_index/elastic/v7/test_index_get_del.py b/tests/integrations/doc_index/elastic/v7/test_index_get_del.py index d5394a7925b..5c0655b8538 100644 --- a/tests/integrations/doc_index/elastic/v7/test_index_get_del.py +++ b/tests/integrations/doc_index/elastic/v7/test_index_get_del.py @@ -7,45 +7,23 @@ from docarray.documents import ImageDoc, TextDoc from docarray.index import ElasticV7DocIndex from docarray.typing import NdArray -from tests.integrations.doc_index.elastic.fixture import start_storage_v7 # noqa: F401 -from tests.integrations.doc_index.elastic.fixture import ( +from tests.integrations.doc_index.elastic.fixture import ( # noqa: F401 DeepNestedDoc, FlatDoc, NestedDoc, SimpleDoc, + start_storage_v7, + ten_deep_nested_docs, + ten_flat_docs, + ten_nested_docs, + ten_simple_docs, ) pytestmark = [pytest.mark.slow, pytest.mark.index] -@pytest.fixture -def ten_simple_docs(): - return [SimpleDoc(tens=np.random.randn(10)) for _ in range(10)] - - -@pytest.fixture -def ten_flat_docs(): - return [ - FlatDoc(tens_one=np.random.randn(10), tens_two=np.random.randn(50)) - for _ in range(10) - ] - - -@pytest.fixture -def ten_nested_docs(): - return [NestedDoc(d=SimpleDoc(tens=np.random.randn(10))) for _ in range(10)] - - -@pytest.fixture -def ten_deep_nested_docs(): - return [ - DeepNestedDoc(d=NestedDoc(d=SimpleDoc(tens=np.random.randn(10)))) - for _ in range(10) - ] - - @pytest.mark.parametrize('use_docarray', [True, False]) -def test_index_simple_schema(ten_simple_docs, use_docarray): +def test_index_simple_schema(ten_simple_docs, use_docarray): # noqa: F811 store = ElasticV7DocIndex[SimpleDoc]() if use_docarray: ten_simple_docs = DocArray[SimpleDoc](ten_simple_docs) @@ -55,7 +33,7 @@ def test_index_simple_schema(ten_simple_docs, use_docarray): @pytest.mark.parametrize('use_docarray', [True, False]) -def test_index_flat_schema(ten_flat_docs, use_docarray): +def test_index_flat_schema(ten_flat_docs, use_docarray): # noqa: F811 store = ElasticV7DocIndex[FlatDoc]() if use_docarray: ten_flat_docs = DocArray[FlatDoc](ten_flat_docs) @@ -65,7 +43,7 @@ def test_index_flat_schema(ten_flat_docs, use_docarray): @pytest.mark.parametrize('use_docarray', [True, False]) -def test_index_nested_schema(ten_nested_docs, use_docarray): +def test_index_nested_schema(ten_nested_docs, use_docarray): # noqa: F811 store = ElasticV7DocIndex[NestedDoc]() if use_docarray: ten_nested_docs = DocArray[NestedDoc](ten_nested_docs) @@ -75,7 +53,7 @@ def test_index_nested_schema(ten_nested_docs, use_docarray): @pytest.mark.parametrize('use_docarray', [True, False]) -def test_index_deep_nested_schema(ten_deep_nested_docs, use_docarray): +def test_index_deep_nested_schema(ten_deep_nested_docs, use_docarray): # noqa: F811 store = ElasticV7DocIndex[DeepNestedDoc]() if use_docarray: ten_deep_nested_docs = DocArray[DeepNestedDoc](ten_deep_nested_docs) @@ -84,7 +62,7 @@ def test_index_deep_nested_schema(ten_deep_nested_docs, use_docarray): assert store.num_docs() == 10 -def test_get_single(ten_simple_docs, ten_flat_docs, ten_nested_docs): +def test_get_single(ten_simple_docs, ten_flat_docs, ten_nested_docs): # noqa: F811 # simple store = ElasticV7DocIndex[SimpleDoc]() store.index(ten_simple_docs) @@ -118,7 +96,7 @@ def test_get_single(ten_simple_docs, ten_flat_docs, ten_nested_docs): assert np.all(store[id_].d.tens == d.d.tens) -def test_get_multiple(ten_simple_docs, ten_flat_docs, ten_nested_docs): +def test_get_multiple(ten_simple_docs, ten_flat_docs, ten_nested_docs): # noqa: F811 docs_to_get_idx = [0, 2, 4, 6, 8] # simple @@ -160,7 +138,7 @@ def test_get_multiple(ten_simple_docs, ten_flat_docs, ten_nested_docs): assert np.all(d_out.d.tens == d_in.d.tens) -def test_get_key_error(ten_simple_docs): +def test_get_key_error(ten_simple_docs): # noqa: F811 store = ElasticV7DocIndex[SimpleDoc]() store.index(ten_simple_docs) @@ -168,7 +146,7 @@ def test_get_key_error(ten_simple_docs): store['not_a_real_id'] -def test_persisting(ten_simple_docs): +def test_persisting(ten_simple_docs): # noqa: F811 store = ElasticV7DocIndex[SimpleDoc](index_name='test_persisting') store.index(ten_simple_docs) @@ -176,7 +154,7 @@ def test_persisting(ten_simple_docs): assert store2.num_docs() == 10 -def test_del_single(ten_simple_docs): +def test_del_single(ten_simple_docs): # noqa: F811 store = ElasticV7DocIndex[SimpleDoc]() store.index(ten_simple_docs) # delete once @@ -204,7 +182,7 @@ def test_del_single(ten_simple_docs): assert np.all(store[id_].tens == d.tens) -def test_del_multiple(ten_simple_docs): +def test_del_multiple(ten_simple_docs): # noqa: F811 docs_to_del_idx = [0, 2, 4, 6, 8] store = ElasticV7DocIndex[SimpleDoc]() @@ -223,7 +201,7 @@ def test_del_multiple(ten_simple_docs): assert np.all(store[doc.id].tens == doc.tens) -def test_del_key_error(ten_simple_docs): +def test_del_key_error(ten_simple_docs): # noqa: F811 store = ElasticV7DocIndex[SimpleDoc]() store.index(ten_simple_docs) @@ -231,7 +209,7 @@ def test_del_key_error(ten_simple_docs): del store['not_a_real_id'] -def test_num_docs(ten_simple_docs): +def test_num_docs(ten_simple_docs): # noqa: F811 store = ElasticV7DocIndex[SimpleDoc]() store.index(ten_simple_docs) diff --git a/tests/index/elastic/v8/docker-compose.yml b/tests/integrations/doc_index/elastic/v8/docker-compose.yml similarity index 100% rename from tests/index/elastic/v8/docker-compose.yml rename to tests/integrations/doc_index/elastic/v8/docker-compose.yml diff --git a/tests/integrations/doc_index/elastic/v8/test_column_config.py b/tests/integrations/doc_index/elastic/v8/test_column_config.py new file mode 100644 index 00000000000..6e1ad6cf88b --- /dev/null +++ b/tests/integrations/doc_index/elastic/v8/test_column_config.py @@ -0,0 +1,131 @@ +import pytest +from pydantic import Field + +from docarray import BaseDoc +from docarray.index import ElasticDocIndex +from tests.integrations.doc_index.elastic.fixture import start_storage_v8 # noqa: F401 + +pytestmark = [pytest.mark.slow, pytest.mark.index, pytest.mark.elasticv8] + + +def test_column_config(): + class MyDoc(BaseDoc): + text: str + color: str = Field(col_type='keyword') + + store = ElasticDocIndex[MyDoc]() + index_docs = [ + MyDoc(id='0', text='hello world', color='red'), + MyDoc(id='1', text='never gonna give you up', color='blue'), + MyDoc(id='2', text='we are the world', color='green'), + ] + store.index(index_docs) + + query = 'world' + docs, _ = store.text_search(query, search_field='text') + assert [doc.id for doc in docs] == ['0', '2'] + + filter_query = {'terms': {'color': ['red', 'blue']}} + docs = store.filter(filter_query) + assert [doc.id for doc in docs] == ['0', '1'] + + +def test_field_object(): + class MyDoc(BaseDoc): + manager: dict = Field( + properties={ + 'age': {'type': 'integer'}, + 'name': { + 'properties': { + 'first': {'type': 'keyword'}, + 'last': {'type': 'keyword'}, + } + }, + } + ) + + store = ElasticDocIndex[MyDoc]() + doc = [ + MyDoc(manager={'age': 25, 'name': {'first': 'Rachel', 'last': 'Green'}}), + MyDoc(manager={'age': 30, 'name': {'first': 'Monica', 'last': 'Geller'}}), + MyDoc(manager={'age': 35, 'name': {'first': 'Phoebe', 'last': 'Buffay'}}), + ] + store.index(doc) + id_ = doc[0].id + assert store[id_].id == id_ + assert store[id_].manager == doc[0].manager + + filter_query = {'range': {'manager.age': {'gte': 30}}} + docs = store.filter(filter_query) + assert [doc.id for doc in docs] == [doc[1].id, doc[2].id] + + +def test_field_geo_point(): + class MyDoc(BaseDoc): + location: dict = Field(col_type='geo_point') + + store = ElasticDocIndex[MyDoc]() + doc = [ + MyDoc(location={'lat': 40.12, 'lon': -72.34}), + MyDoc(location={'lat': 41.12, 'lon': -73.34}), + MyDoc(location={'lat': 42.12, 'lon': -74.34}), + ] + store.index(doc) + + query = { + 'query': { + 'geo_bounding_box': { + 'location': { + 'top_left': {'lat': 42, 'lon': -74}, + 'bottom_right': {'lat': 40, 'lon': -72}, + } + } + }, + } + + docs, _ = store.execute_query(query) + assert [doc['id'] for doc in docs] == [doc[0].id, doc[1].id] + + +def test_field_range(): + class MyDoc(BaseDoc): + expected_attendees: dict = Field(col_type='integer_range') + time_frame: dict = Field(col_type='date_range', format='yyyy-MM-dd') + + store = ElasticDocIndex[MyDoc]() + doc = [ + MyDoc( + expected_attendees={'gte': 10, 'lt': 20}, + time_frame={'gte': '2023-01-01', 'lt': '2023-02-01'}, + ), + MyDoc( + expected_attendees={'gte': 20, 'lt': 30}, + time_frame={'gte': '2023-02-01', 'lt': '2023-03-01'}, + ), + MyDoc( + expected_attendees={'gte': 30, 'lt': 40}, + time_frame={'gte': '2023-03-01', 'lt': '2023-04-01'}, + ), + ] + store.index(doc) + + query = { + 'query': { + 'bool': { + 'should': [ + {'term': {'expected_attendees': {'value': 15}}}, + { + 'range': { + 'time_frame': { + 'gte': '2023-02-05', + 'lt': '2023-02-10', + 'relation': 'contains', + } + } + }, + ] + } + }, + } + docs, _ = store.execute_query(query) + assert [doc['id'] for doc in docs] == [doc[0].id, doc[1].id] diff --git a/tests/integrations/doc_index/elastic/v8/test_find.py b/tests/integrations/doc_index/elastic/v8/test_find.py new file mode 100644 index 00000000000..90292a772bd --- /dev/null +++ b/tests/integrations/doc_index/elastic/v8/test_find.py @@ -0,0 +1,329 @@ +import numpy as np +import pytest +import torch +from pydantic import Field + +from docarray import BaseDoc +from docarray.index import ElasticDocIndex +from docarray.typing import NdArray, TorchTensor +from tests.integrations.doc_index.elastic.fixture import start_storage_v8 # noqa: F401 +from tests.integrations.doc_index.elastic.fixture import FlatDoc, SimpleDoc + +pytestmark = [pytest.mark.slow, pytest.mark.index, pytest.mark.elasticv8] + + +@pytest.mark.parametrize('similarity', ['cosine', 'l2_norm', 'dot_product']) +def test_find_simple_schema(similarity): + class SimpleSchema(BaseDoc): + tens: NdArray[10] = Field(similarity=similarity) + + store = ElasticDocIndex[SimpleSchema]() + + index_docs = [] + for _ in range(10): + vec = np.random.rand(10) + if similarity == 'dot_product': + vec = vec / np.linalg.norm(vec) + index_docs.append(SimpleDoc(tens=vec)) + store.index(index_docs) + + query = index_docs[-1] + docs, scores = store.find(query, search_field='tens', limit=5) + + assert len(docs) == 5 + assert len(scores) == 5 + assert docs[0].id == index_docs[-1].id + assert np.allclose(docs[0].tens, index_docs[-1].tens) + + +@pytest.mark.parametrize('similarity', ['cosine', 'l2_norm', 'dot_product']) +def test_find_flat_schema(similarity): + class FlatSchema(BaseDoc): + tens_one: NdArray = Field(dims=10, similarity=similarity) + tens_two: NdArray = Field(dims=50, similarity=similarity) + + store = ElasticDocIndex[FlatSchema]() + + index_docs = [] + for _ in range(10): + vec_one = np.random.rand(10) + vec_two = np.random.rand(50) + if similarity == 'dot_product': + vec_one = vec_one / np.linalg.norm(vec_one) + vec_two = vec_two / np.linalg.norm(vec_two) + index_docs.append(FlatDoc(tens_one=vec_one, tens_two=vec_two)) + + store.index(index_docs) + + query = index_docs[-1] + + # find on tens_one + docs, scores = store.find(query, search_field='tens_one', limit=5) + assert len(docs) == 5 + assert len(scores) == 5 + assert docs[0].id == index_docs[-1].id + assert np.allclose(docs[0].tens_one, index_docs[-1].tens_one) + assert np.allclose(docs[0].tens_two, index_docs[-1].tens_two) + + # find on tens_two + docs, scores = store.find(query, search_field='tens_two', limit=5) + assert len(docs) == 5 + assert len(scores) == 5 + assert docs[0].id == index_docs[-1].id + assert np.allclose(docs[0].tens_one, index_docs[-1].tens_one) + assert np.allclose(docs[0].tens_two, index_docs[-1].tens_two) + + +@pytest.mark.parametrize('similarity', ['cosine', 'l2_norm', 'dot_product']) +def test_find_nested_schema(similarity): + class SimpleDoc(BaseDoc): + tens: NdArray[10] = Field(similarity=similarity) + + class NestedDoc(BaseDoc): + d: SimpleDoc + tens: NdArray[10] = Field(similarity=similarity) + + class DeepNestedDoc(BaseDoc): + d: NestedDoc + tens: NdArray = Field(similarity=similarity, dims=10) + + store = ElasticDocIndex[DeepNestedDoc]() + + index_docs = [] + for _ in range(10): + vec_simple = np.random.rand(10) + vec_nested = np.random.rand(10) + vec_deep = np.random.rand(10) + if similarity == 'dot_product': + vec_simple = vec_simple / np.linalg.norm(vec_simple) + vec_nested = vec_nested / np.linalg.norm(vec_nested) + vec_deep = vec_deep / np.linalg.norm(vec_deep) + index_docs.append( + DeepNestedDoc( + d=NestedDoc(d=SimpleDoc(tens=vec_simple), tens=vec_nested), + tens=vec_deep, + ) + ) + + store.index(index_docs) + + query = index_docs[-1] + + # find on root level + docs, scores = store.find(query, search_field='tens', limit=5) + assert len(docs) == 5 + assert len(scores) == 5 + assert docs[0].id == index_docs[-1].id + assert np.allclose(docs[0].tens, index_docs[-1].tens) + + # find on first nesting level + docs, scores = store.find(query, search_field='d__tens', limit=5) + assert len(docs) == 5 + assert len(scores) == 5 + assert docs[0].id == index_docs[-1].id + assert np.allclose(docs[0].d.tens, index_docs[-1].d.tens) + + # find on second nesting level + docs, scores = store.find(query, search_field='d__d__tens', limit=5) + assert len(docs) == 5 + assert len(scores) == 5 + assert docs[0].id == index_docs[-1].id + assert np.allclose(docs[0].d.d.tens, index_docs[-1].d.d.tens) + + +def test_find_torch(): + class TorchDoc(BaseDoc): + tens: TorchTensor[10] + + store = ElasticDocIndex[TorchDoc]() + + # A dense_vector field stores dense vectors of float values. + index_docs = [ + TorchDoc(tens=np.random.rand(10).astype(dtype=np.float32)) for _ in range(10) + ] + store.index(index_docs) + + for doc in index_docs: + assert isinstance(doc.tens, TorchTensor) + + query = index_docs[-1] + docs, scores = store.find(query, search_field='tens', limit=5) + + assert len(docs) == 5 + assert len(scores) == 5 + for doc in docs: + assert isinstance(doc.tens, TorchTensor) + + assert docs[0].id == index_docs[-1].id + assert torch.allclose(docs[0].tens, index_docs[-1].tens) + + +def test_find_tensorflow(): + from docarray.typing import TensorFlowTensor + + class TfDoc(BaseDoc): + tens: TensorFlowTensor[10] + + store = ElasticDocIndex[TfDoc]() + + index_docs = [ + TfDoc(tens=np.random.rand(10).astype(dtype=np.float32)) for _ in range(10) + ] + store.index(index_docs) + + for doc in index_docs: + assert isinstance(doc.tens, TensorFlowTensor) + + query = index_docs[-1] + docs, scores = store.find(query, search_field='tens', limit=5) + + assert len(docs) == 5 + assert len(scores) == 5 + for doc in docs: + assert isinstance(doc.tens, TensorFlowTensor) + + assert docs[0].id == index_docs[-1].id + assert np.allclose( + docs[0].tens.unwrap().numpy(), index_docs[-1].tens.unwrap().numpy() + ) + + +def test_find_batched(): + store = ElasticDocIndex[SimpleDoc]() + + index_docs = [SimpleDoc(tens=np.random.rand(10)) for _ in range(10)] + store.index(index_docs) + + queries = index_docs[-2:] + docs_batched, scores_batched = store.find_batched( + queries, search_field='tens', limit=5 + ) + + for docs, scores, query in zip(docs_batched, scores_batched, queries): + assert len(docs) == 5 + assert len(scores) == 5 + assert docs[0].id == query.id + assert np.allclose(docs[0].tens, query.tens) + + +def test_filter(): + class MyDoc(BaseDoc): + A: bool + B: int + C: float + + store = ElasticDocIndex[MyDoc]() + + index_docs = [MyDoc(id=f'{i}', A=(i % 2 == 0), B=i, C=i + 0.5) for i in range(10)] + store.index(index_docs) + + filter_query = {'term': {'A': True}} + docs = store.filter(filter_query) + assert len(docs) > 0 + for doc in docs: + assert doc.A + + filter_query = { + "bool": { + "filter": [ + {"terms": {"B": [3, 4, 7, 8]}}, + {"range": {"C": {"gte": 3, "lte": 5}}}, + ] + } + } + docs = store.filter(filter_query) + assert [doc.id for doc in docs] == ['3', '4'] + + +def test_text_search(): + class MyDoc(BaseDoc): + text: str + + store = ElasticDocIndex[MyDoc]() + index_docs = [ + MyDoc(text='hello world'), + MyDoc(text='never gonna give you up'), + MyDoc(text='we are the world'), + ] + store.index(index_docs) + + query = 'world' + docs, scores = store.text_search(query, search_field='text') + + assert len(docs) == 2 + assert len(scores) == 2 + assert docs[0].text.index(query) >= 0 + assert docs[1].text.index(query) >= 0 + + queries = ['world', 'never'] + docs, scores = store.text_search_batched(queries, search_field='text') + for query, da, score in zip(queries, docs, scores): + assert len(da) > 0 + assert len(score) > 0 + for doc in da: + assert doc.text.index(query) >= 0 + + +def test_query_builder(): + class MyDoc(BaseDoc): + tens: NdArray[10] = Field(similarity='l2_norm') + num: int + text: str + + store = ElasticDocIndex[MyDoc]() + index_docs = [ + MyDoc(id=f'{i}', tens=np.ones(10) * i, num=int(i / 2), text=f'text {int(i/2)}') + for i in range(10) + ] + store.index(index_docs) + + # build_query + q = store.build_query() + assert isinstance(q, store.QueryBuilder) + + # filter + q = store.build_query().filter({'term': {'num': 0}}).build() + docs, _ = store.execute_query(q) + assert [doc['id'] for doc in docs] == ['0', '1'] + + # find + q = store.build_query().find(index_docs[-1], search_field='tens', limit=3).build() + docs, _ = store.execute_query(q) + assert [doc['id'] for doc in docs] == ['9', '8', '7'] + + # text_search + q = store.build_query().text_search('0', search_field='text').build() + docs, _ = store.execute_query(q) + assert [doc['id'] for doc in docs] == ['0', '1'] + + # combination + q = ( + store.build_query() + .filter({'range': {'num': {'lte': 3}}}) + .find(index_docs[-1], search_field='tens') + .text_search('0', search_field='text') + .build() + ) + docs, _ = store.execute_query(q) + assert [doc['id'] for doc in docs] == ['1', '0'] + + # direct + query = { + 'knn': { + 'field': 'tens', + 'query_vector': [9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0], + 'k': 10, + 'num_candidates': 10000, + 'filter': { + 'bool': { + 'filter': [ + {'range': {'num': {'gte': 2}}}, + {'range': {'num': {'lte': 3}}}, + ] + } + }, + }, + } + + docs, _ = store.execute_query(query) + assert [doc['id'] for doc in docs] == ['7', '6', '5', '4'] diff --git a/tests/integrations/doc_index/elastic/v8/test_index_get_del.py b/tests/integrations/doc_index/elastic/v8/test_index_get_del.py new file mode 100644 index 00000000000..e51b683fa16 --- /dev/null +++ b/tests/integrations/doc_index/elastic/v8/test_index_get_del.py @@ -0,0 +1,270 @@ +from typing import Union + +import numpy as np +import pytest + +from docarray import BaseDoc, DocArray +from docarray.documents import ImageDoc, TextDoc +from docarray.index import ElasticDocIndex +from docarray.typing import NdArray +from tests.integrations.doc_index.elastic.fixture import ( # noqa: F401 + DeepNestedDoc, + FlatDoc, + NestedDoc, + SimpleDoc, + start_storage_v8, + ten_deep_nested_docs, + ten_flat_docs, + ten_nested_docs, + ten_simple_docs, +) + +pytestmark = [pytest.mark.slow, pytest.mark.index, pytest.mark.elasticv8] + + +@pytest.mark.parametrize('use_docarray', [True, False]) +def test_index_simple_schema(ten_simple_docs, use_docarray): # noqa: F811 + store = ElasticDocIndex[SimpleDoc]() + if use_docarray: + ten_simple_docs = DocArray[SimpleDoc](ten_simple_docs) + + store.index(ten_simple_docs) + assert store.num_docs() == 10 + + +@pytest.mark.parametrize('use_docarray', [True, False]) +def test_index_flat_schema(ten_flat_docs, use_docarray): # noqa: F811 + store = ElasticDocIndex[FlatDoc]() + if use_docarray: + ten_flat_docs = DocArray[FlatDoc](ten_flat_docs) + + store.index(ten_flat_docs) + assert store.num_docs() == 10 + + +@pytest.mark.parametrize('use_docarray', [True, False]) +def test_index_nested_schema(ten_nested_docs, use_docarray): # noqa: F811 + store = ElasticDocIndex[NestedDoc]() + if use_docarray: + ten_nested_docs = DocArray[NestedDoc](ten_nested_docs) + + store.index(ten_nested_docs) + assert store.num_docs() == 10 + + +@pytest.mark.parametrize('use_docarray', [True, False]) +def test_index_deep_nested_schema(ten_deep_nested_docs, use_docarray): # noqa: F811 + store = ElasticDocIndex[DeepNestedDoc]() + if use_docarray: + ten_deep_nested_docs = DocArray[DeepNestedDoc](ten_deep_nested_docs) + + store.index(ten_deep_nested_docs) + assert store.num_docs() == 10 + + +def test_get_single(ten_simple_docs, ten_flat_docs, ten_nested_docs): # noqa: F811 + # simple + store = ElasticDocIndex[SimpleDoc]() + store.index(ten_simple_docs) + + assert store.num_docs() == 10 + for d in ten_simple_docs: + id_ = d.id + assert store[id_].id == id_ + assert np.all(store[id_].tens == d.tens) + + # flat + store = ElasticDocIndex[FlatDoc]() + store.index(ten_flat_docs) + + assert store.num_docs() == 10 + for d in ten_flat_docs: + id_ = d.id + assert store[id_].id == id_ + assert np.all(store[id_].tens_one == d.tens_one) + assert np.all(store[id_].tens_two == d.tens_two) + + # nested + store = ElasticDocIndex[NestedDoc]() + store.index(ten_nested_docs) + + assert store.num_docs() == 10 + for d in ten_nested_docs: + id_ = d.id + assert store[id_].id == id_ + assert store[id_].d.id == d.d.id + assert np.all(store[id_].d.tens == d.d.tens) + + +def test_get_multiple(ten_simple_docs, ten_flat_docs, ten_nested_docs): # noqa: F811 + docs_to_get_idx = [0, 2, 4, 6, 8] + + # simple + store = ElasticDocIndex[SimpleDoc]() + store.index(ten_simple_docs) + + assert store.num_docs() == 10 + docs_to_get = [ten_simple_docs[i] for i in docs_to_get_idx] + ids_to_get = [d.id for d in docs_to_get] + retrieved_docs = store[ids_to_get] + for id_, d_in, d_out in zip(ids_to_get, docs_to_get, retrieved_docs): + assert d_out.id == id_ + assert np.all(d_out.tens == d_in.tens) + + # flat + store = ElasticDocIndex[FlatDoc]() + store.index(ten_flat_docs) + + assert store.num_docs() == 10 + docs_to_get = [ten_flat_docs[i] for i in docs_to_get_idx] + ids_to_get = [d.id for d in docs_to_get] + retrieved_docs = store[ids_to_get] + for id_, d_in, d_out in zip(ids_to_get, docs_to_get, retrieved_docs): + assert d_out.id == id_ + assert np.all(d_out.tens_one == d_in.tens_one) + assert np.all(d_out.tens_two == d_in.tens_two) + + # nested + store = ElasticDocIndex[NestedDoc]() + store.index(ten_nested_docs) + + assert store.num_docs() == 10 + docs_to_get = [ten_nested_docs[i] for i in docs_to_get_idx] + ids_to_get = [d.id for d in docs_to_get] + retrieved_docs = store[ids_to_get] + for id_, d_in, d_out in zip(ids_to_get, docs_to_get, retrieved_docs): + assert d_out.id == id_ + assert d_out.d.id == d_in.d.id + assert np.all(d_out.d.tens == d_in.d.tens) + + +def test_get_key_error(ten_simple_docs): # noqa: F811 + store = ElasticDocIndex[SimpleDoc]() + store.index(ten_simple_docs) + + with pytest.raises(KeyError): + store['not_a_real_id'] + + +def test_persisting(ten_simple_docs): # noqa: F811 + store = ElasticDocIndex[SimpleDoc](index_name='test_persisting') + store.index(ten_simple_docs) + + store2 = ElasticDocIndex[SimpleDoc](index_name='test_persisting') + assert store2.num_docs() == 10 + + +def test_del_single(ten_simple_docs): # noqa: F811 + store = ElasticDocIndex[SimpleDoc]() + store.index(ten_simple_docs) + # delete once + assert store.num_docs() == 10 + del store[ten_simple_docs[0].id] + assert store.num_docs() == 9 + for i, d in enumerate(ten_simple_docs): + id_ = d.id + if i == 0: # deleted + with pytest.raises(KeyError): + store[id_] + else: + assert store[id_].id == id_ + assert np.all(store[id_].tens == d.tens) + # delete again + del store[ten_simple_docs[3].id] + assert store.num_docs() == 8 + for i, d in enumerate(ten_simple_docs): + id_ = d.id + if i in (0, 3): # deleted + with pytest.raises(KeyError): + store[id_] + else: + assert store[id_].id == id_ + assert np.all(store[id_].tens == d.tens) + + +def test_del_multiple(ten_simple_docs): # noqa: F811 + docs_to_del_idx = [0, 2, 4, 6, 8] + + store = ElasticDocIndex[SimpleDoc]() + store.index(ten_simple_docs) + + assert store.num_docs() == 10 + docs_to_del = [ten_simple_docs[i] for i in docs_to_del_idx] + ids_to_del = [d.id for d in docs_to_del] + del store[ids_to_del] + for i, doc in enumerate(ten_simple_docs): + if i in docs_to_del_idx: + with pytest.raises(KeyError): + store[doc.id] + else: + assert store[doc.id].id == doc.id + assert np.all(store[doc.id].tens == doc.tens) + + +def test_del_key_error(ten_simple_docs): # noqa: F811 + store = ElasticDocIndex[SimpleDoc]() + store.index(ten_simple_docs) + + with pytest.warns(UserWarning): + del store['not_a_real_id'] + + +def test_num_docs(ten_simple_docs): # noqa: F811 + store = ElasticDocIndex[SimpleDoc]() + store.index(ten_simple_docs) + + assert store.num_docs() == 10 + + del store[ten_simple_docs[0].id] + assert store.num_docs() == 9 + + del store[ten_simple_docs[3].id, ten_simple_docs[5].id] + assert store.num_docs() == 7 + + more_docs = [SimpleDoc(tens=np.random.rand(10)) for _ in range(5)] + store.index(more_docs) + assert store.num_docs() == 12 + + del store[more_docs[2].id, ten_simple_docs[7].id] + assert store.num_docs() == 10 + + +def test_index_union_doc(): # noqa: F811 + class MyDoc(BaseDoc): + tensor: Union[NdArray, str] + + class MySchema(BaseDoc): + tensor: NdArray + + store = ElasticDocIndex[MySchema]() + doc = [MyDoc(tensor=np.random.randn(128))] + store.index(doc) + + id_ = doc[0].id + assert store[id_].id == id_ + assert np.all(store[id_].tensor == doc[0].tensor) + + +def test_index_multi_modal_doc(): + class MyMultiModalDoc(BaseDoc): + image: ImageDoc + text: TextDoc + + store = ElasticDocIndex[MyMultiModalDoc]() + + doc = [ + MyMultiModalDoc( + image=ImageDoc(embedding=np.random.randn(128)), text=TextDoc(text='hello') + ) + ] + store.index(doc) + + id_ = doc[0].id + assert store[id_].id == id_ + assert np.all(store[id_].image.embedding == doc[0].image.embedding) + assert store[id_].text.text == doc[0].text.text + + +def test_elasticv7_version_check(): + with pytest.raises(ImportError): + from docarray.index import ElasticV7DocIndex # noqa: F401 From 8b14182864bf1fa0b06d588ed0ad57fe8fb04f7e Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 31 Mar 2023 20:46:00 +0800 Subject: [PATCH 08/14] fix: elasticversion in init, ci and toml Signed-off-by: AnneY --- .github/workflows/ci.yml | 35 +++++++++++++++++++++++++++++++++-- docarray/index/__init__.py | 8 ++++++++ pyproject.toml | 5 +++-- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 130e72de9dd..cab0d20a625 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -225,7 +225,38 @@ jobs: - name: Test id: test run: | - poetry run pytest -m 'index' tests + poetry run pytest -m 'index and not elasticv8' tests + timeout-minutes: 30 + + + docarray-elastic-v8: + needs: [lint-ruff, check-black, import-test] + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: [3.7] + steps: + - uses: actions/checkout@v2.5.0 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Prepare environment + run: | + python -m pip install --upgrade pip + python -m pip install poetry + poetry install --all-extras + poetry run pip install protobuf==3.19.0 + poetry run pip install tensorflow==2.11.0 + poetry run pip install elasticsearch==8.6.2 + sudo apt-get update + sudo apt-get install --no-install-recommends ffmpeg + + - name: Test + id: test + run: | + poetry run pytest -m 'index and elasticv8' tests timeout-minutes: 30 docarray-test-tensorflow: @@ -284,7 +315,7 @@ jobs: # just for blocking the merge until all parallel core-test are successful success-all-test: - needs: [docarray-test, docarray-test-proto3, docarray-doc-index, docarray-test-tensorflow, docarray-test-benchmarks, import-test, check-black, check-mypy, lint-ruff] + needs: [docarray-test, docarray-test-proto3, docarray-doc-index, docarray-elastic-v8, docarray-test-tensorflow, docarray-test-benchmarks, import-test, check-black, check-mypy, lint-ruff] if: always() runs-on: ubuntu-latest steps: diff --git a/docarray/index/__init__.py b/docarray/index/__init__.py index df0d133d29c..5147cb8113c 100644 --- a/docarray/index/__init__.py +++ b/docarray/index/__init__.py @@ -24,7 +24,15 @@ def __getattr__(name: str): import docarray.index.backends.elastic as lib elif name == 'ElasticV7DocIndex': import_library('elasticsearch', raise_error=True) + from elasticsearch import __version__ as __es__version__ + import docarray.index.backends.elasticv7 as lib + + if __es__version__[0] > 7: + raise ImportError( + 'ElasticV7DocIndex requires the elasticsearch library to be version 7.10.1' + ) + else: raise ImportError( f'cannot import name \'{name}\' from \'{_get_path_from_docarray_root_level(__file__)}\'' diff --git a/pyproject.toml b/pyproject.toml index 229151108d1..cc0ce99f8b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,10 +24,10 @@ hnswlib = {version = ">=0.6.2", optional = true } lz4 = {version= ">=1.0.0", optional = true} pydub = {version = "^0.25.1", optional = true } pandas = {version = ">=1.1.0", optional = true } -elasticsearch = {version = "7.10.1", optional = true } +elasticsearch = {version = ">=7.10.1", optional = true } smart-open = {version = ">=6.3.0", extras = ["s3"], optional = true} jina-hubble-sdk = {version = ">=0.34.0", optional = true} -elastic-transport = "^8.4.0" +elastic-transport = {version ="^8.4.0", optional = true } [tool.poetry.extras] common = ["protobuf", "lz4"] @@ -115,4 +115,5 @@ markers = [ "tensorflow: marks test using tensorflow and proto 3", "index: marks test using a document index", "benchmark: marks slow benchmarking tests", + "elasticv8: marks test that run with ElasticSearch v8", ] From b66d55011bb271558c1becbd998896be15fac00f Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 31 Mar 2023 20:54:43 +0800 Subject: [PATCH 09/14] fix: update poetry extras Signed-off-by: AnneY --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index dbbe7aeae8f..f9b068861f2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. [[package]] name = "aiohttp" @@ -808,7 +808,7 @@ name = "elastic-transport" version = "8.4.0" description = "Transport classes and utilities shared among Python Elastic client libraries" category = "main" -optional = false +optional = true python-versions = ">=3.6" files = [ {file = "elastic-transport-8.4.0.tar.gz", hash = "sha256:b9ad708ceb7fcdbc6b30a96f886609a109f042c0b9d9f2e44403b3133ba7ff10"}, @@ -4610,7 +4610,7 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools" audio = ["pydub"] aws = ["smart-open"] common = ["lz4", "protobuf"] -elasticsearch = ["elasticsearch"] +elasticsearch = ["elastic-transport", "elasticsearch"] full = ["av", "lz4", "pandas", "pillow", "protobuf", "pydub", "trimesh", "types-pillow"] hnswlib = ["hnswlib"] image = ["pillow", "types-pillow"] @@ -4624,4 +4624,4 @@ web = ["fastapi"] [metadata] lock-version = "2.0" python-versions = ">=3.7,<4.0" -content-hash = "49f70eda2036ec961a1ed06e9364c56710c91f152d030ddf566519b443b52f93" +content-hash = "fe116769811f4f45c7b48f72ad5c9dc58e4a31586656f4c2318462cf42492049" diff --git a/pyproject.toml b/pyproject.toml index cc0ce99f8b9..22c8d9f53c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ video = ["av"] audio = ["pydub"] mesh = ["trimesh"] hnswlib = ["hnswlib"] -elasticsearch = ["elasticsearch"] +elasticsearch = ["elasticsearch", "elastic-transport"] jac = ["jina-hubble-sdk"] aws = ["smart-open"] torch = ["torch"] From 4aba76649c156a43cd21f30a8df5a5b98c19c3d0 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 31 Mar 2023 21:50:30 +0800 Subject: [PATCH 10/14] fix: raise error when init ElasticV7DocIndex Signed-off-by: AnneY --- docarray/index/__init__.py | 7 ------- docarray/index/backends/elasticv7.py | 9 +++++++++ .../doc_index/elastic/v8/test_index_get_del.py | 4 +++- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/docarray/index/__init__.py b/docarray/index/__init__.py index 5147cb8113c..2c724030fe7 100644 --- a/docarray/index/__init__.py +++ b/docarray/index/__init__.py @@ -24,15 +24,8 @@ def __getattr__(name: str): import docarray.index.backends.elastic as lib elif name == 'ElasticV7DocIndex': import_library('elasticsearch', raise_error=True) - from elasticsearch import __version__ as __es__version__ - import docarray.index.backends.elasticv7 as lib - if __es__version__[0] > 7: - raise ImportError( - 'ElasticV7DocIndex requires the elasticsearch library to be version 7.10.1' - ) - else: raise ImportError( f'cannot import name \'{name}\' from \'{_get_path_from_docarray_root_level(__file__)}\'' diff --git a/docarray/index/backends/elasticv7.py b/docarray/index/backends/elasticv7.py index 5f80379f85e..0013a766df6 100644 --- a/docarray/index/backends/elasticv7.py +++ b/docarray/index/backends/elasticv7.py @@ -14,6 +14,15 @@ class ElasticV7DocIndex(ElasticDocIndex): + def __init__(self, db_config=None, **kwargs): + from elasticsearch import __version__ as __es__version__ + + if __es__version__[0] > 7: + raise ImportError( + 'ElasticV7DocIndex requires the elasticsearch library to be version 7.10.1' + ) + + super().__init__(db_config, **kwargs) ############################################### # Inner classes for query builder and configs # diff --git a/tests/integrations/doc_index/elastic/v8/test_index_get_del.py b/tests/integrations/doc_index/elastic/v8/test_index_get_del.py index e51b683fa16..0736ed8ce8d 100644 --- a/tests/integrations/doc_index/elastic/v8/test_index_get_del.py +++ b/tests/integrations/doc_index/elastic/v8/test_index_get_del.py @@ -267,4 +267,6 @@ class MyMultiModalDoc(BaseDoc): def test_elasticv7_version_check(): with pytest.raises(ImportError): - from docarray.index import ElasticV7DocIndex # noqa: F401 + from docarray.index import ElasticV7DocIndex + + _ = ElasticV7DocIndex[SimpleDoc]() From 5f17684e51a370e4ae1f4b2f3fd871113d5e053d Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 11 Apr 2023 22:24:34 +0800 Subject: [PATCH 11/14] fix: minor fix Signed-off-by: AnneY --- docarray/index/backends/elastic.py | 26 +++++++++++++++----------- docarray/index/backends/elasticv7.py | 20 +++++++++++++++----- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/docarray/index/backends/elastic.py b/docarray/index/backends/elastic.py index c003e9d4095..1a345195cca 100644 --- a/docarray/index/backends/elastic.py +++ b/docarray/index/backends/elastic.py @@ -85,6 +85,7 @@ def __init__(self, db_config=None, **kwargs): '_source': {'enabled': 'true'}, 'properties': {}, } + mappings.update(self._db_config.index_mappings) for col_name, col in self._column_infos.items(): mappings['properties'][col_name] = self._create_index_mapping(col) @@ -124,24 +125,23 @@ def find( query: Union[AnyTensor, BaseDoc], search_field: str = 'embedding', limit: int = 10, + num_candidates: Optional[int] = None, ): if isinstance(query, BaseDoc): query_vec = BaseDocIndex._get_values_by_column([query], search_field)[0] else: query_vec = query query_vec_np = BaseDocIndex._to_numpy(self._outer_instance, query_vec) - self._query['knn'] = ElasticDocIndex._form_search_body( + self._query['knn'] = self._outer_instance._form_search_body( query_vec_np, limit, search_field, - self._outer_instance._runtime_config.default_column_config[ - 'dense_vector' - ]['num_candidates'], + num_candidates, )['knn'] return self - # filter accrpts Leaf/Compound query clauses + # filter accepts Leaf/Compound query clauses # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html def filter(self, query: Dict[str, Any], limit: int = 10): self._query['size'] = limit @@ -156,8 +156,8 @@ def text_search(self, query: str, search_field: str = 'text', limit: int = 10): return self find_batched = _raise_not_composable('find_batched') - filter_batched = _raise_not_composable('find_batched') - text_search_batched = _raise_not_composable('text_search') + filter_batched = _raise_not_composable('filter_batched') + text_search_batched = _raise_not_composable('text_search_batched') def build_query(self, **kwargs) -> QueryBuilder: """ @@ -173,6 +173,7 @@ class DBConfig(BaseDocIndex.DBConfig): index_name: Optional[str] = None es_config: Dict[str, Any] = field(default_factory=dict) index_settings: Dict[str, Any] = field(default_factory=dict) + index_mappings: Dict[str, Any] = field(default_factory=dict) @dataclass class RuntimeConfig(BaseDocIndex.RuntimeConfig): @@ -483,13 +484,17 @@ def _send_requests( return accumulated_info, warning_info - @staticmethod def _form_search_body( + self, query: np.ndarray, limit: int, search_field: str = '', - num_candidates: int = 10000, + num_candidates: Optional[int] = None, ) -> Dict[str, Any]: + if not num_candidates: + num_candidates = self._runtime_config.default_column_config['dense_vector'][ + 'num_candidates' + ] body = { 'size': limit, 'knn': { @@ -501,9 +506,8 @@ def _form_search_body( } return body - @staticmethod def _form_text_search_body( - query: str, limit: int, search_field: str = '' + self, query: str, limit: int, search_field: str = '' ) -> Dict[str, Any]: body = { 'size': limit, diff --git a/docarray/index/backends/elasticv7.py b/docarray/index/backends/elasticv7.py index 0013a766df6..e77aedfc2b4 100644 --- a/docarray/index/backends/elasticv7.py +++ b/docarray/index/backends/elasticv7.py @@ -1,5 +1,6 @@ +import warnings from dataclasses import dataclass -from typing import Any, Dict, List, Sequence, TypeVar, Union +from typing import Any, Dict, List, Optional, Sequence, TypeVar, Union import numpy as np @@ -48,16 +49,26 @@ def find( query: Union[AnyTensor, BaseDoc], search_field: str = 'embedding', limit: int = 10, + num_candidates: Optional[int] = None, ): + if num_candidates: + warnings.warn('`num_candidates` is not supported in ElasticV7DocIndex') + if isinstance(query, BaseDoc): query_vec = BaseDocIndex._get_values_by_column([query], search_field)[0] else: query_vec = query query_vec_np = BaseDocIndex._to_numpy(self._outer_instance, query_vec) self._query['size'] = limit - self._query['query']['script_score'] = ElasticV7DocIndex._form_search_body( + self._query['query'][ + 'script_score' + ] = self._outer_instance._form_search_body( query_vec_np, limit, search_field - )['query']['script_score'] + )[ + 'query' + ][ + 'script_score' + ] return self @@ -102,8 +113,7 @@ def _create_index_mapping(self, col: '_ColumnInfo') -> Dict[str, Any]: return index - @staticmethod - def _form_search_body(query: np.ndarray, limit: int, search_field: str = '') -> Dict[str, Any]: # type: ignore + def _form_search_body(self, query: np.ndarray, limit: int, search_field: str = '') -> Dict[str, Any]: # type: ignore body = { 'size': limit, 'query': { From ae6c15b041240f291ab712739550046d9b35c794 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 12 Apr 2023 11:23:03 +0800 Subject: [PATCH 12/14] refactor: move index tests Signed-off-by: AnneY --- tests/{integrations/doc_index => index/elastic}/__init__.py | 0 tests/{integrations/doc_index => index}/elastic/fixture.py | 0 .../doc_index/elastic => index/elastic/v7}/__init__.py | 0 .../doc_index => index}/elastic/v7/docker-compose.yml | 0 .../doc_index => index}/elastic/v7/test_column_config.py | 2 +- .../{integrations/doc_index => index}/elastic/v7/test_find.py | 4 ++-- .../doc_index => index}/elastic/v7/test_index_get_del.py | 2 +- .../doc_index => index}/elastic/v8/docker-compose.yml | 0 .../doc_index => index}/elastic/v8/test_column_config.py | 2 +- .../{integrations/doc_index => index}/elastic/v8/test_find.py | 4 ++-- .../doc_index => index}/elastic/v8/test_index_get_del.py | 2 +- .../doc_index/elastic/v7 => index/hnswlib}/__init__.py | 0 tests/{integrations/doc_index => index}/hnswlib/test_find.py | 0 .../doc_index => index}/hnswlib/test_index_get_del.py | 0 .../doc_index => index}/hnswlib/test_persist_data.py | 0 tests/integrations/doc_index/hnswlib/__init__.py | 0 16 files changed, 8 insertions(+), 8 deletions(-) rename tests/{integrations/doc_index => index/elastic}/__init__.py (100%) rename tests/{integrations/doc_index => index}/elastic/fixture.py (100%) rename tests/{integrations/doc_index/elastic => index/elastic/v7}/__init__.py (100%) rename tests/{integrations/doc_index => index}/elastic/v7/docker-compose.yml (100%) rename tests/{integrations/doc_index => index}/elastic/v7/test_column_config.py (97%) rename tests/{integrations/doc_index => index}/elastic/v7/test_find.py (98%) rename tests/{integrations/doc_index => index}/elastic/v7/test_index_get_del.py (99%) rename tests/{integrations/doc_index => index}/elastic/v8/docker-compose.yml (100%) rename tests/{integrations/doc_index => index}/elastic/v8/test_column_config.py (97%) rename tests/{integrations/doc_index => index}/elastic/v8/test_find.py (98%) rename tests/{integrations/doc_index => index}/elastic/v8/test_index_get_del.py (99%) rename tests/{integrations/doc_index/elastic/v7 => index/hnswlib}/__init__.py (100%) rename tests/{integrations/doc_index => index}/hnswlib/test_find.py (100%) rename tests/{integrations/doc_index => index}/hnswlib/test_index_get_del.py (100%) rename tests/{integrations/doc_index => index}/hnswlib/test_persist_data.py (100%) delete mode 100644 tests/integrations/doc_index/hnswlib/__init__.py diff --git a/tests/integrations/doc_index/__init__.py b/tests/index/elastic/__init__.py similarity index 100% rename from tests/integrations/doc_index/__init__.py rename to tests/index/elastic/__init__.py diff --git a/tests/integrations/doc_index/elastic/fixture.py b/tests/index/elastic/fixture.py similarity index 100% rename from tests/integrations/doc_index/elastic/fixture.py rename to tests/index/elastic/fixture.py diff --git a/tests/integrations/doc_index/elastic/__init__.py b/tests/index/elastic/v7/__init__.py similarity index 100% rename from tests/integrations/doc_index/elastic/__init__.py rename to tests/index/elastic/v7/__init__.py diff --git a/tests/integrations/doc_index/elastic/v7/docker-compose.yml b/tests/index/elastic/v7/docker-compose.yml similarity index 100% rename from tests/integrations/doc_index/elastic/v7/docker-compose.yml rename to tests/index/elastic/v7/docker-compose.yml diff --git a/tests/integrations/doc_index/elastic/v7/test_column_config.py b/tests/index/elastic/v7/test_column_config.py similarity index 97% rename from tests/integrations/doc_index/elastic/v7/test_column_config.py rename to tests/index/elastic/v7/test_column_config.py index df927a2c2de..a0d4aa4dec9 100644 --- a/tests/integrations/doc_index/elastic/v7/test_column_config.py +++ b/tests/index/elastic/v7/test_column_config.py @@ -3,7 +3,7 @@ from docarray import BaseDoc from docarray.index import ElasticV7DocIndex -from tests.integrations.doc_index.elastic.fixture import start_storage_v7 # noqa: F401 +from tests.index.elastic.fixture import start_storage_v7 # noqa: F401 pytestmark = [pytest.mark.slow, pytest.mark.index] diff --git a/tests/integrations/doc_index/elastic/v7/test_find.py b/tests/index/elastic/v7/test_find.py similarity index 98% rename from tests/integrations/doc_index/elastic/v7/test_find.py rename to tests/index/elastic/v7/test_find.py index 1a0503711a7..6665c8b2b60 100644 --- a/tests/integrations/doc_index/elastic/v7/test_find.py +++ b/tests/index/elastic/v7/test_find.py @@ -6,8 +6,8 @@ from docarray import BaseDoc from docarray.index import ElasticV7DocIndex from docarray.typing import NdArray, TorchTensor -from tests.integrations.doc_index.elastic.fixture import start_storage_v7 # noqa: F401 -from tests.integrations.doc_index.elastic.fixture import FlatDoc, SimpleDoc +from tests.index.elastic.fixture import start_storage_v7 # noqa: F401 +from tests.index.elastic.fixture import FlatDoc, SimpleDoc pytestmark = [pytest.mark.slow, pytest.mark.index] diff --git a/tests/integrations/doc_index/elastic/v7/test_index_get_del.py b/tests/index/elastic/v7/test_index_get_del.py similarity index 99% rename from tests/integrations/doc_index/elastic/v7/test_index_get_del.py rename to tests/index/elastic/v7/test_index_get_del.py index 7b34a4a7e46..7124d5d61bd 100644 --- a/tests/integrations/doc_index/elastic/v7/test_index_get_del.py +++ b/tests/index/elastic/v7/test_index_get_del.py @@ -7,7 +7,7 @@ from docarray.documents import ImageDoc, TextDoc from docarray.index import ElasticV7DocIndex from docarray.typing import NdArray -from tests.integrations.doc_index.elastic.fixture import ( # noqa: F401 +from tests.index.elastic.fixture import ( # noqa: F401 DeepNestedDoc, FlatDoc, NestedDoc, diff --git a/tests/integrations/doc_index/elastic/v8/docker-compose.yml b/tests/index/elastic/v8/docker-compose.yml similarity index 100% rename from tests/integrations/doc_index/elastic/v8/docker-compose.yml rename to tests/index/elastic/v8/docker-compose.yml diff --git a/tests/integrations/doc_index/elastic/v8/test_column_config.py b/tests/index/elastic/v8/test_column_config.py similarity index 97% rename from tests/integrations/doc_index/elastic/v8/test_column_config.py rename to tests/index/elastic/v8/test_column_config.py index 6e1ad6cf88b..2b3bbcee0f8 100644 --- a/tests/integrations/doc_index/elastic/v8/test_column_config.py +++ b/tests/index/elastic/v8/test_column_config.py @@ -3,7 +3,7 @@ from docarray import BaseDoc from docarray.index import ElasticDocIndex -from tests.integrations.doc_index.elastic.fixture import start_storage_v8 # noqa: F401 +from tests.index.elastic.fixture import start_storage_v8 # noqa: F401 pytestmark = [pytest.mark.slow, pytest.mark.index, pytest.mark.elasticv8] diff --git a/tests/integrations/doc_index/elastic/v8/test_find.py b/tests/index/elastic/v8/test_find.py similarity index 98% rename from tests/integrations/doc_index/elastic/v8/test_find.py rename to tests/index/elastic/v8/test_find.py index 90292a772bd..dcc4097eb7d 100644 --- a/tests/integrations/doc_index/elastic/v8/test_find.py +++ b/tests/index/elastic/v8/test_find.py @@ -6,8 +6,8 @@ from docarray import BaseDoc from docarray.index import ElasticDocIndex from docarray.typing import NdArray, TorchTensor -from tests.integrations.doc_index.elastic.fixture import start_storage_v8 # noqa: F401 -from tests.integrations.doc_index.elastic.fixture import FlatDoc, SimpleDoc +from tests.index.elastic.fixture import start_storage_v8 # noqa: F401 +from tests.index.elastic.fixture import FlatDoc, SimpleDoc pytestmark = [pytest.mark.slow, pytest.mark.index, pytest.mark.elasticv8] diff --git a/tests/integrations/doc_index/elastic/v8/test_index_get_del.py b/tests/index/elastic/v8/test_index_get_del.py similarity index 99% rename from tests/integrations/doc_index/elastic/v8/test_index_get_del.py rename to tests/index/elastic/v8/test_index_get_del.py index c3b87ac705a..db2df925ebb 100644 --- a/tests/integrations/doc_index/elastic/v8/test_index_get_del.py +++ b/tests/index/elastic/v8/test_index_get_del.py @@ -7,7 +7,7 @@ from docarray.documents import ImageDoc, TextDoc from docarray.index import ElasticDocIndex from docarray.typing import NdArray -from tests.integrations.doc_index.elastic.fixture import ( # noqa: F401 +from tests.index.elastic.fixture import ( # noqa: F401 DeepNestedDoc, FlatDoc, NestedDoc, diff --git a/tests/integrations/doc_index/elastic/v7/__init__.py b/tests/index/hnswlib/__init__.py similarity index 100% rename from tests/integrations/doc_index/elastic/v7/__init__.py rename to tests/index/hnswlib/__init__.py diff --git a/tests/integrations/doc_index/hnswlib/test_find.py b/tests/index/hnswlib/test_find.py similarity index 100% rename from tests/integrations/doc_index/hnswlib/test_find.py rename to tests/index/hnswlib/test_find.py diff --git a/tests/integrations/doc_index/hnswlib/test_index_get_del.py b/tests/index/hnswlib/test_index_get_del.py similarity index 100% rename from tests/integrations/doc_index/hnswlib/test_index_get_del.py rename to tests/index/hnswlib/test_index_get_del.py diff --git a/tests/integrations/doc_index/hnswlib/test_persist_data.py b/tests/index/hnswlib/test_persist_data.py similarity index 100% rename from tests/integrations/doc_index/hnswlib/test_persist_data.py rename to tests/index/hnswlib/test_persist_data.py diff --git a/tests/integrations/doc_index/hnswlib/__init__.py b/tests/integrations/doc_index/hnswlib/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 From 3dc76941ed0f905fad6d73d7326da74b2db1c46c Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 12 Apr 2023 16:24:58 +0800 Subject: [PATCH 13/14] refactor: code refactor Signed-off-by: AnneY --- tests/index/elastic/v8/test_find.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/index/elastic/v8/test_find.py b/tests/index/elastic/v8/test_find.py index dcc4097eb7d..5ee0956bb87 100644 --- a/tests/index/elastic/v8/test_find.py +++ b/tests/index/elastic/v8/test_find.py @@ -224,10 +224,10 @@ class MyDoc(BaseDoc): assert doc.A filter_query = { - "bool": { - "filter": [ - {"terms": {"B": [3, 4, 7, 8]}}, - {"range": {"C": {"gte": 3, "lte": 5}}}, + 'bool': { + 'filter': [ + {'terms': {'B': [3, 4, 7, 8]}}, + {'range': {'C': {'gte': 3, 'lte': 5}}}, ] } } From 0cc9e64e6c17ceec1b0685ba3fc00d5f083a5ada Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 12 Apr 2023 20:25:15 +0800 Subject: [PATCH 14/14] feat: add ip_range Signed-off-by: AnneY --- docarray/index/backends/elastic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docarray/index/backends/elastic.py b/docarray/index/backends/elastic.py index b2d13d164af..c2c1c6646a2 100644 --- a/docarray/index/backends/elastic.py +++ b/docarray/index/backends/elastic.py @@ -207,6 +207,7 @@ def __post_init__(self): 'long_range': {}, 'double_range': {}, 'date_range': {}, + 'ip_range': {}, 'ip': {}, 'version': {}, 'histogram': {},