From fc3f93b98d36f37dbfde47b82163587538536ea3 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 25 Jul 2022 23:24:38 +0800 Subject: [PATCH 01/93] feat(redis): add storage backend and unit test --- docarray/array/storage/redis/backend.py | 183 ++++++++++++++++++ .../unit/array/storage/redis/test_backend.py | 140 ++++++++++++++ 2 files changed, 323 insertions(+) create mode 100644 docarray/array/storage/redis/backend.py create mode 100644 tests/unit/array/storage/redis/test_backend.py diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py new file mode 100644 index 00000000000..a4d064430ea --- /dev/null +++ b/docarray/array/storage/redis/backend.py @@ -0,0 +1,183 @@ +from dataclasses import dataclass, field +from typing import ( + Iterable, + Dict, + Optional, + TYPE_CHECKING, + Union, + Any, + Tuple, + List, +) + +from .... import Document +import numpy as np +from ..base.backend import BaseBackendMixin, TypeMap +from redis import Redis +from redis.exceptions import ResponseError +from redis.commands.search.field import VectorField, TextField, NumericField +from ....helper import dataclass_from_dict + +if TYPE_CHECKING: + from ....typing import DocumentArraySourceType + from ....typing import DocumentArraySourceType, ArrayType + + +@dataclass +class RedisConfig: + n_dim: int + host: Optional[str] = field(default='localhost') + port: Optional[int] = field(default=6379) + flush: Optional[bool] = field(default=False) + update_schema: Optional[bool] = field(default=True) + distance: Optional[str] = field(default='COSINE') + redis_config: Dict[str, Any] = field(default_factory=dict) + index_text: Optional[bool] = field(default=False) + tag_indices: List[str] = field(default_factory=list) + batch_size: Optional[int] = field(default=64) + method: Optional[str] = field(default='HNSW') + initial_cap: Optional[int] = None + ef_construction: Optional[int] = None + m: Optional[int] = None + ef_runtime: Optional[int] = None + block_size: Optional[int] = None + columns: Optional[List[Tuple[str, str]]] = None + + +class BackendMixin(BaseBackendMixin): + """Provide necessary functions to enable this storage backend.""" + + TYPE_MAP = { + 'str': TypeMap(type='text', converter=TextField), + 'bytes': TypeMap(type='text', converter=TextField), + 'int': TypeMap(type='integer', converter=NumericField), + 'float': TypeMap(type='float', converter=NumericField), + 'double': TypeMap(type='double', converter=NumericField), + 'long': TypeMap(type='long', converter=NumericField), + # TODO add bool + } + + def _init_storage( + self, + _docs: Optional['DocumentArraySourceType'] = None, + config: Optional[Union[RedisConfig, Dict]] = None, + **kwargs, + ): + if not config: + raise RedisConfig() + elif isinstance(config, dict): + config = dataclass_from_dict(RedisConfig, config) + + if not config.distance in ['L2', 'IP', 'COSINE']: + raise ValueError(f'Distance metric {config.distance} not supported') + if not config.method in ['HNSW', 'FLAT']: + raise ValueError(f'Method {config.method} not supported') + + self._offset2id_key = 'offset2id' + self._config = config + self.n_dim = self._config.n_dim + self._config.columns = self._normalize_columns(self._config.columns) + + self._client = self._build_client() + super()._init_storage(_docs, config, **kwargs) + + if _docs is None: + return + elif isinstance(_docs, Iterable): + self.extend(_docs) + elif isinstance(_docs, Document): + self.append(_docs) + + def _build_client(self): + client = Redis( + host=self._config.host, + port=self._config.port, + **self._config.redis_config, + ) + + if self._config.flush: + client.flushdb() + + # redis client will throw error if index does not exist + if self._config.update_schema: + try: + client.ft().dropindex('idx') + except ResponseError: + pass + + if self._config.flush or self._config.update_schema: + schema = self._build_schema_from_redis_config(self._config) + client.ft().create_index(schema) + + return client + + def _build_schema_from_redis_config(self, redis_config): + index_param = { + 'TYPE': 'FLOAT32', + 'DIM': self.n_dim, + 'DISTANCE_METRIC': self._config.distance, + } + + if self._config.method == 'HNSW' and ( + self._config.m or self._config.ef_construction or self._config.ef_runtime + ): + index_options = { + 'M': self._config.m or 16, + 'EF_CONSTRUCTION': self._config.ef_construction or 200, + 'EF_RUNTIME': self._config.ef_runtime or 10, + } + index_param.update(index_options) + + if self._config.method == 'FLAT' and self._config.block_size: + index_options = {'BLOCK_SIZE': self._config.block_size} + index_param.update(index_options) + + if self._config.initial_cap: + index_param['INITIAL_CAP'] = self._config.initial_cap + schema = [VectorField('embedding', self._config.method, index_param)] + + if redis_config.tag_indices: + for index in redis_config.tag_indices: + # TODO TextField or TagField + schema.append(TextField(index)) + + # TODO whether to add schema to column (elastic does but qdrant doesn't) + for col, coltype in self._config.columns: + schema.append(self._map_column(col, coltype)) + return schema + + def _doc_id_exists(self, doc_id): + return self._client.exists(doc_id) + + def _map_embedding(self, embedding: 'ArrayType') -> bytes: + if embedding is not None: + from ....math.ndarray import to_numpy_array + + embedding = to_numpy_array(embedding) + + if embedding.ndim > 1: + embedding = np.asarray(embedding).squeeze() + else: + embedding = np.zeros(self.n_dim) + return embedding.astype(np.float32).tobytes() + + def _get_offset2ids_meta(self) -> List: + """Return the offset2ids stored in redis + + :return: a list containing ids + + :raises ValueError: error is raised if index _client is not found or no offsets are found + """ + if not self._client: + raise ValueError('Redis client does not exist') + + if not self._client.exists(self._offset2id_key): + return [] + return self._client.lrange(self._offset2id_key, 0, -1) + + def _update_offset2ids_meta(self): + """Update the offset2ids in redis""" + if self._client.exists(self._offset2id_key): + self._client.delete(self._offset2id_key) + if len(self._offset2ids.ids) > 0: + self._client.rpush(self._offset2id_key, *self._offset2ids.ids) diff --git a/tests/unit/array/storage/redis/test_backend.py b/tests/unit/array/storage/redis/test_backend.py new file mode 100644 index 00000000000..ffdc46a030d --- /dev/null +++ b/tests/unit/array/storage/redis/test_backend.py @@ -0,0 +1,140 @@ +from abc import ABC +import pytest +import numpy as np + +from docarray import DocumentArray, Document +from docarray.array.storage.redis.backend import RedisConfig, BackendMixin +from docarray.array.storage.base.helper import Offset2ID +from docarray.array.storage.base.getsetdel import BaseGetSetDelMixin +from docarray.array.storage.memory import GetSetDelMixin, SequenceLikeMixin + + +class StorageMixins(BackendMixin, GetSetDelMixin, SequenceLikeMixin, ABC): + ... + + +class DocumentArrayDummy(StorageMixins, DocumentArray): + def __new__(cls, *args, **kwargs): + return super().__new__(cls) + + def _load_offset2ids(self): + pass + + def _save_offset2ids(self): + pass + + +type_convert = { + 'int': 'NUMERIC', + 'float': 'NUMERIC', + 'double': 'NUMERIC', + 'long': 'NUMERIC', + 'str': 'TEXT', + 'bytes': 'TEXT', +} + + +@pytest.fixture(scope='function') +def da_redis(): + cfg = RedisConfig(n_dim=128, flush=True) + da_redis = DocumentArrayDummy(storage='redis', config=cfg) + return da_redis + + +@pytest.mark.parametrize('distance', ['L2', 'IP', 'COSINE']) +@pytest.mark.parametrize('tag_indices', [['attr3'], ['attr3', 'attr4']]) +@pytest.mark.parametrize( + 'columns', + [ + [('attr1', 'str'), ('attr2', 'bytes')], + [('attr1', 'int'), ('attr2', 'float')], + [('attr1', 'double'), ('attr2', 'long')], + ], +) +def test_init_storage(distance, tag_indices, columns, start_storage): + cfg = RedisConfig( + n_dim=128, + distance=distance, + flush=True, + tag_indices=tag_indices, + columns=columns, + redis_config={'decode_responses': True}, + ) + redis_da = DocumentArrayDummy(storage='redis', config=cfg) + + assert redis_da._client.info()['tcp_port'] == redis_da._config.port + assert redis_da._client.ft().info()['attributes'][0][1] == 'embedding' + assert redis_da._client.ft().info()['attributes'][0][5] == 'VECTOR' + + for i in range(len(tag_indices)): + assert ( + redis_da._client.ft().info()['attributes'][i + 1][1] + == redis_da._config.tag_indices[i] + ) + assert redis_da._client.ft().info()['attributes'][i + 1][5] == 'TEXT' + + for i in range(len(columns)): + assert ( + redis_da._client.ft().info()['attributes'][i + len(tag_indices) + 1][1] + == redis_da._config.columns[i][0] + ) + assert ( + redis_da._client.ft().info()['attributes'][i + len(tag_indices) + 1][5] + == type_convert[redis_da._config.columns[i][1]] + ) + + +def test_init_storage_update_schema(start_storage): + cfg = RedisConfig(n_dim=128, tag_indices=['attr1']) + redis_da = DocumentArrayDummy(storage='redis', config=cfg) + assert redis_da._client.ft().info()['attributes'][1][1] == b'attr1' + + cfg = RedisConfig(n_dim=128, tag_indices=['attr2'], update_schema=False) + redis_da = DocumentArrayDummy(storage='redis', config=cfg) + assert redis_da._client.ft().info()['attributes'][1][1] == b'attr1' + + cfg = RedisConfig(n_dim=128, tag_indices=['attr2'], update_schema=True) + redis_da = DocumentArrayDummy(storage='redis', config=cfg) + assert redis_da._client.ft().info()['attributes'][1][1] == b'attr2' + + +@pytest.mark.parametrize( + 'id', + [ + ('abc'), + ('123'), + ], +) +def test_doc_id_exists(id, da_redis, start_storage): + da_redis._client.hset(id, mapping={'attr1': 1}) + assert da_redis._doc_id_exists(id) + + +@pytest.mark.parametrize( + 'array', + [ + ([1, 2, 3, 4, 5]), + ([1.1, 1.2, 1.3]), + ([1, 2.0, 3.0, 4]), + ], +) +def test_map_embedding(array, start_storage): + cfg = RedisConfig(n_dim=len(array)) + redis_da = DocumentArrayDummy(storage='redis', config=cfg) + embedding = redis_da._map_embedding(array) + assert type(embedding) == bytes + assert np.allclose(np.frombuffer(embedding, dtype=np.float32), np.array(array)) + + +@pytest.mark.parametrize( + 'ids', + [ + (['1', '2', '3']), + (['a', 'b', 'c']), + ], +) +def test_offset2ids_meta(ids, da_redis, start_storage): + assert da_redis._get_offset2ids_meta() == [] + da_redis._offset2ids = Offset2ID(ids) + da_redis._update_offset2ids_meta() + assert da_redis._get_offset2ids_meta() == [bytes(id, 'utf-8') for id in ids] From 703ea0986640533c812bacb9ffbcc82ab094993c Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 26 Jul 2022 10:09:39 +0800 Subject: [PATCH 02/93] style: isort python imports --- docarray/array/storage/redis/backend.py | 28 +++++++++---------------- 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index a4d064430ea..0fe2d41b307 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -1,26 +1,18 @@ from dataclasses import dataclass, field -from typing import ( - Iterable, - Dict, - Optional, - TYPE_CHECKING, - Union, - Any, - Tuple, - List, -) +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union -from .... import Document import numpy as np -from ..base.backend import BaseBackendMixin, TypeMap + from redis import Redis +from redis.commands.search.field import NumericField, TextField, VectorField from redis.exceptions import ResponseError -from redis.commands.search.field import VectorField, TextField, NumericField + +from .... import Document from ....helper import dataclass_from_dict +from ..base.backend import BaseBackendMixin, TypeMap if TYPE_CHECKING: - from ....typing import DocumentArraySourceType - from ....typing import DocumentArraySourceType, ArrayType + from ....typing import ArrayType, DocumentArraySourceType @dataclass @@ -68,9 +60,9 @@ def _init_storage( elif isinstance(config, dict): config = dataclass_from_dict(RedisConfig, config) - if not config.distance in ['L2', 'IP', 'COSINE']: + if config.distance not in ['L2', 'IP', 'COSINE']: raise ValueError(f'Distance metric {config.distance} not supported') - if not config.method in ['HNSW', 'FLAT']: + if config.method not in ['HNSW', 'FLAT']: raise ValueError(f'Method {config.method} not supported') self._offset2id_key = 'offset2id' @@ -161,7 +153,7 @@ def _map_embedding(self, embedding: 'ArrayType') -> bytes: embedding = np.zeros(self.n_dim) return embedding.astype(np.float32).tobytes() - def _get_offset2ids_meta(self) -> List: + def _get_offset2ids_meta(self) -> List[str]: """Return the offset2ids stored in redis :return: a list containing ids From 6f4f8b1faec02733eefd5a7f1a8e61c7194db705 Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 26 Jul 2022 10:17:56 +0800 Subject: [PATCH 03/93] style: remove useless parameter in _build_schema --- docarray/array/storage/redis/backend.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 0fe2d41b307..744db02fa02 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -98,12 +98,12 @@ def _build_client(self): pass if self._config.flush or self._config.update_schema: - schema = self._build_schema_from_redis_config(self._config) + schema = self._build_schema_from_redis_config() client.ft().create_index(schema) return client - def _build_schema_from_redis_config(self, redis_config): + def _build_schema_from_redis_config(self): index_param = { 'TYPE': 'FLOAT32', 'DIM': self.n_dim, @@ -128,8 +128,8 @@ def _build_schema_from_redis_config(self, redis_config): index_param['INITIAL_CAP'] = self._config.initial_cap schema = [VectorField('embedding', self._config.method, index_param)] - if redis_config.tag_indices: - for index in redis_config.tag_indices: + if self._config.tag_indices: + for index in self._config.tag_indices: # TODO TextField or TagField schema.append(TextField(index)) From 06fc38a26922b8ce72d6d674846aa0ce454e05bc Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 26 Jul 2022 10:26:36 +0800 Subject: [PATCH 04/93] test: test optional param in redis config --- .../unit/array/storage/redis/test_backend.py | 40 ++++++++++++++++--- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/tests/unit/array/storage/redis/test_backend.py b/tests/unit/array/storage/redis/test_backend.py index ffdc46a030d..81bf4682227 100644 --- a/tests/unit/array/storage/redis/test_backend.py +++ b/tests/unit/array/storage/redis/test_backend.py @@ -1,12 +1,11 @@ from abc import ABC -import pytest -import numpy as np -from docarray import DocumentArray, Document -from docarray.array.storage.redis.backend import RedisConfig, BackendMixin +import numpy as np +import pytest +from docarray import DocumentArray from docarray.array.storage.base.helper import Offset2ID -from docarray.array.storage.base.getsetdel import BaseGetSetDelMixin from docarray.array.storage.memory import GetSetDelMixin, SequenceLikeMixin +from docarray.array.storage.redis.backend import BackendMixin, RedisConfig class StorageMixins(BackendMixin, GetSetDelMixin, SequenceLikeMixin, ABC): @@ -42,6 +41,15 @@ def da_redis(): @pytest.mark.parametrize('distance', ['L2', 'IP', 'COSINE']) +@pytest.mark.parametrize( + 'method,initial_cap,ef_construction,block_size', + [ + ('HNSW', None, None, None), + ('HNSW', 10, 250, None), + ('HNSW', 10, 250, 1000000), + ('FLAT', 10, 250, 1000000), + ], +) @pytest.mark.parametrize('tag_indices', [['attr3'], ['attr3', 'attr4']]) @pytest.mark.parametrize( 'columns', @@ -51,13 +59,29 @@ def da_redis(): [('attr1', 'double'), ('attr2', 'long')], ], ) -def test_init_storage(distance, tag_indices, columns, start_storage): +@pytest.mark.parametrize('index_text', [True, False]) +def test_init_storage( + distance, + tag_indices, + columns, + method, + initial_cap, + ef_construction, + block_size, + index_text, + start_storage, +): cfg = RedisConfig( n_dim=128, distance=distance, flush=True, tag_indices=tag_indices, columns=columns, + method=method, + initial_cap=initial_cap, + ef_construction=ef_construction, + block_size=block_size, + index_text=index_text, redis_config={'decode_responses': True}, ) redis_da = DocumentArrayDummy(storage='redis', config=cfg) @@ -83,6 +107,10 @@ def test_init_storage(distance, tag_indices, columns, start_storage): == type_convert[redis_da._config.columns[i][1]] ) + if index_text: + assert redis_da._client.ft().info()['attributes'][-1][1] == 'text' + assert redis_da._client.ft().info()['attributes'][-1][5] == 'TEXT' + def test_init_storage_update_schema(start_storage): cfg = RedisConfig(n_dim=128, tag_indices=['attr1']) From f21e2da4b503bcd67c760b8e2f5497bbdd387137 Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 26 Jul 2022 10:58:02 +0800 Subject: [PATCH 05/93] fix: remove optional for some params in RedisConfig --- docarray/array/storage/redis/backend.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 744db02fa02..505ede97445 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -18,16 +18,16 @@ @dataclass class RedisConfig: n_dim: int - host: Optional[str] = field(default='localhost') - port: Optional[int] = field(default=6379) - flush: Optional[bool] = field(default=False) - update_schema: Optional[bool] = field(default=True) - distance: Optional[str] = field(default='COSINE') + host: str = field(default='localhost') + port: int = field(default=6379) + flush: bool = field(default=False) + update_schema: bool = field(default=True) + distance: str = field(default='COSINE') redis_config: Dict[str, Any] = field(default_factory=dict) - index_text: Optional[bool] = field(default=False) + index_text: bool = field(default=False) tag_indices: List[str] = field(default_factory=list) - batch_size: Optional[int] = field(default=64) - method: Optional[str] = field(default='HNSW') + batch_size: int = field(default=64) + method: str = field(default='HNSW') initial_cap: Optional[int] = None ef_construction: Optional[int] = None m: Optional[int] = None From 3a3e78b60ffe4326f395700c1ee7052a1d318cb9 Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 26 Jul 2022 10:59:37 +0800 Subject: [PATCH 06/93] feat: add build schema for 'text' --- docarray/array/storage/redis/backend.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 505ede97445..020327bd02d 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -136,6 +136,9 @@ def _build_schema_from_redis_config(self): # TODO whether to add schema to column (elastic does but qdrant doesn't) for col, coltype in self._config.columns: schema.append(self._map_column(col, coltype)) + + if self._config.index_text: + schema.append(TextField('text')) return schema def _doc_id_exists(self, doc_id): From 5ae1409f7a5011cb24412f2898af11406cbe73a4 Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 26 Jul 2022 11:09:20 +0800 Subject: [PATCH 07/93] fix: raise ValueError when RedisConfig is None and add unit test --- docarray/array/storage/redis/backend.py | 2 +- tests/unit/array/storage/redis/test_backend.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 020327bd02d..80ea14caebf 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -56,7 +56,7 @@ def _init_storage( **kwargs, ): if not config: - raise RedisConfig() + raise ValueError('Empty config is not allowed for Redis storage') elif isinstance(config, dict): config = dataclass_from_dict(RedisConfig, config) diff --git a/tests/unit/array/storage/redis/test_backend.py b/tests/unit/array/storage/redis/test_backend.py index 81bf4682227..818ccdfff09 100644 --- a/tests/unit/array/storage/redis/test_backend.py +++ b/tests/unit/array/storage/redis/test_backend.py @@ -126,6 +126,11 @@ def test_init_storage_update_schema(start_storage): assert redis_da._client.ft().info()['attributes'][1][1] == b'attr2' +def test_init_storage_empty_config(start_storage): + with pytest.raises(ValueError): + redis_da = DocumentArrayDummy(storage='redis') + + @pytest.mark.parametrize( 'id', [ From 9e716d1542f81003652e5f8900b3a4f86f277376 Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 26 Jul 2022 11:37:26 +0800 Subject: [PATCH 08/93] refactor: improve the logic of update_schema --- docarray/array/storage/redis/backend.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 80ea14caebf..49cec99c159 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -90,12 +90,9 @@ def _build_client(self): if self._config.flush: client.flushdb() - # redis client will throw error if index does not exist if self._config.update_schema: - try: + if len(client.execute_command('FT._LIST')) > 0: client.ft().dropindex('idx') - except ResponseError: - pass if self._config.flush or self._config.update_schema: schema = self._build_schema_from_redis_config() From 90e5ae9bf49c3371bf4138f01dd968ab93d507f2 Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 26 Jul 2022 12:34:15 +0800 Subject: [PATCH 09/93] test: add None value of embedding in test_map_embedding --- tests/unit/array/storage/redis/test_backend.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/unit/array/storage/redis/test_backend.py b/tests/unit/array/storage/redis/test_backend.py index 818ccdfff09..194aa0e5753 100644 --- a/tests/unit/array/storage/redis/test_backend.py +++ b/tests/unit/array/storage/redis/test_backend.py @@ -149,14 +149,23 @@ def test_doc_id_exists(id, da_redis, start_storage): ([1, 2, 3, 4, 5]), ([1.1, 1.2, 1.3]), ([1, 2.0, 3.0, 4]), + None, ], ) def test_map_embedding(array, start_storage): - cfg = RedisConfig(n_dim=len(array)) + if array is None: + cfg = RedisConfig(n_dim=3) + else: + cfg = RedisConfig(n_dim=len(array)) + redis_da = DocumentArrayDummy(storage='redis', config=cfg) embedding = redis_da._map_embedding(array) assert type(embedding) == bytes - assert np.allclose(np.frombuffer(embedding, dtype=np.float32), np.array(array)) + + if array is None: + assert np.allclose(np.frombuffer(embedding, dtype=np.float32), np.zeros((3))) + else: + assert np.allclose(np.frombuffer(embedding, dtype=np.float32), np.array(array)) @pytest.mark.parametrize( From 5300e2380dc9327fc750ce530f4ed884c981e51a Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 26 Jul 2022 19:26:26 +0800 Subject: [PATCH 10/93] fix: remove useless code --- docarray/array/storage/redis/backend.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 49cec99c159..4eca9ddfac0 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -154,15 +154,6 @@ def _map_embedding(self, embedding: 'ArrayType') -> bytes: return embedding.astype(np.float32).tobytes() def _get_offset2ids_meta(self) -> List[str]: - """Return the offset2ids stored in redis - - :return: a list containing ids - - :raises ValueError: error is raised if index _client is not found or no offsets are found - """ - if not self._client: - raise ValueError('Redis client does not exist') - if not self._client.exists(self._offset2id_key): return [] return self._client.lrange(self._offset2id_key, 0, -1) From a6e19681398a56648707be03a0df978b3e7630b0 Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 26 Jul 2022 19:39:36 +0800 Subject: [PATCH 11/93] fix: set decode_responses in redis_config to False --- docarray/array/storage/redis/backend.py | 6 +++ .../unit/array/storage/redis/test_backend.py | 47 +++++++++++-------- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 4eca9ddfac0..2e28580811a 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -65,6 +65,12 @@ def _init_storage( if config.method not in ['HNSW', 'FLAT']: raise ValueError(f'Method {config.method} not supported') + if ( + 'decode_responses' in config.redis_config + and config.redis_config['decode_responses'] + ): + config.redis_config['decode_responses'] = False + self._offset2id_key = 'offset2id' self._config = config self.n_dim = self._config.n_dim diff --git a/tests/unit/array/storage/redis/test_backend.py b/tests/unit/array/storage/redis/test_backend.py index 194aa0e5753..626606efda6 100644 --- a/tests/unit/array/storage/redis/test_backend.py +++ b/tests/unit/array/storage/redis/test_backend.py @@ -24,12 +24,12 @@ def _save_offset2ids(self): type_convert = { - 'int': 'NUMERIC', - 'float': 'NUMERIC', - 'double': 'NUMERIC', - 'long': 'NUMERIC', - 'str': 'TEXT', - 'bytes': 'TEXT', + 'int': b'NUMERIC', + 'float': b'NUMERIC', + 'double': b'NUMERIC', + 'long': b'NUMERIC', + 'str': b'TEXT', + 'bytes': b'TEXT', } @@ -60,6 +60,16 @@ def da_redis(): ], ) @pytest.mark.parametrize('index_text', [True, False]) +@pytest.mark.parametrize( + 'redis_config', + [ + {'decode_responses': True}, + {'decode_responses': False}, + {'retry_on_timeout': True}, + {'decode_responses': True, 'retry_on_timeout': True}, + {}, + ], +) def test_init_storage( distance, tag_indices, @@ -69,6 +79,7 @@ def test_init_storage( ef_construction, block_size, index_text, + redis_config, start_storage, ): cfg = RedisConfig( @@ -82,34 +93,32 @@ def test_init_storage( ef_construction=ef_construction, block_size=block_size, index_text=index_text, - redis_config={'decode_responses': True}, + redis_config=redis_config, ) redis_da = DocumentArrayDummy(storage='redis', config=cfg) assert redis_da._client.info()['tcp_port'] == redis_da._config.port - assert redis_da._client.ft().info()['attributes'][0][1] == 'embedding' - assert redis_da._client.ft().info()['attributes'][0][5] == 'VECTOR' + assert redis_da._client.ft().info()['attributes'][0][1] == b'embedding' + assert redis_da._client.ft().info()['attributes'][0][5] == b'VECTOR' for i in range(len(tag_indices)): - assert ( - redis_da._client.ft().info()['attributes'][i + 1][1] - == redis_da._config.tag_indices[i] + assert redis_da._client.ft().info()['attributes'][i + 1][1] == bytes( + redis_da._config.tag_indices[i], 'utf-8' ) - assert redis_da._client.ft().info()['attributes'][i + 1][5] == 'TEXT' + assert redis_da._client.ft().info()['attributes'][i + 1][5] == b'TEXT' for i in range(len(columns)): - assert ( - redis_da._client.ft().info()['attributes'][i + len(tag_indices) + 1][1] - == redis_da._config.columns[i][0] - ) + assert redis_da._client.ft().info()['attributes'][i + len(tag_indices) + 1][ + 1 + ] == bytes(redis_da._config.columns[i][0], 'utf-8') assert ( redis_da._client.ft().info()['attributes'][i + len(tag_indices) + 1][5] == type_convert[redis_da._config.columns[i][1]] ) if index_text: - assert redis_da._client.ft().info()['attributes'][-1][1] == 'text' - assert redis_da._client.ft().info()['attributes'][-1][5] == 'TEXT' + assert redis_da._client.ft().info()['attributes'][-1][1] == b'text' + assert redis_da._client.ft().info()['attributes'][-1][5] == b'TEXT' def test_init_storage_update_schema(start_storage): From ca8efb93158f2421a4b6910124274de4f52c7d04 Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 26 Jul 2022 20:53:18 +0800 Subject: [PATCH 12/93] fix: _get_offset2ids_meta return List[str] --- docarray/array/storage/redis/backend.py | 3 ++- tests/unit/array/storage/redis/test_backend.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 2e28580811a..5b941119ac3 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -162,7 +162,8 @@ def _map_embedding(self, embedding: 'ArrayType') -> bytes: def _get_offset2ids_meta(self) -> List[str]: if not self._client.exists(self._offset2id_key): return [] - return self._client.lrange(self._offset2id_key, 0, -1) + ids = self._client.lrange(self._offset2id_key, 0, -1) + return [id.decode() for id in ids] def _update_offset2ids_meta(self): """Update the offset2ids in redis""" diff --git a/tests/unit/array/storage/redis/test_backend.py b/tests/unit/array/storage/redis/test_backend.py index 626606efda6..2a8ff5abe82 100644 --- a/tests/unit/array/storage/redis/test_backend.py +++ b/tests/unit/array/storage/redis/test_backend.py @@ -188,4 +188,4 @@ def test_offset2ids_meta(ids, da_redis, start_storage): assert da_redis._get_offset2ids_meta() == [] da_redis._offset2ids = Offset2ID(ids) da_redis._update_offset2ids_meta() - assert da_redis._get_offset2ids_meta() == [bytes(id, 'utf-8') for id in ids] + assert da_redis._get_offset2ids_meta() == ids From 0dab0c495c9504d2776029dad8447c5965f99a6e Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 27 Jul 2022 09:33:27 +0800 Subject: [PATCH 13/93] fix: redis dropindex bug --- docarray/array/storage/redis/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 5b941119ac3..80e61f4e47f 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -98,7 +98,7 @@ def _build_client(self): if self._config.update_schema: if len(client.execute_command('FT._LIST')) > 0: - client.ft().dropindex('idx') + client.ft().dropindex() if self._config.flush or self._config.update_schema: schema = self._build_schema_from_redis_config() From 02760732f94acf9cb3ab7ae7167130d5327f04ed Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 27 Jul 2022 22:04:03 +0800 Subject: [PATCH 14/93] feat: add redis dependecies --- setup.py | 5 +++++ tests/unit/array/docker-compose.yml | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f32a510008b..78d9be10844 100644 --- a/setup.py +++ b/setup.py @@ -67,6 +67,7 @@ 'annlite>=0.3.2', 'qdrant-client~=0.7.3', 'elasticsearch>=8.2.0', + 'redis>=4.3.4', ], 'qdrant': [ 'qdrant-client~=0.7.3', @@ -80,6 +81,9 @@ 'elasticsearch': [ 'elasticsearch>=8.2.0', ], + 'redis': [ + 'redis>=4.3.4', + ], 'test': [ 'pytest', 'pytest-timeout', @@ -102,6 +106,7 @@ 'weaviate-client~=3.3.0', 'annlite>=0.3.2', 'elasticsearch>=8.2.0', + 'redis>=4.3.4', 'jina', ], }, diff --git a/tests/unit/array/docker-compose.yml b/tests/unit/array/docker-compose.yml index b3b1a844256..1fda3e4cf20 100644 --- a/tests/unit/array/docker-compose.yml +++ b/tests/unit/array/docker-compose.yml @@ -26,7 +26,11 @@ services: - "9200:9200" networks: - elastic - + redis: + image: redis/redis-stack:latest + ports: + - "6379:6379" + networks: elastic: name: elastic \ No newline at end of file From f64ee39fa439b957b54330d34e959b2631b59da5 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 1 Aug 2022 22:27:43 +0800 Subject: [PATCH 15/93] fix: switch to absolute import --- docarray/array/storage/redis/backend.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 80e61f4e47f..f4cd770fb7f 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -2,14 +2,12 @@ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union import numpy as np +from docarray import Document +from docarray.array.storage.base.backend import BaseBackendMixin, TypeMap +from docarray.helper import dataclass_from_dict from redis import Redis from redis.commands.search.field import NumericField, TextField, VectorField -from redis.exceptions import ResponseError - -from .... import Document -from ....helper import dataclass_from_dict -from ..base.backend import BaseBackendMixin, TypeMap if TYPE_CHECKING: from ....typing import ArrayType, DocumentArraySourceType From 94f7e73e291c2f5e413c6c797a2df5f74233c66f Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 1 Aug 2022 22:33:35 +0800 Subject: [PATCH 16/93] test: remove unnecessary tests --- .../unit/array/storage/redis/test_backend.py | 51 ------------------- 1 file changed, 51 deletions(-) diff --git a/tests/unit/array/storage/redis/test_backend.py b/tests/unit/array/storage/redis/test_backend.py index 2a8ff5abe82..fef4136b07e 100644 --- a/tests/unit/array/storage/redis/test_backend.py +++ b/tests/unit/array/storage/redis/test_backend.py @@ -138,54 +138,3 @@ def test_init_storage_update_schema(start_storage): def test_init_storage_empty_config(start_storage): with pytest.raises(ValueError): redis_da = DocumentArrayDummy(storage='redis') - - -@pytest.mark.parametrize( - 'id', - [ - ('abc'), - ('123'), - ], -) -def test_doc_id_exists(id, da_redis, start_storage): - da_redis._client.hset(id, mapping={'attr1': 1}) - assert da_redis._doc_id_exists(id) - - -@pytest.mark.parametrize( - 'array', - [ - ([1, 2, 3, 4, 5]), - ([1.1, 1.2, 1.3]), - ([1, 2.0, 3.0, 4]), - None, - ], -) -def test_map_embedding(array, start_storage): - if array is None: - cfg = RedisConfig(n_dim=3) - else: - cfg = RedisConfig(n_dim=len(array)) - - redis_da = DocumentArrayDummy(storage='redis', config=cfg) - embedding = redis_da._map_embedding(array) - assert type(embedding) == bytes - - if array is None: - assert np.allclose(np.frombuffer(embedding, dtype=np.float32), np.zeros((3))) - else: - assert np.allclose(np.frombuffer(embedding, dtype=np.float32), np.array(array)) - - -@pytest.mark.parametrize( - 'ids', - [ - (['1', '2', '3']), - (['a', 'b', 'c']), - ], -) -def test_offset2ids_meta(ids, da_redis, start_storage): - assert da_redis._get_offset2ids_meta() == [] - da_redis._offset2ids = Offset2ID(ids) - da_redis._update_offset2ids_meta() - assert da_redis._get_offset2ids_meta() == ids From 86323c2b2f4f6a6933b8604ab8edd90309d5fb4f Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 1 Aug 2022 22:52:53 +0800 Subject: [PATCH 17/93] feat: add redis storage getsetdel and unit tests --- docarray/array/storage/redis/getsetdel.py | 74 ++++++++ .../array/storage/redis/test_getsetdel.py | 164 ++++++++++++++++++ 2 files changed, 238 insertions(+) create mode 100644 docarray/array/storage/redis/getsetdel.py create mode 100644 tests/unit/array/storage/redis/test_getsetdel.py diff --git a/docarray/array/storage/redis/getsetdel.py b/docarray/array/storage/redis/getsetdel.py new file mode 100644 index 00000000000..666f3dc88e2 --- /dev/null +++ b/docarray/array/storage/redis/getsetdel.py @@ -0,0 +1,74 @@ +from typing import Dict + +from docarray import Document +from docarray.array.storage.base.getsetdel import BaseGetSetDelMixin +from docarray.array.storage.base.helper import Offset2ID + + +class GetSetDelMixin(BaseGetSetDelMixin): + """Provide concrete implementation for ``__getitem__``, ``__setitem__``, + and ``__delitem__`` for ``DocumentArrayRedis``""" + + def _get_doc_by_id(self, _id: str) -> 'Document': + """Concrete implementation of base class' ``_get_doc_by_id`` + + :param _id: the id of the document + :return: the retrieved document from redis + """ + try: + result = self._client.hgetall(_id.encode()) + doc = Document.from_base64(result[b'blob']) + return doc + except Exception as ex: + raise KeyError(_id) from ex + + def _set_doc_by_id(self, _id: str, value: 'Document'): + """Concrete implementation of base class' ``_set_doc_by_id`` + + :param _id: the id of doc to update + :param value: the document to update to + """ + if _id != value.id: + self._del_doc_by_id(_id) + + payload = self._document_to_redis(value) + self._client.hset(_id, mapping=payload) + + def _del_doc_by_id(self, _id: str): + """Concrete implementation of base class' ``_del_doc_by_id`` + + :param _id: the id of the document to delete + """ + if self._doc_id_exists(_id): + self._client.delete(_id) + + def _document_to_redis(self, doc: 'Document') -> Dict: + extra_columns = { + col: doc.tags.get(col) + for col, _ in self._config.columns + if doc.tags.get(col) is not None + } + payload = { + 'embedding': self._map_embedding(doc.embedding), + 'blob': doc.to_base64(), + **extra_columns, + } + + if self._config.tag_indices: + for index in self._config.tag_indices: + if doc.tags.get(index) is not None: + payload[index] = doc.tags.get(index) + + if doc.text: + payload['text'] = doc.text + return payload + + def _load_offset2ids(self): + ids = self._get_offset2ids_meta() + self._offset2ids = Offset2ID(ids) + + def _save_offset2ids(self): + self._update_offset2ids_meta() + + def _clear_storage(self): + self._client.flushdb() diff --git a/tests/unit/array/storage/redis/test_getsetdel.py b/tests/unit/array/storage/redis/test_getsetdel.py new file mode 100644 index 00000000000..54ec280c469 --- /dev/null +++ b/tests/unit/array/storage/redis/test_getsetdel.py @@ -0,0 +1,164 @@ +from abc import ABC + +import numpy as np +import pytest +from docarray import Document, DocumentArray +from docarray.array.storage.base.helper import Offset2ID +from docarray.array.storage.memory import SequenceLikeMixin +from docarray.array.storage.redis.getsetdel import GetSetDelMixin +from docarray.array.storage.redis.backend import BackendMixin, RedisConfig + + +class StorageMixins(BackendMixin, GetSetDelMixin, SequenceLikeMixin, ABC): + ... + + +class DocumentArrayDummy(StorageMixins, DocumentArray): + def __new__(cls, *args, **kwargs): + return super().__new__(cls) + + def _load_offset2ids(self): + pass + + def _save_offset2ids(self): + pass + + +@pytest.fixture(scope='function') +def tag_indices(): + tag_indices = ['tag_1', 'tag_2'] + return tag_indices + + +@pytest.fixture(scope='function') +def columns(): + columns = [ + ('col_str', 'str'), + ('col_bytes', 'bytes'), + ('col_int', 'int'), + ('col_float', 'float'), + ('col_long', 'long'), + ('col_double', 'double'), + ] + return columns + + +@pytest.fixture(scope='function') +def da_redis(tag_indices, columns): + cfg = RedisConfig(n_dim=3, flush=True, tag_indices=tag_indices, columns=columns) + da_redis = DocumentArrayDummy(storage='redis', config=cfg) + return da_redis + + +@pytest.mark.parametrize( + 'embedding', [[1, 2, 3], [1.0, 2.0, 3.0], [1, 2, 3, 4, 5], None] +) +@pytest.mark.parametrize('text', ['test_text', None]) +@pytest.mark.parametrize( + 'tag', + [ + {'tag_1': 'tag1'}, + {'tag_1': 'tag1', 'tag_2': 'tag2'}, + {'tag_1': 'tag1', 'tag_2': 'tag2', 'tag_3': 'tag3'}, + None, + ], +) +@pytest.mark.parametrize( + 'col', + [ + {'col_str': 'hello', 'col_bytes': b'world'}, + {'col_int': 1, 'col_float': 1.0}, + {'col_long': 123, 'col_double': 1.1}, + None, + ], +) +def test_document_to_embedding( + embedding, text, tag, col, da_redis, columns, tag_indices, start_storage +): + tags = {} + if tag is not None: + tags.update(tag) + if col is not None: + tags.update(col) + doc = Document(embedding=embedding, text=text, tags=tags) + payload = da_redis._document_to_redis(doc) + + if embedding is None: + assert np.allclose( + np.frombuffer(payload['embedding'], dtype=np.float32), np.zeros((3)) + ) + else: + assert np.allclose( + np.frombuffer(payload['embedding'], dtype=np.float32), np.array(embedding) + ) + + if text is None: + with pytest.raises(KeyError): + payload['text'] + else: + assert payload['text'] == text + + for col, _ in columns: + if col in tags: + assert payload[col] == tags[col] + else: + with pytest.raises(KeyError): + payload[col] + + for tag in tag_indices: + if tag in tags: + assert payload[tag] == tags[tag] + else: + with pytest.raises(KeyError): + payload[tag] + + for key in tags: + if (key not in tag_indices) and (key not in (col[0] for col in columns)): + assert key not in payload + + +@pytest.mark.parametrize( + 'doc', + [ + Document(id='0'), + Document(id='1', text='hello world'), + Document(id='2', embedding=[1, 2, 3], tags={'tag_1': 'tag1', 'tag_2': 'tag2'}), + Document( + text='hello world', + embedding=[1, 2, 3], + tags={'tag_1': 'tag1', 'tag_2': 'tag2'}, + chunks=[Document(text='token1'), Document(text='token2')], + ), + ], +) +def test_setgetdel_doc_by_id(doc, da_redis, start_storage): + da_redis._set_doc_by_id(doc.id, doc) + doc_get = da_redis._get_doc_by_id(doc.id) + assert doc == doc_get + + da_redis._del_doc_by_id(doc.id) + with pytest.raises(KeyError): + da_redis._get_doc_by_id(doc.id) + + +def test_clear_storage(da_redis, start_storage): + for i in range(3): + doc = Document(id=str(i)) + da_redis._set_doc_by_id(str(i), doc) + + da_redis._clear_storage() + + for i in range(3): + with pytest.raises(KeyError): + da_redis._get_doc_by_id(i) + + +def test_offset2ids(da_redis, start_storage): + ids = [str(i) for i in range(3)] + for id in ids: + doc = Document(id=id) + da_redis._set_doc_by_id(id, doc) + da_redis._offset2ids = Offset2ID(ids) + da_redis._save_offset2ids() + da_redis._load_offset2ids() + assert da_redis._offset2ids.ids == ids From f23b73a4f708dde7571017f25dd37979b9f6dfc1 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 6 Aug 2022 18:39:33 +0800 Subject: [PATCH 18/93] fix: backend call super init --- docarray/array/storage/redis/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index f4cd770fb7f..0965c10e915 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -75,7 +75,7 @@ def _init_storage( self._config.columns = self._normalize_columns(self._config.columns) self._client = self._build_client() - super()._init_storage(_docs, config, **kwargs) + super()._init_storage() if _docs is None: return From 15d5ff65d7a7073ad0b01511eaeac02c6c8f96a3 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 6 Aug 2022 18:41:07 +0800 Subject: [PATCH 19/93] fix: redis _set_doc_by_id --- docarray/array/storage/redis/getsetdel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/redis/getsetdel.py b/docarray/array/storage/redis/getsetdel.py index 666f3dc88e2..d58152aed5f 100644 --- a/docarray/array/storage/redis/getsetdel.py +++ b/docarray/array/storage/redis/getsetdel.py @@ -32,7 +32,7 @@ def _set_doc_by_id(self, _id: str, value: 'Document'): self._del_doc_by_id(_id) payload = self._document_to_redis(value) - self._client.hset(_id, mapping=payload) + self._client.hset(value.id, mapping=payload) def _del_doc_by_id(self, _id: str): """Concrete implementation of base class' ``_del_doc_by_id`` From 5369de0add61361ab856f6356b0d890609b7f463 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 6 Aug 2022 19:28:16 +0800 Subject: [PATCH 20/93] feat: add redis seqlike --- docarray/array/storage/redis/seqlike.py | 80 +++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 docarray/array/storage/redis/seqlike.py diff --git a/docarray/array/storage/redis/seqlike.py b/docarray/array/storage/redis/seqlike.py new file mode 100644 index 00000000000..1a216710981 --- /dev/null +++ b/docarray/array/storage/redis/seqlike.py @@ -0,0 +1,80 @@ +from typing import Iterable, Union + +from .... import Document +from ..base.seqlike import BaseSequenceLikeMixin + + +class SequenceLikeMixin(BaseSequenceLikeMixin): + """Implement sequence-like methods for DocumentArray with Redis as storage""" + + def __eq__(self, other): + """Compare this object to the other, returns True if and only if other + as the same type as self and other has the same meta information + + :param other: the other object to check for equality + :return: ``True`` if other is equal to self + """ + # two DA are considered as the same if they have the same client meta data + return ( + type(self) is type(other) + and self._client.client_info() == other._client.client_info() + and self._config == other._config + ) + + def __len__(self): + """Return the length of :class:`DocumentArray` that uses Redis as storage + + :return: the length of this :class:`DocumentArrayRedis` object + """ + try: + keys = self._client.keys() + if b'offset2id' in keys: + return len(keys) - 1 + else: + return len(keys) + + except: + return 0 + + def __contains__(self, x: Union[str, 'Document']): + """Check if ``x`` is contained in this :class:`DocumentArray` with Redis storage + + :param x: the id of the document to check or the document object itself + :return: True if ``x`` is contained in self + """ + if isinstance(x, str): + return self._doc_id_exists(x) + elif isinstance(x, Document): + return self._doc_id_exists(x.id) + else: + return False + + # TODO this del is unreachable, del will call __del__ in base/getsetdel + # def __del__(self): + # """Delete this :class:`DocumentArrayRedis` object""" + # self._offset2ids.clear() + + def __repr__(self): + """Return the string representation of :class:`DocumentArrayRedis` object + :return: string representation of this object + """ + return f'' + + def _upload_batch(self, docs: Iterable['Document']): + print('uploading batch @', __import__('time').time_ns()) + pipe = self._client.pipeline() + batch = 0 + for doc in docs: + payload = self._document_to_redis(doc) + pipe.hset(doc.id, mapping=payload) + batch += 1 + if batch >= self._config.batch_size: + pipe.execute() + batch = 0 + if batch > 0: + pipe.execute() + + def extend(self, docs: Iterable['Document']): + docs = list(docs) + self._upload_batch(docs) + self._offset2ids.extend([doc.id for doc in docs]) From 58a365ba3b36dd316a1a33d2fcdac465e62d0ea6 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 6 Aug 2022 19:28:36 +0800 Subject: [PATCH 21/93] test: add redis to test_seq --- tests/unit/array/test_sequence.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/tests/unit/array/test_sequence.py b/tests/unit/array/test_sequence.py index 527219fcc59..a90fe9e576f 100644 --- a/tests/unit/array/test_sequence.py +++ b/tests/unit/array/test_sequence.py @@ -1,20 +1,20 @@ +import tempfile import uuid +import numpy as np import pytest -import tempfile - from docarray import Document, DocumentArray +from docarray.array.elastic import DocumentArrayElastic from docarray.array.memory import DocumentArrayInMemory from docarray.array.qdrant import DocumentArrayQdrant +from docarray.array.redis import DocumentArrayRedis from docarray.array.sqlite import DocumentArraySqlite -from docarray.array.storage.sqlite import SqliteConfig -from docarray.array.weaviate import DocumentArrayWeaviate -from docarray.array.elastic import DocumentArrayElastic +from docarray.array.storage.elastic import ElasticConfig from docarray.array.storage.qdrant import QdrantConfig +from docarray.array.storage.redis import RedisConfig +from docarray.array.storage.sqlite import SqliteConfig from docarray.array.storage.weaviate import WeaviateConfig -from docarray.array.storage.elastic import ElasticConfig -import numpy as np - +from docarray.array.weaviate import DocumentArrayWeaviate from tests.conftest import tmpfile @@ -26,6 +26,7 @@ (DocumentArrayWeaviate, lambda: WeaviateConfig(n_dim=1)), (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=1)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=1)), + (DocumentArrayRedis, lambda: RedisConfig(n_dim=1, flush=True)), ], ) def test_insert(da_cls, config, start_storage): @@ -48,6 +49,7 @@ def test_insert(da_cls, config, start_storage): (DocumentArrayWeaviate, lambda: WeaviateConfig(n_dim=1)), (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=1)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=1)), + (DocumentArrayRedis, lambda: RedisConfig(n_dim=1, flush=True)), ], ) def test_append_extend(da_cls, config, start_storage): @@ -81,6 +83,7 @@ def update_config_inplace(config, tmpdir, tmpfile): ('weaviate', {'n_dim': 3, 'name': 'Weaviate'}), ('qdrant', {'n_dim': 3, 'collection_name': 'qdrant'}), ('elasticsearch', {'n_dim': 3, 'index_name': 'elasticsearch'}), + ('redis', {'n_dim': 3, 'flush': True}), ], ) def test_context_manager_from_disk(storage, config, start_storage, tmpdir, tmpfile): @@ -96,6 +99,8 @@ def test_context_manager_from_disk(storage, config, start_storage, tmpdir, tmpfi assert len(da) == 2 assert len(da._offset2ids.ids) == 2 + if storage == 'redis': + config['flush'] = False da2 = DocumentArray(storage=storage, config=config) assert len(da2) == 2 From 56aab5d8ed169403d9edfb8b8417d2e1a11eb97e Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 21:12:05 +0800 Subject: [PATCH 22/93] feat: add redis storage subclass and entrypoint --- docarray/array/document.py | 16 ++++++++++++++++ docarray/array/redis.py | 19 +++++++++++++++++++ docarray/array/storage/redis/__init__.py | 12 ++++++++++++ 3 files changed, 47 insertions(+) create mode 100644 docarray/array/redis.py create mode 100644 docarray/array/storage/redis/__init__.py diff --git a/docarray/array/document.py b/docarray/array/document.py index 100172b770f..a5cdf25f98b 100644 --- a/docarray/array/document.py +++ b/docarray/array/document.py @@ -10,10 +10,12 @@ from .annlite import DocumentArrayAnnlite from .weaviate import DocumentArrayWeaviate from .elastic import DocumentArrayElastic + from .redis import DocumentArrayRedis from .storage.sqlite import SqliteConfig from .storage.annlite import AnnliteConfig from .storage.weaviate import WeaviateConfig from .storage.elastic import ElasticConfig + from .storage.redis import RedisConfig class DocumentArray(AllMixins, BaseDocumentArray): @@ -120,6 +122,16 @@ def __new__( """Create a Elastic-powered DocumentArray object.""" ... + @overload + def __new__( + cls, + _docs: Optional['DocumentArraySourceType'] = None, + storage: str = 'redis', + config: Optional[Union['RedisConfig', Dict]] = None, + ) -> 'DocumentArrayRedis': + """Create a Redis-powered DocumentArray object.""" + ... + def __enter__(self): return self @@ -156,6 +168,10 @@ def __new__(cls, *args, storage: str = 'memory', **kwargs): from .elastic import DocumentArrayElastic instance = super().__new__(DocumentArrayElastic) + elif storage == 'redis': + from .redis import DocumentArrayRedis + + instance = super().__new__(DocumentArrayRedis) else: raise ValueError(f'storage=`{storage}` is not supported.') diff --git a/docarray/array/redis.py b/docarray/array/redis.py new file mode 100644 index 00000000000..bee5d597288 --- /dev/null +++ b/docarray/array/redis.py @@ -0,0 +1,19 @@ +from .document import DocumentArray +from .storage.redis import RedisConfig, StorageMixins + +__all__ = ['DocumentArrayRedis', 'RedisConfig'] + + +class DocumentArrayRedis(StorageMixins, DocumentArray): + """This is a :class:`DocumentArray` that uses Redis as + vector search engine and storage. + """ + + def __new__(cls, *args, **kwargs): + """``__new__`` method for :class:`DocumentArrayRedis` + + :param *args: list of args to instantiate the object + :param **kwargs: dict of args to instantiate the object + :return: the instantiated :class:`DocumentArrayRedis` object + """ + return super().__new__(cls) diff --git a/docarray/array/storage/redis/__init__.py b/docarray/array/storage/redis/__init__.py new file mode 100644 index 00000000000..8b74432ac0b --- /dev/null +++ b/docarray/array/storage/redis/__init__.py @@ -0,0 +1,12 @@ +from abc import ABC + +from .backend import BackendMixin, RedisConfig +from .find import FindMixin +from .getsetdel import GetSetDelMixin +from .seqlike import SequenceLikeMixin + +__all__ = ['StorageMixins', 'RedisConfig'] + + +class StorageMixins(FindMixin, BackendMixin, GetSetDelMixin, SequenceLikeMixin, ABC): + ... From 0cb15df036c83278ba2981eabd895114d97325fd Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 21:16:02 +0800 Subject: [PATCH 23/93] feat: add redis storage find.py --- docarray/array/storage/redis/find.py | 119 +++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 docarray/array/storage/redis/find.py diff --git a/docarray/array/storage/redis/find.py b/docarray/array/storage/redis/find.py new file mode 100644 index 00000000000..d07e78133c7 --- /dev/null +++ b/docarray/array/storage/redis/find.py @@ -0,0 +1,119 @@ +from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, TypeVar, Union + +import numpy as np +from docarray import Document, DocumentArray +from docarray.array.mixins.find import FindMixin as BaseFindMixin +from docarray.math import ndarray +from docarray.math.ndarray import to_numpy_array +from docarray.score import NamedScore + +from redis.commands.search.query import NumericFilter, Query + +if TYPE_CHECKING: + import tensorflow + import torch + + RedisArrayType = TypeVar( + 'RedisArrayType', + np.ndarray, + tensorflow.Tensor, + torch.Tensor, + Sequence[float], + Dict, + ) + + +class FindMixin(BaseFindMixin): + def _find_similar_vectors( + self, query: 'RedisArrayType', filter: Optional[Dict] = None, limit=10 + ): + q = ( + Query("*=>[KNN " + str(limit) + " @embedding $vec AS vector_score]") + .sort_by('vector_score') + .dialect(2) + ) + + query_params = {"vec": to_numpy_array(query).astype(np.float32).tobytes()} + if filter: + f = self._build_fiter(filter) + q.add_filter(f) + results = self._client.ft().search(q, query_params).docs + + da = DocumentArray() + for res in results: + doc = Document.from_base64(res.blob.encode()) + doc.scores['score'] = NamedScore(value=res.vector_score) + da.append(doc) + return da + + def _find( + self, + query: 'RedisArrayType', + limit: int = 10, + filter: Optional[Dict] = None, + **kwargs + ) -> List['DocumentArray']: + + query = np.array(query) + num_rows, n_dim = ndarray.get_array_rows(query) + if n_dim != 2: + query = query.reshape((num_rows, -1)) + + return [ + self._find_similar_vectors(q, filter=filter, limit=limit) for q in query + ] + + def _find_with_filter(self, filter: Dict, limit: Optional[Union[int, float]] = 20): + + if filter: + s = self._build_query_str(filter) + q = Query(s) + q.paging(0, limit) + + results = self._client.ft().search(q).docs + + da = DocumentArray() + for res in results: + doc = Document.from_base64(res.blob.encode()) + da.append(doc) + return da + + def _filter( + self, filter: Dict, limit: Optional[Union[int, float]] = 20 + ) -> 'DocumentArray': + + return self._find_with_filter(filter, limit=limit) + + # TODO return NumericFilter or List[NumericFilter] + def _build_fiter(self, filter: Dict) -> NumericFilter: + + INF = "+inf" + NEG_INF = "-inf" + + if filter['operator'] == 'gt': + f = NumericFilter(filter['key'], filter['value'], INF, minExclusive=True) + elif filter['operator'] == 'gte': + f = NumericFilter(filter['key'], filter['value'], INF) + elif filter['operator'] == 'lt': + f = NumericFilter( + filter['key'], NEG_INF, filter['value'], maxExclusive=True + ) + elif filter['operator'] == 'lte': + f = NumericFilter(filter['key'], NEG_INF, filter['value']) + + return f + + def _build_query_str(self, filter: Dict) -> str: + INF = "+inf" + NEG_INF = "-inf" + + if filter['operator'] == 'gt': + s = "@{}:[({} {}]".format(filter['key'], filter['value'], INF) + elif filter['operator'] == 'gte': + s = "@{}:[{} {}]".format(filter['key'], filter['value'], INF) + elif filter['operator'] == 'lt': + s = "@{}:[{} ({}]".format(filter['key'], NEG_INF, filter['value']) + elif filter['operator'] == 'lte': + s = "@{}:[{} {}]".format(filter['key'], NEG_INF, filter['value']) + + return s From 902fede292ab645d7a192acff9eb93f74853341a Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 21:31:21 +0800 Subject: [PATCH 24/93] test: test_construct add redis --- tests/unit/array/test_construct.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/unit/array/test_construct.py b/tests/unit/array/test_construct.py index 303cb80061c..e25bb7576c2 100644 --- a/tests/unit/array/test_construct.py +++ b/tests/unit/array/test_construct.py @@ -9,6 +9,7 @@ from docarray.array.storage.qdrant import QdrantConfig from docarray.array.weaviate import DocumentArrayWeaviate, WeaviateConfig from docarray.array.elastic import DocumentArrayElastic, ElasticConfig +from docarray.array.redis import DocumentArrayRedis, RedisConfig @pytest.mark.parametrize( @@ -20,6 +21,7 @@ (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) def test_construct_docarray(da_cls, config, start_storage): @@ -68,6 +70,7 @@ def test_construct_docarray(da_cls, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) @pytest.mark.parametrize('is_copy', [True, False]) @@ -97,6 +100,7 @@ def test_docarray_copy_singleton(da_cls, config, is_copy, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) @pytest.mark.parametrize('is_copy', [True, False]) @@ -125,6 +129,7 @@ def test_docarray_copy_da(da_cls, config, is_copy, start_storage): (DocumentArrayAnnlite, AnnliteConfig(n_dim=1)), (DocumentArrayQdrant, QdrantConfig(n_dim=1)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) @pytest.mark.parametrize('is_copy', [True, False]) From f918fcbc742bb13a52dc0a596242cbc4e1da68f6 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 21:33:29 +0800 Subject: [PATCH 25/93] test: add redis to test_pull_out --- tests/unit/array/test_pull_out.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/unit/array/test_pull_out.py b/tests/unit/array/test_pull_out.py index fc3a194a7db..76a3b5b1c69 100644 --- a/tests/unit/array/test_pull_out.py +++ b/tests/unit/array/test_pull_out.py @@ -22,6 +22,7 @@ def docs(): ('annlite', {'n_dim': 2}), ('qdrant', {'n_dim': 2}), ('elasticsearch', {'n_dim': 2}), + ('redis', {'n_dim': 2, 'flush': True}), ], ) def test_update_embedding(docs, storage, config, start_storage): @@ -56,6 +57,7 @@ def test_update_embedding(docs, storage, config, start_storage): ('annlite', {'n_dim': 2}), ('qdrant', {'n_dim': 2}), ('elasticsearch', {'n_dim': 2}), + ('redis', {'n_dim': 2, 'flush': True}), ], ) def test_update_doc_embedding(docs, storage, config, start_storage): @@ -90,6 +92,7 @@ def test_update_doc_embedding(docs, storage, config, start_storage): ('annlite', {'n_dim': 2}), ('qdrant', {'n_dim': 2}), ('elasticsearch', {'n_dim': 2}), + ('redis', {'n_dim': 2, 'flush': True}), ], ) def test_batch_update_embedding(docs, storage, config, start_storage): @@ -122,6 +125,7 @@ def test_batch_update_embedding(docs, storage, config, start_storage): ('annlite', {'n_dim': 2}), ('qdrant', {'n_dim': 2}), ('elasticsearch', {'n_dim': 2}), + ('redis', {'n_dim': 2, 'flush': True}), ], ) def test_batch_update_doc_embedding(docs, storage, config, start_storage): @@ -156,6 +160,7 @@ def test_batch_update_doc_embedding(docs, storage, config, start_storage): ('annlite', {'n_dim': 2}), ('qdrant', {'n_dim': 2}), ('elasticsearch', {'n_dim': 2}), + ('redis', {'n_dim': 2, 'flush': True}), ], ) def test_update_id(docs, storage, config, start_storage): @@ -177,6 +182,7 @@ def test_update_id(docs, storage, config, start_storage): ('annlite', {'n_dim': 2}), ('qdrant', {'n_dim': 2}), ('elasticsearch', {'n_dim': 2}), + ('redis', {'n_dim': 2, 'flush': True}), ], ) def test_update_doc_id(docs, storage, config, start_storage): @@ -197,6 +203,7 @@ def test_update_doc_id(docs, storage, config, start_storage): ('annlite', {'n_dim': 2}), ('qdrant', {'n_dim': 2}), ('elasticsearch', {'n_dim': 2}), + ('redis', {'n_dim': 2, 'flush': True}), ], ) def test_batch_update_id(docs, storage, config, start_storage): @@ -220,6 +227,7 @@ def test_batch_update_id(docs, storage, config, start_storage): ('annlite', {'n_dim': 2}), ('qdrant', {'n_dim': 2}), ('elasticsearch', {'n_dim': 2}), + ('redis', {'n_dim': 2, 'flush': True}), ], ) def test_batch_update_doc_id(docs, storage, config, start_storage): From e29061db62c2f7871d0d6273325d7ad3cd454aff Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 21:37:50 +0800 Subject: [PATCH 26/93] test: add redis to test_content --- tests/unit/array/mixins/test_content.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/unit/array/mixins/test_content.py b/tests/unit/array/mixins/test_content.py index 1a590211dc2..cf8e78a46fe 100644 --- a/tests/unit/array/mixins/test_content.py +++ b/tests/unit/array/mixins/test_content.py @@ -9,6 +9,7 @@ from docarray.array.storage.weaviate import WeaviateConfig from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic, ElasticConfig +from docarray.array.redis import DocumentArrayRedis, RedisConfig @pytest.mark.parametrize( @@ -20,6 +21,7 @@ DocumentArrayWeaviate, DocumentArrayQdrant, DocumentArrayElastic, + DocumentArrayRedis, ], ) @pytest.mark.parametrize( @@ -31,6 +33,7 @@ def test_content_empty_getter_return_none(cls, content_attr, start_storage): DocumentArrayWeaviate, DocumentArrayQdrant, DocumentArrayElastic, + DocumentArrayRedis, ]: da = cls(config={'n_dim': 3}) else: @@ -47,6 +50,7 @@ def test_content_empty_getter_return_none(cls, content_attr, start_storage): DocumentArrayWeaviate, DocumentArrayQdrant, DocumentArrayElastic, + DocumentArrayRedis, ], ) @pytest.mark.parametrize( @@ -65,6 +69,7 @@ def test_content_empty_setter(cls, content_attr, start_storage): DocumentArrayWeaviate, DocumentArrayQdrant, DocumentArrayElastic, + DocumentArrayRedis, ]: da = cls(config={'n_dim': 3}) else: @@ -82,6 +87,7 @@ def test_content_empty_setter(cls, content_attr, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) @pytest.mark.parametrize( @@ -116,6 +122,7 @@ def test_content_getter_setter(cls, content_attr, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) def test_content_empty(da_len, da_cls, config, start_storage): @@ -153,6 +160,7 @@ def test_content_empty(da_len, da_cls, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=5)), (DocumentArrayQdrant, QdrantConfig(n_dim=5)), (DocumentArrayElastic, ElasticConfig(n_dim=5)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) def test_embeddings_setter(da_len, da_cls, config, start_storage): From bcc6ae37fdee3d1b5de6c7f243726d5fab91813e Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 21:40:14 +0800 Subject: [PATCH 27/93] test: add redis to test_embed --- tests/unit/array/mixins/test_embed.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/unit/array/mixins/test_embed.py b/tests/unit/array/mixins/test_embed.py index 31dfef5bd34..dc1a20c5d98 100644 --- a/tests/unit/array/mixins/test_embed.py +++ b/tests/unit/array/mixins/test_embed.py @@ -22,6 +22,7 @@ from docarray.array.sqlite import DocumentArraySqlite from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic +from docarray.array.redis import DocumentArrayRedis random_embed_models = { 'keras': lambda: tf.keras.Sequential( @@ -74,6 +75,7 @@ DocumentArrayQdrant, # DocumentArrayWeaviate, TODO: enable this DocumentArrayElastic, + DocumentArrayRedis, ], ) @pytest.mark.parametrize('N', [2, 10]) @@ -96,6 +98,10 @@ def test_embedding_on_random_network( DocumentArrayElastic, ]: da = da_cls.empty(N, config={'n_dim': embedding_shape}) + elif da_cls in [ + DocumentArrayRedis, + ]: + da = da_cls.empty(N, config={'n_dim': embedding_shape, 'flush': True}) else: da = da_cls.empty(N, config=None) da.tensors = np.random.random([N, *input_shape]).astype(np.float32) From bf84282c65e93a0003bd6f53631b0879380b5fb2 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 21:41:01 +0800 Subject: [PATCH 28/93] test: add redis to test_empty --- tests/unit/array/mixins/test_empty.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/array/mixins/test_empty.py b/tests/unit/array/mixins/test_empty.py index 3301bb59485..0ba3da06e93 100644 --- a/tests/unit/array/mixins/test_empty.py +++ b/tests/unit/array/mixins/test_empty.py @@ -8,6 +8,7 @@ from docarray.array.storage.weaviate import WeaviateConfig from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic, ElasticConfig +from docarray.array.redis import DocumentArrayRedis, RedisConfig @pytest.mark.parametrize( @@ -19,6 +20,7 @@ (DocumentArrayWeaviate, WeaviateConfig(n_dim=5)), (DocumentArrayQdrant, QdrantConfig(n_dim=5)), (DocumentArrayElastic, ElasticConfig(n_dim=5)), + (DocumentArrayRedis, RedisConfig(n_dim=5)), ], ) def test_empty_non_zero(da_cls, config, start_storage): From 03580834b839c2a6449b6cff5fc56d21951503c3 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 21:42:42 +0800 Subject: [PATCH 29/93] test: add redis test_eval_class --- tests/unit/array/mixins/test_eval_class.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/unit/array/mixins/test_eval_class.py b/tests/unit/array/mixins/test_eval_class.py index a8f9255e678..8467f6d4ede 100644 --- a/tests/unit/array/mixins/test_eval_class.py +++ b/tests/unit/array/mixins/test_eval_class.py @@ -15,6 +15,7 @@ ('annlite', {'n_dim': 256}), ('qdrant', {'n_dim': 256}), ('elasticsearch', {'n_dim': 256}), + ('redis', {'n_dim': 256, 'flush': True}), ], ) @pytest.mark.parametrize( @@ -51,6 +52,7 @@ def test_eval_mixin_perfect_match(metric_fn, kwargs, storage, config, start_stor ('annlite', {'n_dim': 256}), ('qdrant', {'n_dim': 256}), ('elasticsearch', {'n_dim': 256}), + ('redis', {'n_dim': 256, 'flush': True}), ], ) @pytest.mark.parametrize( @@ -94,6 +96,7 @@ def test_eval_mixin_zero_match(storage, config, metric_fn, start_storage, kwargs ('annlite', {'n_dim': 256}), ('qdrant', {'n_dim': 256}), ('elasticsearch', {'n_dim': 256}), + ('redis', {'n_dim': 256, 'flush': True}), ], ) def test_diff_len_should_raise(storage, config, start_storage): @@ -112,6 +115,7 @@ def test_diff_len_should_raise(storage, config, start_storage): ('annlite', {'n_dim': 256}), ('qdrant', {'n_dim': 256}), ('elasticsearch', {'n_dim': 256}), + ('redis', {'n_dim': 256, 'flush': True}), ], ) def test_diff_hash_fun_should_raise(storage, config, start_storage): @@ -130,6 +134,7 @@ def test_diff_hash_fun_should_raise(storage, config, start_storage): ('annlite', {'n_dim': 3}), ('qdrant', {'n_dim': 3}), ('elasticsearch', {'n_dim': 3}), + ('redis', {'n_dim': 3, 'flush': True}), ], ) def test_same_hash_same_len_fun_should_work(storage, config, start_storage): @@ -158,6 +163,7 @@ def test_same_hash_same_len_fun_should_work(storage, config, start_storage): ('annlite', {'n_dim': 3}), ('qdrant', {'n_dim': 3}), ('elasticsearch', {'n_dim': 3}), + ('redis', {'n_dim': 3, 'flush': True}), ], ) def test_adding_noise(storage, config, start_storage): @@ -188,6 +194,7 @@ def test_adding_noise(storage, config, start_storage): ('annlite', {'n_dim': 128}), ('qdrant', {'n_dim': 128}), ('elasticsearch', {'n_dim': 128}), + ('redis', {'n_dim': 128, 'flush': True}), ], ) @pytest.mark.parametrize( From 926ecc56bd3045b3da7af8593e13878e7ea35e37 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 21:45:35 +0800 Subject: [PATCH 30/93] test: add redis to test_getset --- tests/unit/array/mixins/test_getset.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/unit/array/mixins/test_getset.py b/tests/unit/array/mixins/test_getset.py index 871fa84114a..da0c143f0dd 100644 --- a/tests/unit/array/mixins/test_getset.py +++ b/tests/unit/array/mixins/test_getset.py @@ -13,6 +13,7 @@ from docarray.array.storage.weaviate import WeaviateConfig from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic, ElasticConfig +from docarray.array.redis import DocumentArrayRedis, RedisConfig from tests import random_docs rand_array = np.random.random([10, 3]) @@ -42,6 +43,7 @@ def nested_docs(): ('weaviate', {'n_dim': 3}), ('qdrant', {'n_dim': 3}), ('elasticsearch', {'n_dim': 3}), + ('redis', {'n_dim': 3, 'flush': True}), ], ) @pytest.mark.parametrize( @@ -67,6 +69,7 @@ def test_set_embeddings_multi_kind(array, storage, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) def test_da_get_embeddings(docs, config, da_cls, start_storage): @@ -88,6 +91,7 @@ def test_da_get_embeddings(docs, config, da_cls, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) def test_embeddings_setter_da(docs, config, da_cls, start_storage): @@ -118,6 +122,7 @@ def test_embeddings_setter_da(docs, config, da_cls, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) def test_embeddings_wrong_len(docs, config, da_cls, start_storage): @@ -141,6 +146,7 @@ def test_embeddings_wrong_len(docs, config, da_cls, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) def test_tensors_getter_da(docs, config, da_cls, start_storage): @@ -167,6 +173,7 @@ def test_tensors_getter_da(docs, config, da_cls, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) def test_texts_getter_da(docs, config, da_cls, start_storage): @@ -202,6 +209,7 @@ def test_texts_getter_da(docs, config, da_cls, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) def test_setter_by_sequences_in_selected_docs_da(docs, config, da_cls, start_storage): @@ -239,6 +247,7 @@ def test_setter_by_sequences_in_selected_docs_da(docs, config, da_cls, start_sto (DocumentArrayWeaviate, WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) def test_texts_wrong_len(docs, config, da_cls, start_storage): @@ -262,6 +271,7 @@ def test_texts_wrong_len(docs, config, da_cls, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) def test_tensors_wrong_len(docs, config, da_cls, start_storage): @@ -285,6 +295,7 @@ def test_tensors_wrong_len(docs, config, da_cls, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) def test_blobs_getter_setter(docs, da_cls, config, start_storage): @@ -317,6 +328,7 @@ def test_blobs_getter_setter(docs, da_cls, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) def test_ellipsis_getter(nested_docs, da_cls, config, start_storage): @@ -340,6 +352,7 @@ def test_ellipsis_getter(nested_docs, da_cls, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) def test_ellipsis_attribute_setter(nested_docs, da_cls, config, start_storage): @@ -360,6 +373,7 @@ def test_ellipsis_attribute_setter(nested_docs, da_cls, config, start_storage): (DocumentArrayAnnlite, AnnliteConfig(n_dim=6)), (DocumentArrayWeaviate, WeaviateConfig(n_dim=6)), (DocumentArrayElastic, ElasticConfig(n_dim=6)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) def test_zero_embeddings(da_cls, config, start_storage): From 897465baaf80eaaccbc9ba999a43f5f37e230bd8 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 21:48:05 +0800 Subject: [PATCH 31/93] test: add redis to test_magic --- tests/unit/array/mixins/test_magic.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/unit/array/mixins/test_magic.py b/tests/unit/array/mixins/test_magic.py index 70a11979839..0b7f21b6ba4 100644 --- a/tests/unit/array/mixins/test_magic.py +++ b/tests/unit/array/mixins/test_magic.py @@ -8,6 +8,7 @@ from docarray.array.storage.weaviate import WeaviateConfig from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic, ElasticConfig +from docarray.array.redis import DocumentArrayRedis, RedisConfig N = 100 @@ -32,6 +33,7 @@ def docs(): (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=1, flush=True)), ], ) def test_iter_len_bool(da_cls, config, start_storage): @@ -58,6 +60,7 @@ def test_iter_len_bool(da_cls, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) def test_repr(da_cls, config, start_storage): @@ -77,6 +80,7 @@ def test_repr(da_cls, config, start_storage): ('weaviate', WeaviateConfig(n_dim=128)), ('qdrant', QdrantConfig(n_dim=128)), ('elasticsearch', ElasticConfig(n_dim=128)), + ('redis', RedisConfig(n_dim=128)), ], ) def test_repr_str(docs, storage, config, start_storage): @@ -100,6 +104,7 @@ def test_repr_str(docs, storage, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) def test_iadd(da_cls, config, start_storage): From 15c1c1716fc3a45134a3e73cbbd00b222e520920 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 21:50:45 +0800 Subject: [PATCH 32/93] test: add redis to test_parallel --- tests/unit/array/mixins/test_parallel.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/unit/array/mixins/test_parallel.py b/tests/unit/array/mixins/test_parallel.py index e929dbb9479..52901c977d7 100644 --- a/tests/unit/array/mixins/test_parallel.py +++ b/tests/unit/array/mixins/test_parallel.py @@ -12,6 +12,7 @@ from docarray.array.storage.weaviate import WeaviateConfig from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic, ElasticConfig +from docarray.array.redis import DocumentArrayRedis, RedisConfig def foo(d: Document): @@ -52,6 +53,7 @@ def test_parallel_map_apply_external_pool(pytestconfig, pool): (DocumentArrayWeaviate, WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) @pytest.mark.parametrize('backend', ['process', 'thread']) @@ -108,6 +110,7 @@ def test_parallel_map( (DocumentArrayWeaviate, WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) @pytest.mark.parametrize('backend', ['thread']) @@ -179,6 +182,7 @@ def test_parallel_map_batch( (DocumentArrayWeaviate, WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) def test_map_lambda(pytestconfig, da_cls, config, start_storage): @@ -207,6 +211,7 @@ def test_map_lambda(pytestconfig, da_cls, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, QdrantConfig(n_dim=10)), (DocumentArrayElastic, ElasticConfig(n_dim=10)), + (DocumentArrayRedis, RedisConfig(n_dim=10, flush=True)), ], ) def test_apply_partial(pytestconfig, da_cls, config, start_storage): @@ -236,6 +241,7 @@ def test_apply_partial(pytestconfig, da_cls, config, start_storage): ('weaviate', WeaviateConfig(n_dim=256)), ('qdrant', QdrantConfig(n_dim=256)), ('elasticsearch', ElasticConfig(n_dim=256)), + ('redis', RedisConfig(n_dim=256, flush=True)), ], ) @pytest.mark.parametrize('backend', ['thread', 'process']) From bdf552908667d56010eb0b6bfaec7c4e4c272a20 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 21:52:26 +0800 Subject: [PATCH 33/93] test: add redis to test_sample --- tests/unit/array/mixins/test_sample.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/unit/array/mixins/test_sample.py b/tests/unit/array/mixins/test_sample.py index 2a01c1bfd01..5844db56afc 100644 --- a/tests/unit/array/mixins/test_sample.py +++ b/tests/unit/array/mixins/test_sample.py @@ -8,6 +8,7 @@ from docarray.array.storage.weaviate import WeaviateConfig from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic, ElasticConfig +from docarray.array.redis import DocumentArrayRedis, RedisConfig @pytest.mark.parametrize( @@ -19,6 +20,7 @@ (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) def test_sample(da_cls, config, start_storage): @@ -44,6 +46,7 @@ def test_sample(da_cls, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) def test_sample_with_seed(da_cls, config, start_storage): @@ -68,6 +71,7 @@ def test_sample_with_seed(da_cls, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) def test_shuffle(da_cls, config, start_storage): @@ -93,6 +97,7 @@ def test_shuffle(da_cls, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) def test_shuffle_with_seed(da_cls, config, start_storage): From f827c2beb0183706f418bd437cdd592498c19b86 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 21:53:25 +0800 Subject: [PATCH 34/93] test: add redis to test_text --- tests/unit/array/mixins/test_text.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/unit/array/mixins/test_text.py b/tests/unit/array/mixins/test_text.py index 0f5ace32655..d047181410b 100644 --- a/tests/unit/array/mixins/test_text.py +++ b/tests/unit/array/mixins/test_text.py @@ -9,6 +9,7 @@ from docarray.array.storage.weaviate import WeaviateConfig from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic, ElasticConfig +from docarray.array.redis import DocumentArrayRedis, RedisConfig @pytest.fixture(scope='function') @@ -30,6 +31,7 @@ def docs(): (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) def test_da_vocabulary(da_cls, config, docs, min_freq, start_storage): @@ -58,6 +60,7 @@ def test_da_vocabulary(da_cls, config, docs, min_freq, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) def test_da_text_to_tensor_non_max_len(docs, da_cls, config, start_storage): @@ -86,6 +89,7 @@ def test_da_text_to_tensor_non_max_len(docs, da_cls, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) def test_da_text_to_tensor_max_len_3(docs, da_cls, config, start_storage): @@ -116,6 +120,7 @@ def test_da_text_to_tensor_max_len_3(docs, da_cls, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) def test_da_text_to_tensor_max_len_1(docs, da_cls, config, start_storage): @@ -140,12 +145,13 @@ def test_da_text_to_tensor_max_len_1(docs, da_cls, config, start_storage): @pytest.mark.parametrize( 'da_cls,config', [ - (DocumentArray, None), - (DocumentArraySqlite, None), - (DocumentArrayAnnlite, AnnliteConfig(n_dim=128)), - (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), - (DocumentArrayQdrant, QdrantConfig(n_dim=128)), - (DocumentArrayElastic, ElasticConfig(n_dim=128)), + # (DocumentArray, None), + # (DocumentArraySqlite, None), + # (DocumentArrayAnnlite, AnnliteConfig(n_dim=128)), + # (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), + # (DocumentArrayQdrant, QdrantConfig(n_dim=128)), + # (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) def test_convert_text_tensor_random_text(da_cls, docs, config, start_storage): From 4626ffd19f73228efe0e2e08fd0612733bae6692 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 21:56:56 +0800 Subject: [PATCH 35/93] test: add redis to test_traverse --- tests/unit/array/mixins/test_traverse.py | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/unit/array/mixins/test_traverse.py b/tests/unit/array/mixins/test_traverse.py index 0a128551de2..af8a75c296e 100644 --- a/tests/unit/array/mixins/test_traverse.py +++ b/tests/unit/array/mixins/test_traverse.py @@ -9,6 +9,7 @@ from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.annlite import DocumentArrayAnnlite from docarray.array.elastic import DocumentArrayElastic +from docarray.array.redis import DocumentArrayRedis from tests import random_docs # some random prime number for sanity check @@ -42,6 +43,7 @@ def doc_req(): (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_type(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -61,6 +63,7 @@ def test_traverse_type(doc_req, filter_fn, da_cls, kwargs, start_storage): (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_root(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -80,6 +83,7 @@ def test_traverse_root(doc_req, filter_fn, da_cls, kwargs, start_storage): (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -99,6 +103,7 @@ def test_traverse_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_root_plus_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -119,6 +124,7 @@ def test_traverse_root_plus_chunk(doc_req, filter_fn, da_cls, kwargs, start_stor (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_chunk_plus_root(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -139,6 +145,7 @@ def test_traverse_chunk_plus_root(doc_req, filter_fn, da_cls, kwargs, start_stor (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_match(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -158,6 +165,7 @@ def test_traverse_match(doc_req, filter_fn, da_cls, kwargs, start_storage): (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_match_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -177,6 +185,7 @@ def test_traverse_match_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage) (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_root_match_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -195,6 +204,7 @@ def test_traverse_root_match_chunk(doc_req, filter_fn, da_cls, kwargs, start_sto (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_flatten_embedding(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -214,6 +224,7 @@ def test_traverse_flatten_embedding(doc_req, filter_fn, da_cls, kwargs, start_st (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_flatten_root(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -232,6 +243,7 @@ def test_traverse_flatten_root(doc_req, filter_fn, da_cls, kwargs, start_storage (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_flatten_chunk(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -250,6 +262,7 @@ def test_traverse_flatten_chunk(doc_req, filter_fn, da_cls, kwargs, start_storag (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_flatten_root_plus_chunk( @@ -270,6 +283,7 @@ def test_traverse_flatten_root_plus_chunk( (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_flatten_match(doc_req, filter_fn, da_cls, kwargs, start_storage): @@ -288,6 +302,7 @@ def test_traverse_flatten_match(doc_req, filter_fn, da_cls, kwargs, start_storag (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_flatten_match_chunk( @@ -308,6 +323,7 @@ def test_traverse_flatten_match_chunk( (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_flatten_root_match_chunk( @@ -334,6 +350,7 @@ def test_traverse_flatten_root_match_chunk( (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_flattened_per_path_embedding( @@ -358,6 +375,7 @@ def test_traverse_flattened_per_path_embedding( (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_flattened_per_path_root( @@ -378,6 +396,7 @@ def test_traverse_flattened_per_path_root( (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_flattened_per_path_chunk( @@ -398,6 +417,7 @@ def test_traverse_flattened_per_path_chunk( (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_flattened_per_path_root_plus_chunk( @@ -419,6 +439,7 @@ def test_traverse_flattened_per_path_root_plus_chunk( (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_flattened_per_path_match( @@ -439,6 +460,7 @@ def test_traverse_flattened_per_path_match( (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_flattened_per_path_root_match_chunk( @@ -462,6 +484,7 @@ def test_traverse_flattened_per_path_root_match_chunk( (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_docuset_traverse_over_iterator_HACKY(da_cls, kwargs, filter_fn): @@ -489,6 +512,7 @@ def test_docuset_traverse_over_iterator_HACKY(da_cls, kwargs, filter_fn): (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_docuset_traverse_over_iterator_CAVEAT(da_cls, kwargs, filter_fn): @@ -555,6 +579,7 @@ def test_traverse_chunkarray(filter_fn): (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) @pytest.mark.parametrize( @@ -600,6 +625,7 @@ def test_filter_fn_traverse_flat( (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) @pytest.mark.parametrize( @@ -651,6 +677,7 @@ def test_filter_fn_traverse_flat_per_path( (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traversal_path(da_cls, kwargs): @@ -669,6 +696,7 @@ def test_traversal_path(da_cls, kwargs): (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_traverse_flat_root_itself(da_cls, kwargs): @@ -691,6 +719,7 @@ def da_and_dam(N): (DocumentArrayWeaviate, {'config': {'n_dim': 10}}), (DocumentArrayQdrant, {'config': {'n_dim': 10}}), (DocumentArrayElastic, {'config': {'n_dim': 10}}), + (DocumentArrayRedis, {'config': {'n_dim': 10, 'flush': True}}), ], ) def test_flatten(da_cls, kwargs): From be1413d2d631d252f213b55e8493f7b984e34ca8 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 22:00:10 +0800 Subject: [PATCH 36/93] test: add redis test_plot --- tests/unit/array/mixins/test_plot.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/unit/array/mixins/test_plot.py b/tests/unit/array/mixins/test_plot.py index 7678126d49a..7fc38169c21 100644 --- a/tests/unit/array/mixins/test_plot.py +++ b/tests/unit/array/mixins/test_plot.py @@ -15,6 +15,7 @@ from docarray.array.annlite import DocumentArrayAnnlite from docarray.array.storage.annlite import AnnliteConfig from docarray.array.elastic import DocumentArrayElastic, ElasticConfig +from docarray.array.redis import DocumentArrayRedis, RedisConfig @pytest.mark.parametrize('keep_aspect_ratio', [True, False]) @@ -28,6 +29,7 @@ # (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128, scroll_batch_size=8)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) def test_sprite_fail_tensor_success_uri( @@ -66,6 +68,7 @@ def test_sprite_fail_tensor_success_uri( (DocumentArrayWeaviate, lambda: WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=128, scroll_batch_size=8)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=128)), + (DocumentArrayRedis, lambda: RedisConfig(n_dim=128, flush=True)), ], ) def test_sprite_image_generator( @@ -101,6 +104,7 @@ def da_and_dam(start_storage): (DocumentArrayWeaviate, {'config': {'n_dim': 3}}), (DocumentArrayAnnlite, {'config': {'n_dim': 3}}), (DocumentArrayQdrant, {'config': {'n_dim': 3}}), + (DocumentArrayRedis, {'config': {'n_dim': 3, 'flush': True}}), ] ] @@ -158,6 +162,7 @@ def test_plot_embeddings_same_path(tmpdir, da_cls, config_gen, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) def test_summary_homo_hetero(da_cls, config, start_storage): @@ -181,6 +186,7 @@ def test_summary_homo_hetero(da_cls, config, start_storage): (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), (DocumentArrayQdrant, QdrantConfig(n_dim=128)), (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) def test_empty_get_attributes(da_cls, config, start_storage): From 096b303bdecc8ed2d014469a750ec0019b9da93e Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 22:06:18 +0800 Subject: [PATCH 37/93] test: add redis to test_find --- tests/unit/array/mixins/test_find.py | 39 ++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index c8ad6030cda..4cad5e0fdc4 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -33,6 +33,7 @@ def inv_cosine(*args): ('annlite', {'n_dim': 32}), ('qdrant', {'n_dim': 32}), ('elasticsearch', {'n_dim': 32}), + ('redis', {'n_dim': 32, 'flush': True}), ], ) @pytest.mark.parametrize('limit', [1, 5, 10]) @@ -251,6 +252,14 @@ def test_find_by_tag(storage, config, start_storage): 'eq': operator.eq, } +numeric_operators_redis = { + 'gte': operator.ge, + 'gt': operator.gt, + 'lte': operator.le, + 'lt': operator.lt, + 'eq': operator.eq, +} + @pytest.mark.parametrize( 'storage,filter_gen,numeric_operators,operator', @@ -331,6 +340,21 @@ def test_find_by_tag(storage, config, start_storage): ) for operator in ['gt', 'gte', 'lt', 'lte'] ], + *[ + tuple( + [ + 'redis', + lambda operator, threshold: { + 'key': 'price', + 'operator': operator, + 'value': threshold, + }, + numeric_operators_redis, + operator, + ] + ) + for operator in ['gt', 'gte', 'lt', 'lte'] + ], ], ) def test_search_pre_filtering( @@ -420,6 +444,21 @@ def test_search_pre_filtering( ) for operator in numeric_operators_annlite.keys() ], + *[ + tuple( + [ + 'redis', + lambda operator, threshold: { + 'key': 'price', + 'operator': operator, + 'value': threshold, + }, + numeric_operators_elasticsearch, + operator, + ] + ) + for operator in ['gt', 'gte', 'lt', 'lte'] + ], ], ) def test_filtering(storage, filter_gen, operator, numeric_operators, start_storage): From 3d4f4116a4d9361da6ab30134d782dce712d725d Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 13 Aug 2022 22:11:48 +0800 Subject: [PATCH 38/93] test: add redis to test_match --- tests/unit/array/mixins/test_match.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/unit/array/mixins/test_match.py b/tests/unit/array/mixins/test_match.py index 24b5f360240..ce014a490e1 100644 --- a/tests/unit/array/mixins/test_match.py +++ b/tests/unit/array/mixins/test_match.py @@ -75,6 +75,7 @@ def doc_lists_to_doc_arrays(doc_lists, *args, **kwargs): ('annlite', {'n_dim': 3}), ('qdrant', {'n_dim': 3}), ('weaviate', {'n_dim': 3}), + ('redis', {'n_dim': 3, 'flush': True}), ], ) @pytest.mark.parametrize('limit', [1, 2, 3]) @@ -607,6 +608,14 @@ def test_match_ensure_scores_unique(): 'neq': operator.ne, } +numeric_operators_redis = { + 'gte': operator.ge, + 'gt': operator.gt, + 'lte': operator.le, + 'lt': operator.lt, + 'eq': operator.eq, +} + @pytest.mark.parametrize( 'storage,filter_gen,numeric_operators,operator', @@ -670,6 +679,21 @@ def test_match_ensure_scores_unique(): ) for operator in numeric_operators_annlite.keys() ], + *[ + tuple( + [ + 'redis', + lambda operator, threshold: { + 'key': 'price', + 'operator': operator, + 'value': threshold, + }, + numeric_operators_redis, + operator, + ] + ) + for operator in ['gt', 'gte', 'lt', 'lte'] + ], ], ) def test_match_pre_filtering( From a3144cb97be4d48183aa0dde1194971469b3e3fe Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 15 Aug 2022 18:39:45 +0800 Subject: [PATCH 39/93] fix: fix conflicts with main --- docarray/array/document.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/docarray/array/document.py b/docarray/array/document.py index a5cdf25f98b..9188a0026d1 100644 --- a/docarray/array/document.py +++ b/docarray/array/document.py @@ -4,18 +4,18 @@ from .mixins import AllMixins if TYPE_CHECKING: - from ..typing import DocumentArraySourceType - from .memory import DocumentArrayInMemory - from .sqlite import DocumentArraySqlite - from .annlite import DocumentArrayAnnlite - from .weaviate import DocumentArrayWeaviate - from .elastic import DocumentArrayElastic - from .redis import DocumentArrayRedis - from .storage.sqlite import SqliteConfig - from .storage.annlite import AnnliteConfig - from .storage.weaviate import WeaviateConfig - from .storage.elastic import ElasticConfig - from .storage.redis import RedisConfig + from docarray.typing import DocumentArraySourceType + from docarray.array.memory import DocumentArrayInMemory + from docarray.array.sqlite import DocumentArraySqlite + from docarray.array.annlite import DocumentArrayAnnlite + from docarray.array.weaviate import DocumentArrayWeaviate + from docarray.array.elastic import DocumentArrayElastic + from docarray.array.redis import DocumentArrayRedis + from docarray.array.storage.sqlite import SqliteConfig + from docarray.array.storage.annlite import AnnliteConfig + from docarray.array.storage.weaviate import WeaviateConfig + from docarray.array.storage.elastic import ElasticConfig + from docarray.array.storage.redis import RedisConfig class DocumentArray(AllMixins, BaseDocumentArray): From 6949fe2db7491939dca9fd39867d48f7c6b02425 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 15 Aug 2022 23:36:00 +0800 Subject: [PATCH 40/93] test: add redis to test_advance_indexing --- tests/unit/array/test_advance_indexing.py | 236 +++++++++++++--------- 1 file changed, 138 insertions(+), 98 deletions(-) diff --git a/tests/unit/array/test_advance_indexing.py b/tests/unit/array/test_advance_indexing.py index 87b37a7ee8d..5aa3736a0a7 100644 --- a/tests/unit/array/test_advance_indexing.py +++ b/tests/unit/array/test_advance_indexing.py @@ -2,10 +2,13 @@ import pytest from docarray import DocumentArray, Document -from docarray.array.storage.weaviate import WeaviateConfig -from docarray.array.annlite import AnnliteConfig -from docarray.array.qdrant import QdrantConfig -from docarray.array.elastic import ElasticConfig + +# from docarray.array.storage.weaviate import WeaviateConfig +# from docarray.array.annlite import AnnliteConfig +# from docarray.array.qdrant import QdrantConfig +# from docarray.array.elastic import ElasticConfig +from docarray.array.redis import RedisConfig +import gc @pytest.fixture @@ -21,12 +24,13 @@ def indices(): @pytest.mark.parametrize( 'storage,config', [ - ('memory', None), - ('sqlite', None), - ('weaviate', WeaviateConfig(n_dim=123)), - ('annlite', AnnliteConfig(n_dim=123)), - ('qdrant', QdrantConfig(n_dim=123)), - ('elasticsearch', ElasticConfig(n_dim=123)), + # ('memory', None), + # ('sqlite', None), + # ('weaviate', WeaviateConfig(n_dim=123)), + # ('annlite', AnnliteConfig(n_dim=123)), + # ('qdrant', QdrantConfig(n_dim=123)), + # ('elasticsearch', ElasticConfig(n_dim=123)), + ('redis', RedisConfig(n_dim=123, flush=True)), ], ) def test_getter_int_str(docs, storage, config, start_storage): @@ -54,11 +58,12 @@ def test_getter_int_str(docs, storage, config, start_storage): @pytest.mark.parametrize( 'storage,config', [ - ('memory', None), - ('sqlite', None), - ('weaviate', WeaviateConfig(n_dim=123)), - ('annlite', AnnliteConfig(n_dim=123)), - ('qdrant', QdrantConfig(n_dim=123)), + # ('memory', None), + # ('sqlite', None), + # ('weaviate', WeaviateConfig(n_dim=123)), + # ('annlite', AnnliteConfig(n_dim=123)), + # ('qdrant', QdrantConfig(n_dim=123)), + ('redis', RedisConfig(n_dim=123, flush=True)), ], ) def test_setter_int_str(docs, storage, config, start_storage): @@ -82,12 +87,13 @@ def test_setter_int_str(docs, storage, config, start_storage): @pytest.mark.parametrize( 'storage,config', [ - ('memory', None), - ('sqlite', None), - ('weaviate', WeaviateConfig(n_dim=123)), - ('annlite', AnnliteConfig(n_dim=123)), - ('qdrant', QdrantConfig(n_dim=123)), - ('elasticsearch', ElasticConfig(n_dim=123)), + # ('memory', None), + # ('sqlite', None), + # ('weaviate', WeaviateConfig(n_dim=123)), + # ('annlite', AnnliteConfig(n_dim=123)), + # ('qdrant', QdrantConfig(n_dim=123)), + # ('elasticsearch', ElasticConfig(n_dim=123)), + ('redis', RedisConfig(n_dim=123, flush=True)), ], ) def test_del_int_str(docs, storage, config, start_storage, indices): @@ -116,12 +122,13 @@ def test_del_int_str(docs, storage, config, start_storage, indices): @pytest.mark.parametrize( 'storage,config', [ - ('memory', None), - ('sqlite', None), - ('weaviate', WeaviateConfig(n_dim=123)), - ('annlite', AnnliteConfig(n_dim=123)), - ('qdrant', QdrantConfig(n_dim=123)), - ('elasticsearch', ElasticConfig(n_dim=123)), + # ('memory', None), + # ('sqlite', None), + # ('weaviate', WeaviateConfig(n_dim=123)), + # ('annlite', AnnliteConfig(n_dim=123)), + # ('qdrant', QdrantConfig(n_dim=123)), + # ('elasticsearch', ElasticConfig(n_dim=123)), + ('redis', RedisConfig(n_dim=123, flush=True)), ], ) def test_slice(docs, storage, config, start_storage): @@ -154,12 +161,13 @@ def test_slice(docs, storage, config, start_storage): @pytest.mark.parametrize( 'storage,config', [ - ('memory', None), - ('sqlite', None), - ('weaviate', WeaviateConfig(n_dim=123)), - ('annlite', AnnliteConfig(n_dim=123)), - ('qdrant', QdrantConfig(n_dim=123)), - ('elasticsearch', ElasticConfig(n_dim=123)), + # ('memory', None), + # ('sqlite', None), + # ('weaviate', WeaviateConfig(n_dim=123)), + # ('annlite', AnnliteConfig(n_dim=123)), + # ('qdrant', QdrantConfig(n_dim=123)), + # ('elasticsearch', ElasticConfig(n_dim=123)), + ('redis', RedisConfig(n_dim=123, flush=True)), ], ) def test_sequence_bool_index(docs, storage, config, start_storage): @@ -200,12 +208,13 @@ def test_sequence_bool_index(docs, storage, config, start_storage): @pytest.mark.parametrize( 'storage,config', [ - ('memory', None), - ('sqlite', None), - ('weaviate', WeaviateConfig(n_dim=123)), - ('annlite', AnnliteConfig(n_dim=123)), - ('qdrant', QdrantConfig(n_dim=123)), - ('elasticsearch', ElasticConfig(n_dim=123)), + # ('memory', None), + # ('sqlite', None), + # ('weaviate', WeaviateConfig(n_dim=123)), + # ('annlite', AnnliteConfig(n_dim=123)), + # ('qdrant', QdrantConfig(n_dim=123)), + # ('elasticsearch', ElasticConfig(n_dim=123)), + ('redis', RedisConfig(n_dim=123, flush=True)), ], ) def test_sequence_int(docs, nparray, storage, config, start_storage): @@ -236,12 +245,13 @@ def test_sequence_int(docs, nparray, storage, config, start_storage): @pytest.mark.parametrize( 'storage,config', [ - ('memory', None), - ('sqlite', None), - ('weaviate', WeaviateConfig(n_dim=123)), - ('annlite', AnnliteConfig(n_dim=123)), - ('qdrant', QdrantConfig(n_dim=123)), - ('elasticsearch', ElasticConfig(n_dim=123)), + # ('memory', None), + # ('sqlite', None), + # ('weaviate', WeaviateConfig(n_dim=123)), + # ('annlite', AnnliteConfig(n_dim=123)), + # ('qdrant', QdrantConfig(n_dim=123)), + # ('elasticsearch', ElasticConfig(n_dim=123)), + ('redis', RedisConfig(n_dim=123, flush=True)), ], ) def test_sequence_str(docs, storage, config, start_storage): @@ -270,12 +280,13 @@ def test_sequence_str(docs, storage, config, start_storage): @pytest.mark.parametrize( 'storage,config', [ - ('memory', None), - ('sqlite', None), - ('weaviate', WeaviateConfig(n_dim=123)), - ('annlite', AnnliteConfig(n_dim=123)), - ('qdrant', QdrantConfig(n_dim=123)), - ('elasticsearch', ElasticConfig(n_dim=123)), + # ('memory', None), + # ('sqlite', None), + # ('weaviate', WeaviateConfig(n_dim=123)), + # ('annlite', AnnliteConfig(n_dim=123)), + # ('qdrant', QdrantConfig(n_dim=123)), + # ('elasticsearch', ElasticConfig(n_dim=123)), + ('redis', RedisConfig(n_dim=123, flush=True)), ], ) def test_docarray_list_tuple(docs, storage, config, start_storage): @@ -290,12 +301,13 @@ def test_docarray_list_tuple(docs, storage, config, start_storage): @pytest.mark.parametrize( 'storage,config', [ - ('memory', None), - ('sqlite', None), - ('weaviate', WeaviateConfig(n_dim=123)), - ('annlite', AnnliteConfig(n_dim=123)), - ('qdrant', QdrantConfig(n_dim=123)), - ('elasticsearch', ElasticConfig(n_dim=123)), + # ('memory', None), + # ('sqlite', None), + # ('weaviate', WeaviateConfig(n_dim=123)), + # ('annlite', AnnliteConfig(n_dim=123)), + # ('qdrant', QdrantConfig(n_dim=123)), + # ('elasticsearch', ElasticConfig(n_dim=123)), + ('redis', RedisConfig(n_dim=123, flush=True)), ], ) def test_path_syntax_indexing(storage, config, start_storage): @@ -329,12 +341,13 @@ def test_path_syntax_indexing(storage, config, start_storage): @pytest.mark.parametrize( 'storage,config', [ - ('memory', None), - ('sqlite', None), - ('weaviate', WeaviateConfig(n_dim=123)), - ('annlite', AnnliteConfig(n_dim=123)), - ('qdrant', QdrantConfig(n_dim=123)), - ('elasticsearch', ElasticConfig(n_dim=123)), + # ('memory', None), + # ('sqlite', None), + # ('weaviate', WeaviateConfig(n_dim=123)), + # ('annlite', AnnliteConfig(n_dim=123)), + # ('qdrant', QdrantConfig(n_dim=123)), + # ('elasticsearch', ElasticConfig(n_dim=123)), + ('redis', RedisConfig(n_dim=123, flush=True)), ], ) def test_path_syntax_indexing_set(storage, config, start_storage): @@ -414,12 +427,13 @@ def test_path_syntax_indexing_set(storage, config, start_storage): @pytest.mark.parametrize( 'storage,config_gen', [ - ('memory', None), - ('sqlite', None), - ('weaviate', lambda: WeaviateConfig(n_dim=123)), - ('annlite', lambda: AnnliteConfig(n_dim=123)), - ('qdrant', lambda: QdrantConfig(n_dim=123)), - ('elasticsearch', lambda: ElasticConfig(n_dim=123)), + # ('memory', None), + # ('sqlite', None), + # ('weaviate', lambda: WeaviateConfig(n_dim=123)), + # ('annlite', lambda: AnnliteConfig(n_dim=123)), + # ('qdrant', lambda: QdrantConfig(n_dim=123)), + # ('elasticsearch', lambda: ElasticConfig(n_dim=123)), + ('redis', lambda: RedisConfig(n_dim=123, flush=True)), ], ) def test_attribute_indexing(storage, config_gen, start_storage, size): @@ -450,7 +464,11 @@ def test_attribute_indexing(storage, config_gen, start_storage, size): @pytest.mark.parametrize( - 'storage', ['memory', 'sqlite', 'weaviate', 'annlite', 'qdrant', 'elasticsearch'] + 'storage', + [ + # \'memory', 'sqlite', 'weaviate', 'annlite', 'qdrant', 'elasticsearch', + 'redis' + ], ) def test_tensor_attribute_selector(storage, start_storage): import scipy.sparse @@ -461,6 +479,8 @@ def test_tensor_attribute_selector(storage, start_storage): if storage in ('annlite', 'weaviate', 'qdrant', 'elasticsearch'): da = DocumentArray(storage=storage, config={'n_dim': 10}) + elif storage == 'redis': + da = DocumentArray(storage=storage, config={'n_dim': 10, 'flush': True}) else: da = DocumentArray(storage=storage) @@ -482,30 +502,34 @@ def test_tensor_attribute_selector(storage, start_storage): assert isinstance(v1, list) -# TODO: since match function is not implemented, this test will -# not work with weaviate storage atm, will be addressed in -# next version -@pytest.mark.parametrize('storage', ['memory', 'sqlite', 'annlite']) -def test_advance_selector_mixed(storage): - if storage == 'annlite': - da = DocumentArray(storage=storage, config={'n_dim': 3}) - else: - da = DocumentArray(storage=storage) +# # TODO: since match function is not implemented, this test will +# # not work with weaviate storage atm, will be addressed in +# # next version +# @pytest.mark.parametrize('storage', ['memory', 'sqlite', 'annlite']) +# def test_advance_selector_mixed(storage): +# if storage == 'annlite': +# da = DocumentArray(storage=storage, config={'n_dim': 3}) +# else: +# da = DocumentArray(storage=storage) - da.extend(DocumentArray.empty(10)) - da.embeddings = np.random.random([10, 3]) +# da.extend(DocumentArray.empty(10)) +# da.embeddings = np.random.random([10, 3]) - da.match(da, exclude_self=True) +# da.match(da, exclude_self=True) - assert len(da[:, ('id', 'embedding', 'matches')]) == 3 - assert len(da[:, ('id', 'embedding', 'matches')][0]) == 10 +# assert len(da[:, ('id', 'embedding', 'matches')]) == 3 +# assert len(da[:, ('id', 'embedding', 'matches')][0]) == 10 @pytest.mark.parametrize( - 'storage', ['memory', 'sqlite', 'weaviate', 'annlite', 'qdrant', 'elasticsearch'] + 'storage', + [ + # 'memory', 'sqlite', 'weaviate', 'annlite', 'qdrant', 'elasticsearch', + 'redis' + ], ) def test_single_boolean_and_padding(storage, start_storage): - if storage in ('annlite', 'weaviate', 'qdrant', 'elasticsearch'): + if storage in ('annlite', 'weaviate', 'qdrant', 'elasticsearch', 'redis'): da = DocumentArray(storage=storage, config={'n_dim': 10}) else: da = DocumentArray(storage=storage) @@ -528,12 +552,13 @@ def test_single_boolean_and_padding(storage, start_storage): @pytest.mark.parametrize( 'storage,config_gen', [ - ('memory', None), - ('sqlite', None), - ('weaviate', lambda: WeaviateConfig(n_dim=123)), - ('annlite', lambda: AnnliteConfig(n_dim=123)), - ('qdrant', lambda: QdrantConfig(n_dim=123)), - ('elasticsearch', lambda: ElasticConfig(n_dim=123)), + # ('memory', None), + # ('sqlite', None), + # ('weaviate', lambda: WeaviateConfig(n_dim=123)), + # ('annlite', lambda: AnnliteConfig(n_dim=123)), + # ('qdrant', lambda: QdrantConfig(n_dim=123)), + # ('elasticsearch', lambda: ElasticConfig(n_dim=123)), + ('redis', lambda: RedisConfig(n_dim=123, flush=True)), ], ) def test_edge_case_two_strings(storage, config_gen, start_storage): @@ -602,19 +627,29 @@ def test_edge_case_two_strings(storage, config_gen, start_storage): with pytest.raises(IndexError): da['1', 'hellohello'] = 'hello' + if storage == 'redis': + gc.collect() + +# TODO: since redis has flush, this test should be rewrite for redis @pytest.mark.parametrize( 'storage,config', [ - ('sqlite', None), - ('weaviate', WeaviateConfig(n_dim=123)), - ('annlite', AnnliteConfig(n_dim=123)), - ('qdrant', QdrantConfig(n_dim=123)), - ('elasticsearch', ElasticConfig(n_dim=123)), + # ('sqlite', None), + # ('weaviate', WeaviateConfig(n_dim=123)), + # ('annlite', AnnliteConfig(n_dim=123)), + # ('qdrant', QdrantConfig(n_dim=123)), + # ('elasticsearch', ElasticConfig(n_dim=123)), + ('redis', RedisConfig(n_dim=123, flush=True)), ], ) def test_offset2ids_persistence(storage, config, start_storage): - da = DocumentArray(storage=storage, config=config) + # config = RedisConfig(n_dim=123, flush=True) + # storage = 'redis' + if storage == 'redis': + da = DocumentArray(storage=storage, config=config) + else: + da = DocumentArray(storage=storage, config=config) da.extend( [ @@ -626,12 +661,17 @@ def test_offset2ids_persistence(storage, config, start_storage): da.insert(1, Document(id='1')) da.insert(3, Document(id='3')) + # print('aaa') + # print(da._offset2ids.ids) + config = da._config da_ids = da[:, 'id'] assert da_ids == [str(i) for i in range(5)] da._persist = True da.__del__() + if storage == 'redis': + config.flush = False da = DocumentArray(storage=storage, config=config) assert da[:, 'id'] == da_ids From 08d6644abd96a7e1043128bd44328ccd740b5c07 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 15 Aug 2022 23:36:29 +0800 Subject: [PATCH 41/93] feat: add _ensure_unique_config to redis --- docarray/array/storage/redis/backend.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 0965c10e915..1cad2c6f6e3 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -104,6 +104,15 @@ def _build_client(self): return client + def _ensure_unique_config( + self, + config_root: dict, + config_subindex: dict, + config_joined: dict, + subindex_name: str, + ) -> dict: + return config_joined + def _build_schema_from_redis_config(self): index_param = { 'TYPE': 'FLOAT32', From a8f6eebc77431466512ea57f6a828332f5feda64 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 17 Aug 2022 22:38:22 +0800 Subject: [PATCH 42/93] fix: remove comments test_text --- tests/unit/array/mixins/test_text.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/unit/array/mixins/test_text.py b/tests/unit/array/mixins/test_text.py index d047181410b..5c1eb163cb7 100644 --- a/tests/unit/array/mixins/test_text.py +++ b/tests/unit/array/mixins/test_text.py @@ -145,12 +145,12 @@ def test_da_text_to_tensor_max_len_1(docs, da_cls, config, start_storage): @pytest.mark.parametrize( 'da_cls,config', [ - # (DocumentArray, None), - # (DocumentArraySqlite, None), - # (DocumentArrayAnnlite, AnnliteConfig(n_dim=128)), - # (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), - # (DocumentArrayQdrant, QdrantConfig(n_dim=128)), - # (DocumentArrayElastic, ElasticConfig(n_dim=128)), + (DocumentArray, None), + (DocumentArraySqlite, None), + (DocumentArrayAnnlite, AnnliteConfig(n_dim=128)), + (DocumentArrayWeaviate, WeaviateConfig(n_dim=128)), + (DocumentArrayQdrant, QdrantConfig(n_dim=128)), + (DocumentArrayElastic, ElasticConfig(n_dim=128)), (DocumentArrayRedis, RedisConfig(n_dim=128, flush=True)), ], ) From efd052d3c66a7c575cc4d236d787bfa3b55cf1cd Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 17 Aug 2022 22:40:15 +0800 Subject: [PATCH 43/93] fix: remove useless debug output --- docarray/array/storage/redis/seqlike.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docarray/array/storage/redis/seqlike.py b/docarray/array/storage/redis/seqlike.py index 1a216710981..9cde317b003 100644 --- a/docarray/array/storage/redis/seqlike.py +++ b/docarray/array/storage/redis/seqlike.py @@ -61,7 +61,6 @@ def __repr__(self): return f'' def _upload_batch(self, docs: Iterable['Document']): - print('uploading batch @', __import__('time').time_ns()) pipe = self._client.pipeline() batch = 0 for doc in docs: From 60b95260b00b2e8438a421f490301c3d612d5eb1 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 17 Aug 2022 22:48:53 +0800 Subject: [PATCH 44/93] refactor: simplify redis search command --- docarray/array/storage/redis/find.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docarray/array/storage/redis/find.py b/docarray/array/storage/redis/find.py index d07e78133c7..148980a1e58 100644 --- a/docarray/array/storage/redis/find.py +++ b/docarray/array/storage/redis/find.py @@ -28,12 +28,15 @@ def _find_similar_vectors( self, query: 'RedisArrayType', filter: Optional[Dict] = None, limit=10 ): q = ( - Query("*=>[KNN " + str(limit) + " @embedding $vec AS vector_score]") + Query('*=>[KNN $limit @embedding $vec AS vector_score]') .sort_by('vector_score') .dialect(2) ) - query_params = {"vec": to_numpy_array(query).astype(np.float32).tobytes()} + query_params = { + 'vec': to_numpy_array(query).astype(np.float32).tobytes(), + 'limit': str(limit), + } if filter: f = self._build_fiter(filter) q.add_filter(f) From 5da0d892cf34c20b9967f953497fb35c2b85cf0a Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 17 Aug 2022 22:57:37 +0800 Subject: [PATCH 45/93] refractor: avoid using redis keys command --- docarray/array/storage/redis/seqlike.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docarray/array/storage/redis/seqlike.py b/docarray/array/storage/redis/seqlike.py index 9cde317b003..9532bae67af 100644 --- a/docarray/array/storage/redis/seqlike.py +++ b/docarray/array/storage/redis/seqlike.py @@ -27,11 +27,11 @@ def __len__(self): :return: the length of this :class:`DocumentArrayRedis` object """ try: - keys = self._client.keys() - if b'offset2id' in keys: - return len(keys) - 1 + dbsize = self._client.dbsize() + if self._client.exists(b'offset2id'): + return dbsize - 1 else: - return len(keys) + return dbsize except: return 0 From 0d32d68077bb260ebe4f5493073855a55d286188 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 17 Aug 2022 23:00:16 +0800 Subject: [PATCH 46/93] refractor: remove useless check --- docarray/array/storage/redis/find.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/docarray/array/storage/redis/find.py b/docarray/array/storage/redis/find.py index 148980a1e58..69834a95451 100644 --- a/docarray/array/storage/redis/find.py +++ b/docarray/array/storage/redis/find.py @@ -67,11 +67,9 @@ def _find( ] def _find_with_filter(self, filter: Dict, limit: Optional[Union[int, float]] = 20): - - if filter: - s = self._build_query_str(filter) - q = Query(s) - q.paging(0, limit) + s = self._build_query_str(filter) + q = Query(s) + q.paging(0, limit) results = self._client.ft().search(q).docs @@ -89,7 +87,6 @@ def _filter( # TODO return NumericFilter or List[NumericFilter] def _build_fiter(self, filter: Dict) -> NumericFilter: - INF = "+inf" NEG_INF = "-inf" From 6faa2ab89c9fb786f6a5bb2beedd501b2fbf9f48 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 18 Aug 2022 10:23:59 +0800 Subject: [PATCH 47/93] refractor: clarify exception message --- docarray/array/storage/redis/backend.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 1cad2c6f6e3..60809aced86 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -59,14 +59,15 @@ def _init_storage( config = dataclass_from_dict(RedisConfig, config) if config.distance not in ['L2', 'IP', 'COSINE']: - raise ValueError(f'Distance metric {config.distance} not supported') + raise ValueError( + f'Expecting distance metric one of COSINE, L2 OR IP, got {config.distance} instead' + ) if config.method not in ['HNSW', 'FLAT']: - raise ValueError(f'Method {config.method} not supported') + raise ValueError( + f'Expecting search method one of HNSW OR FLAT, got {config.method} instead' + ) - if ( - 'decode_responses' in config.redis_config - and config.redis_config['decode_responses'] - ): + if config.redis_config.get('decode_responses'): config.redis_config['decode_responses'] = False self._offset2id_key = 'offset2id' From cfbb54378baf9f5cc00bc43ed316b30fe230bf13 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 18 Aug 2022 10:26:06 +0800 Subject: [PATCH 48/93] feat: add index_name to redis config --- docarray/array/storage/redis/backend.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 60809aced86..4fbe9a2066e 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -20,6 +20,7 @@ class RedisConfig: port: int = field(default=6379) flush: bool = field(default=False) update_schema: bool = field(default=True) + index_name: str = field(default='idx') distance: str = field(default='COSINE') redis_config: Dict[str, Any] = field(default_factory=dict) index_text: bool = field(default=False) @@ -97,11 +98,11 @@ def _build_client(self): if self._config.update_schema: if len(client.execute_command('FT._LIST')) > 0: - client.ft().dropindex() + client.ft(index_name=self._config.index_name).dropindex() if self._config.flush or self._config.update_schema: schema = self._build_schema_from_redis_config() - client.ft().create_index(schema) + client.ft(index_name=self._config.index_name).create_index(schema) return client From a91d2c53a3122af65148d122915ebc0ce3c52fa7 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 18 Aug 2022 10:57:48 +0800 Subject: [PATCH 49/93] fix: change redis find to mongo style --- docarray/array/storage/redis/find.py | 77 +++++++++++++++++----------- tests/unit/array/mixins/test_find.py | 29 ++++------- 2 files changed, 57 insertions(+), 49 deletions(-) diff --git a/docarray/array/storage/redis/find.py b/docarray/array/storage/redis/find.py index 69834a95451..b6ab824a145 100644 --- a/docarray/array/storage/redis/find.py +++ b/docarray/array/storage/redis/find.py @@ -25,21 +25,23 @@ class FindMixin(BaseFindMixin): def _find_similar_vectors( - self, query: 'RedisArrayType', filter: Optional[Dict] = None, limit=10 + self, + query: 'RedisArrayType', + filter: Optional[Dict] = None, + limit: Optional[Union[int, float]] = 20, ): q = ( - Query('*=>[KNN $limit @embedding $vec AS vector_score]') + Query(f'*=>[KNN {limit} @embedding $vec AS vector_score]') .sort_by('vector_score') + .paging(0, limit) .dialect(2) ) - query_params = { - 'vec': to_numpy_array(query).astype(np.float32).tobytes(), - 'limit': str(limit), - } + query_params = {'vec': to_numpy_array(query).astype(np.float32).tobytes()} if filter: - f = self._build_fiter(filter) - q.add_filter(f) + filters = self._build_fiter(filter) + for f in filters: + q.add_filter(f) results = self._client.ft().search(q, query_params).docs da = DocumentArray() @@ -52,9 +54,9 @@ def _find_similar_vectors( def _find( self, query: 'RedisArrayType', - limit: int = 10, + limit: Optional[Union[int, float]] = 20, filter: Optional[Dict] = None, - **kwargs + **kwargs, ) -> List['DocumentArray']: query = np.array(query) @@ -86,34 +88,47 @@ def _filter( return self._find_with_filter(filter, limit=limit) # TODO return NumericFilter or List[NumericFilter] - def _build_fiter(self, filter: Dict) -> NumericFilter: + def _build_fiter(self, filter: Dict) -> List[NumericFilter]: INF = "+inf" NEG_INF = "-inf" - - if filter['operator'] == 'gt': - f = NumericFilter(filter['key'], filter['value'], INF, minExclusive=True) - elif filter['operator'] == 'gte': - f = NumericFilter(filter['key'], filter['value'], INF) - elif filter['operator'] == 'lt': - f = NumericFilter( - filter['key'], NEG_INF, filter['value'], maxExclusive=True - ) - elif filter['operator'] == 'lte': - f = NumericFilter(filter['key'], NEG_INF, filter['value']) + f = [] + + for key in filter: + operator = list(filter[key].keys())[0] + threshold = filter[key][operator] + if operator == '$gt': + f.append(NumericFilter(key, threshold, INF, minExclusive=True)) + elif operator == '$gte': + f.append(NumericFilter(key, threshold, INF)) + elif operator == '$lt': + f.append(NumericFilter(key, NEG_INF, threshold, maxExclusive=True)) + elif operator == '$lte': + f.append(NumericFilter(key, NEG_INF, threshold)) + elif operator == '$eq': + f.append(NumericFilter(key, threshold, threshold)) + # TODO add $neq if possible return f def _build_query_str(self, filter: Dict) -> str: INF = "+inf" NEG_INF = "-inf" - - if filter['operator'] == 'gt': - s = "@{}:[({} {}]".format(filter['key'], filter['value'], INF) - elif filter['operator'] == 'gte': - s = "@{}:[{} {}]".format(filter['key'], filter['value'], INF) - elif filter['operator'] == 'lt': - s = "@{}:[{} ({}]".format(filter['key'], NEG_INF, filter['value']) - elif filter['operator'] == 'lte': - s = "@{}:[{} {}]".format(filter['key'], NEG_INF, filter['value']) + s = "" + + for key in filter: + operator = list(filter[key].keys())[0] + threshold = filter[key][operator] + if operator == '$gt': + s += f"@{key}:[({threshold} {INF}] " + elif operator == '$gte': + s += f"@{key}:[{threshold} {INF}] " + elif operator == '$lt': + s += f"@{key}:[{NEG_INF} ({threshold}] " + elif operator == '$lte': + s += f"@{key}:[{NEG_INF} {threshold}] " + elif operator == '$eq': + s += f"@{key}:[{threshold} {threshold}] " + elif operator == '$neq': + s += f"-@{key}:[{threshold} {threshold}] " return s diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index caafd12d7a0..42ac832f081 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -253,11 +253,12 @@ def test_find_by_tag(storage, config, start_storage): } numeric_operators_redis = { - 'gte': operator.ge, - 'gt': operator.gt, - 'lte': operator.le, - 'lt': operator.lt, - 'eq': operator.eq, + '$gte': operator.ge, + '$gt': operator.gt, + '$lte': operator.le, + '$lt': operator.lt, + '$eq': operator.eq, + '$neq': operator.ne, } @@ -344,16 +345,12 @@ def test_find_by_tag(storage, config, start_storage): tuple( [ 'redis', - lambda operator, threshold: { - 'key': 'price', - 'operator': operator, - 'value': threshold, - }, + lambda operator, threshold: {'price': {operator: threshold}}, numeric_operators_redis, operator, ] ) - for operator in ['gt', 'gte', 'lt', 'lte'] + for operator in ['$gte', '$gt', '$lte', '$lt', '$eq'] ], ], ) @@ -448,16 +445,12 @@ def test_search_pre_filtering( tuple( [ 'redis', - lambda operator, threshold: { - 'key': 'price', - 'operator': operator, - 'value': threshold, - }, - numeric_operators_elasticsearch, + lambda operator, threshold: {'price': {operator: threshold}}, + numeric_operators_redis, operator, ] ) - for operator in ['gt', 'gte', 'lt', 'lte'] + for operator in numeric_operators_redis.keys() ], ], ) From 8bc105af63375d139c08f14f83ad8eb916e9acc4 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 18 Aug 2022 11:00:23 +0800 Subject: [PATCH 50/93] refractor: config sequence --- docarray/array/storage/redis/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 4fbe9a2066e..dcb4a54fd79 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -18,9 +18,9 @@ class RedisConfig: n_dim: int host: str = field(default='localhost') port: int = field(default=6379) + index_name: str = field(default='idx') flush: bool = field(default=False) update_schema: bool = field(default=True) - index_name: str = field(default='idx') distance: str = field(default='COSINE') redis_config: Dict[str, Any] = field(default_factory=dict) index_text: bool = field(default=False) From 8e89d146bbb8811f093452362c05560f7fdf205a Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 18 Aug 2022 11:26:54 +0800 Subject: [PATCH 51/93] test: remove comments --- tests/unit/array/test_advance_indexing.py | 219 ++++++++++------------ 1 file changed, 104 insertions(+), 115 deletions(-) diff --git a/tests/unit/array/test_advance_indexing.py b/tests/unit/array/test_advance_indexing.py index 5aa3736a0a7..38a096349cd 100644 --- a/tests/unit/array/test_advance_indexing.py +++ b/tests/unit/array/test_advance_indexing.py @@ -2,11 +2,10 @@ import pytest from docarray import DocumentArray, Document - -# from docarray.array.storage.weaviate import WeaviateConfig -# from docarray.array.annlite import AnnliteConfig -# from docarray.array.qdrant import QdrantConfig -# from docarray.array.elastic import ElasticConfig +from docarray.array.storage.weaviate import WeaviateConfig +from docarray.array.annlite import AnnliteConfig +from docarray.array.qdrant import QdrantConfig +from docarray.array.elastic import ElasticConfig from docarray.array.redis import RedisConfig import gc @@ -24,12 +23,12 @@ def indices(): @pytest.mark.parametrize( 'storage,config', [ - # ('memory', None), - # ('sqlite', None), - # ('weaviate', WeaviateConfig(n_dim=123)), - # ('annlite', AnnliteConfig(n_dim=123)), - # ('qdrant', QdrantConfig(n_dim=123)), - # ('elasticsearch', ElasticConfig(n_dim=123)), + ('memory', None), + ('sqlite', None), + ('weaviate', WeaviateConfig(n_dim=123)), + ('annlite', AnnliteConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123)), + ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123, flush=True)), ], ) @@ -58,11 +57,11 @@ def test_getter_int_str(docs, storage, config, start_storage): @pytest.mark.parametrize( 'storage,config', [ - # ('memory', None), - # ('sqlite', None), - # ('weaviate', WeaviateConfig(n_dim=123)), - # ('annlite', AnnliteConfig(n_dim=123)), - # ('qdrant', QdrantConfig(n_dim=123)), + ('memory', None), + ('sqlite', None), + ('weaviate', WeaviateConfig(n_dim=123)), + ('annlite', AnnliteConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123, flush=True)), ], ) @@ -87,12 +86,12 @@ def test_setter_int_str(docs, storage, config, start_storage): @pytest.mark.parametrize( 'storage,config', [ - # ('memory', None), - # ('sqlite', None), - # ('weaviate', WeaviateConfig(n_dim=123)), - # ('annlite', AnnliteConfig(n_dim=123)), - # ('qdrant', QdrantConfig(n_dim=123)), - # ('elasticsearch', ElasticConfig(n_dim=123)), + ('memory', None), + ('sqlite', None), + ('weaviate', WeaviateConfig(n_dim=123)), + ('annlite', AnnliteConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123)), + ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123, flush=True)), ], ) @@ -122,12 +121,12 @@ def test_del_int_str(docs, storage, config, start_storage, indices): @pytest.mark.parametrize( 'storage,config', [ - # ('memory', None), - # ('sqlite', None), - # ('weaviate', WeaviateConfig(n_dim=123)), - # ('annlite', AnnliteConfig(n_dim=123)), - # ('qdrant', QdrantConfig(n_dim=123)), - # ('elasticsearch', ElasticConfig(n_dim=123)), + ('memory', None), + ('sqlite', None), + ('weaviate', WeaviateConfig(n_dim=123)), + ('annlite', AnnliteConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123)), + ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123, flush=True)), ], ) @@ -161,12 +160,12 @@ def test_slice(docs, storage, config, start_storage): @pytest.mark.parametrize( 'storage,config', [ - # ('memory', None), - # ('sqlite', None), - # ('weaviate', WeaviateConfig(n_dim=123)), - # ('annlite', AnnliteConfig(n_dim=123)), - # ('qdrant', QdrantConfig(n_dim=123)), - # ('elasticsearch', ElasticConfig(n_dim=123)), + ('memory', None), + ('sqlite', None), + ('weaviate', WeaviateConfig(n_dim=123)), + ('annlite', AnnliteConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123)), + ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123, flush=True)), ], ) @@ -208,12 +207,12 @@ def test_sequence_bool_index(docs, storage, config, start_storage): @pytest.mark.parametrize( 'storage,config', [ - # ('memory', None), - # ('sqlite', None), - # ('weaviate', WeaviateConfig(n_dim=123)), - # ('annlite', AnnliteConfig(n_dim=123)), - # ('qdrant', QdrantConfig(n_dim=123)), - # ('elasticsearch', ElasticConfig(n_dim=123)), + ('memory', None), + ('sqlite', None), + ('weaviate', WeaviateConfig(n_dim=123)), + ('annlite', AnnliteConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123)), + ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123, flush=True)), ], ) @@ -245,12 +244,12 @@ def test_sequence_int(docs, nparray, storage, config, start_storage): @pytest.mark.parametrize( 'storage,config', [ - # ('memory', None), - # ('sqlite', None), - # ('weaviate', WeaviateConfig(n_dim=123)), - # ('annlite', AnnliteConfig(n_dim=123)), - # ('qdrant', QdrantConfig(n_dim=123)), - # ('elasticsearch', ElasticConfig(n_dim=123)), + ('memory', None), + ('sqlite', None), + ('weaviate', WeaviateConfig(n_dim=123)), + ('annlite', AnnliteConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123)), + ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123, flush=True)), ], ) @@ -280,12 +279,12 @@ def test_sequence_str(docs, storage, config, start_storage): @pytest.mark.parametrize( 'storage,config', [ - # ('memory', None), - # ('sqlite', None), - # ('weaviate', WeaviateConfig(n_dim=123)), - # ('annlite', AnnliteConfig(n_dim=123)), - # ('qdrant', QdrantConfig(n_dim=123)), - # ('elasticsearch', ElasticConfig(n_dim=123)), + ('memory', None), + ('sqlite', None), + ('weaviate', WeaviateConfig(n_dim=123)), + ('annlite', AnnliteConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123)), + ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123, flush=True)), ], ) @@ -301,12 +300,12 @@ def test_docarray_list_tuple(docs, storage, config, start_storage): @pytest.mark.parametrize( 'storage,config', [ - # ('memory', None), - # ('sqlite', None), - # ('weaviate', WeaviateConfig(n_dim=123)), - # ('annlite', AnnliteConfig(n_dim=123)), - # ('qdrant', QdrantConfig(n_dim=123)), - # ('elasticsearch', ElasticConfig(n_dim=123)), + ('memory', None), + ('sqlite', None), + ('weaviate', WeaviateConfig(n_dim=123)), + ('annlite', AnnliteConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123)), + ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123, flush=True)), ], ) @@ -341,12 +340,12 @@ def test_path_syntax_indexing(storage, config, start_storage): @pytest.mark.parametrize( 'storage,config', [ - # ('memory', None), - # ('sqlite', None), - # ('weaviate', WeaviateConfig(n_dim=123)), - # ('annlite', AnnliteConfig(n_dim=123)), - # ('qdrant', QdrantConfig(n_dim=123)), - # ('elasticsearch', ElasticConfig(n_dim=123)), + ('memory', None), + ('sqlite', None), + ('weaviate', WeaviateConfig(n_dim=123)), + ('annlite', AnnliteConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123)), + ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123, flush=True)), ], ) @@ -427,12 +426,12 @@ def test_path_syntax_indexing_set(storage, config, start_storage): @pytest.mark.parametrize( 'storage,config_gen', [ - # ('memory', None), - # ('sqlite', None), - # ('weaviate', lambda: WeaviateConfig(n_dim=123)), - # ('annlite', lambda: AnnliteConfig(n_dim=123)), - # ('qdrant', lambda: QdrantConfig(n_dim=123)), - # ('elasticsearch', lambda: ElasticConfig(n_dim=123)), + ('memory', None), + ('sqlite', None), + ('weaviate', lambda: WeaviateConfig(n_dim=123)), + ('annlite', lambda: AnnliteConfig(n_dim=123)), + ('qdrant', lambda: QdrantConfig(n_dim=123)), + ('elasticsearch', lambda: ElasticConfig(n_dim=123)), ('redis', lambda: RedisConfig(n_dim=123, flush=True)), ], ) @@ -465,10 +464,7 @@ def test_attribute_indexing(storage, config_gen, start_storage, size): @pytest.mark.parametrize( 'storage', - [ - # \'memory', 'sqlite', 'weaviate', 'annlite', 'qdrant', 'elasticsearch', - 'redis' - ], + ['memory', 'sqlite', 'weaviate', 'annlite', 'qdrant', 'elasticsearch', 'redis'], ) def test_tensor_attribute_selector(storage, start_storage): import scipy.sparse @@ -502,31 +498,28 @@ def test_tensor_attribute_selector(storage, start_storage): assert isinstance(v1, list) -# # TODO: since match function is not implemented, this test will -# # not work with weaviate storage atm, will be addressed in -# # next version -# @pytest.mark.parametrize('storage', ['memory', 'sqlite', 'annlite']) -# def test_advance_selector_mixed(storage): -# if storage == 'annlite': -# da = DocumentArray(storage=storage, config={'n_dim': 3}) -# else: -# da = DocumentArray(storage=storage) +# TODO: since match function is not implemented, this test will +# not work with weaviate storage atm, will be addressed in +# next version +@pytest.mark.parametrize('storage', ['memory', 'sqlite', 'annlite']) +def test_advance_selector_mixed(storage): + if storage == 'annlite': + da = DocumentArray(storage=storage, config={'n_dim': 3}) + else: + da = DocumentArray(storage=storage) -# da.extend(DocumentArray.empty(10)) -# da.embeddings = np.random.random([10, 3]) + da.extend(DocumentArray.empty(10)) + da.embeddings = np.random.random([10, 3]) -# da.match(da, exclude_self=True) + da.match(da, exclude_self=True) -# assert len(da[:, ('id', 'embedding', 'matches')]) == 3 -# assert len(da[:, ('id', 'embedding', 'matches')][0]) == 10 + assert len(da[:, ('id', 'embedding', 'matches')]) == 3 + assert len(da[:, ('id', 'embedding', 'matches')][0]) == 10 @pytest.mark.parametrize( 'storage', - [ - # 'memory', 'sqlite', 'weaviate', 'annlite', 'qdrant', 'elasticsearch', - 'redis' - ], + ['memory', 'sqlite', 'weaviate', 'annlite', 'qdrant', 'elasticsearch', 'redis'], ) def test_single_boolean_and_padding(storage, start_storage): if storage in ('annlite', 'weaviate', 'qdrant', 'elasticsearch', 'redis'): @@ -549,19 +542,24 @@ def test_single_boolean_and_padding(storage, start_storage): assert len(da[True, False, False]) == 1 +@pytest.fixture() +def ensure_gc(): + gc.collect() + + @pytest.mark.parametrize( 'storage,config_gen', [ - # ('memory', None), - # ('sqlite', None), - # ('weaviate', lambda: WeaviateConfig(n_dim=123)), - # ('annlite', lambda: AnnliteConfig(n_dim=123)), - # ('qdrant', lambda: QdrantConfig(n_dim=123)), - # ('elasticsearch', lambda: ElasticConfig(n_dim=123)), + ('memory', None), + ('sqlite', None), + ('weaviate', lambda: WeaviateConfig(n_dim=123)), + ('annlite', lambda: AnnliteConfig(n_dim=123)), + ('qdrant', lambda: QdrantConfig(n_dim=123)), + ('elasticsearch', lambda: ElasticConfig(n_dim=123)), ('redis', lambda: RedisConfig(n_dim=123, flush=True)), ], ) -def test_edge_case_two_strings(storage, config_gen, start_storage): +def test_edge_case_two_strings(storage, config_gen, ensure_gc, start_storage): # getitem if config_gen: da = DocumentArray(storage=storage, config=config_gen()) @@ -628,28 +626,22 @@ def test_edge_case_two_strings(storage, config_gen, start_storage): da['1', 'hellohello'] = 'hello' if storage == 'redis': - gc.collect() + ensure_gc -# TODO: since redis has flush, this test should be rewrite for redis @pytest.mark.parametrize( 'storage,config', [ - # ('sqlite', None), - # ('weaviate', WeaviateConfig(n_dim=123)), - # ('annlite', AnnliteConfig(n_dim=123)), - # ('qdrant', QdrantConfig(n_dim=123)), - # ('elasticsearch', ElasticConfig(n_dim=123)), + ('sqlite', None), + ('weaviate', WeaviateConfig(n_dim=123)), + ('annlite', AnnliteConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123)), + ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123, flush=True)), ], ) def test_offset2ids_persistence(storage, config, start_storage): - # config = RedisConfig(n_dim=123, flush=True) - # storage = 'redis' - if storage == 'redis': - da = DocumentArray(storage=storage, config=config) - else: - da = DocumentArray(storage=storage, config=config) + da = DocumentArray(storage=storage, config=config) da.extend( [ @@ -661,9 +653,6 @@ def test_offset2ids_persistence(storage, config, start_storage): da.insert(1, Document(id='1')) da.insert(3, Document(id='3')) - # print('aaa') - # print(da._offset2ids.ids) - config = da._config da_ids = da[:, 'id'] assert da_ids == [str(i) for i in range(5)] From e36ae3904a974868d03e7ce335208227062f4c51 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 18 Aug 2022 15:28:51 +0800 Subject: [PATCH 52/93] fix: redis config index_name update --- docarray/array/storage/redis/backend.py | 2 +- tests/unit/array/storage/redis/test_backend.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index dcb4a54fd79..bfada762d25 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -97,7 +97,7 @@ def _build_client(self): client.flushdb() if self._config.update_schema: - if len(client.execute_command('FT._LIST')) > 0: + if self._config.index_name in client.execute_command('FT._LIST'): client.ft(index_name=self._config.index_name).dropindex() if self._config.flush or self._config.update_schema: diff --git a/tests/unit/array/storage/redis/test_backend.py b/tests/unit/array/storage/redis/test_backend.py index fef4136b07e..0204a01226c 100644 --- a/tests/unit/array/storage/redis/test_backend.py +++ b/tests/unit/array/storage/redis/test_backend.py @@ -122,17 +122,22 @@ def test_init_storage( def test_init_storage_update_schema(start_storage): - cfg = RedisConfig(n_dim=128, tag_indices=['attr1']) + index = 'aaa' + cfg = RedisConfig(n_dim=128, tag_indices=['attr1'], index_name=index, flush=True) redis_da = DocumentArrayDummy(storage='redis', config=cfg) - assert redis_da._client.ft().info()['attributes'][1][1] == b'attr1' + assert redis_da._client.ft(index).info()['attributes'][1][1] == b'attr1' cfg = RedisConfig(n_dim=128, tag_indices=['attr2'], update_schema=False) redis_da = DocumentArrayDummy(storage='redis', config=cfg) - assert redis_da._client.ft().info()['attributes'][1][1] == b'attr1' + assert redis_da._client.ft(index).info()['attributes'][1][1] == b'attr1' - cfg = RedisConfig(n_dim=128, tag_indices=['attr2'], update_schema=True) + index2 = 'bbb' + cfg = RedisConfig( + n_dim=128, tag_indices=['attr2'], index_name=index2, update_schema=True + ) redis_da = DocumentArrayDummy(storage='redis', config=cfg) - assert redis_da._client.ft().info()['attributes'][1][1] == b'attr2' + assert redis_da._client.ft(index).info()['attributes'][1][1] == b'attr1' + assert redis_da._client.ft(index2).info()['attributes'][1][1] == b'attr2' def test_init_storage_empty_config(start_storage): From f4b19b70d18acd84a0849110475e04e72a0591ff Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 19 Aug 2022 21:50:01 +0800 Subject: [PATCH 53/93] test: fix redis in test_advance_indexing --- tests/unit/array/test_advance_indexing.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/unit/array/test_advance_indexing.py b/tests/unit/array/test_advance_indexing.py index 38a096349cd..d72eaf857fb 100644 --- a/tests/unit/array/test_advance_indexing.py +++ b/tests/unit/array/test_advance_indexing.py @@ -522,8 +522,10 @@ def test_advance_selector_mixed(storage): ['memory', 'sqlite', 'weaviate', 'annlite', 'qdrant', 'elasticsearch', 'redis'], ) def test_single_boolean_and_padding(storage, start_storage): - if storage in ('annlite', 'weaviate', 'qdrant', 'elasticsearch', 'redis'): + if storage in ('annlite', 'weaviate', 'qdrant', 'elasticsearch'): da = DocumentArray(storage=storage, config={'n_dim': 10}) + elif storage == 'redis': + da = DocumentArray(storage=storage, config={'n_dim': 10, 'flush': True}) else: da = DocumentArray(storage=storage) da.extend(DocumentArray.empty(3)) @@ -661,6 +663,7 @@ def test_offset2ids_persistence(storage, config, start_storage): if storage == 'redis': config.flush = False + config.update_schema = False da = DocumentArray(storage=storage, config=config) assert da[:, 'id'] == da_ids From aa2f8b2f53c622775c2e6ec1e8f5a1d4456fc35b Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 19 Aug 2022 21:52:17 +0800 Subject: [PATCH 54/93] test: fix redis in test_content --- tests/unit/array/mixins/test_content.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/unit/array/mixins/test_content.py b/tests/unit/array/mixins/test_content.py index cf8e78a46fe..6d2d5896e1c 100644 --- a/tests/unit/array/mixins/test_content.py +++ b/tests/unit/array/mixins/test_content.py @@ -33,9 +33,10 @@ def test_content_empty_getter_return_none(cls, content_attr, start_storage): DocumentArrayWeaviate, DocumentArrayQdrant, DocumentArrayElastic, - DocumentArrayRedis, ]: da = cls(config={'n_dim': 3}) + elif cls == DocumentArrayRedis: + da = cls(config={'n_dim': 3, 'flush': True}) else: da = cls() assert getattr(da, content_attr) is None @@ -69,9 +70,10 @@ def test_content_empty_setter(cls, content_attr, start_storage): DocumentArrayWeaviate, DocumentArrayQdrant, DocumentArrayElastic, - DocumentArrayRedis, ]: da = cls(config={'n_dim': 3}) + elif cls == DocumentArrayRedis: + da = cls(config={'n_dim': 3, 'flush': True}) else: da = cls() setattr(da, content_attr[0], content_attr[1]) From e46174c798e28975a694ead48c49a7e2983f9673 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 19 Aug 2022 21:52:38 +0800 Subject: [PATCH 55/93] test: fix redis in test_empty --- tests/unit/array/mixins/test_empty.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/array/mixins/test_empty.py b/tests/unit/array/mixins/test_empty.py index 0ba3da06e93..7de86e9a5a8 100644 --- a/tests/unit/array/mixins/test_empty.py +++ b/tests/unit/array/mixins/test_empty.py @@ -20,7 +20,7 @@ (DocumentArrayWeaviate, WeaviateConfig(n_dim=5)), (DocumentArrayQdrant, QdrantConfig(n_dim=5)), (DocumentArrayElastic, ElasticConfig(n_dim=5)), - (DocumentArrayRedis, RedisConfig(n_dim=5)), + (DocumentArrayRedis, RedisConfig(n_dim=5, flush=True)), ], ) def test_empty_non_zero(da_cls, config, start_storage): From 634b62ddbfd0653b9d0610af7a39fd27e333722a Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 19 Aug 2022 21:54:51 +0800 Subject: [PATCH 56/93] test: fix redis in test_find --- tests/unit/array/mixins/test_find.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 42ac832f081..a1825f84c88 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -359,9 +359,16 @@ def test_search_pre_filtering( ): np.random.seed(0) n_dim = 128 - da = DocumentArray( - storage=storage, config={'n_dim': n_dim, 'columns': [('price', 'int')]} - ) + + if storage == 'redis': + da = DocumentArray( + storage=storage, + config={'n_dim': n_dim, 'columns': [('price', 'int')], 'flush': True}, + ) + else: + da = DocumentArray( + storage=storage, config={'n_dim': n_dim, 'columns': [('price', 'int')]} + ) da.extend( [ @@ -456,9 +463,16 @@ def test_search_pre_filtering( ) def test_filtering(storage, filter_gen, operator, numeric_operators, start_storage): n_dim = 128 - da = DocumentArray( - storage=storage, config={'n_dim': n_dim, 'columns': [('price', 'float')]} - ) + + if storage == 'redis': + da = DocumentArray( + storage=storage, + config={'n_dim': n_dim, 'columns': [('price', 'float')], 'flush': True}, + ) + else: + da = DocumentArray( + storage=storage, config={'n_dim': n_dim, 'columns': [('price', 'float')]} + ) da.extend([Document(id=f'r{i}', tags={'price': i}) for i in range(50)]) thresholds = [10, 20, 30] From 5ddaeb70ace6fa9562262c892c9a89b6a8af02f7 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 19 Aug 2022 21:55:10 +0800 Subject: [PATCH 57/93] test: fix redis in test_magic --- tests/unit/array/mixins/test_magic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/array/mixins/test_magic.py b/tests/unit/array/mixins/test_magic.py index 0b7f21b6ba4..66edce5b152 100644 --- a/tests/unit/array/mixins/test_magic.py +++ b/tests/unit/array/mixins/test_magic.py @@ -80,7 +80,7 @@ def test_repr(da_cls, config, start_storage): ('weaviate', WeaviateConfig(n_dim=128)), ('qdrant', QdrantConfig(n_dim=128)), ('elasticsearch', ElasticConfig(n_dim=128)), - ('redis', RedisConfig(n_dim=128)), + ('redis', RedisConfig(n_dim=128, flush=True)), ], ) def test_repr_str(docs, storage, config, start_storage): From 57feb9bcef570e5fd4a76221e45f3bd27588683a Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 19 Aug 2022 21:57:50 +0800 Subject: [PATCH 58/93] test: fix redis in test_match --- tests/unit/array/mixins/test_match.py | 32 +++++++++++++++------------ 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/tests/unit/array/mixins/test_match.py b/tests/unit/array/mixins/test_match.py index fd627bd5c9a..5a5f5772856 100644 --- a/tests/unit/array/mixins/test_match.py +++ b/tests/unit/array/mixins/test_match.py @@ -609,11 +609,12 @@ def test_match_ensure_scores_unique(): } numeric_operators_redis = { - 'gte': operator.ge, - 'gt': operator.gt, - 'lte': operator.le, - 'lt': operator.lt, - 'eq': operator.eq, + '$gte': operator.ge, + '$gt': operator.gt, + '$lte': operator.le, + '$lt': operator.lt, + '$eq': operator.eq, + # '$neq': operator.ne, } @@ -683,16 +684,12 @@ def test_match_ensure_scores_unique(): tuple( [ 'redis', - lambda operator, threshold: { - 'key': 'price', - 'operator': operator, - 'value': threshold, - }, + lambda operator, threshold: {'price': {operator: threshold}}, numeric_operators_redis, operator, ] ) - for operator in ['gt', 'gte', 'lt', 'lte'] + for operator in numeric_operators_redis.keys() ], ], ) @@ -700,9 +697,16 @@ def test_match_pre_filtering( storage, filter_gen, operator, numeric_operators, start_storage ): n_dim = 128 - da = DocumentArray( - storage=storage, config={'n_dim': n_dim, 'columns': [('price', 'int')]} - ) + + if storage == 'redis': + da = DocumentArray( + storage=storage, + config={'n_dim': n_dim, 'columns': [('price', 'int')], 'flush': True}, + ) + else: + da = DocumentArray( + storage=storage, config={'n_dim': n_dim, 'columns': [('price', 'int')]} + ) da.extend( [ From cad75f76be64ce52e5ca89abca4e3ddac5cb8b5c Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 20 Aug 2022 01:24:07 +0800 Subject: [PATCH 59/93] feat: redis add subindex support --- docarray/array/storage/redis/backend.py | 16 +++++++++++--- docarray/array/storage/redis/find.py | 8 +++++-- docarray/array/storage/redis/getsetdel.py | 6 +++--- docarray/array/storage/redis/seqlike.py | 26 +++++++++++++++++------ 4 files changed, 41 insertions(+), 15 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index bfada762d25..cdfed639f33 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -8,6 +8,7 @@ from redis import Redis from redis.commands.search.field import NumericField, TextField, VectorField +from redis.commands.search.indexDefinition import IndexDefinition if TYPE_CHECKING: from ....typing import ArrayType, DocumentArraySourceType @@ -71,9 +72,10 @@ def _init_storage( if config.redis_config.get('decode_responses'): config.redis_config['decode_responses'] = False - self._offset2id_key = 'offset2id' + self._offset2id_key = 'offset2id__' + config.index_name self._config = config self.n_dim = self._config.n_dim + self._doc_prefix = "doc__" + config.index_name + ":" self._config.columns = self._normalize_columns(self._config.columns) self._client = self._build_client() @@ -102,7 +104,10 @@ def _build_client(self): if self._config.flush or self._config.update_schema: schema = self._build_schema_from_redis_config() - client.ft(index_name=self._config.index_name).create_index(schema) + idef = IndexDefinition(prefix=[self._doc_prefix]) + client.ft(index_name=self._config.index_name).create_index( + schema, definition=idef + ) return client @@ -113,6 +118,11 @@ def _ensure_unique_config( config_joined: dict, subindex_name: str, ) -> dict: + if 'index_name' not in config_subindex: + config_joined['index_name'] = ( + config_joined['index_name'] + '_subindex_' + subindex_name + ) + config_joined['flush'] = False return config_joined def _build_schema_from_redis_config(self): @@ -154,7 +164,7 @@ def _build_schema_from_redis_config(self): return schema def _doc_id_exists(self, doc_id): - return self._client.exists(doc_id) + return self._client.exists(self._doc_prefix + doc_id) def _map_embedding(self, embedding: 'ArrayType') -> bytes: if embedding is not None: diff --git a/docarray/array/storage/redis/find.py b/docarray/array/storage/redis/find.py index b6ab824a145..bd2b0b79474 100644 --- a/docarray/array/storage/redis/find.py +++ b/docarray/array/storage/redis/find.py @@ -42,7 +42,11 @@ def _find_similar_vectors( filters = self._build_fiter(filter) for f in filters: q.add_filter(f) - results = self._client.ft().search(q, query_params).docs + results = ( + self._client.ft(index_name=self._config.index_name) + .search(q, query_params) + .docs + ) da = DocumentArray() for res in results: @@ -73,7 +77,7 @@ def _find_with_filter(self, filter: Dict, limit: Optional[Union[int, float]] = 2 q = Query(s) q.paging(0, limit) - results = self._client.ft().search(q).docs + results = self._client.ft(index_name=self._config.index_name).search(q).docs da = DocumentArray() for res in results: diff --git a/docarray/array/storage/redis/getsetdel.py b/docarray/array/storage/redis/getsetdel.py index d58152aed5f..96e81aa07fe 100644 --- a/docarray/array/storage/redis/getsetdel.py +++ b/docarray/array/storage/redis/getsetdel.py @@ -16,7 +16,7 @@ def _get_doc_by_id(self, _id: str) -> 'Document': :return: the retrieved document from redis """ try: - result = self._client.hgetall(_id.encode()) + result = self._client.hgetall(self._doc_prefix + _id) doc = Document.from_base64(result[b'blob']) return doc except Exception as ex: @@ -32,7 +32,7 @@ def _set_doc_by_id(self, _id: str, value: 'Document'): self._del_doc_by_id(_id) payload = self._document_to_redis(value) - self._client.hset(value.id, mapping=payload) + self._client.hset(self._doc_prefix + value.id, mapping=payload) def _del_doc_by_id(self, _id: str): """Concrete implementation of base class' ``_del_doc_by_id`` @@ -40,7 +40,7 @@ def _del_doc_by_id(self, _id: str): :param _id: the id of the document to delete """ if self._doc_id_exists(_id): - self._client.delete(_id) + self._client.delete(self._doc_prefix + _id) def _document_to_redis(self, doc: 'Document') -> Dict: extra_columns = { diff --git a/docarray/array/storage/redis/seqlike.py b/docarray/array/storage/redis/seqlike.py index 9532bae67af..2eab480976f 100644 --- a/docarray/array/storage/redis/seqlike.py +++ b/docarray/array/storage/redis/seqlike.py @@ -27,12 +27,24 @@ def __len__(self): :return: the length of this :class:`DocumentArrayRedis` object """ try: - dbsize = self._client.dbsize() - if self._client.exists(b'offset2id'): - return dbsize - 1 - else: - return dbsize + # TODO + # method 1 + # keys = self._client.keys(pattern) and add same prefix to all docs in one docarray + # if self._offset2id_key.encode() in keys: + # return len(keys) - 1 + # else: + # return len(keys) + # method 2 + # this way, extend(), insert() funcs have to call self._save_offset2ids() + # if self._client.exists(self._offset2id_key.encode()): + # print('offset2id exists') + # return self._client.llen(self._offset2id_key.encode()) + # else: + # return 0 + + # method 3 + return len(self._offset2ids) except: return 0 @@ -65,7 +77,7 @@ def _upload_batch(self, docs: Iterable['Document']): batch = 0 for doc in docs: payload = self._document_to_redis(doc) - pipe.hset(doc.id, mapping=payload) + pipe.hset(self._doc_prefix + doc.id, mapping=payload) batch += 1 if batch >= self._config.batch_size: pipe.execute() @@ -73,7 +85,7 @@ def _upload_batch(self, docs: Iterable['Document']): if batch > 0: pipe.execute() - def extend(self, docs: Iterable['Document']): + def _extend(self, docs: Iterable['Document']): docs = list(docs) self._upload_batch(docs) self._offset2ids.extend([doc.id for doc in docs]) From 75e7a902159dae0f163e3a4a5242a490754ff5e7 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 20 Aug 2022 01:26:25 +0800 Subject: [PATCH 60/93] test: add redis to sub_index related tests --- tests/unit/array/mixins/test_del.py | 1 + tests/unit/array/mixins/test_find.py | 4 +++- tests/unit/array/mixins/test_getset.py | 3 +++ tests/unit/array/mixins/test_match.py | 1 + tests/unit/array/test_sequence.py | 5 +++++ 5 files changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/unit/array/mixins/test_del.py b/tests/unit/array/mixins/test_del.py index 7656679bd81..dffd573e85d 100644 --- a/tests/unit/array/mixins/test_del.py +++ b/tests/unit/array/mixins/test_del.py @@ -116,6 +116,7 @@ def test_del_da_attribute(): ('qdrant', {'n_dim': 3, 'distance': 'euclidean'}), ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), + ('redis', {'n_dim': 3, 'distance': 'L2', 'flush': True}), ], ) def test_del_subindex(storage, config): diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index a1825f84c88..f1136ae2f37 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -561,6 +561,7 @@ def test_elastic_id_filter(storage, config, limit): ('qdrant', {'n_dim': 3, 'distance': 'euclidean'}), ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), + ('redis', {'n_dim': 3, 'distance': 'L2', 'flush': True}), ], ) def test_find_subindex(storage, config): @@ -568,7 +569,7 @@ def test_find_subindex(storage, config): subindex_configs = {'@c': None} if storage == 'sqlite': subindex_configs['@c'] = dict() - elif storage in ['weaviate', 'annlite', 'qdrant', 'elasticsearch']: + elif storage in ['weaviate', 'annlite', 'qdrant', 'elasticsearch', 'redis']: subindex_configs['@c'] = {'n_dim': 2} da = DocumentArray( @@ -615,6 +616,7 @@ def test_find_subindex(storage, config): ('qdrant', {'n_dim': 3, 'distance': 'euclidean'}), ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), + ('redis', {'n_dim': 3, 'distance': 'L2', 'flush': True}), ], ) def test_find_subindex_multimodal(storage, config): diff --git a/tests/unit/array/mixins/test_getset.py b/tests/unit/array/mixins/test_getset.py index 8551b1519e1..5eea9761a6c 100644 --- a/tests/unit/array/mixins/test_getset.py +++ b/tests/unit/array/mixins/test_getset.py @@ -425,6 +425,7 @@ def embeddings_eq(emb1, emb2): ('qdrant', {'n_dim': 3, 'distance': 'euclidean'}), ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), + ('redis', {'n_dim': 3, 'distance': 'L2', 'flush': True}), ], ) def test_getset_subindex(storage, config): @@ -507,6 +508,7 @@ def test_getset_subindex(storage, config): ('qdrant', {'n_dim': 3, 'distance': 'euclidean'}), ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), + ('redis', {'n_dim': 3, 'distance': 'L2', 'flush': True}), ], ) def test_init_subindex(storage, config): @@ -546,6 +548,7 @@ def test_init_subindex(storage, config): ('qdrant', {'n_dim': 3, 'distance': 'euclidean'}), ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), + ('redis', {'n_dim': 3, 'distance': 'L2', 'flush': True}), ], ) def test_set_on_subindex(storage, config): diff --git a/tests/unit/array/mixins/test_match.py b/tests/unit/array/mixins/test_match.py index 5a5f5772856..e1babf1571a 100644 --- a/tests/unit/array/mixins/test_match.py +++ b/tests/unit/array/mixins/test_match.py @@ -750,6 +750,7 @@ def embeddings_eq(emb1, emb2): ('qdrant', {'n_dim': 3, 'distance': 'euclidean'}), ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), + ('redis', {'n_dim': 3, 'distance': 'L2', 'flush': True}), ], ) def test_match_subindex(storage, config): diff --git a/tests/unit/array/test_sequence.py b/tests/unit/array/test_sequence.py index 76d614c7205..084ea44ac6a 100644 --- a/tests/unit/array/test_sequence.py +++ b/tests/unit/array/test_sequence.py @@ -101,6 +101,7 @@ def test_context_manager_from_disk(storage, config, start_storage, tmpdir, tmpfi if storage == 'redis': config['flush'] = False + config['update_schema'] = False da2 = DocumentArray(storage=storage, config=config) assert len(da2) == 2 @@ -119,6 +120,7 @@ def test_context_manager_from_disk(storage, config, start_storage, tmpdir, tmpfi ('qdrant', {'n_dim': 3, 'distance': 'euclidean'}), ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), + ('redis', {'n_dim': 3, 'distance': 'L2', 'flush': True}), ], ) def test_extend_subindex(storage, config): @@ -164,6 +166,7 @@ def test_extend_subindex(storage, config): ('qdrant', {'n_dim': 3, 'distance': 'euclidean'}), ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), + ('redis', {'n_dim': 3, 'distance': 'L2', 'flush': True}), ], ) def test_append_subindex(storage, config): @@ -213,6 +216,7 @@ def embeddings_eq(emb1, emb2): ('qdrant', {'n_dim': 3, 'distance': 'euclidean'}), ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), + ('redis', {'n_dim': 3, 'distance': 'L2', 'flush': True}), ], ) @pytest.mark.parametrize( @@ -239,6 +243,7 @@ def test_del_and_append(index, storage, config): ('qdrant', {'n_dim': 3, 'distance': 'euclidean'}), ('elasticsearch', {'n_dim': 3, 'distance': 'l2_norm'}), ('sqlite', dict()), + ('redis', {'n_dim': 3, 'distance': 'L2', 'flush': True}), ], ) @pytest.mark.parametrize( From 61360d0f4d098a72d623efde89fcec1dddb3f86d Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 23 Aug 2022 14:03:46 +0800 Subject: [PATCH 61/93] feat: change redis find to pre-filtering --- docarray/array/storage/redis/find.py | 55 +++++++++------------------ tests/unit/array/mixins/test_find.py | 2 +- tests/unit/array/mixins/test_match.py | 2 +- 3 files changed, 21 insertions(+), 38 deletions(-) diff --git a/docarray/array/storage/redis/find.py b/docarray/array/storage/redis/find.py index bd2b0b79474..b65c544ab75 100644 --- a/docarray/array/storage/redis/find.py +++ b/docarray/array/storage/redis/find.py @@ -30,18 +30,17 @@ def _find_similar_vectors( filter: Optional[Dict] = None, limit: Optional[Union[int, float]] = 20, ): + + query_str = self._build_query_str(filter) if filter else "*" + q = ( - Query(f'*=>[KNN {limit} @embedding $vec AS vector_score]') + Query(f'{query_str}=>[KNN {limit} @embedding $vec AS vector_score]') .sort_by('vector_score') .paging(0, limit) .dialect(2) ) query_params = {'vec': to_numpy_array(query).astype(np.float32).tobytes()} - if filter: - filters = self._build_fiter(filter) - for f in filters: - q.add_filter(f) results = ( self._client.ft(index_name=self._config.index_name) .search(q, query_params) @@ -91,48 +90,32 @@ def _filter( return self._find_with_filter(filter, limit=limit) - # TODO return NumericFilter or List[NumericFilter] - def _build_fiter(self, filter: Dict) -> List[NumericFilter]: - INF = "+inf" - NEG_INF = "-inf" - f = [] - - for key in filter: - operator = list(filter[key].keys())[0] - threshold = filter[key][operator] - if operator == '$gt': - f.append(NumericFilter(key, threshold, INF, minExclusive=True)) - elif operator == '$gte': - f.append(NumericFilter(key, threshold, INF)) - elif operator == '$lt': - f.append(NumericFilter(key, NEG_INF, threshold, maxExclusive=True)) - elif operator == '$lte': - f.append(NumericFilter(key, NEG_INF, threshold)) - elif operator == '$eq': - f.append(NumericFilter(key, threshold, threshold)) - # TODO add $neq if possible - - return f - def _build_query_str(self, filter: Dict) -> str: INF = "+inf" NEG_INF = "-inf" - s = "" + s = "(" for key in filter: operator = list(filter[key].keys())[0] - threshold = filter[key][operator] + value = filter[key][operator] if operator == '$gt': - s += f"@{key}:[({threshold} {INF}] " + s += f"@{key}:[({value} {INF}] " elif operator == '$gte': - s += f"@{key}:[{threshold} {INF}] " + s += f"@{key}:[{value} {INF}] " elif operator == '$lt': - s += f"@{key}:[{NEG_INF} ({threshold}] " + s += f"@{key}:[{NEG_INF} ({value}] " elif operator == '$lte': - s += f"@{key}:[{NEG_INF} {threshold}] " + s += f"@{key}:[{NEG_INF} {value}] " elif operator == '$eq': - s += f"@{key}:[{threshold} {threshold}] " + if type(value) is int: + s += f"@{key}:[{value} {value}] " + else: + s += f"@{key}:{value} " elif operator == '$neq': - s += f"-@{key}:[{threshold} {threshold}] " + if type(value) is int: + s += f"-@{key}:[{value} {value}] " + else: + s += f"-@{key}:{value} " + s += ")" return s diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index f1136ae2f37..6532f20fa78 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -350,7 +350,7 @@ def test_find_by_tag(storage, config, start_storage): operator, ] ) - for operator in ['$gte', '$gt', '$lte', '$lt', '$eq'] + for operator in numeric_operators_redis.keys() ], ], ) diff --git a/tests/unit/array/mixins/test_match.py b/tests/unit/array/mixins/test_match.py index e1babf1571a..005203d9ef1 100644 --- a/tests/unit/array/mixins/test_match.py +++ b/tests/unit/array/mixins/test_match.py @@ -614,7 +614,7 @@ def test_match_ensure_scores_unique(): '$lte': operator.le, '$lt': operator.lt, '$eq': operator.eq, - # '$neq': operator.ne, + '$neq': operator.ne, } From 8a6168d498d2a2b45ce712831b33666743f845fc Mon Sep 17 00:00:00 2001 From: AnneY Date: Tue, 23 Aug 2022 21:45:29 +0800 Subject: [PATCH 62/93] test: add category filter test for redis find --- tests/unit/array/mixins/test_find.py | 41 ++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 6532f20fa78..625a5fa70ae 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -511,6 +511,47 @@ def test_weaviate_filter_query(start_storage): assert isinstance(da._filter(filter={}), type(da)) +def test_redis_category_filter(start_storage): + n_dim = 128 + da = DocumentArray( + storage='redis', + config={'n_dim': n_dim, 'columns': [('color', 'str')], 'flush': True}, + ) + + da.extend( + [ + Document(id=f'r{i}', embedding=np.random.rand(n_dim), tags={'color': 'red'}) + for i in range(10) + ] + ) + + da.extend( + [ + Document( + id=f'r{i}', embedding=np.random.rand(n_dim), tags={'color': 'blue'} + ) + for i in range(10, 20) + ] + ) + + da.extend( + [ + Document( + id=f'r{i}', embedding=np.random.rand(n_dim), tags={'color': 'green'} + ) + for i in range(20, 30) + ] + ) + + results = da.find(np.random.rand(n_dim), filter={'color': {'$eq': 'red'}}) + assert len(results) > 0 + assert all([(r.tags['color'] == 'red') for r in results]) + + results = da.find(np.random.rand(n_dim), filter={'color': {'$neq': 'red'}}) + assert len(results) > 0 + assert all([(r.tags['color'] != 'red') for r in results]) + + @pytest.mark.parametrize('storage', ['memory']) def test_unsupported_pre_filtering(storage, start_storage): From 4d0505fe7e43ed31a8e7fdb9d96ee15afbab0ac6 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 24 Aug 2022 00:11:11 +0800 Subject: [PATCH 63/93] refactor: using batch_docs in redis extend --- docarray/array/storage/redis/seqlike.py | 28 ++++++++----------------- tests/unit/array/test_sequence.py | 9 ++++++++ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/docarray/array/storage/redis/seqlike.py b/docarray/array/storage/redis/seqlike.py index 2eab480976f..33fb9e23845 100644 --- a/docarray/array/storage/redis/seqlike.py +++ b/docarray/array/storage/redis/seqlike.py @@ -1,7 +1,7 @@ from typing import Iterable, Union -from .... import Document -from ..base.seqlike import BaseSequenceLikeMixin +from docarray import Document, DocumentArray +from docarray.array.storage.base.seqlike import BaseSequenceLikeMixin class SequenceLikeMixin(BaseSequenceLikeMixin): @@ -61,31 +61,21 @@ def __contains__(self, x: Union[str, 'Document']): else: return False - # TODO this del is unreachable, del will call __del__ in base/getsetdel - # def __del__(self): - # """Delete this :class:`DocumentArrayRedis` object""" - # self._offset2ids.clear() - def __repr__(self): """Return the string representation of :class:`DocumentArrayRedis` object :return: string representation of this object """ return f'' - def _upload_batch(self, docs: Iterable['Document']): + def _upload_batch(self, batch_of_docs: DocumentArray): pipe = self._client.pipeline() - batch = 0 - for doc in docs: + for doc in batch_of_docs: payload = self._document_to_redis(doc) pipe.hset(self._doc_prefix + doc.id, mapping=payload) - batch += 1 - if batch >= self._config.batch_size: - pipe.execute() - batch = 0 - if batch > 0: - pipe.execute() + pipe.execute() def _extend(self, docs: Iterable['Document']): - docs = list(docs) - self._upload_batch(docs) - self._offset2ids.extend([doc.id for doc in docs]) + da = DocumentArray(docs) + for batch_of_docs in da.batch(self._config.batch_size): + self._upload_batch(batch_of_docs) + self._offset2ids.extend(batch_of_docs[:, 'id']) diff --git a/tests/unit/array/test_sequence.py b/tests/unit/array/test_sequence.py index 084ea44ac6a..4f474965927 100644 --- a/tests/unit/array/test_sequence.py +++ b/tests/unit/array/test_sequence.py @@ -1,3 +1,4 @@ +import gc import tempfile import uuid @@ -18,6 +19,11 @@ from tests.conftest import tmpfile +@pytest.fixture() +def ensure_gc(): + gc.collect() + + @pytest.mark.parametrize( 'da_cls,config', [ @@ -90,6 +96,9 @@ def test_context_manager_from_disk(storage, config, start_storage, tmpdir, tmpfi config = config update_config_inplace(config, tmpdir, tmpfile) + if storage == 'redis': + ensure_gc + da = DocumentArray(storage=storage, config=config) with da as da_open: From c0a1f35aa1026d7da53dc61a7b42eb729967ae37 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 24 Aug 2022 21:11:52 +0800 Subject: [PATCH 64/93] feat: add redis bool type support and tests --- docarray/array/storage/redis/backend.py | 4 +--- docarray/array/storage/redis/find.py | 4 ++++ docarray/array/storage/redis/getsetdel.py | 13 +++++++---- tests/unit/array/mixins/test_find.py | 28 +++++++++++++++++++---- tests/unit/array/mixins/test_getset.py | 10 ++++++++ 5 files changed, 47 insertions(+), 12 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index cdfed639f33..15fce7d41e1 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -46,7 +46,7 @@ class BackendMixin(BaseBackendMixin): 'float': TypeMap(type='float', converter=NumericField), 'double': TypeMap(type='double', converter=NumericField), 'long': TypeMap(type='long', converter=NumericField), - # TODO add bool + 'bool': TypeMap(type='long', converter=NumericField), } def _init_storage( @@ -152,10 +152,8 @@ def _build_schema_from_redis_config(self): if self._config.tag_indices: for index in self._config.tag_indices: - # TODO TextField or TagField schema.append(TextField(index)) - # TODO whether to add schema to column (elastic does but qdrant doesn't) for col, coltype in self._config.columns: schema.append(self._map_column(col, coltype)) diff --git a/docarray/array/storage/redis/find.py b/docarray/array/storage/redis/find.py index b65c544ab75..bb7e7239d29 100644 --- a/docarray/array/storage/redis/find.py +++ b/docarray/array/storage/redis/find.py @@ -109,11 +109,15 @@ def _build_query_str(self, filter: Dict) -> str: elif operator == '$eq': if type(value) is int: s += f"@{key}:[{value} {value}] " + elif type(value) is bool: + s += f"@{key}:[{int(value)} {int(value)}] " else: s += f"@{key}:{value} " elif operator == '$neq': if type(value) is int: s += f"-@{key}:[{value} {value}] " + elif type(value) is bool: + s += f"-@{key}:[{int(value)} {int(value)}] " else: s += f"-@{key}:{value} " s += ")" diff --git a/docarray/array/storage/redis/getsetdel.py b/docarray/array/storage/redis/getsetdel.py index 96e81aa07fe..4f6c4ba903c 100644 --- a/docarray/array/storage/redis/getsetdel.py +++ b/docarray/array/storage/redis/getsetdel.py @@ -1,3 +1,4 @@ +from codecs import unicode_escape_decode from typing import Dict from docarray import Document @@ -43,11 +44,13 @@ def _del_doc_by_id(self, _id: str): self._client.delete(self._doc_prefix + _id) def _document_to_redis(self, doc: 'Document') -> Dict: - extra_columns = { - col: doc.tags.get(col) - for col, _ in self._config.columns - if doc.tags.get(col) is not None - } + extra_columns = {} + + for col, _ in self._config.columns: + tag = doc.tags.get(col) + if tag is not None: + extra_columns[col] = int(tag) if type(tag) is bool else tag + payload = { 'embedding': self._map_embedding(doc.embedding), 'blob': doc.to_base64(), diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 625a5fa70ae..6a7b1bb88dd 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -515,12 +515,20 @@ def test_redis_category_filter(start_storage): n_dim = 128 da = DocumentArray( storage='redis', - config={'n_dim': n_dim, 'columns': [('color', 'str')], 'flush': True}, + config={ + 'n_dim': n_dim, + 'columns': [('color', 'str'), ('isfake', 'bool')], + 'flush': True, + }, ) da.extend( [ - Document(id=f'r{i}', embedding=np.random.rand(n_dim), tags={'color': 'red'}) + Document( + id=f'r{i}', + embedding=np.random.rand(n_dim), + tags={'color': 'red', 'isfake': True}, + ) for i in range(10) ] ) @@ -528,7 +536,9 @@ def test_redis_category_filter(start_storage): da.extend( [ Document( - id=f'r{i}', embedding=np.random.rand(n_dim), tags={'color': 'blue'} + id=f'r{i}', + embedding=np.random.rand(n_dim), + tags={'color': 'blue', 'isfake': False}, ) for i in range(10, 20) ] @@ -537,7 +547,9 @@ def test_redis_category_filter(start_storage): da.extend( [ Document( - id=f'r{i}', embedding=np.random.rand(n_dim), tags={'color': 'green'} + id=f'r{i}', + embedding=np.random.rand(n_dim), + tags={'color': 'green', 'isfake': False}, ) for i in range(20, 30) ] @@ -551,6 +563,14 @@ def test_redis_category_filter(start_storage): assert len(results) > 0 assert all([(r.tags['color'] != 'red') for r in results]) + results = da.find(np.random.rand(n_dim), filter={'isfake': {'$eq': True}}) + assert len(results) > 0 + assert all([(r.tags['isfake'] == True) for r in results]) + + results = da.find(np.random.rand(n_dim), filter={'isfake': {'$neq': True}}) + assert len(results) > 0 + assert all([(r.tags['isfake'] == False) for r in results]) + @pytest.mark.parametrize('storage', ['memory']) def test_unsupported_pre_filtering(storage, start_storage): diff --git a/tests/unit/array/mixins/test_getset.py b/tests/unit/array/mixins/test_getset.py index 3bba3d6fc2d..59d0791415c 100644 --- a/tests/unit/array/mixins/test_getset.py +++ b/tests/unit/array/mixins/test_getset.py @@ -1,3 +1,5 @@ +import gc + import numpy as np import pytest import scipy.sparse @@ -499,6 +501,11 @@ def test_getset_subindex(storage, config): assert embeddings_eq(da._subindices['@c']['c_11'].embedding, [-2, -2]) +@pytest.fixture() +def ensure_gc(): + gc.collect() + + @pytest.mark.parametrize( 'storage, config', [ @@ -512,6 +519,9 @@ def test_getset_subindex(storage, config): ], ) def test_init_subindex(storage, config): + if storage == 'redis': + ensure_gc + num_top_level_docs = 5 num_chunks_per_doc = 3 subindex_configs = ( From ddd1ebab69e3dbc2a1ad59ce8ac45e7434c9373f Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 24 Aug 2022 21:24:37 +0800 Subject: [PATCH 65/93] refractor: add default values to redis config --- docarray/array/storage/redis/backend.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 15fce7d41e1..c453938d8e8 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -28,10 +28,10 @@ class RedisConfig: tag_indices: List[str] = field(default_factory=list) batch_size: int = field(default=64) method: str = field(default='HNSW') + ef_construction: int = field(default=200) + m: int = field(default=16) + ef_runtime: int = field(default=10) initial_cap: Optional[int] = None - ef_construction: Optional[int] = None - m: Optional[int] = None - ef_runtime: Optional[int] = None block_size: Optional[int] = None columns: Optional[List[Tuple[str, str]]] = None @@ -132,13 +132,11 @@ def _build_schema_from_redis_config(self): 'DISTANCE_METRIC': self._config.distance, } - if self._config.method == 'HNSW' and ( - self._config.m or self._config.ef_construction or self._config.ef_runtime - ): + if self._config.method == 'HNSW': index_options = { - 'M': self._config.m or 16, - 'EF_CONSTRUCTION': self._config.ef_construction or 200, - 'EF_RUNTIME': self._config.ef_runtime or 10, + 'M': self._config.m, + 'EF_CONSTRUCTION': self._config.ef_construction, + 'EF_RUNTIME': self._config.ef_runtime, } index_param.update(index_options) From e85f1f7d55441234ca58bd1f6514cd0f6cde5bd3 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 24 Aug 2022 21:32:24 +0800 Subject: [PATCH 66/93] refractor: remove useless comments --- docarray/array/storage/redis/seqlike.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/docarray/array/storage/redis/seqlike.py b/docarray/array/storage/redis/seqlike.py index 33fb9e23845..b06a7ef726c 100644 --- a/docarray/array/storage/redis/seqlike.py +++ b/docarray/array/storage/redis/seqlike.py @@ -27,23 +27,6 @@ def __len__(self): :return: the length of this :class:`DocumentArrayRedis` object """ try: - # TODO - # method 1 - # keys = self._client.keys(pattern) and add same prefix to all docs in one docarray - # if self._offset2id_key.encode() in keys: - # return len(keys) - 1 - # else: - # return len(keys) - - # method 2 - # this way, extend(), insert() funcs have to call self._save_offset2ids() - # if self._client.exists(self._offset2id_key.encode()): - # print('offset2id exists') - # return self._client.llen(self._offset2id_key.encode()) - # else: - # return 0 - - # method 3 return len(self._offset2ids) except: return 0 From 97bc99ea9814004bf2b560c8455b1d0eb5f6a15a Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 24 Aug 2022 22:30:27 +0800 Subject: [PATCH 67/93] test: fix test_backend for redis --- tests/unit/array/storage/redis/test_backend.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit/array/storage/redis/test_backend.py b/tests/unit/array/storage/redis/test_backend.py index 0204a01226c..0db8d07eb92 100644 --- a/tests/unit/array/storage/redis/test_backend.py +++ b/tests/unit/array/storage/redis/test_backend.py @@ -44,8 +44,6 @@ def da_redis(): @pytest.mark.parametrize( 'method,initial_cap,ef_construction,block_size', [ - ('HNSW', None, None, None), - ('HNSW', 10, 250, None), ('HNSW', 10, 250, 1000000), ('FLAT', 10, 250, 1000000), ], From 3bba4b010b69e9368870c50ea8c9dd72f1dec9d3 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 24 Aug 2022 23:07:51 +0800 Subject: [PATCH 68/93] refractor: simplify key prefix --- docarray/array/storage/redis/backend.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index c453938d8e8..9891e71f3e9 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -11,7 +11,7 @@ from redis.commands.search.indexDefinition import IndexDefinition if TYPE_CHECKING: - from ....typing import ArrayType, DocumentArraySourceType + from docarray.typing import ArrayType, DocumentArraySourceType @dataclass @@ -72,10 +72,10 @@ def _init_storage( if config.redis_config.get('decode_responses'): config.redis_config['decode_responses'] = False - self._offset2id_key = 'offset2id__' + config.index_name + self._offset2id_key = config.index_name + '__offset2id' self._config = config self.n_dim = self._config.n_dim - self._doc_prefix = "doc__" + config.index_name + ":" + self._doc_prefix = config.index_name + ':' self._config.columns = self._normalize_columns(self._config.columns) self._client = self._build_client() @@ -164,7 +164,7 @@ def _doc_id_exists(self, doc_id): def _map_embedding(self, embedding: 'ArrayType') -> bytes: if embedding is not None: - from ....math.ndarray import to_numpy_array + from docarray.math.ndarray import to_numpy_array embedding = to_numpy_array(embedding) From c0f083f9821769633edfc7ceb66d8b3793585529 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 24 Aug 2022 23:11:00 +0800 Subject: [PATCH 69/93] test: add bool type test for redis --- tests/unit/array/storage/redis/test_backend.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/unit/array/storage/redis/test_backend.py b/tests/unit/array/storage/redis/test_backend.py index 0db8d07eb92..cfafbb71e77 100644 --- a/tests/unit/array/storage/redis/test_backend.py +++ b/tests/unit/array/storage/redis/test_backend.py @@ -30,6 +30,7 @@ def _save_offset2ids(self): 'long': b'NUMERIC', 'str': b'TEXT', 'bytes': b'TEXT', + 'bool': b'NUMERIC', } @@ -48,15 +49,15 @@ def da_redis(): ('FLAT', 10, 250, 1000000), ], ) -@pytest.mark.parametrize('tag_indices', [['attr3'], ['attr3', 'attr4']]) @pytest.mark.parametrize( 'columns', [ [('attr1', 'str'), ('attr2', 'bytes')], [('attr1', 'int'), ('attr2', 'float')], - [('attr1', 'double'), ('attr2', 'long')], + [('attr1', 'double'), ('attr2', 'long'), ('attr3', 'bool')], ], ) +@pytest.mark.parametrize('tag_indices', [['attr4'], ['attr4', 'attr5']]) @pytest.mark.parametrize('index_text', [True, False]) @pytest.mark.parametrize( 'redis_config', From 582b56b655dec3320b3f9b6a8fa1147cc7bf2403 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 24 Aug 2022 23:20:45 +0800 Subject: [PATCH 70/93] refeactor: add default value in redisconfig --- docarray/array/storage/redis/backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 9891e71f3e9..9e4f6ee2831 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -31,8 +31,8 @@ class RedisConfig: ef_construction: int = field(default=200) m: int = field(default=16) ef_runtime: int = field(default=10) + block_size: int = field(default=1048576) initial_cap: Optional[int] = None - block_size: Optional[int] = None columns: Optional[List[Tuple[str, str]]] = None @@ -140,7 +140,7 @@ def _build_schema_from_redis_config(self): } index_param.update(index_options) - if self._config.method == 'FLAT' and self._config.block_size: + if self._config.method == 'FLAT': index_options = {'BLOCK_SIZE': self._config.block_size} index_param.update(index_options) From 1e2682b6bfbace0a1a1d1c61c0f2c65180c406b0 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 24 Aug 2022 23:22:37 +0800 Subject: [PATCH 71/93] fix: keep kwargs for future potential use --- docarray/array/storage/redis/find.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docarray/array/storage/redis/find.py b/docarray/array/storage/redis/find.py index bb7e7239d29..10afdcedbae 100644 --- a/docarray/array/storage/redis/find.py +++ b/docarray/array/storage/redis/find.py @@ -29,6 +29,7 @@ def _find_similar_vectors( query: 'RedisArrayType', filter: Optional[Dict] = None, limit: Optional[Union[int, float]] = 20, + **kwargs, ): query_str = self._build_query_str(filter) if filter else "*" @@ -68,7 +69,8 @@ def _find( query = query.reshape((num_rows, -1)) return [ - self._find_similar_vectors(q, filter=filter, limit=limit) for q in query + self._find_similar_vectors(q, filter=filter, limit=limit, **kwargs) + for q in query ] def _find_with_filter(self, filter: Dict, limit: Optional[Union[int, float]] = 20): From 80cff7ac5cd7dffb0d2afcba554d9319eafa944b Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 24 Aug 2022 23:33:12 +0800 Subject: [PATCH 72/93] refractor: change bool type check --- docarray/array/storage/redis/getsetdel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/redis/getsetdel.py b/docarray/array/storage/redis/getsetdel.py index 4f6c4ba903c..e6aff837666 100644 --- a/docarray/array/storage/redis/getsetdel.py +++ b/docarray/array/storage/redis/getsetdel.py @@ -49,7 +49,7 @@ def _document_to_redis(self, doc: 'Document') -> Dict: for col, _ in self._config.columns: tag = doc.tags.get(col) if tag is not None: - extra_columns[col] = int(tag) if type(tag) is bool else tag + extra_columns[col] = int(tag) if isinstance(tag, bool) else tag payload = { 'embedding': self._map_embedding(doc.embedding), From 549939aa77b2a23709bbed1b499d480e94db4935 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 24 Aug 2022 23:39:02 +0800 Subject: [PATCH 73/93] refractor: specify redis version --- setup.py | 6 +++--- tests/unit/array/docker-compose.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 6957e10c2e9..41a9b4268eb 100644 --- a/setup.py +++ b/setup.py @@ -69,7 +69,7 @@ 'annlite>=0.3.2', 'qdrant-client~=0.7.3', 'elasticsearch>=8.2.0', - 'redis>=4.3.4', + 'redis>=4.3.0', ], 'qdrant': [ 'qdrant-client~=0.7.3', @@ -84,7 +84,7 @@ 'elasticsearch>=8.2.0', ], 'redis': [ - 'redis>=4.3.4', + 'redis>=4.3.0', ], 'test': [ 'pytest', @@ -108,7 +108,7 @@ 'weaviate-client~=3.3.0', 'annlite>=0.3.2', 'elasticsearch>=8.2.0', - 'redis>=4.3.4', + 'redis>=4.3.0', 'jina', ], }, diff --git a/tests/unit/array/docker-compose.yml b/tests/unit/array/docker-compose.yml index 1fda3e4cf20..e6fb1601709 100644 --- a/tests/unit/array/docker-compose.yml +++ b/tests/unit/array/docker-compose.yml @@ -27,7 +27,7 @@ services: networks: - elastic redis: - image: redis/redis-stack:latest + image: redis/redis-stack:6.2.2-v5 ports: - "6379:6379" From 1fce045ad2c5c51a95d89c9e5c98c05793fc5f54 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 25 Aug 2022 00:05:25 +0800 Subject: [PATCH 74/93] feat: add _set_docs_by_ids for redis --- docarray/array/storage/redis/getsetdel.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docarray/array/storage/redis/getsetdel.py b/docarray/array/storage/redis/getsetdel.py index e6aff837666..ce209cea245 100644 --- a/docarray/array/storage/redis/getsetdel.py +++ b/docarray/array/storage/redis/getsetdel.py @@ -4,6 +4,7 @@ from docarray import Document from docarray.array.storage.base.getsetdel import BaseGetSetDelMixin from docarray.array.storage.base.helper import Offset2ID +from typing import Iterable class GetSetDelMixin(BaseGetSetDelMixin): @@ -35,6 +36,21 @@ def _set_doc_by_id(self, _id: str, value: 'Document'): payload = self._document_to_redis(value) self._client.hset(self._doc_prefix + value.id, mapping=payload) + def _set_docs_by_ids(self, ids, docs: Iterable['Document'], mismatch_ids: Dict): + """Overridden implementation of _set_docs_by_ids in order to add docs in batches and flush at the end + + :param ids: the ids used for indexing + """ + pipe = self._client.pipeline() + + for _id, doc in zip(ids, docs): + if _id != doc.id: + self._del_doc_by_id(_id) + payload = self._document_to_redis(doc) + pipe.hset(self._doc_prefix + doc.id, mapping=payload) + + pipe.execute() + def _del_doc_by_id(self, _id: str): """Concrete implementation of base class' ``_del_doc_by_id`` From cfd9aa863f4b4eca6ae44dcd99895534cdd6a0f1 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 25 Aug 2022 00:25:10 +0800 Subject: [PATCH 75/93] fix: fix redis set_doc_by_id(s) --- docarray/array/storage/redis/getsetdel.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/docarray/array/storage/redis/getsetdel.py b/docarray/array/storage/redis/getsetdel.py index ce209cea245..2e179badd9a 100644 --- a/docarray/array/storage/redis/getsetdel.py +++ b/docarray/array/storage/redis/getsetdel.py @@ -30,8 +30,9 @@ def _set_doc_by_id(self, _id: str, value: 'Document'): :param _id: the id of doc to update :param value: the document to update to """ + self._del_doc_by_id(_id) if _id != value.id: - self._del_doc_by_id(_id) + self._del_doc_by_id(value.id) payload = self._document_to_redis(value) self._client.hset(self._doc_prefix + value.id, mapping=payload) @@ -41,15 +42,12 @@ def _set_docs_by_ids(self, ids, docs: Iterable['Document'], mismatch_ids: Dict): :param ids: the ids used for indexing """ - pipe = self._client.pipeline() - for _id, doc in zip(ids, docs): + self._del_doc_by_id(_id) if _id != doc.id: - self._del_doc_by_id(_id) - payload = self._document_to_redis(doc) - pipe.hset(self._doc_prefix + doc.id, mapping=payload) + self._del_doc_by_id(doc.id) - pipe.execute() + self._upload_batch(docs) def _del_doc_by_id(self, _id: str): """Concrete implementation of base class' ``_del_doc_by_id`` From b7180871c4d6805e67ab870b9956955c34a681fa Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 25 Aug 2022 00:30:51 +0800 Subject: [PATCH 76/93] refractor: refract gc collect for redis --- tests/unit/array/mixins/test_getset.py | 7 +------ tests/unit/array/test_advance_indexing.py | 9 ++------- tests/unit/array/test_sequence.py | 7 +------ 3 files changed, 4 insertions(+), 19 deletions(-) diff --git a/tests/unit/array/mixins/test_getset.py b/tests/unit/array/mixins/test_getset.py index 59d0791415c..3a32ddbc8b4 100644 --- a/tests/unit/array/mixins/test_getset.py +++ b/tests/unit/array/mixins/test_getset.py @@ -501,11 +501,6 @@ def test_getset_subindex(storage, config): assert embeddings_eq(da._subindices['@c']['c_11'].embedding, [-2, -2]) -@pytest.fixture() -def ensure_gc(): - gc.collect() - - @pytest.mark.parametrize( 'storage, config', [ @@ -520,7 +515,7 @@ def ensure_gc(): ) def test_init_subindex(storage, config): if storage == 'redis': - ensure_gc + gc.collect() num_top_level_docs = 5 num_chunks_per_doc = 3 diff --git a/tests/unit/array/test_advance_indexing.py b/tests/unit/array/test_advance_indexing.py index 7bb456c59b5..0e1ff6884e1 100644 --- a/tests/unit/array/test_advance_indexing.py +++ b/tests/unit/array/test_advance_indexing.py @@ -600,11 +600,6 @@ def test_single_boolean_and_padding(storage, start_storage): assert len(da[True, False, False]) == 1 -@pytest.fixture() -def ensure_gc(): - gc.collect() - - @pytest.mark.parametrize( 'storage,config_gen', [ @@ -617,7 +612,7 @@ def ensure_gc(): ('redis', lambda: RedisConfig(n_dim=123, flush=True)), ], ) -def test_edge_case_two_strings(storage, config_gen, ensure_gc, start_storage): +def test_edge_case_two_strings(storage, config_gen, start_storage): # getitem if config_gen: da = DocumentArray(storage=storage, config=config_gen()) @@ -684,7 +679,7 @@ def test_edge_case_two_strings(storage, config_gen, ensure_gc, start_storage): da['1', 'hellohello'] = 'hello' if storage == 'redis': - ensure_gc + gc.collect() @pytest.mark.parametrize( diff --git a/tests/unit/array/test_sequence.py b/tests/unit/array/test_sequence.py index 4f474965927..33b25b8db4d 100644 --- a/tests/unit/array/test_sequence.py +++ b/tests/unit/array/test_sequence.py @@ -19,11 +19,6 @@ from tests.conftest import tmpfile -@pytest.fixture() -def ensure_gc(): - gc.collect() - - @pytest.mark.parametrize( 'da_cls,config', [ @@ -97,7 +92,7 @@ def test_context_manager_from_disk(storage, config, start_storage, tmpdir, tmpfi update_config_inplace(config, tmpdir, tmpfile) if storage == 'redis': - ensure_gc + gc.collect() da = DocumentArray(storage=storage, config=config) From c23f630c050c6c3f6d40f2bd489d7ef4e8c5a965 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 25 Aug 2022 08:43:20 +0800 Subject: [PATCH 77/93] feat: add _get_docs_by_ids for redis --- docarray/array/storage/redis/getsetdel.py | 32 ++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/docarray/array/storage/redis/getsetdel.py b/docarray/array/storage/redis/getsetdel.py index 2e179badd9a..70347b9ddd2 100644 --- a/docarray/array/storage/redis/getsetdel.py +++ b/docarray/array/storage/redis/getsetdel.py @@ -4,7 +4,7 @@ from docarray import Document from docarray.array.storage.base.getsetdel import BaseGetSetDelMixin from docarray.array.storage.base.helper import Offset2ID -from typing import Iterable +from typing import Sequence, Iterable class GetSetDelMixin(BaseGetSetDelMixin): @@ -24,6 +24,36 @@ def _get_doc_by_id(self, _id: str) -> 'Document': except Exception as ex: raise KeyError(_id) from ex + def _get_docs_by_ids(self, ids: Sequence[str]) -> Iterable['Document']: + """Concrete implementation of base class' ``_get_docs_by_ids`` + + :param ids: ids of the document + :return: Iterable[Document] + """ + + accumulated_docs = [] + accumulated_docs_id_not_found = [] + + if not ids: + return accumulated_docs + + pipe = self._client.pipeline() + for id in ids: + pipe.hgetall(self._doc_prefix + id) + + results = pipe.execute() + + for i, result in enumerate(results): + if result: + accumulated_docs.append(Document.from_base64(result[b'blob'])) + else: + accumulated_docs_id_not_found.append(ids[i]) + + if accumulated_docs_id_not_found: + raise KeyError(accumulated_docs_id_not_found, accumulated_docs) + + return accumulated_docs + def _set_doc_by_id(self, _id: str, value: 'Document'): """Concrete implementation of base class' ``_set_doc_by_id`` From 8dbc5ee2947f189f75d8ededbef10818dda6bba9 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 25 Aug 2022 15:47:09 +0800 Subject: [PATCH 78/93] refractor: support find_by_text in future --- docarray/array/storage/redis/backend.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 9e4f6ee2831..53d75676193 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -148,15 +148,9 @@ def _build_schema_from_redis_config(self): index_param['INITIAL_CAP'] = self._config.initial_cap schema = [VectorField('embedding', self._config.method, index_param)] - if self._config.tag_indices: - for index in self._config.tag_indices: - schema.append(TextField(index)) - for col, coltype in self._config.columns: schema.append(self._map_column(col, coltype)) - if self._config.index_text: - schema.append(TextField('text')) return schema def _doc_id_exists(self, doc_id): From 55a6c1c520286486746fe4475c2c492c717584bc Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 25 Aug 2022 17:33:26 +0800 Subject: [PATCH 79/93] feat: add doc.id to redis payload --- docarray/array/storage/redis/getsetdel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docarray/array/storage/redis/getsetdel.py b/docarray/array/storage/redis/getsetdel.py index 70347b9ddd2..30d9f86e066 100644 --- a/docarray/array/storage/redis/getsetdel.py +++ b/docarray/array/storage/redis/getsetdel.py @@ -96,6 +96,7 @@ def _document_to_redis(self, doc: 'Document') -> Dict: extra_columns[col] = int(tag) if isinstance(tag, bool) else tag payload = { + 'id': doc.id, 'embedding': self._map_embedding(doc.embedding), 'blob': doc.to_base64(), **extra_columns, From 173066ad637825c0d873d27e868f3dc5220bfd92 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 25 Aug 2022 17:35:19 +0800 Subject: [PATCH 80/93] fix: remove find_text related --- docarray/array/storage/redis/backend.py | 3 +- .../unit/array/storage/redis/test_backend.py | 47 ++++--------------- 2 files changed, 11 insertions(+), 39 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 53d75676193..05e006e776a 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -24,7 +24,6 @@ class RedisConfig: update_schema: bool = field(default=True) distance: str = field(default='COSINE') redis_config: Dict[str, Any] = field(default_factory=dict) - index_text: bool = field(default=False) tag_indices: List[str] = field(default_factory=list) batch_size: int = field(default=64) method: str = field(default='HNSW') @@ -99,7 +98,7 @@ def _build_client(self): client.flushdb() if self._config.update_schema: - if self._config.index_name in client.execute_command('FT._LIST'): + if self._config.index_name.encode() in client.execute_command('FT._LIST'): client.ft(index_name=self._config.index_name).dropindex() if self._config.flush or self._config.update_schema: diff --git a/tests/unit/array/storage/redis/test_backend.py b/tests/unit/array/storage/redis/test_backend.py index cfafbb71e77..04de55febba 100644 --- a/tests/unit/array/storage/redis/test_backend.py +++ b/tests/unit/array/storage/redis/test_backend.py @@ -1,9 +1,7 @@ from abc import ABC -import numpy as np import pytest from docarray import DocumentArray -from docarray.array.storage.base.helper import Offset2ID from docarray.array.storage.memory import GetSetDelMixin, SequenceLikeMixin from docarray.array.storage.redis.backend import BackendMixin, RedisConfig @@ -57,8 +55,6 @@ def da_redis(): [('attr1', 'double'), ('attr2', 'long'), ('attr3', 'bool')], ], ) -@pytest.mark.parametrize('tag_indices', [['attr4'], ['attr4', 'attr5']]) -@pytest.mark.parametrize('index_text', [True, False]) @pytest.mark.parametrize( 'redis_config', [ @@ -71,13 +67,11 @@ def da_redis(): ) def test_init_storage( distance, - tag_indices, columns, method, initial_cap, ef_construction, block_size, - index_text, redis_config, start_storage, ): @@ -85,13 +79,11 @@ def test_init_storage( n_dim=128, distance=distance, flush=True, - tag_indices=tag_indices, columns=columns, method=method, initial_cap=initial_cap, ef_construction=ef_construction, block_size=block_size, - index_text=index_text, redis_config=redis_config, ) redis_da = DocumentArrayDummy(storage='redis', config=cfg) @@ -100,45 +92,26 @@ def test_init_storage( assert redis_da._client.ft().info()['attributes'][0][1] == b'embedding' assert redis_da._client.ft().info()['attributes'][0][5] == b'VECTOR' - for i in range(len(tag_indices)): + for i in range(len(columns)): assert redis_da._client.ft().info()['attributes'][i + 1][1] == bytes( - redis_da._config.tag_indices[i], 'utf-8' + redis_da._config.columns[i][0], 'utf-8' ) - assert redis_da._client.ft().info()['attributes'][i + 1][5] == b'TEXT' - - for i in range(len(columns)): - assert redis_da._client.ft().info()['attributes'][i + len(tag_indices) + 1][ - 1 - ] == bytes(redis_da._config.columns[i][0], 'utf-8') assert ( - redis_da._client.ft().info()['attributes'][i + len(tag_indices) + 1][5] + redis_da._client.ft().info()['attributes'][i + 1][5] == type_convert[redis_da._config.columns[i][1]] ) - if index_text: - assert redis_da._client.ft().info()['attributes'][-1][1] == b'text' - assert redis_da._client.ft().info()['attributes'][-1][5] == b'TEXT' - def test_init_storage_update_schema(start_storage): - index = 'aaa' - cfg = RedisConfig(n_dim=128, tag_indices=['attr1'], index_name=index, flush=True) - redis_da = DocumentArrayDummy(storage='redis', config=cfg) - assert redis_da._client.ft(index).info()['attributes'][1][1] == b'attr1' - cfg = RedisConfig(n_dim=128, tag_indices=['attr2'], update_schema=False) + cfg = RedisConfig(n_dim=128, columns=[('attr1', 'str')], flush=True) redis_da = DocumentArrayDummy(storage='redis', config=cfg) - assert redis_da._client.ft(index).info()['attributes'][1][1] == b'attr1' + assert redis_da._client.ft().info()['attributes'][1][1] == b'attr1' - index2 = 'bbb' - cfg = RedisConfig( - n_dim=128, tag_indices=['attr2'], index_name=index2, update_schema=True - ) + cfg = RedisConfig(n_dim=128, columns=[('attr2', 'str')], update_schema=False) redis_da = DocumentArrayDummy(storage='redis', config=cfg) - assert redis_da._client.ft(index).info()['attributes'][1][1] == b'attr1' - assert redis_da._client.ft(index2).info()['attributes'][1][1] == b'attr2' + assert redis_da._client.ft().info()['attributes'][1][1] == b'attr1' - -def test_init_storage_empty_config(start_storage): - with pytest.raises(ValueError): - redis_da = DocumentArrayDummy(storage='redis') + cfg = RedisConfig(n_dim=128, columns=[('attr2', 'str')], update_schema=True) + redis_da = DocumentArrayDummy(storage='redis', config=cfg) + assert redis_da._client.ft().info()['attributes'][1][1] == b'attr2' From 59dc17fc10a3369fc86b2f70d5a4922a7b2ec053 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 25 Aug 2022 19:21:40 +0800 Subject: [PATCH 81/93] doc: add doc for redis storage backend --- docs/advanced/document-store/redis.md | 113 ++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 docs/advanced/document-store/redis.md diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md new file mode 100644 index 00000000000..2d243ba3ae2 --- /dev/null +++ b/docs/advanced/document-store/redis.md @@ -0,0 +1,113 @@ +(redis)= + +# Redis + +One can use [Redis](https://redis.io) as the document store for DocumentArray. It is useful when one wants to have faster Document retrieval on embeddings, i.e. `.match()`, `.find()`. + +````{tip} +This feature requires `redis`. You can install it via `pip install "docarray[redis]".` +```` + +## Usage + +### Start Redis service + +To use Redis as the storage backend, it is required to have the Redis service started. Create `docker-compose.yml` as follows: + +```yaml +version: "3.3" +services: + redis: + image: redis/redis-stack:6.2.2-v5 + ports: + - "6379:6379" +``` + +Then + +```bash +docker-compose up +``` + +### Create DocumentArray with Redis backend + +Assuming service is started using the default configuration (i.e. server address is `http://localhost:6379`), one can instantiate a DocumentArray with Redis storage as such: + +```python +from docarray import DocumentArray + +da = DocumentArray(storage='redis', config={'n_dim': 128}) +``` + +The usage would be the same as the ordinary DocumentArray, but the dimension of an embedding for a Document must be provided at creation time. + +**Currently, one Redis server instance is only supoorted to store one DocumentArray.** To access a DocumentArray formerly persisted, one can specify `host` and `port`. + +The following example will build a DocumentArray with previously stored data on `http://localhost:6379`: + +```python +from docarray import DocumentArray, Document + +da = DocumentArray( + storage='redis', + config={'n_dim': 128}, +) + +da.extend([Document() for _ in range(1000)]) + +da2 = DocumentArray( + storage='redis', + config={'n_dim': 128}, +) + +da2.summary() +``` + +```text + Documents Summary + + Length 1000 + Homogenous Documents True + Common Attributes ('id',) + Multimodal dataclass False + + Attributes Summary + + Attribute Data type #Unique values Has empty value + ───────────────────────────────────────────────────────────── + id ('str',) 1000 False +``` + +To store a new DocumentArray in current Redis server, one can set `flush` to `True` so that previous DocumentArray will be cleared: + +```python +from docarray import DocumentArray + +da = DocumentArray(storage='redis', config={'n_dim': 128, 'flush': True}) +``` + +Other functions behave the same as in-memory DocumentArray. + + +## Config + +The following configs can be set: + +| Name | Description | Default | +|-------------------|---------------------------------------------------------------------------------------------------|---------------------------------------------------------| +| `host` | Host address of the Redis server | | +| `port` | Port of the Redis Server | | +| `redis_config` | Other Redis configs in a Dict and pass to `Redis` client constructor, e.g. `socket_timeout`, `ssl`| | +| `index_name` | Redis index name; the name of RedisSearch index to set this DocumentArray | | +| `n_dim` | Dimensionality of the embeddings | | +| `flush` | Boolean flag indicating whether to clear previous DocumentArray in Redis | | +| `update_schema` | Boolean flag indicating whether to update Redis Search schema | | +| `distance` | Similarity distance metric in Redis | | +| `batch_size` | Batch size used to handle storage updates | | +| `method` | Vector similarity index algorithm in Redis | +| `ef_construction` | Optional parameter for Redis HNSW algorithm | | +| `m` | Optional parameter for Redis HNSW algorithm | | +| `ef_runtime` | Optional parameter for Redis HNSW algorithm | | +| `block_size` | Optional parameter for Redis FLAT algorithm | | +| `initial_cap` | Optional parameter for Redis HNSW and FLAT algorithm | | +| `columns` | Other fields to stora in Document and build schema | | \ No newline at end of file From 1dce1400d775143d1caa031410c68f2718c69b07 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 25 Aug 2022 20:26:16 +0800 Subject: [PATCH 82/93] fix: support find_text related in future --- docarray/array/storage/redis/getsetdel.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docarray/array/storage/redis/getsetdel.py b/docarray/array/storage/redis/getsetdel.py index 30d9f86e066..709d404d45e 100644 --- a/docarray/array/storage/redis/getsetdel.py +++ b/docarray/array/storage/redis/getsetdel.py @@ -102,11 +102,6 @@ def _document_to_redis(self, doc: 'Document') -> Dict: **extra_columns, } - if self._config.tag_indices: - for index in self._config.tag_indices: - if doc.tags.get(index) is not None: - payload[index] = doc.tags.get(index) - if doc.text: payload['text'] = doc.text return payload From 4f5ff4ad5c84ed4ec2819079aefbde3d8475b773 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 25 Aug 2022 21:13:54 +0800 Subject: [PATCH 83/93] refractor: change to $ne according to mongo --- docarray/array/storage/redis/find.py | 2 +- tests/unit/array/mixins/test_find.py | 6 +++--- tests/unit/array/mixins/test_match.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docarray/array/storage/redis/find.py b/docarray/array/storage/redis/find.py index 10afdcedbae..2459333c1ed 100644 --- a/docarray/array/storage/redis/find.py +++ b/docarray/array/storage/redis/find.py @@ -115,7 +115,7 @@ def _build_query_str(self, filter: Dict) -> str: s += f"@{key}:[{int(value)} {int(value)}] " else: s += f"@{key}:{value} " - elif operator == '$neq': + elif operator == '$ne': if type(value) is int: s += f"-@{key}:[{value} {value}] " elif type(value) is bool: diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 6a7b1bb88dd..974252c76ce 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -258,7 +258,7 @@ def test_find_by_tag(storage, config, start_storage): '$lte': operator.le, '$lt': operator.lt, '$eq': operator.eq, - '$neq': operator.ne, + '$ne': operator.ne, } @@ -559,7 +559,7 @@ def test_redis_category_filter(start_storage): assert len(results) > 0 assert all([(r.tags['color'] == 'red') for r in results]) - results = da.find(np.random.rand(n_dim), filter={'color': {'$neq': 'red'}}) + results = da.find(np.random.rand(n_dim), filter={'color': {'$ne': 'red'}}) assert len(results) > 0 assert all([(r.tags['color'] != 'red') for r in results]) @@ -567,7 +567,7 @@ def test_redis_category_filter(start_storage): assert len(results) > 0 assert all([(r.tags['isfake'] == True) for r in results]) - results = da.find(np.random.rand(n_dim), filter={'isfake': {'$neq': True}}) + results = da.find(np.random.rand(n_dim), filter={'isfake': {'$ne': True}}) assert len(results) > 0 assert all([(r.tags['isfake'] == False) for r in results]) diff --git a/tests/unit/array/mixins/test_match.py b/tests/unit/array/mixins/test_match.py index 005203d9ef1..becb0f8a633 100644 --- a/tests/unit/array/mixins/test_match.py +++ b/tests/unit/array/mixins/test_match.py @@ -614,7 +614,7 @@ def test_match_ensure_scores_unique(): '$lte': operator.le, '$lt': operator.lt, '$eq': operator.eq, - '$neq': operator.ne, + '$ne': operator.ne, } From e23e7efe8e804f49a795f0652cd9633333ddc980 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 25 Aug 2022 23:24:51 +0800 Subject: [PATCH 84/93] fix: remove _find_by_text related --- docarray/array/storage/redis/backend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 05e006e776a..41728834714 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -24,7 +24,6 @@ class RedisConfig: update_schema: bool = field(default=True) distance: str = field(default='COSINE') redis_config: Dict[str, Any] = field(default_factory=dict) - tag_indices: List[str] = field(default_factory=list) batch_size: int = field(default=64) method: str = field(default='HNSW') ef_construction: int = field(default=200) From ac6352ca65a44358f49fff467839dc4b9f41c1f0 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 25 Aug 2022 23:26:40 +0800 Subject: [PATCH 85/93] refractor: update _upload_batch --- docarray/array/storage/redis/seqlike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/redis/seqlike.py b/docarray/array/storage/redis/seqlike.py index b06a7ef726c..3db561daef3 100644 --- a/docarray/array/storage/redis/seqlike.py +++ b/docarray/array/storage/redis/seqlike.py @@ -50,7 +50,7 @@ def __repr__(self): """ return f'' - def _upload_batch(self, batch_of_docs: DocumentArray): + def _upload_batch(self, batch_of_docs: Iterable['Document']): pipe = self._client.pipeline() for doc in batch_of_docs: payload = self._document_to_redis(doc) From 5f8ff6fa86599da2cba3f138f5083a5844eb062a Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 25 Aug 2022 23:28:40 +0800 Subject: [PATCH 86/93] test: update redis related tests --- tests/unit/array/docker-compose.yml | 2 +- tests/unit/array/mixins/test_find.py | 7 +++++++ tests/unit/array/mixins/test_match.py | 11 ++++++++--- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/tests/unit/array/docker-compose.yml b/tests/unit/array/docker-compose.yml index e6fb1601709..07de4842154 100644 --- a/tests/unit/array/docker-compose.yml +++ b/tests/unit/array/docker-compose.yml @@ -27,7 +27,7 @@ services: networks: - elastic redis: - image: redis/redis-stack:6.2.2-v5 + image: redislabs/redisearch:2.6.0 ports: - "6379:6379" diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 974252c76ce..6ad66f22a5b 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -74,6 +74,9 @@ def test_find(storage, config, limit, query, start_storage): t['cosine_similarity'].value for t in result[:, 'scores'] ] assert sorted(cosine_similarities, reverse=True) == cosine_similarities + if storage == 'redis': + cosine_distances = [t['score'].value for t in da[:, 'scores']] + assert sorted(cosine_distances, reverse=False) == cosine_distances elif storage in ['memory', 'annlite', 'elasticsearch']: cosine_distances = [t['cosine'].value for t in da[:, 'scores']] assert sorted(cosine_distances, reverse=False) == cosine_distances @@ -84,6 +87,10 @@ def test_find(storage, config, limit, query, start_storage): t['cosine_similarity'].value for t in da[:, 'scores'] ] assert sorted(cosine_similarities, reverse=True) == cosine_similarities + if storage == 'redis': + for da in result: + cosine_distances = [t['score'].value for t in da[:, 'scores']] + assert sorted(cosine_distances, reverse=False) == cosine_distances elif storage in ['memory', 'annlite', 'elasticsearch']: for da in result: cosine_distances = [t['cosine'].value for t in da[:, 'scores']] diff --git a/tests/unit/array/mixins/test_match.py b/tests/unit/array/mixins/test_match.py index becb0f8a633..f246a928e04 100644 --- a/tests/unit/array/mixins/test_match.py +++ b/tests/unit/array/mixins/test_match.py @@ -92,9 +92,14 @@ def test_match(storage, config, doc_lists, limit, exclude_self, start_storage): for m in D1[:, 'matches']: assert len(m) == limit - expected_sorted_values = [ - D1[0].matches[i].scores['cosine'].value for i in range(limit) - ] + if storage == 'redis': + expected_sorted_values = [ + D1[0].matches[i].scores['score'].value for i in range(limit) + ] + else: + expected_sorted_values = [ + D1[0].matches[i].scores['cosine'].value for i in range(limit) + ] assert expected_sorted_values == sorted(expected_sorted_values) From e74bca80df8a0b2b45ede8833824367dd4a063e8 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 25 Aug 2022 23:29:08 +0800 Subject: [PATCH 87/93] docs: update redis doc --- docs/advanced/document-store/index.md | 1 + docs/advanced/document-store/redis.md | 303 ++++++++++++++++++++++---- 2 files changed, 263 insertions(+), 41 deletions(-) diff --git a/docs/advanced/document-store/index.md b/docs/advanced/document-store/index.md index 93739a044c8..0c79d816807 100644 --- a/docs/advanced/document-store/index.md +++ b/docs/advanced/document-store/index.md @@ -9,6 +9,7 @@ annlite qdrant elasticsearch weaviate +redis extend benchmark ``` diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index 2d243ba3ae2..94c7e6f6813 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -1,5 +1,4 @@ (redis)= - # Redis One can use [Redis](https://redis.io) as the document store for DocumentArray. It is useful when one wants to have faster Document retrieval on embeddings, i.e. `.match()`, `.find()`. @@ -18,7 +17,7 @@ To use Redis as the storage backend, it is required to have the Redis service st version: "3.3" services: redis: - image: redis/redis-stack:6.2.2-v5 + image: redislabs/redisearch:2.6.0 ports: - "6379:6379" ``` @@ -31,7 +30,7 @@ docker-compose up ### Create DocumentArray with Redis backend -Assuming service is started using the default configuration (i.e. server address is `http://localhost:6379`), one can instantiate a DocumentArray with Redis storage as such: +Assuming service is started using the default configuration (i.e. server address is `localhost:6379`), one can instantiate a DocumentArray with Redis storage as such: ```python from docarray import DocumentArray @@ -41,19 +40,30 @@ da = DocumentArray(storage='redis', config={'n_dim': 128}) The usage would be the same as the ordinary DocumentArray, but the dimension of an embedding for a Document must be provided at creation time. -**Currently, one Redis server instance is only supoorted to store one DocumentArray.** To access a DocumentArray formerly persisted, one can specify `host` and `port`. +```{caution} +Currently, one Redis server instance can only support to store a single DocumentArray. +``` + +To store a new DocumentArray in current Redis server, one can set `flush` to `True` so that previous DocumentArray will be cleared: + +```python +from docarray import DocumentArray + +da = DocumentArray(storage='redis', config={'n_dim': 128, 'flush': True}) +``` + +To access a previously stored DocumentArray, one can specify `host` and `port`. -The following example will build a DocumentArray with previously stored data on `http://localhost:6379`: +The following example will build a DocumentArray with previously stored data on `localhost:6379`: ```python from docarray import DocumentArray, Document -da = DocumentArray( +with DocumentArray( storage='redis', - config={'n_dim': 128}, -) - -da.extend([Document() for _ in range(1000)]) + config={'n_dim': 128, 'flush': True}, +) as da: + da.extend([Document() for _ in range(1000)]) da2 = DocumentArray( storage='redis', @@ -64,50 +74,261 @@ da2.summary() ``` ```text - Documents Summary +╭────────────── Documents Summary ──────────────╮ +│ │ +│ Type DocumentArrayRedis │ +│ Length 1000 │ +│ Homogenous Documents True │ +│ Common Attributes ('id',) │ +│ Multimodal dataclass False │ +│ │ +╰───────────────────────────────────────────────╯ +╭───────────────────── Attributes Summary ─────────────────────╮ +│ │ +│ Attribute Data type #Unique values Has empty value │ +│ ────────────────────────────────────────────────────────── │ +│ id ('str',) 1000 False │ +│ │ +╰──────────────────────────────────────────────────────────────╯ +╭─── DocumentArrayRedis Config ───╮ +│ │ +│ n_dim 128 │ +│ host localhost │ +│ port 6379 │ +│ index_name idx │ +│ flush False │ +│ update_schema True │ +│ distance COSINE │ +│ redis_config {} │ +│ batch_size 64 │ +│ method HNSW │ +│ ef_construction 200 │ +│ m 16 │ +│ ef_runtime 10 │ +│ block_size 1048576 │ +│ initial_cap None │ +│ columns [] │ +│ │ +╰─────────────────────────────────╯ +``` + + + +Other functions behave the same as in-memory DocumentArray. + + +### Vector search with filter query - Length 1000 - Homogenous Documents True - Common Attributes ('id',) - Multimodal dataclass False +One can perform Vector Similarity Search based on FLAT or HNSW algorithm and pre-filter results using a filter query that is based on [MongoDB's Query](https://www.mongodb.com/docs/manual/reference/operator/query/). We currently support a subset of those selectors: - Attributes Summary +- `$eq` - Equal to (number, string) +- `$ne` - Not equal to (number, string) +- `$gt` - Greater than (number) +- `$gte` - Greater than or equal to (number) +- `$lt` - Less than (number) +- `$lte` - Less than or equal to (number) - Attribute Data type #Unique values Has empty value - ───────────────────────────────────────────────────────────── - id ('str',) 1000 False + +Consider Documents with embeddings `[0,0,0]` up to `[9,9,9]` where the document with embedding `[i,i,i]` +has tag `price` with number value and tag `color` with string value. We can create such example with the following code: + +```python +import numpy as np +from docarray import Document, DocumentArray + +n_dim = 3 + +da = DocumentArray( + storage='redis', + config={ + 'n_dim': n_dim, + 'columns': [('price', 'int'), ('color', 'str')], + 'flush': True, + }, +) + +da.extend( + [ + Document( + id=f'{i}', embedding=i * np.ones(n_dim), tags={'price': i, 'color': 'red'} + ) + for i in range(10) + ] +) +da.extend( + [ + Document( + id=f'{i+10}', + embedding=i * np.ones(n_dim), + tags={'price': i, 'color': 'blue'}, + ) + for i in range(10) + ] +) + +print('\nIndexed prices and colors:\n') +for embedding, price, color in zip( + da.embeddings, da[:, 'tags__price'], da[:, 'tags__color'] +): + print(f'\tembedding={embedding},\t price={price},\t color={color}') ``` -To store a new DocumentArray in current Redis server, one can set `flush` to `True` so that previous DocumentArray will be cleared: +Consider we want the nearest vectors to the embedding `[8. 8. 8.]`, with the restriction that +prices and color must follow a filter. For example, let's consider that retrieved documents must have `price` value lower then or equal to `max_price` and have `color` equal to `color`. We can encode this information in Redis using `{'price': {'$lte': max_price}, 'color': {'$eq': color}}`. +Then the search with the proposed filter can be implemented and used with the following code: ```python -from docarray import DocumentArray +max_price = 7 +color = 'red' +n_limit = 5 -da = DocumentArray(storage='redis', config={'n_dim': 128, 'flush': True}) +np_query = np.ones(n_dim) * 8 +print(f'\nQuery vector: \t{np_query}') + +filter = {'price': {'$lte': max_price}, 'color': {'$eq': color}} +results = da.find(np_query, filter=filter, limit=n_limit) + +print( + '\nEmbeddings Approximate Nearest Neighbours with "price" at most 7 and "color" red:\n' +) +for embedding, price, color, score in zip( + results.embeddings, + results[:, 'tags__price'], + results[:, 'tags__color'], + results[:, 'scores'], +): + print( + f' score={score["score"].value},\t embedding={embedding},\t price={price},\t color={color}' + ) ``` -Other functions behave the same as in-memory DocumentArray. +This would print: + +```text +Embeddings Approximate Nearest Neighbours with "price" at most 7 and "color" red: + + embedding=[3. 3. 3.], price=3, color=red, score=0 + embedding=[6. 6. 6.], price=6, color=red, score=0 + embedding=[1. 1. 1.], price=1, color=red, score=5.96046447754e-08 + embedding=[2. 2. 2.], price=2, color=red, score=5.96046447754e-08 + embedding=[4. 4. 4.], price=4, color=red, score=5.96046447754e-08 +``` + +### Update Vector Search Indexing Schema + +Redis vector similarity supports two indexing methods: + +- FLAT - Brute-force index. + +- HNSW - Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs. + +Both methods have some mandatory parameters and optional parameters. + +```{tip} +You can read more about HNSW or FLAT parameters and their default values [here](https://redis.io/docs/stack/search/reference/vectors/#querying-vector-fields). +``` + +You can update the search indexing schema on existing DocumentArray by simply set `update_schema` to `True` and change your config parameters. + +Consider you store Documents with default indexing method `'HNSW'` and distance `'COSINE'`, and find the nearest vectors to the embedding `[8. 8. 8.]`. + +```python +import numpy as np +from docarray import Document, DocumentArray + +n_dim = 3 + +da = DocumentArray( + storage='redis', + config={ + 'n_dim': n_dim, + 'flush': True, + }, +) + +da.extend([Document(id=f'{i}', embedding=i * np.ones(n_dim)) for i in range(10)]) + +np_query = np.ones(n_dim) * 8 +n_limit = 5 + +results = da.find(np_query, limit=n_limit) + +print('\nEmbeddings Approximate Nearest Neighbours:\n') +for embedding, score in zip( + results.embeddings, + results[:, 'scores'], +): + print(f' embedding={embedding},\t score={score["score"].value}') +``` + +This would print: + +```text +Embeddings Approximate Nearest Neighbours: + + embedding=[3. 3. 3.], score=0 + embedding=[6. 6. 6.], score=0 + embedding=[1. 1. 1.], score=5.96046447754e-08 + embedding=[2. 2. 2.], score=5.96046447754e-08 + embedding=[4. 4. 4.], score=5.96046447754e-08 +``` + +Then you can use a different search indexing schema on current DocumentArray as follows: +```python +da2 = DocumentArray( + storage='redis', + config={'n_dim': n_dim, 'update_schema': True, 'distance': 'L2'}, +) + +results = da.find(np_query, limit=n_limit) + +print('\nEmbeddings Approximate Nearest Neighbours:\n') +for embedding, score in zip( + results.embeddings, + results[:, 'scores'], +): + print(f' embedding={embedding},\t score={score["score"].value}') +``` + +This would print: + +```text +Embeddings Approximate Nearest Neighbours: + + embedding=[8. 8. 8.], score=0 + embedding=[9. 9. 9.], score=3 + embedding=[7. 7. 7.], score=3 + embedding=[6. 6. 6.], score=12 + embedding=[5. 5. 5.], score=27 +``` ## Config The following configs can be set: -| Name | Description | Default | -|-------------------|---------------------------------------------------------------------------------------------------|---------------------------------------------------------| -| `host` | Host address of the Redis server | | -| `port` | Port of the Redis Server | | -| `redis_config` | Other Redis configs in a Dict and pass to `Redis` client constructor, e.g. `socket_timeout`, `ssl`| | -| `index_name` | Redis index name; the name of RedisSearch index to set this DocumentArray | | -| `n_dim` | Dimensionality of the embeddings | | -| `flush` | Boolean flag indicating whether to clear previous DocumentArray in Redis | | -| `update_schema` | Boolean flag indicating whether to update Redis Search schema | | -| `distance` | Similarity distance metric in Redis | | -| `batch_size` | Batch size used to handle storage updates | | -| `method` | Vector similarity index algorithm in Redis | -| `ef_construction` | Optional parameter for Redis HNSW algorithm | | -| `m` | Optional parameter for Redis HNSW algorithm | | -| `ef_runtime` | Optional parameter for Redis HNSW algorithm | | -| `block_size` | Optional parameter for Redis FLAT algorithm | | -| `initial_cap` | Optional parameter for Redis HNSW and FLAT algorithm | | -| `columns` | Other fields to stora in Document and build schema | | \ No newline at end of file +| Name | Description | Default | +|-------------------|---------------------------------------------------------------------------------------------------|-------------------------------------------------- | +| `host` | Host address of the Redis server | `'localhost'` | +| `port` | Port of the Redis Server | `6379` | +| `redis_config` | Other Redis configs in a Dict and pass to `Redis` client constructor, e.g. `socket_timeout`, `ssl`| `{}` | +| `index_name` | Redis index name; the name of RedisSearch index to set this DocumentArray | `'idx'` | +| `n_dim` | Dimensionality of the embeddings | `None` | +| `flush` | Boolean flag indicating whether to clear previous DocumentArray in Redis | `False` | +| `update_schema` | Boolean flag indicating whether to update Redis Search schema | `True` | +| `distance` | Similarity distance metric in Redis | `'COSINE'` | +| `batch_size` | Batch size used to handle storage updates | `64` | +| `method` | Vector similarity index algorithm in Redis | `'HNSW'` | +| `ef_construction` | Optional parameter for Redis HNSW algorithm | `200` | +| `m` | Optional parameter for Redis HNSW algorithm | `16` | +| `ef_runtime` | Optional parameter for Redis HNSW algorithm | `10` | +| `block_size` | Optional parameter for Redis FLAT algorithm | `1048576` | +| `initial_cap` | Optional parameter for Redis HNSW and FLAT algorithm | `None`, defaults to the default value in Redis | +| `columns` | Other fields to store in Document and build schema | `None` | + +You can check the default values in [the docarray source code](https://github.com/jina-ai/docarray/blob/main/docarray/array/storage/redis/backend.py) + + +```{note} +This Document Store will support storing multiple DocumentArrays and full-text search soon. +``` From 73b6dbe057705dab021fdfa33aa71593061bde10 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 25 Aug 2022 23:48:21 +0800 Subject: [PATCH 88/93] test: update test after removing text related --- .../array/storage/redis/test_getsetdel.py | 21 ++++--------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/tests/unit/array/storage/redis/test_getsetdel.py b/tests/unit/array/storage/redis/test_getsetdel.py index 54ec280c469..cd2b3f3d43c 100644 --- a/tests/unit/array/storage/redis/test_getsetdel.py +++ b/tests/unit/array/storage/redis/test_getsetdel.py @@ -24,12 +24,6 @@ def _save_offset2ids(self): pass -@pytest.fixture(scope='function') -def tag_indices(): - tag_indices = ['tag_1', 'tag_2'] - return tag_indices - - @pytest.fixture(scope='function') def columns(): columns = [ @@ -44,8 +38,8 @@ def columns(): @pytest.fixture(scope='function') -def da_redis(tag_indices, columns): - cfg = RedisConfig(n_dim=3, flush=True, tag_indices=tag_indices, columns=columns) +def da_redis(columns): + cfg = RedisConfig(n_dim=3, flush=True, columns=columns) da_redis = DocumentArrayDummy(storage='redis', config=cfg) return da_redis @@ -73,7 +67,7 @@ def da_redis(tag_indices, columns): ], ) def test_document_to_embedding( - embedding, text, tag, col, da_redis, columns, tag_indices, start_storage + embedding, text, tag, col, da_redis, columns, start_storage ): tags = {} if tag is not None: @@ -105,15 +99,8 @@ def test_document_to_embedding( with pytest.raises(KeyError): payload[col] - for tag in tag_indices: - if tag in tags: - assert payload[tag] == tags[tag] - else: - with pytest.raises(KeyError): - payload[tag] - for key in tags: - if (key not in tag_indices) and (key not in (col[0] for col in columns)): + if key not in (col[0] for col in columns): assert key not in payload From be728514599bf459d3d840f47d1484caccc65c9b Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 26 Aug 2022 08:39:27 +0800 Subject: [PATCH 89/93] docs: minor changes for redis doc --- docs/advanced/document-store/redis.md | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index 94c7e6f6813..31ddf83d736 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -73,7 +73,7 @@ da2 = DocumentArray( da2.summary() ``` -```text +```console ╭────────────── Documents Summary ──────────────╮ │ │ │ Type DocumentArrayRedis │ @@ -174,9 +174,9 @@ for embedding, price, color in zip( ``` Consider we want the nearest vectors to the embedding `[8. 8. 8.]`, with the restriction that -prices and color must follow a filter. For example, let's consider that retrieved documents must have `price` value lower then or equal to `max_price` and have `color` equal to `color`. We can encode this information in Redis using `{'price': {'$lte': max_price}, 'color': {'$eq': color}}`. +prices and color must follow a filter. For example, let's consider that retrieved documents must have a `price` value lower than or equal to `max_price` and have `color` equal to `color`. We can encode this information in Redis using `{'price': {'$lte': max_price}, 'color': {'$eq': color}}`. -Then the search with the proposed filter can be implemented and used with the following code: +Then the search with the proposed filter can be used as follows: ```python max_price = 7 color = 'red' @@ -204,7 +204,7 @@ for embedding, price, color, score in zip( This would print: -```text +```console Embeddings Approximate Nearest Neighbours with "price" at most 7 and "color" red: embedding=[3. 3. 3.], price=3, color=red, score=0 @@ -218,14 +218,14 @@ Embeddings Approximate Nearest Neighbours with "price" at most 7 and "color" red Redis vector similarity supports two indexing methods: -- FLAT - Brute-force index. +- FLAT - Brute-force search. - HNSW - Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs. Both methods have some mandatory parameters and optional parameters. ```{tip} -You can read more about HNSW or FLAT parameters and their default values [here](https://redis.io/docs/stack/search/reference/vectors/#querying-vector-fields). +Read more about HNSW or FLAT parameters and their default values [here](https://redis.io/docs/stack/search/reference/vectors/#querying-vector-fields). ``` You can update the search indexing schema on existing DocumentArray by simply set `update_schema` to `True` and change your config parameters. @@ -263,7 +263,7 @@ for embedding, score in zip( This would print: -```text +```console Embeddings Approximate Nearest Neighbours: embedding=[3. 3. 3.], score=0 @@ -277,7 +277,11 @@ Then you can use a different search indexing schema on current DocumentArray as ```python da2 = DocumentArray( storage='redis', - config={'n_dim': n_dim, 'update_schema': True, 'distance': 'L2'}, + config={ + 'n_dim': n_dim, + 'update_schema': True, + 'distance': 'L2', + }, ) results = da.find(np_query, limit=n_limit) @@ -292,7 +296,7 @@ for embedding, score in zip( This would print: -```text +```console Embeddings Approximate Nearest Neighbours: embedding=[8. 8. 8.], score=0 @@ -318,7 +322,7 @@ The following configs can be set: | `update_schema` | Boolean flag indicating whether to update Redis Search schema | `True` | | `distance` | Similarity distance metric in Redis | `'COSINE'` | | `batch_size` | Batch size used to handle storage updates | `64` | -| `method` | Vector similarity index algorithm in Redis | `'HNSW'` | +| `method` | Vector similarity index algorithm in Redis, either `FLAT` or `HNSW` | `'HNSW'` | | `ef_construction` | Optional parameter for Redis HNSW algorithm | `200` | | `m` | Optional parameter for Redis HNSW algorithm | `16` | | `ef_runtime` | Optional parameter for Redis HNSW algorithm | `10` | @@ -330,5 +334,6 @@ You can check the default values in [the docarray source code](https://github.co ```{note} -This Document Store will support storing multiple DocumentArrays and full-text search soon. +The Redis storage backend will support storing multiple DocumentArrays, full-text search, more query conitions and geo-filtering soon. +The benchmark test is on the way. ``` From fdd9462933e8ba66d127b4304e2cf93fde51e7ea Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 26 Aug 2022 09:20:00 +0800 Subject: [PATCH 90/93] docs: minor changes for redis doc --- docs/advanced/document-store/redis.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index 31ddf83d736..74ddc5dae82 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -334,6 +334,6 @@ You can check the default values in [the docarray source code](https://github.co ```{note} -The Redis storage backend will support storing multiple DocumentArrays, full-text search, more query conitions and geo-filtering soon. +The DocumentArray Redis storage backend will support storing multiple DocumentArrays, full-text search, more query conitions and geo-filtering soon. The benchmark test is on the way. ``` From 8d0f251b0f2723feceb6682a717ee5bb9087c581 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 26 Aug 2022 20:04:49 +0800 Subject: [PATCH 91/93] docs: update redis doc --- docs/advanced/document-store/redis.md | 69 ++++++++++++++------------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index 74ddc5dae82..a4ce1dd155e 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -1,7 +1,7 @@ (redis)= # Redis -One can use [Redis](https://redis.io) as the document store for DocumentArray. It is useful when one wants to have faster Document retrieval on embeddings, i.e. `.match()`, `.find()`. +You can use [Redis](https://redis.io) as the document store for DocumentArray. It is useful when you want to have faster Document retrieval on embeddings, i.e. `.match()`, `.find()`. ````{tip} This feature requires `redis`. You can install it via `pip install "docarray[redis]".` @@ -30,7 +30,7 @@ docker-compose up ### Create DocumentArray with Redis backend -Assuming service is started using the default configuration (i.e. server address is `localhost:6379`), one can instantiate a DocumentArray with Redis storage as such: +Assuming the service is started using the default configuration (i.e. server address is `localhost:6379`), you can instantiate a DocumentArray with Redis storage as such: ```python from docarray import DocumentArray @@ -41,10 +41,10 @@ da = DocumentArray(storage='redis', config={'n_dim': 128}) The usage would be the same as the ordinary DocumentArray, but the dimension of an embedding for a Document must be provided at creation time. ```{caution} -Currently, one Redis server instance can only support to store a single DocumentArray. +Currently, one Redis server instance can only store a single DocumentArray. ``` -To store a new DocumentArray in current Redis server, one can set `flush` to `True` so that previous DocumentArray will be cleared: +To store a new DocumentArray on the current Redis server, you can set `flush` to `True` so that the previous DocumentArray will be cleared: ```python from docarray import DocumentArray @@ -52,9 +52,9 @@ from docarray import DocumentArray da = DocumentArray(storage='redis', config={'n_dim': 128, 'flush': True}) ``` -To access a previously stored DocumentArray, one can specify `host` and `port`. +To access a previously stored DocumentArray, you can set `host` and `port` to match with the previuosly stored DocumentArray and make sure `flush` is `False`. -The following example will build a DocumentArray with previously stored data on `localhost:6379`: +The following example builds a DocumentArray from previously stored data on `localhost:6379`: ```python from docarray import DocumentArray, Document @@ -67,12 +67,13 @@ with DocumentArray( da2 = DocumentArray( storage='redis', - config={'n_dim': 128}, + config={'n_dim': 128, 'flush': False}, ) da2.summary() ``` +```{dropdown} Output ```console ╭────────────── Documents Summary ──────────────╮ │ │ @@ -119,7 +120,7 @@ Other functions behave the same as in-memory DocumentArray. ### Vector search with filter query -One can perform Vector Similarity Search based on FLAT or HNSW algorithm and pre-filter results using a filter query that is based on [MongoDB's Query](https://www.mongodb.com/docs/manual/reference/operator/query/). We currently support a subset of those selectors: +You can perform Vector Similarity Search based on [FLAT or HNSW algorithm](vector-search-index) and pre-filter results using a filter query that is based on [MongoDB's Query](https://www.mongodb.com/docs/manual/reference/operator/query/). We currently support a subset of those selectors: - `$eq` - Equal to (number, string) - `$ne` - Not equal to (number, string) @@ -129,8 +130,8 @@ One can perform Vector Similarity Search based on FLAT or HNSW algorithm and pre - `$lte` - Less than or equal to (number) -Consider Documents with embeddings `[0,0,0]` up to `[9,9,9]` where the document with embedding `[i,i,i]` -has tag `price` with number value and tag `color` with string value. We can create such example with the following code: +Consider Documents with embeddings `[0, 0, 0]` up to `[9, 9, 9]` where the Document with embedding `[i, i, i]` +has tag `price` with a number value, and tag `color` with a string value. You can create such example with the following code: ```python import numpy as np @@ -144,6 +145,7 @@ da = DocumentArray( 'n_dim': n_dim, 'columns': [('price', 'int'), ('color', 'str')], 'flush': True, + 'distance': 'L2', }, ) @@ -173,8 +175,8 @@ for embedding, price, color in zip( print(f'\tembedding={embedding},\t price={price},\t color={color}') ``` -Consider we want the nearest vectors to the embedding `[8. 8. 8.]`, with the restriction that -prices and color must follow a filter. For example, let's consider that retrieved documents must have a `price` value lower than or equal to `max_price` and have `color` equal to `color`. We can encode this information in Redis using `{'price': {'$lte': max_price}, 'color': {'$eq': color}}`. +Consider the case where you want the nearest vectors to the embedding `[8., 8., 8.]`, with the restriction that +prices and colors must pass a filter. For example, let's consider that retrieved Documents must have a `price` value lower than or equal to `max_price` and have `color` equal to `color`. We can encode this information in Redis using `{'price': {'$lte': max_price}, 'color': {'$eq': color}}`. Then the search with the proposed filter can be used as follows: ```python @@ -207,20 +209,20 @@ This would print: ```console Embeddings Approximate Nearest Neighbours with "price" at most 7 and "color" red: - embedding=[3. 3. 3.], price=3, color=red, score=0 - embedding=[6. 6. 6.], price=6, color=red, score=0 - embedding=[1. 1. 1.], price=1, color=red, score=5.96046447754e-08 - embedding=[2. 2. 2.], price=2, color=red, score=5.96046447754e-08 - embedding=[4. 4. 4.], price=4, color=red, score=5.96046447754e-08 + score=3, embedding=[7. 7. 7.], price=7, color=red + score=12, embedding=[6. 6. 6.], price=6, color=red + score=27, embedding=[5. 5. 5.], price=5, color=red + score=48, embedding=[4. 4. 4.], price=4, color=red + score=75, embedding=[3. 3. 3.], price=3, color=red ``` +(vector-search-index)= ### Update Vector Search Indexing Schema Redis vector similarity supports two indexing methods: -- FLAT - Brute-force search. - -- HNSW - Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs. +- **FLAT**: Brute-force search. +- **HNSW**: Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs. Both methods have some mandatory parameters and optional parameters. @@ -228,9 +230,9 @@ Both methods have some mandatory parameters and optional parameters. Read more about HNSW or FLAT parameters and their default values [here](https://redis.io/docs/stack/search/reference/vectors/#querying-vector-fields). ``` -You can update the search indexing schema on existing DocumentArray by simply set `update_schema` to `True` and change your config parameters. +You can update the search indexing schema on an existing DocumentArray by setting `update_schema` to `True` and changing your configuratoin parameters. -Consider you store Documents with default indexing method `'HNSW'` and distance `'COSINE'`, and find the nearest vectors to the embedding `[8. 8. 8.]`. +Consider you store Documents with default indexing method `'HNSW'` and distance `'L2'`, and want to find the nearest vectors to the embedding `[8. 8. 8.]`. ```python import numpy as np @@ -243,6 +245,7 @@ da = DocumentArray( config={ 'n_dim': n_dim, 'flush': True, + 'distance': 'L2', }, ) @@ -266,21 +269,21 @@ This would print: ```console Embeddings Approximate Nearest Neighbours: - embedding=[3. 3. 3.], score=0 - embedding=[6. 6. 6.], score=0 - embedding=[1. 1. 1.], score=5.96046447754e-08 - embedding=[2. 2. 2.], score=5.96046447754e-08 - embedding=[4. 4. 4.], score=5.96046447754e-08 + embedding=[8. 8. 8.], score=0 + embedding=[7. 7. 7.], score=3 + embedding=[9. 9. 9.], score=3 + embedding=[6. 6. 6.], score=12 + embedding=[5. 5. 5.], score=27 ``` -Then you can use a different search indexing schema on current DocumentArray as follows: +Then you can use a different search indexing schema on the current DocumentArray as follows: ```python da2 = DocumentArray( storage='redis', config={ 'n_dim': n_dim, 'update_schema': True, - 'distance': 'L2', + 'distance': 'COSINE', }, ) @@ -294,7 +297,7 @@ for embedding, score in zip( print(f' embedding={embedding},\t score={score["score"].value}') ``` -This would print: +This will print: ```console Embeddings Approximate Nearest Neighbours: @@ -307,7 +310,7 @@ Embeddings Approximate Nearest Neighbours: ``` -## Config +## Configuration The following configs can be set: @@ -320,7 +323,7 @@ The following configs can be set: | `n_dim` | Dimensionality of the embeddings | `None` | | `flush` | Boolean flag indicating whether to clear previous DocumentArray in Redis | `False` | | `update_schema` | Boolean flag indicating whether to update Redis Search schema | `True` | -| `distance` | Similarity distance metric in Redis | `'COSINE'` | +| `distance` | Similarity distance metric in Redis, one of {`'L2'`, `'IP'`, `'COSINE'`} | `'COSINE'` | | `batch_size` | Batch size used to handle storage updates | `64` | | `method` | Vector similarity index algorithm in Redis, either `FLAT` or `HNSW` | `'HNSW'` | | `ef_construction` | Optional parameter for Redis HNSW algorithm | `200` | @@ -334,6 +337,6 @@ You can check the default values in [the docarray source code](https://github.co ```{note} -The DocumentArray Redis storage backend will support storing multiple DocumentArrays, full-text search, more query conitions and geo-filtering soon. +Only 1 DocumentArray is allowed per redis instance (db0). We will support storing multiple DocumentArrays in one redis instance, full-text search, more query conitions and geo-filtering soon. The benchmark test is on the way. ``` From 9063420d5fd1cad263126bcb8dc4076345256af9 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 26 Aug 2022 20:06:45 +0800 Subject: [PATCH 92/93] docs: fix redis doc --- docs/advanced/document-store/redis.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index a4ce1dd155e..9dc953b17d0 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -302,11 +302,11 @@ This will print: ```console Embeddings Approximate Nearest Neighbours: - embedding=[8. 8. 8.], score=0 - embedding=[9. 9. 9.], score=3 - embedding=[7. 7. 7.], score=3 - embedding=[6. 6. 6.], score=12 - embedding=[5. 5. 5.], score=27 + embedding=[3. 3. 3.], score=0 + embedding=[6. 6. 6.], score=0 + embedding=[9. 9. 9.], score=5.96046447754e-08 + embedding=[8. 8. 8.], score=5.96046447754e-08 + embedding=[5. 5. 5.], score=5.96046447754e-08 ``` From fe800d80f32a152a32764408fca3a6b90e535ebd Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 26 Aug 2022 20:15:57 +0800 Subject: [PATCH 93/93] docs: add host and port in example --- docs/advanced/document-store/redis.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index 9dc953b17d0..b3abb03771a 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -35,7 +35,9 @@ Assuming the service is started using the default configuration (i.e. server add ```python from docarray import DocumentArray -da = DocumentArray(storage='redis', config={'n_dim': 128}) +da = DocumentArray( + storage='redis', config={'host': 'localhost', 'port': 6379, 'n_dim': 128} +) ``` The usage would be the same as the ordinary DocumentArray, but the dimension of an embedding for a Document must be provided at creation time.