Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
107 commits
Select commit Hold shift + click to select a range
fc3f93b
feat(redis): add storage backend and unit test
AnneYang720 Jul 25, 2022
703ea09
style: isort python imports
AnneYang720 Jul 26, 2022
6f4f8b1
style: remove useless parameter in _build_schema
AnneYang720 Jul 26, 2022
06fc38a
test: test optional param in redis config
AnneYang720 Jul 26, 2022
f21e2da
fix: remove optional for some params in RedisConfig
AnneYang720 Jul 26, 2022
3a3e78b
feat: add build schema for 'text'
AnneYang720 Jul 26, 2022
5ae1409
fix: raise ValueError when RedisConfig is None and add unit test
AnneYang720 Jul 26, 2022
9e716d1
refactor: improve the logic of update_schema
AnneYang720 Jul 26, 2022
90e5ae9
test: add None value of embedding in test_map_embedding
AnneYang720 Jul 26, 2022
5300e23
fix: remove useless code
AnneYang720 Jul 26, 2022
a6e1968
fix: set decode_responses in redis_config to False
AnneYang720 Jul 26, 2022
ca8efb9
fix: _get_offset2ids_meta return List[str]
AnneYang720 Jul 26, 2022
0dab0c4
fix: redis dropindex bug
AnneYang720 Jul 27, 2022
0276073
feat: add redis dependecies
AnneYang720 Jul 27, 2022
f64ee39
fix: switch to absolute import
AnneYang720 Aug 1, 2022
94f7e73
test: remove unnecessary tests
AnneYang720 Aug 1, 2022
86323c2
feat: add redis storage getsetdel and unit tests
AnneYang720 Aug 1, 2022
f23b73a
fix: backend call super init
AnneYang720 Aug 6, 2022
15d5ff6
fix: redis _set_doc_by_id
AnneYang720 Aug 6, 2022
5369de0
feat: add redis seqlike
AnneYang720 Aug 6, 2022
58a365b
test: add redis to test_seq
AnneYang720 Aug 6, 2022
56aab5d
feat: add redis storage subclass and entrypoint
AnneYang720 Aug 13, 2022
0cb15df
feat: add redis storage find.py
AnneYang720 Aug 13, 2022
902fede
test: test_construct add redis
AnneYang720 Aug 13, 2022
f918fcb
test: add redis to test_pull_out
AnneYang720 Aug 13, 2022
e29061d
test: add redis to test_content
AnneYang720 Aug 13, 2022
bcc6ae3
test: add redis to test_embed
AnneYang720 Aug 13, 2022
bf84282
test: add redis to test_empty
AnneYang720 Aug 13, 2022
0358083
test: add redis test_eval_class
AnneYang720 Aug 13, 2022
926ecc5
test: add redis to test_getset
AnneYang720 Aug 13, 2022
897465b
test: add redis to test_magic
AnneYang720 Aug 13, 2022
15c1c17
test: add redis to test_parallel
AnneYang720 Aug 13, 2022
bdf5529
test: add redis to test_sample
AnneYang720 Aug 13, 2022
f827c2b
test: add redis to test_text
AnneYang720 Aug 13, 2022
4626ffd
test: add redis to test_traverse
AnneYang720 Aug 13, 2022
be1413d
test: add redis test_plot
AnneYang720 Aug 13, 2022
096b303
test: add redis to test_find
AnneYang720 Aug 13, 2022
3d4f411
test: add redis to test_match
AnneYang720 Aug 13, 2022
a3144cb
fix: fix conflicts with main
AnneYang720 Aug 15, 2022
13e6b6c
Merge remote-tracking branch 'jina/main' into feat-redis-backend
AnneYang720 Aug 15, 2022
6949fe2
test: add redis to test_advance_indexing
AnneYang720 Aug 15, 2022
08d6644
feat: add _ensure_unique_config to redis
AnneYang720 Aug 15, 2022
a8f6eeb
fix: remove comments test_text
AnneYang720 Aug 17, 2022
efd052d
fix: remove useless debug output
AnneYang720 Aug 17, 2022
60b9526
refactor: simplify redis search command
AnneYang720 Aug 17, 2022
5da0d89
refractor: avoid using redis keys command
AnneYang720 Aug 17, 2022
0d32d68
refractor: remove useless check
AnneYang720 Aug 17, 2022
6faa2ab
refractor: clarify exception message
AnneYang720 Aug 18, 2022
cfbb543
feat: add index_name to redis config
AnneYang720 Aug 18, 2022
a91d2c5
fix: change redis find to mongo style
AnneYang720 Aug 18, 2022
8bc105a
refractor: config sequence
AnneYang720 Aug 18, 2022
8e89d14
test: remove comments
AnneYang720 Aug 18, 2022
e36ae39
fix: redis config index_name update
AnneYang720 Aug 18, 2022
f4b19b7
test: fix redis in test_advance_indexing
AnneYang720 Aug 19, 2022
aa2f8b2
test: fix redis in test_content
AnneYang720 Aug 19, 2022
e46174c
test: fix redis in test_empty
AnneYang720 Aug 19, 2022
634b62d
test: fix redis in test_find
AnneYang720 Aug 19, 2022
5ddaeb7
test: fix redis in test_magic
AnneYang720 Aug 19, 2022
57feb9b
test: fix redis in test_match
AnneYang720 Aug 19, 2022
cad75f7
feat: redis add subindex support
AnneYang720 Aug 19, 2022
75e7a90
test: add redis to sub_index related tests
AnneYang720 Aug 19, 2022
12a9985
Merge branch 'main' into feat-redis-backend
hanxiao Aug 21, 2022
61360d0
feat: change redis find to pre-filtering
AnneYang720 Aug 23, 2022
d4a82d1
Merge branch 'main' into feat-redis-backend
Aug 23, 2022
8a6168d
test: add category filter test for redis find
AnneYang720 Aug 23, 2022
7d3de6c
Merge branch 'main' into feat-redis-backend
Aug 23, 2022
4d0505f
refactor: using batch_docs in redis extend
AnneYang720 Aug 23, 2022
911b648
Merge branch 'feat-redis-backend' of github.com:AnneYang720/docarray …
AnneYang720 Aug 23, 2022
baeccd2
Merge branch 'main' into feat-redis-backend
Aug 24, 2022
c0a1f35
feat: add redis bool type support and tests
AnneYang720 Aug 24, 2022
b2f3a77
Merge branch 'feat-redis-backend' of github.com:AnneYang720/docarray …
AnneYang720 Aug 24, 2022
ddd1eba
refractor: add default values to redis config
AnneYang720 Aug 24, 2022
2d7c75c
Merge branch 'main' into feat-redis-backend
Aug 24, 2022
e85f1f7
refractor: remove useless comments
AnneYang720 Aug 24, 2022
53cbaa9
Merge branch 'feat-redis-backend' of github.com:AnneYang720/docarray …
AnneYang720 Aug 24, 2022
97bc99e
test: fix test_backend for redis
AnneYang720 Aug 24, 2022
3bba4b0
refractor: simplify key prefix
AnneYang720 Aug 24, 2022
c0f083f
test: add bool type test for redis
AnneYang720 Aug 24, 2022
582b56b
refeactor: add default value in redisconfig
AnneYang720 Aug 24, 2022
1e2682b
fix: keep kwargs for future potential use
AnneYang720 Aug 24, 2022
80cff7a
refractor: change bool type check
AnneYang720 Aug 24, 2022
549939a
refractor: specify redis version
AnneYang720 Aug 24, 2022
1fce045
feat: add _set_docs_by_ids for redis
AnneYang720 Aug 24, 2022
cfd9aa8
fix: fix redis set_doc_by_id(s)
AnneYang720 Aug 24, 2022
b718087
refractor: refract gc collect for redis
AnneYang720 Aug 24, 2022
c23f630
feat: add _get_docs_by_ids for redis
AnneYang720 Aug 25, 2022
4e1e933
Merge branch 'main' into feat-redis-backend
Aug 25, 2022
8dbc5ee
refractor: support find_by_text in future
AnneYang720 Aug 25, 2022
be3f14a
Merge branch 'feat-redis-backend' of github.com:AnneYang720/docarray …
AnneYang720 Aug 25, 2022
55a6c1c
feat: add doc.id to redis payload
AnneYang720 Aug 25, 2022
173066a
fix: remove find_text related
AnneYang720 Aug 25, 2022
59dc17f
doc: add doc for redis storage backend
AnneYang720 Aug 25, 2022
1dce140
fix: support find_text related in future
AnneYang720 Aug 25, 2022
4f5ff4a
refractor: change to $ne according to mongo
AnneYang720 Aug 25, 2022
e23e7ef
fix: remove _find_by_text related
AnneYang720 Aug 25, 2022
ac6352c
refractor: update _upload_batch
AnneYang720 Aug 25, 2022
5f8ff6f
test: update redis related tests
AnneYang720 Aug 25, 2022
e74bca8
docs: update redis doc
AnneYang720 Aug 25, 2022
73b6dbe
test: update test after removing text related
AnneYang720 Aug 25, 2022
be72851
docs: minor changes for redis doc
AnneYang720 Aug 26, 2022
fdd9462
docs: minor changes for redis doc
AnneYang720 Aug 26, 2022
0cf9618
Merge branch 'main' into feat-redis-backend
hanxiao Aug 26, 2022
8d0f251
docs: update redis doc
AnneYang720 Aug 26, 2022
fec9cbf
Merge branch 'feat-redis-backend' of github.com:AnneYang720/docarray …
AnneYang720 Aug 26, 2022
9063420
docs: fix redis doc
AnneYang720 Aug 26, 2022
fe800d8
docs: add host and port in example
AnneYang720 Aug 26, 2022
5f94757
Merge branch 'main' into feat-redis-backend
hanxiao Aug 26, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions docarray/array/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
from docarray.array.annlite import DocumentArrayAnnlite
from docarray.array.weaviate import DocumentArrayWeaviate
from docarray.array.elastic import DocumentArrayElastic
from docarray.array.redis import DocumentArrayRedis
from docarray.array.storage.sqlite import SqliteConfig
from docarray.array.storage.annlite import AnnliteConfig
from docarray.array.storage.weaviate import WeaviateConfig
from docarray.array.storage.elastic import ElasticConfig
from docarray.array.storage.redis import RedisConfig


class DocumentArray(AllMixins, BaseDocumentArray):
Expand Down Expand Up @@ -127,6 +129,16 @@ def __new__(
"""Create a Elastic-powered DocumentArray object."""
...

@overload
def __new__(
cls,
_docs: Optional['DocumentArraySourceType'] = None,
storage: str = 'redis',
config: Optional[Union['RedisConfig', Dict]] = None,
) -> 'DocumentArrayRedis':
"""Create a Redis-powered DocumentArray object."""
...

def __enter__(self):
return self

Expand Down Expand Up @@ -163,6 +175,10 @@ def __new__(cls, *args, storage: str = 'memory', **kwargs):
from docarray.array.elastic import DocumentArrayElastic

instance = super().__new__(DocumentArrayElastic)
elif storage == 'redis':
from .redis import DocumentArrayRedis

instance = super().__new__(DocumentArrayRedis)

else:
raise ValueError(f'storage=`{storage}` is not supported.')
Expand Down
19 changes: 19 additions & 0 deletions docarray/array/redis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from .document import DocumentArray
from .storage.redis import RedisConfig, StorageMixins

__all__ = ['DocumentArrayRedis', 'RedisConfig']


class DocumentArrayRedis(StorageMixins, DocumentArray):
"""This is a :class:`DocumentArray` that uses Redis as
vector search engine and storage.
"""

def __new__(cls, *args, **kwargs):
"""``__new__`` method for :class:`DocumentArrayRedis`

:param *args: list of args to instantiate the object
:param **kwargs: dict of args to instantiate the object
:return: the instantiated :class:`DocumentArrayRedis` object
"""
return super().__new__(cls)
12 changes: 12 additions & 0 deletions docarray/array/storage/redis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from abc import ABC

from .backend import BackendMixin, RedisConfig
from .find import FindMixin
from .getsetdel import GetSetDelMixin
from .seqlike import SequenceLikeMixin

__all__ = ['StorageMixins', 'RedisConfig']


class StorageMixins(FindMixin, BackendMixin, GetSetDelMixin, SequenceLikeMixin, ABC):
...
180 changes: 180 additions & 0 deletions docarray/array/storage/redis/backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union

import numpy as np
from docarray import Document
from docarray.array.storage.base.backend import BaseBackendMixin, TypeMap
from docarray.helper import dataclass_from_dict

from redis import Redis
from redis.commands.search.field import NumericField, TextField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition

if TYPE_CHECKING:
from docarray.typing import ArrayType, DocumentArraySourceType


@dataclass
class RedisConfig:
n_dim: int
host: str = field(default='localhost')
port: int = field(default=6379)
index_name: str = field(default='idx')
flush: bool = field(default=False)
update_schema: bool = field(default=True)
distance: str = field(default='COSINE')
redis_config: Dict[str, Any] = field(default_factory=dict)
batch_size: int = field(default=64)
method: str = field(default='HNSW')
ef_construction: int = field(default=200)
m: int = field(default=16)
ef_runtime: int = field(default=10)
block_size: int = field(default=1048576)
initial_cap: Optional[int] = None
columns: Optional[List[Tuple[str, str]]] = None


class BackendMixin(BaseBackendMixin):
"""Provide necessary functions to enable this storage backend."""

TYPE_MAP = {
'str': TypeMap(type='text', converter=TextField),
'bytes': TypeMap(type='text', converter=TextField),
'int': TypeMap(type='integer', converter=NumericField),
'float': TypeMap(type='float', converter=NumericField),
'double': TypeMap(type='double', converter=NumericField),
'long': TypeMap(type='long', converter=NumericField),
'bool': TypeMap(type='long', converter=NumericField),
}

def _init_storage(
self,
_docs: Optional['DocumentArraySourceType'] = None,
config: Optional[Union[RedisConfig, Dict]] = None,
**kwargs,
):
if not config:
raise ValueError('Empty config is not allowed for Redis storage')
elif isinstance(config, dict):
config = dataclass_from_dict(RedisConfig, config)

if config.distance not in ['L2', 'IP', 'COSINE']:
raise ValueError(
f'Expecting distance metric one of COSINE, L2 OR IP, got {config.distance} instead'
)
if config.method not in ['HNSW', 'FLAT']:
raise ValueError(
f'Expecting search method one of HNSW OR FLAT, got {config.method} instead'
)

if config.redis_config.get('decode_responses'):
config.redis_config['decode_responses'] = False

self._offset2id_key = config.index_name + '__offset2id'
self._config = config
self.n_dim = self._config.n_dim
self._doc_prefix = config.index_name + ':'
self._config.columns = self._normalize_columns(self._config.columns)

self._client = self._build_client()
super()._init_storage()

if _docs is None:
return
elif isinstance(_docs, Iterable):
self.extend(_docs)
elif isinstance(_docs, Document):
self.append(_docs)

def _build_client(self):
client = Redis(
host=self._config.host,
port=self._config.port,
**self._config.redis_config,
)

if self._config.flush:
client.flushdb()

if self._config.update_schema:
if self._config.index_name.encode() in client.execute_command('FT._LIST'):
client.ft(index_name=self._config.index_name).dropindex()

if self._config.flush or self._config.update_schema:
schema = self._build_schema_from_redis_config()
idef = IndexDefinition(prefix=[self._doc_prefix])
client.ft(index_name=self._config.index_name).create_index(
schema, definition=idef
)

return client

def _ensure_unique_config(
self,
config_root: dict,
config_subindex: dict,
config_joined: dict,
subindex_name: str,
) -> dict:
if 'index_name' not in config_subindex:
config_joined['index_name'] = (
config_joined['index_name'] + '_subindex_' + subindex_name
)
config_joined['flush'] = False
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this do? Why do we need it here?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parameter flush only exists in RedisConfig, flush=True will clear the redis database. We don't want to clear database when there is subindex docarray, so I make sure flush is set to False here.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure about this, why would a subindex behave differently from the main index? Shouldn't wee let the user decide what they want to do? @alaeddine-13 @bwanglzu ?

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

didn't get it, i think we already have a flush option which user can control in the config?

return config_joined

def _build_schema_from_redis_config(self):
index_param = {
'TYPE': 'FLOAT32',
'DIM': self.n_dim,
'DISTANCE_METRIC': self._config.distance,
}

if self._config.method == 'HNSW':
index_options = {
'M': self._config.m,
'EF_CONSTRUCTION': self._config.ef_construction,
'EF_RUNTIME': self._config.ef_runtime,
}
index_param.update(index_options)

if self._config.method == 'FLAT':
index_options = {'BLOCK_SIZE': self._config.block_size}
index_param.update(index_options)

if self._config.initial_cap:
index_param['INITIAL_CAP'] = self._config.initial_cap
schema = [VectorField('embedding', self._config.method, index_param)]

for col, coltype in self._config.columns:
schema.append(self._map_column(col, coltype))

return schema

def _doc_id_exists(self, doc_id):
return self._client.exists(self._doc_prefix + doc_id)

def _map_embedding(self, embedding: 'ArrayType') -> bytes:
if embedding is not None:
from docarray.math.ndarray import to_numpy_array

embedding = to_numpy_array(embedding)

if embedding.ndim > 1:
embedding = np.asarray(embedding).squeeze()
else:
embedding = np.zeros(self.n_dim)
return embedding.astype(np.float32).tobytes()

def _get_offset2ids_meta(self) -> List[str]:
if not self._client.exists(self._offset2id_key):
return []
ids = self._client.lrange(self._offset2id_key, 0, -1)
return [id.decode() for id in ids]

def _update_offset2ids_meta(self):
"""Update the offset2ids in redis"""
if self._client.exists(self._offset2id_key):
self._client.delete(self._offset2id_key)
if len(self._offset2ids.ids) > 0:
self._client.rpush(self._offset2id_key, *self._offset2ids.ids)
127 changes: 127 additions & 0 deletions docarray/array/storage/redis/find.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, TypeVar, Union

import numpy as np
from docarray import Document, DocumentArray
from docarray.array.mixins.find import FindMixin as BaseFindMixin
from docarray.math import ndarray
from docarray.math.ndarray import to_numpy_array
from docarray.score import NamedScore

from redis.commands.search.query import NumericFilter, Query

if TYPE_CHECKING:
import tensorflow
import torch

RedisArrayType = TypeVar(
'RedisArrayType',
np.ndarray,
tensorflow.Tensor,
torch.Tensor,
Sequence[float],
Dict,
)


class FindMixin(BaseFindMixin):
def _find_similar_vectors(
self,
query: 'RedisArrayType',
filter: Optional[Dict] = None,
limit: Optional[Union[int, float]] = 20,
**kwargs,
):

query_str = self._build_query_str(filter) if filter else "*"

q = (
Query(f'{query_str}=>[KNN {limit} @embedding $vec AS vector_score]')
.sort_by('vector_score')
.paging(0, limit)
.dialect(2)
)

query_params = {'vec': to_numpy_array(query).astype(np.float32).tobytes()}
results = (
self._client.ft(index_name=self._config.index_name)
.search(q, query_params)
.docs
)

da = DocumentArray()
for res in results:
doc = Document.from_base64(res.blob.encode())
doc.scores['score'] = NamedScore(value=res.vector_score)
da.append(doc)
return da

def _find(
self,
query: 'RedisArrayType',
limit: Optional[Union[int, float]] = 20,
filter: Optional[Dict] = None,
**kwargs,
) -> List['DocumentArray']:

query = np.array(query)
num_rows, n_dim = ndarray.get_array_rows(query)
if n_dim != 2:
query = query.reshape((num_rows, -1))

return [
self._find_similar_vectors(q, filter=filter, limit=limit, **kwargs)
for q in query
]

def _find_with_filter(self, filter: Dict, limit: Optional[Union[int, float]] = 20):
s = self._build_query_str(filter)
q = Query(s)
q.paging(0, limit)

results = self._client.ft(index_name=self._config.index_name).search(q).docs

da = DocumentArray()
for res in results:
doc = Document.from_base64(res.blob.encode())
da.append(doc)
return da

def _filter(
self, filter: Dict, limit: Optional[Union[int, float]] = 20
) -> 'DocumentArray':

return self._find_with_filter(filter, limit=limit)

def _build_query_str(self, filter: Dict) -> str:
INF = "+inf"
NEG_INF = "-inf"
s = "("

for key in filter:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here, lt's check for $and and $or in order to allow logical operations (if allowed in redis)
you can find more info about the syntax here:
https://github.com/jina-ai/annlite#annlite

operator = list(filter[key].keys())[0]
value = filter[key][operator]
if operator == '$gt':
s += f"@{key}:[({value} {INF}] "
elif operator == '$gte':
s += f"@{key}:[{value} {INF}] "
elif operator == '$lt':
s += f"@{key}:[{NEG_INF} ({value}] "
elif operator == '$lte':
s += f"@{key}:[{NEG_INF} {value}] "
elif operator == '$eq':
if type(value) is int:
s += f"@{key}:[{value} {value}] "
elif type(value) is bool:
s += f"@{key}:[{int(value)} {int(value)}] "
else:
s += f"@{key}:{value} "
elif operator == '$ne':
if type(value) is int:
s += f"-@{key}:[{value} {value}] "
elif type(value) is bool:
s += f"-@{key}:[{int(value)} {int(value)}] "
else:
s += f"-@{key}:{value} "
s += ")"

return s
Loading