From b6866dcb0c3c3d00f97fe801e0ef4b0cf52f4451 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 9 Sep 2022 21:00:22 +0800 Subject: [PATCH 01/11] feat: redis support full-text search --- docarray/array/storage/redis/backend.py | 9 ++++++ docarray/array/storage/redis/find.py | 39 +++++++++++++++++++++++++ tests/unit/array/mixins/test_find.py | 5 ++-- 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index b9f54a821a3..797c5b2d8af 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -24,6 +24,8 @@ class RedisConfig: update_schema: bool = field(default=True) distance: str = field(default='COSINE') redis_config: Dict[str, Any] = field(default_factory=dict) + index_text: bool = field(default=False) + tag_indices: List[str] = field(default_factory=list) batch_size: int = field(default=64) method: str = field(default='HNSW') ef_construction: int = field(default=200) @@ -146,6 +148,13 @@ def _build_schema_from_redis_config(self): index_param['INITIAL_CAP'] = self._config.initial_cap schema = [VectorField('embedding', self._config.method, index_param)] + if self._config.index_text: + schema.append(TextField('text')) + + if self._config.tag_indices: + for index in self._config.tag_indices: + schema.append(TextField(index)) + for col, coltype in self._config.columns.items(): schema.append(self._map_column(col, coltype)) diff --git a/docarray/array/storage/redis/find.py b/docarray/array/storage/redis/find.py index cd38fd98fb1..4934727206a 100644 --- a/docarray/array/storage/redis/find.py +++ b/docarray/array/storage/redis/find.py @@ -106,6 +106,40 @@ def _filter(self, filter: Dict, limit: int = 20) -> 'DocumentArray': return self._find_with_filter(filter, limit=limit) + def _find_by_text( + self, query: Union[str, List[str]], index: str = 'text', limit: Optional[Union[int, float]] = 20 + ): + if isinstance(query, str): + query = [query] + + return [ + self._find_similar_documents_from_text( + q, + index=index, + limit=limit, + ) + for q in query + ] + + + def _find_similar_documents_from_text( + self, query: str, index: str = 'text', limit: Optional[Union[int, float]] = 20 + ): + query_str = _build_query_str(query) + q = ( + Query(f'@{index}:{query_str}') + .scorer('BM25') + .paging(0, limit) + ) + + results = self._client.ft(index_name=self._config.index_name).search(q).docs + + da = DocumentArray() + for res in results: + doc = Document.from_base64(res.blob.encode()) + da.append(doc) + return da + def _build_query_node(key, condition): operator = list(condition.keys())[0] @@ -154,3 +188,8 @@ def _build_query_nodes(filter): nodes.append(child) return nodes + + +def _build_query_str(query): + query_str = "|".join(query.split(" ")) + return query_str diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 38d4e500603..3de4cb90082 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -99,6 +99,7 @@ def test_find(storage, config, limit, query, start_storage): 'storage, config', [ ('elasticsearch', {'n_dim': 32, 'index_text': True}), + ('redis', {'n_dim': 32, 'flush': True, 'index_text': True}), ], ) def test_find_by_text(storage, config, start_storage): @@ -140,6 +141,7 @@ def test_find_by_text(storage, config, start_storage): 'storage, config', [ ('elasticsearch', {'n_dim': 32, 'tag_indices': ['attr1', 'attr2', 'attr3']}), + ('redis', {'n_dim': 32, 'flush': True, 'tag_indices': ['attr1', 'attr2', 'attr3']}), ], ) def test_find_by_tag(storage, config, start_storage): @@ -193,8 +195,7 @@ def test_find_by_tag(storage, config, start_storage): results = da.find('token6', index='attr3') assert len(results) == 2 - assert results[0].id == '2' - assert results[1].id == '1' + assert set(results[:, 'id']) == {'1', '2'} results = da.find('token6', index='attr3', limit=1) assert len(results) == 1 From e8cd30cccbb726715ce435071069bb782ff82816 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 9 Sep 2022 21:10:23 +0800 Subject: [PATCH 02/11] docs: add redis full-text search doc --- docs/advanced/document-store/redis.md | 90 +++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index 9a67404d734..a51e5ce9942 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -281,6 +281,96 @@ More example filter expresses } ``` +### Search by `.text` field + +You can perform Text search in a `DocumentArray` with `storage='redis'`. +To do this, text needs to be indexed using the boolean flag `'index_text'` which is set when the `DocumentArray` is created with `config={'index_text': True, ...}`. +The following example builds a `DocumentArray` with several documents containing text and searches for those that have `pizza` in their text description. + +```python +from docarray import Document, DocumentArray + +da = DocumentArray( + storage='redis', config={'n_dim': 2, 'index_text': True, 'flush': True} +) +da.extend( + [ + Document(text='Person eating'), + Document(text='Person eating pizza'), + Document(text='Pizza restaurant'), + ] +) + +pizza_docs = da.find('pizza') +print(pizza_docs[:, 'text']) +``` + +This will print: + +```console +['Person eating pizza', 'Pizza restaurant'] +``` + +### Search by `.tags` field + +Text can also be indexed when it is part of `tags`. +This is mostly useful in applications where text data can be split into groups and applications might require retrieving items based on a text search in an specific tag. + +For example: + +```python +from docarray import Document, DocumentArray + +da = DocumentArray( + storage='redis', + config={'n_dim': 32, 'flush': True, 'tag_indices': ['food_type', 'price']}, +) +da.extend( + [ + Document( + tags={ + 'food_type': 'Italian and Spanish food', + 'price': 'cheap but not that cheap', + }, + ), + Document( + tags={ + 'food_type': 'French and Italian food', + 'price': 'on the expensive side', + }, + ), + Document( + tags={ + 'food_type': 'chinese noddles', + 'price': 'quite cheap for what you get!', + }, + ), + ] +) + +results_cheap = da.find('cheap', index='price') +print('searching "cheap" in :\n\t', results_cheap[:, 'tags__price']) + +results_italian = da.find('italian', index='food_type') +print('searching "italian" in :\n\t', results_italian[:, 'tags__food_type']) +``` + +This will print: + +```console +searching "cheap" in : + ['cheap but not that cheap', 'quite cheap for what you get!'] +searching "italian" in : + ['French and Italian food', 'Italian and Spanish food'] +``` + +```{note} +By default, if you don't specify the parameter `index` in the `find` method, the Document attribute `text` will be used for search. If you want to use a specific tags field, make sure to specify it with parameter `index`: +```python +results = da.find('cheap', index='price') +``` + + (vector-search-index)= ### Update Vector Search Indexing Schema From 6fce3a25b500a4fb74d5d7fbb3a15f5325040956 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 9 Sep 2022 21:13:06 +0800 Subject: [PATCH 03/11] feat: add tag_indices to redis getsetdel --- docarray/array/storage/redis/getsetdel.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docarray/array/storage/redis/getsetdel.py b/docarray/array/storage/redis/getsetdel.py index d201a164c8e..53c9ef543aa 100644 --- a/docarray/array/storage/redis/getsetdel.py +++ b/docarray/array/storage/redis/getsetdel.py @@ -95,6 +95,12 @@ def _document_to_redis(self, doc: 'Document') -> Dict: if tag is not None: extra_columns[col] = int(tag) if isinstance(tag, bool) else tag + if self._config.tag_indices: + for index in self._config.tag_indices: + text = doc.tags.get(index) + if text is not None: + extra_columns[index] = text + payload = { 'id': doc.id, 'embedding': self._map_embedding(doc.embedding), From 8f9123cf7342b3f251fe724a57e6b58da918ef11 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 9 Sep 2022 21:16:28 +0800 Subject: [PATCH 04/11] refactor: black redis files --- docarray/array/storage/redis/backend.py | 2 +- docarray/array/storage/redis/find.py | 14 ++++++-------- tests/unit/array/mixins/test_find.py | 5 ++++- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 797c5b2d8af..2e20eb0bc00 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -150,7 +150,7 @@ def _build_schema_from_redis_config(self): if self._config.index_text: schema.append(TextField('text')) - + if self._config.tag_indices: for index in self._config.tag_indices: schema.append(TextField(index)) diff --git a/docarray/array/storage/redis/find.py b/docarray/array/storage/redis/find.py index 4934727206a..f6b6780cfd0 100644 --- a/docarray/array/storage/redis/find.py +++ b/docarray/array/storage/redis/find.py @@ -107,7 +107,10 @@ def _filter(self, filter: Dict, limit: int = 20) -> 'DocumentArray': return self._find_with_filter(filter, limit=limit) def _find_by_text( - self, query: Union[str, List[str]], index: str = 'text', limit: Optional[Union[int, float]] = 20 + self, + query: Union[str, List[str]], + index: str = 'text', + limit: Optional[Union[int, float]] = 20, ): if isinstance(query, str): query = [query] @@ -121,16 +124,11 @@ def _find_by_text( for q in query ] - def _find_similar_documents_from_text( self, query: str, index: str = 'text', limit: Optional[Union[int, float]] = 20 ): query_str = _build_query_str(query) - q = ( - Query(f'@{index}:{query_str}') - .scorer('BM25') - .paging(0, limit) - ) + q = Query(f'@{index}:{query_str}').scorer('BM25').paging(0, limit) results = self._client.ft(index_name=self._config.index_name).search(q).docs @@ -139,7 +137,7 @@ def _find_similar_documents_from_text( doc = Document.from_base64(res.blob.encode()) da.append(doc) return da - + def _build_query_node(key, condition): operator = list(condition.keys())[0] diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 3de4cb90082..fd5efb80bc0 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -141,7 +141,10 @@ def test_find_by_text(storage, config, start_storage): 'storage, config', [ ('elasticsearch', {'n_dim': 32, 'tag_indices': ['attr1', 'attr2', 'attr3']}), - ('redis', {'n_dim': 32, 'flush': True, 'tag_indices': ['attr1', 'attr2', 'attr3']}), + ( + 'redis', + {'n_dim': 32, 'flush': True, 'tag_indices': ['attr1', 'attr2', 'attr3']}, + ), ], ) def test_find_by_tag(storage, config, start_storage): From a7d8da5dfed4d4aeb4ec6e816f51558a7461b0f9 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 9 Sep 2022 21:26:20 +0800 Subject: [PATCH 05/11] feat: redis supports io --- docarray/array/storage/redis/backend.py | 13 +++++++++++++ tests/unit/array/mixins/test_io.py | 15 +++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 2e20eb0bc00..38e508a3b1b 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -187,3 +187,16 @@ def _update_offset2ids_meta(self): self._client.delete(self._offset2id_key) if len(self._offset2ids.ids) > 0: self._client.rpush(self._offset2id_key, *self._offset2ids.ids) + + def __getstate__(self): + d = dict(self.__dict__) + del d['_client'] + return d + + def __setstate__(self, state): + self.__dict__ = state + self._client = Redis( + host=self._config.host, + port=self._config.port, + **self._config.redis_config, + ) diff --git a/tests/unit/array/mixins/test_io.py b/tests/unit/array/mixins/test_io.py index a32fdce103e..5e9db8a5edc 100644 --- a/tests/unit/array/mixins/test_io.py +++ b/tests/unit/array/mixins/test_io.py @@ -13,9 +13,12 @@ from docarray.array.storage.weaviate import WeaviateConfig from docarray.array.weaviate import DocumentArrayWeaviate from docarray.array.elastic import DocumentArrayElastic, ElasticConfig +from docarray.array.redis import DocumentArrayRedis, RedisConfig from docarray.helper import random_identity from tests import random_docs +import gc + @pytest.fixture def docs(): @@ -34,6 +37,7 @@ def docs(): (DocumentArrayWeaviate, lambda: WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=10)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=10)), + (DocumentArrayRedis, lambda: RedisConfig(n_dim=10, flush=True)), ], ) def test_document_save_load( @@ -67,6 +71,7 @@ def test_document_save_load( (DocumentArrayWeaviate, lambda: WeaviateConfig(n_dim=10)), (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=10)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=10)), + (DocumentArrayRedis, lambda: RedisConfig(n_dim=10, flush=True)), ], ) def test_da_csv_write(docs, flatten_tags, tmp_path, da_cls, config, start_storage): @@ -86,6 +91,7 @@ def test_da_csv_write(docs, flatten_tags, tmp_path, da_cls, config, start_storag (DocumentArrayWeaviate, lambda: WeaviateConfig(n_dim=256)), (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=256)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=256)), + (DocumentArrayRedis, lambda: RedisConfig(n_dim=256, flush=True)), ], ) def test_from_ndarray(da_cls, config, start_storage): @@ -103,6 +109,7 @@ def test_from_ndarray(da_cls, config, start_storage): (DocumentArrayWeaviate, lambda: WeaviateConfig(n_dim=256)), (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=256)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=256)), + (DocumentArrayRedis, lambda: RedisConfig(n_dim=256, flush=True)), ], ) def test_from_files(da_cls, config, start_storage): @@ -143,6 +150,7 @@ def test_from_files_exclude(): (DocumentArrayWeaviate, lambda: WeaviateConfig(n_dim=256)), (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=256)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=256)), + (DocumentArrayRedis, lambda: RedisConfig(n_dim=256, flush=True)), ], ) def test_from_ndjson(da_cls, config, start_storage): @@ -160,9 +168,13 @@ def test_from_ndjson(da_cls, config, start_storage): (DocumentArrayWeaviate, lambda: WeaviateConfig(n_dim=3)), (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=3)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=3)), + (DocumentArrayRedis, lambda: RedisConfig(n_dim=3, flush=True)), ], ) def test_from_to_pd_dataframe(da_cls, config, start_storage): + if da_cls == DocumentArrayRedis: + gc.collect() + df = da_cls.empty(2, config=config()).to_dataframe() assert len(da_cls.from_dataframe(df, config=config())) == 2 @@ -188,6 +200,7 @@ def test_from_to_pd_dataframe(da_cls, config, start_storage): (DocumentArrayAnnlite, AnnliteConfig(n_dim=3)), (DocumentArrayQdrant, QdrantConfig(n_dim=3)), (DocumentArrayElastic, ElasticConfig(n_dim=3)), + (DocumentArrayRedis, RedisConfig(n_dim=3, flush=True)), ], ) def test_from_to_bytes(da_cls, config, start_storage): @@ -219,6 +232,7 @@ def test_from_to_bytes(da_cls, config, start_storage): (DocumentArrayWeaviate, lambda: WeaviateConfig(n_dim=256)), (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=256)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=256)), + (DocumentArrayRedis, lambda: RedisConfig(n_dim=256)), ], ) def test_push_pull_io(da_cls, config, show_progress, start_storage): @@ -251,6 +265,7 @@ def test_push_pull_io(da_cls, config, show_progress, start_storage): # (DocumentArrayAnnlite, PqliteConfig(n_dim=3)), # TODO: enable this # (DocumentArrayQdrant, QdrantConfig(n_dim=3)), # (DocumentArrayElastic, ElasticConfig(n_dim=3)), # Elastic needs config + # (DocumentArrayRedis, RedisConfig(n_dim=3, flush=True)), # Redis needs config ], ) def test_from_to_base64(protocol, compress, da_cls, config): From a14fada738e0b199c77821effa10a521ae2ee9e8 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sat, 10 Sep 2022 00:38:35 +0800 Subject: [PATCH 06/11] fix: fix test_push_pull_io for redis --- tests/unit/array/mixins/test_io.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/unit/array/mixins/test_io.py b/tests/unit/array/mixins/test_io.py index 5e9db8a5edc..56dcf746de3 100644 --- a/tests/unit/array/mixins/test_io.py +++ b/tests/unit/array/mixins/test_io.py @@ -232,7 +232,7 @@ def test_from_to_bytes(da_cls, config, start_storage): (DocumentArrayWeaviate, lambda: WeaviateConfig(n_dim=256)), (DocumentArrayQdrant, lambda: QdrantConfig(n_dim=256)), (DocumentArrayElastic, lambda: ElasticConfig(n_dim=256)), - (DocumentArrayRedis, lambda: RedisConfig(n_dim=256)), + (DocumentArrayRedis, lambda: RedisConfig(n_dim=256, flush=True)), ], ) def test_push_pull_io(da_cls, config, show_progress, start_storage): @@ -246,7 +246,12 @@ def test_push_pull_io(da_cls, config, show_progress, start_storage): da1.push(name, show_progress=show_progress) - da2 = da_cls.pull(name, show_progress=show_progress, config=config()) + if da_cls == DocumentArrayRedis: + config = config() + config.flush = False + da2 = da_cls.pull(name, show_progress=show_progress, config=config) + else: + da2 = da_cls.pull(name, show_progress=show_progress, config=config()) assert len(da1) == len(da2) == 10 assert da1.texts == da2.texts == random_texts From 0c163b628757e99257026e51771ebe770b598bad Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 12 Sep 2022 22:34:03 +0800 Subject: [PATCH 07/11] refactor: code minor adjustments --- docarray/array/storage/redis/backend.py | 5 ++--- docarray/array/storage/redis/find.py | 25 ++++++++++++++++++------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 38e508a3b1b..8bf8d560949 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -151,9 +151,8 @@ def _build_schema_from_redis_config(self): if self._config.index_text: schema.append(TextField('text')) - if self._config.tag_indices: - for index in self._config.tag_indices: - schema.append(TextField(index)) + for index in self._config.tag_indices: + schema.append(TextField(index)) for col, coltype in self._config.columns.items(): schema.append(self._map_column(col, coltype)) diff --git a/docarray/array/storage/redis/find.py b/docarray/array/storage/redis/find.py index f6b6780cfd0..2dabbc79cbb 100644 --- a/docarray/array/storage/redis/find.py +++ b/docarray/array/storage/redis/find.py @@ -39,7 +39,7 @@ def _find_similar_vectors( self, query: 'RedisArrayType', filter: Optional[Dict] = None, - limit: int = 20, + limit: Union[int, float] = 20, **kwargs, ): @@ -73,7 +73,7 @@ def _find_similar_vectors( def _find( self, query: 'RedisArrayType', - limit: int = 20, + limit: Union[int, float] = 20, filter: Optional[Dict] = None, **kwargs, ) -> List['DocumentArray']: @@ -88,7 +88,11 @@ def _find( for q in query ] - def _find_with_filter(self, filter: Dict, limit: int = 20): + def _find_with_filter( + self, + filter: Dict, + limit: Union[int, float] = 20, + ): nodes = _build_query_nodes(filter) query_str = intersect(*nodes).to_string() q = Query(query_str) @@ -102,7 +106,11 @@ def _find_with_filter(self, filter: Dict, limit: int = 20): da.append(doc) return da - def _filter(self, filter: Dict, limit: int = 20) -> 'DocumentArray': + def _filter( + self, + filter: Dict, + limit: Union[int, float] = 20, + ) -> 'DocumentArray': return self._find_with_filter(filter, limit=limit) @@ -110,7 +118,7 @@ def _find_by_text( self, query: Union[str, List[str]], index: str = 'text', - limit: Optional[Union[int, float]] = 20, + limit: Union[int, float] = 20, ): if isinstance(query, str): query = [query] @@ -125,7 +133,10 @@ def _find_by_text( ] def _find_similar_documents_from_text( - self, query: str, index: str = 'text', limit: Optional[Union[int, float]] = 20 + self, + query: str, + index: str = 'text', + limit: Union[int, float] = 20, ): query_str = _build_query_str(query) q = Query(f'@{index}:{query_str}').scorer('BM25').paging(0, limit) @@ -189,5 +200,5 @@ def _build_query_nodes(filter): def _build_query_str(query): - query_str = "|".join(query.split(" ")) + query_str = '|'.join(query.split(' ')) return query_str From 8a45a86316f0356844893dd22941e90b8369d75b Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 12 Sep 2022 22:41:23 +0800 Subject: [PATCH 08/11] docs: default scorer function in redis text search --- docs/advanced/document-store/redis.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index a51e5ce9942..9fa8f61857e 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -157,7 +157,7 @@ da.extend( Document( id=f'{i}', embedding=i * np.ones(n_dim), - tags={'price': i, 'color': 'blue', 'stock': i%2==0}, + tags={'price': i, 'color': 'blue', 'stock': i % 2 == 0}, ) for i in range(10) ] @@ -167,7 +167,7 @@ da.extend( Document( id=f'{i+10}', embedding=i * np.ones(n_dim), - tags={'price': i, 'color': 'red', 'stock': i%2==0}, + tags={'price': i, 'color': 'red', 'stock': i % 2 == 0}, ) for i in range(10) ] @@ -283,7 +283,7 @@ More example filter expresses ### Search by `.text` field -You can perform Text search in a `DocumentArray` with `storage='redis'`. +You can perform Text search in a `DocumentArray` with `storage='redis'`. The default similarity ranking algorithm is `BM25`. To do this, text needs to be indexed using the boolean flag `'index_text'` which is set when the `DocumentArray` is created with `config={'index_text': True, ...}`. The following example builds a `DocumentArray` with several documents containing text and searches for those that have `pizza` in their text description. From 9e6a76d2b14c20a6749f76162d833444a6797c6b Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 12 Sep 2022 23:13:30 +0800 Subject: [PATCH 09/11] feat: make redis scoer parameter configurable --- docarray/array/storage/redis/find.py | 18 +++++++++++++- docs/advanced/document-store/redis.md | 35 +++++++++++++++++++++------ 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/docarray/array/storage/redis/find.py b/docarray/array/storage/redis/find.py index 2dabbc79cbb..001247cf577 100644 --- a/docarray/array/storage/redis/find.py +++ b/docarray/array/storage/redis/find.py @@ -119,6 +119,7 @@ def _find_by_text( query: Union[str, List[str]], index: str = 'text', limit: Union[int, float] = 20, + **kwargs, ): if isinstance(query, str): query = [query] @@ -128,6 +129,7 @@ def _find_by_text( q, index=index, limit=limit, + **kwargs, ) for q in query ] @@ -137,9 +139,23 @@ def _find_similar_documents_from_text( query: str, index: str = 'text', limit: Union[int, float] = 20, + **kwargs, ): query_str = _build_query_str(query) - q = Query(f'@{index}:{query_str}').scorer('BM25').paging(0, limit) + scorer = kwargs.get('scorer', 'BM25') + if scorer not in [ + 'BM25', + 'TFIDF', + 'TFIDF.DOCNORM', + 'DISMAX', + 'DOCSCORE', + 'HAMMING', + ]: + raise ValueError( + f'Expecting a valid text similarity ranking algorithm, got {scorer} instead' + ) + + q = Query(f'@{index}:{query_str}').scorer(scorer).paging(0, limit) results = self._client.ft(index_name=self._config.index_name).search(q).docs diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index 9fa8f61857e..84b41d03f54 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -283,7 +283,7 @@ More example filter expresses ### Search by `.text` field -You can perform Text search in a `DocumentArray` with `storage='redis'`. The default similarity ranking algorithm is `BM25`. +You can perform Text search in a `DocumentArray` with `storage='redis'`. To do this, text needs to be indexed using the boolean flag `'index_text'` which is set when the `DocumentArray` is created with `config={'index_text': True, ...}`. The following example builds a `DocumentArray` with several documents containing text and searches for those that have `pizza` in their text description. @@ -295,20 +295,41 @@ da = DocumentArray( ) da.extend( [ - Document(text='Person eating'), - Document(text='Person eating pizza'), - Document(text='Pizza restaurant'), + Document(id='1', text='token1 token2 token3'), + Document(id='2', text='token1 token2'), + Document(id='3', text='token2 token3 token4'), ] ) -pizza_docs = da.find('pizza') -print(pizza_docs[:, 'text']) +results = da.find('token1') +print(results[:, 'text']) ``` This will print: ```console -['Person eating pizza', 'Pizza restaurant'] +['token1 token2 token3', 'token1 token2'] +``` + +The default similarity ranking algorithm is `BM25`. Besides, `TFIDF`, `TFIDF.DOCNORM`, `DISMAX`, `DOCSCORE` and `HAMMING` are also supported by [RediSearch](https://redis.io/docs/stack/search/reference/scoring/). You can change it by specifying `scorer` in function `find`: + +```python +results = da.find('token1 token3', scorer='TFIDF.DOCNORM') +print('scorer=TFIDF.DOCNORM:') +print(results[:, 'text']) + +results = da.find('token1 token3') +print('scorer=BM25:') +print(results[:, 'text']) +``` + +This will print: + +```console +scorer=TFIDF.DOCNORM: +['token1 token2', 'token1 token2 token3', 'token2 token3 token4'] +scorer=BM25: +['token1 token2 token3', 'token1 token2', 'token2 token3 token4'] ``` ### Search by `.tags` field From 4d9f5daf2875c95ef4857faf21b2d8108fb2b135 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 14 Sep 2022 15:01:58 +0800 Subject: [PATCH 10/11] docs: redis doc error fix --- docs/advanced/document-store/redis.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index 84b41d03f54..190255a4ea9 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -283,9 +283,9 @@ More example filter expresses ### Search by `.text` field -You can perform Text search in a `DocumentArray` with `storage='redis'`. +You can perform full-text search in a `DocumentArray` with `storage='redis'`. To do this, text needs to be indexed using the boolean flag `'index_text'` which is set when the `DocumentArray` is created with `config={'index_text': True, ...}`. -The following example builds a `DocumentArray` with several documents containing text and searches for those that have `pizza` in their text description. +The following example builds a `DocumentArray` with several documents containing text and searches for those that have `token1` in their text description. ```python from docarray import Document, DocumentArray From 1c0779340eb71b2ca1e19c28e9ba67045ca5ca5a Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 14 Sep 2022 21:42:07 +0800 Subject: [PATCH 11/11] test: add test for scorer of redis text search --- tests/unit/array/mixins/test_find.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index fd5efb80bc0..3bfec1b988c 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -112,7 +112,10 @@ def test_find_by_text(storage, config, start_storage): ] ) - results = da.find('token1') + if storage == 'redis': + results = da.find('token1', scorer='TFIDF') + else: + results = da.find('token1') assert isinstance(results, DocumentArray) assert len(results) == 2 assert set(results[:, 'id']) == {'1', '2'}