From 1b0829b9d10ac28d1a97711e1ed143fa12e11a84 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 29 Sep 2022 16:23:37 +0800 Subject: [PATCH 1/8] feat: redis add geo filter --- docarray/array/storage/redis/backend.py | 5 +++-- docarray/array/storage/redis/find.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index 78d9c1ed8c2..55c7ac4afc5 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -8,7 +8,7 @@ from docarray.helper import dataclass_from_dict, random_identity, filter_dict from redis import Redis -from redis.commands.search.field import NumericField, TextField, VectorField +from redis.commands.search.field import NumericField, TextField, VectorField, GeoField from redis.commands.search.indexDefinition import IndexDefinition if TYPE_CHECKING: @@ -46,7 +46,8 @@ class BackendMixin(BaseBackendMixin): 'float': TypeMap(type='float', converter=NumericField), 'double': TypeMap(type='double', converter=NumericField), 'long': TypeMap(type='long', converter=NumericField), - 'bool': TypeMap(type='long', converter=NumericField), + 'bool': TypeMap(type='bool', converter=NumericField), + 'geo': TypeMap(type='geo', converter=GeoField), } def _init_storage( diff --git a/docarray/array/storage/redis/find.py b/docarray/array/storage/redis/find.py index 001247cf577..6298ca10d44 100644 --- a/docarray/array/storage/redis/find.py +++ b/docarray/array/storage/redis/find.py @@ -17,6 +17,7 @@ intersect, le, lt, + geo, union, ) @@ -187,6 +188,17 @@ def _build_query_node(key, condition): query_dict[key] = lt(value) elif operator == '$lte': query_dict[key] = le(value) + elif operator == '$geo': + if value.get('unit') is None: + value['unit'] = "km" + elif value['unit'] not in ['m', 'km', 'mi', 'ft']: + unit = value['unit'] + raise ValueError( + f'Expecting geo unit one of m, km, mi OR ft, got {unit} instead' + ) + query_dict[key] = geo( + value['lat'], value['lon'], value['radius'], value['unit'] + ) else: raise ValueError( f'Expecting filter operator one of $gt, $gte, $lt, $lte, $eq, $ne, $and OR $or, got {operator} instead' From d411d7d53e8dafc3f39d4b55063cf146f6c25763 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 19 Oct 2022 14:40:00 +0800 Subject: [PATCH 2/8] feat: add geo to redis typemap --- docarray/array/storage/redis/backend.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docarray/array/storage/redis/backend.py b/docarray/array/storage/redis/backend.py index d0aeabff48d..adf7e7d4bed 100644 --- a/docarray/array/storage/redis/backend.py +++ b/docarray/array/storage/redis/backend.py @@ -46,6 +46,7 @@ class BackendMixin(BaseBackendMixin): 'float': TypeMap(type='float', converter=NumericField), 'double': TypeMap(type='double', converter=NumericField), 'long': TypeMap(type='long', converter=NumericField), + 'geo': TypeMap(type='geo', converter=GeoField), } def _init_storage( From 5884bdb21e4b619aa34f0ac174dd391e1ae26fbe Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 19 Oct 2022 14:40:19 +0800 Subject: [PATCH 3/8] fix: remove geo operator --- docarray/array/storage/redis/find.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/docarray/array/storage/redis/find.py b/docarray/array/storage/redis/find.py index d2a78c68b59..54ceb7ab853 100644 --- a/docarray/array/storage/redis/find.py +++ b/docarray/array/storage/redis/find.py @@ -18,7 +18,6 @@ intersect, le, lt, - geo, union, ) @@ -75,7 +74,7 @@ def _find( self, query: 'RedisArrayType', limit: Union[int, float] = 20, - filter: Optional[Dict] = None, + filter: Optional[Union[str, Dict]] = None, **kwargs, ) -> List['DocumentArray']: @@ -108,7 +107,7 @@ def _find_with_filter( def _filter( self, - filter: Dict, + filter: Union[str, Dict], limit: Union[int, float] = 20, ) -> 'DocumentArray': @@ -187,17 +186,6 @@ def _build_query_node(key, condition): query_dict[key] = lt(value) elif operator == '$lte': query_dict[key] = le(value) - elif operator == '$geo': - if value.get('unit') is None: - value['unit'] = "km" - elif value['unit'] not in ['m', 'km', 'mi', 'ft']: - unit = value['unit'] - raise ValueError( - f'Expecting geo unit one of m, km, mi OR ft, got {unit} instead' - ) - query_dict[key] = geo( - value['lat'], value['lon'], value['radius'], value['unit'] - ) else: raise ValueError( f'Expecting filter operator one of $gt, $gte, $lt, $lte, $eq, $ne, $and OR $or, got {operator} instead' From fb7d044e1210d78d0f18b85b18fdc42772b43e31 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 19 Oct 2022 15:30:51 +0800 Subject: [PATCH 4/8] test: update redis test_backend for geo --- tests/unit/array/storage/redis/test_backend.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/tests/unit/array/storage/redis/test_backend.py b/tests/unit/array/storage/redis/test_backend.py index 6e73e4042ca..75096e4e40f 100644 --- a/tests/unit/array/storage/redis/test_backend.py +++ b/tests/unit/array/storage/redis/test_backend.py @@ -21,17 +21,6 @@ def _save_offset2ids(self): pass -type_convert = { - 'int': b'NUMERIC', - 'float': b'NUMERIC', - 'double': b'NUMERIC', - 'long': b'NUMERIC', - 'str': b'TEXT', - 'bytes': b'TEXT', - 'bool': b'NUMERIC', -} - - @pytest.mark.parametrize('distance', ['L2', 'IP', 'COSINE']) @pytest.mark.parametrize( 'method,initial_cap,ef_construction,block_size', @@ -43,12 +32,9 @@ def _save_offset2ids(self): @pytest.mark.parametrize( 'columns', [ - [('attr1', 'str'), ('attr2', 'bytes')], - [('attr1', 'int'), ('attr2', 'float')], - [('attr1', 'double'), ('attr2', 'long'), ('attr3', 'int')], {'attr1': 'str', 'attr2': 'bytes'}, {'attr1': 'int', 'attr2': 'float'}, - {'attr1': 'double', 'attr2': 'long', 'attr3': 'int'}, + {'attr1': 'double', 'attr2': 'long', 'attr3': 'geo'}, ], ) @pytest.mark.parametrize( From 47655342dbd248264b293c7d1be3c262203d7c2a Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 19 Oct 2022 15:33:13 +0800 Subject: [PATCH 5/8] test: add test for redis geo --- tests/unit/array/mixins/test_find.py | 57 ++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 49ea3325887..5778fb57290 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -632,6 +632,63 @@ def test_redis_category_filter(filter, checker, columns, start_storage): assert all([checker(r) for r in results]) +from math import radians, cos, sin, asin, sqrt + + +def haversine(lon1, lat1, lon2, lat2): + """ + Calculate the great circle distance between two points + on the earth (specified in decimal degrees) + """ + lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) + dlon = lon2 - lon1 + dlat = lat2 - lat1 + a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2 + c = 2 * asin(sqrt(a)) + r = 6371 # average radius of earth, unit km + return c * r # unit km + + +@pytest.mark.parametrize( + 'filter,checker', + [ + ( + '@location:[-98.71 38.71 200 km] ', + lambda r: haversine( + -98.71, + 38.71, + float(r.tags['location'].split(',')[0]), + float(r.tags['location'].split(',')[1]), + ) + < 200, + ), + ], +) +def test_redis_geo_filter(filter, checker, start_storage): + n_dim = 128 + da = DocumentArray( + storage='redis', + config={ + 'n_dim': n_dim, + 'columns': {'location': 'geo'}, + }, + ) + + da.extend( + [ + Document( + embedding=np.random.rand(n_dim), + tags={'location': f"{-98.17+i},{38.71+i}"}, + ) + for i in range(10) + ] + ) + + results = da.find(np.random.rand(n_dim), filter=filter) + assert len(results) > 0 + assert all([checker(r) for r in results]) + + @pytest.mark.parametrize('storage', ['memory']) @pytest.mark.parametrize('columns', [[('price', 'int')], {'price': 'int'}]) def test_unsupported_pre_filtering(storage, start_storage, columns): From 0d8be410029abe1ed157d6be846710e6d2bf4032 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 19 Oct 2022 15:54:22 +0800 Subject: [PATCH 6/8] docs: add geo example to redis doc --- docs/advanced/document-store/redis.md | 46 +++++++++++++++------------ 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index efa5f98be57..a95e5f80609 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -141,7 +141,6 @@ You can check the default values in [the docarray source code](https://github.co For vector search configurations, default values are those of the database backend, which you can find in the [Redis documentation](https://redis.io/docs/stack/search/reference/vectors/). ```{note} -We will support geo-filtering soon. The benchmark test is on the way. ``` @@ -247,8 +246,8 @@ integer in `columns` configuration (`'field': 'int'`) and use a filter query tha One can search with user-defined query filters using the `.find` method. Such queries follow the [Redis Search Query Syntax](https://redis.io/docs/stack/search/reference/query_syntax/). -Consider a case where you store Documents with a tag of `price` into Redis and you want to retrieve all Documents -with `price` less than or equal to some `max_price` value. +Consider a case where you store Documents with a tag of `location` into Redis and you want to retrieve all Documents +with `location` within some `max_distance` value. You can index such Documents as follows: @@ -260,42 +259,49 @@ da = DocumentArray( storage='redis', config={ 'n_dim': n_dim, - 'columns': {'price': 'float'}, + 'columns': {'location': 'geo'}, }, ) with da: - da.extend([Document(id=f'r{i}', tags={'price': i}) for i in range(10)]) + da.extend( + [ + Document(id=f'r{i}', tags={'location': f"{-98.17+i},{38.71+i}"}) + for i in range(10) + ] + ) print('\nIndexed Prices:\n') -for price in da[:, 'tags__price']: +for price in da[:, 'tags__location']: print(f'\t price={price}') ``` -Then you can retrieve all documents whose price is less than or equal to `max_price` by applying the following -filter: +Then you can retrieve all documents whose location is within `max_distance` from earth coordinates `-98.71,38.71` by applying the following filter: ```python -max_price = 3 -n_limit = 4 +max_distance = 1000 +n_limit = 5 -filter = f'@price:[-inf {max_price}] ' -results = da.find(filter=filter) +filter = f'@location:[-98.71 38.71 {max_distance} km] ' +results = da.find(filter=filter, limit=n_limit) -print('\n Returned examples that verify filter "price at most 3":\n') -for price in results[:, 'tags__price']: - print(f'\t price={price}') +print( + '\n Returned examples that verify filter "distance from -98.71,38.71 at most 1000 km":\n' +) +for location in results[:, 'tags__location']: + print(f'\t location={location}') ``` This would print ``` - Returned examples that satisfy condition "price at most 3": +Returned examples that verify filter "distance from -98.71,38.71 at most 1000 km": - price=0 - price=1 - price=2 - price=3 + location=-98.17,38.71 + location=-97.17,39.71 + location=-96.17,40.71 + location=-95.17,41.71 + location=-94.17,42.71 ``` (vector-search-index)= From 773ae1781a2f278a42c198738447bd4a52c6562a Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 19 Oct 2022 20:06:46 +0800 Subject: [PATCH 7/8] test: use sklearn functions instead --- tests/unit/array/mixins/test_find.py | 58 ++++++++++------------------ 1 file changed, 21 insertions(+), 37 deletions(-) diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 5778fb57290..775394f9068 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -1,9 +1,11 @@ +import operator +from math import radians + import numpy as np import pytest - -from docarray import DocumentArray, Document +from docarray import Document, DocumentArray from docarray.math import ndarray -import operator +from sklearn.metrics.pairwise import haversine_distances def test_customize_metric_fn(): @@ -632,39 +634,7 @@ def test_redis_category_filter(filter, checker, columns, start_storage): assert all([checker(r) for r in results]) -from math import radians, cos, sin, asin, sqrt - - -def haversine(lon1, lat1, lon2, lat2): - """ - Calculate the great circle distance between two points - on the earth (specified in decimal degrees) - """ - lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) - dlon = lon2 - lon1 - dlat = lat2 - lat1 - a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2 - c = 2 * asin(sqrt(a)) - r = 6371 # average radius of earth, unit km - return c * r # unit km - - -@pytest.mark.parametrize( - 'filter,checker', - [ - ( - '@location:[-98.71 38.71 200 km] ', - lambda r: haversine( - -98.71, - 38.71, - float(r.tags['location'].split(',')[0]), - float(r.tags['location'].split(',')[1]), - ) - < 200, - ), - ], -) -def test_redis_geo_filter(filter, checker, start_storage): +def test_redis_geo_filter(start_storage): n_dim = 128 da = DocumentArray( storage='redis', @@ -684,9 +654,23 @@ def test_redis_geo_filter(filter, checker, start_storage): ] ) + filter = '@location:[-98.71 38.71 800 km] ' + results = da.find(np.random.rand(n_dim), filter=filter) assert len(results) > 0 - assert all([checker(r) for r in results]) + + for r in results: + lon1, lat1, lon2, lat2 = map( + radians, + [ + -98.71, + 38.71, + float(r.tags['location'].split(',')[0]), + float(r.tags['location'].split(',')[1]), + ], + ) + distance = haversine_distances([[lon1, lat1], [lon2, lat2]]) * 6371 + assert distance[0][1] < 800 @pytest.mark.parametrize('storage', ['memory']) From ef6d053647e9cb95a19cb3599ff251e03bb1d9a1 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 19 Oct 2022 20:25:26 +0800 Subject: [PATCH 8/8] docs: add extra redis geo example --- docs/advanced/document-store/redis.md | 71 +++++++++++++++++---------- 1 file changed, 45 insertions(+), 26 deletions(-) diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index a95e5f80609..523882e9db8 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -246,8 +246,7 @@ integer in `columns` configuration (`'field': 'int'`) and use a filter query tha One can search with user-defined query filters using the `.find` method. Such queries follow the [Redis Search Query Syntax](https://redis.io/docs/stack/search/reference/query_syntax/). -Consider a case where you store Documents with a tag of `location` into Redis and you want to retrieve all Documents -with `location` within some `max_distance` value. +Consider a case where you store Documents with a tag of `price` into Redis and you want to retrieve all Documents with `price` less than or equal to some `max_price` value. You can index such Documents as follows: @@ -259,51 +258,71 @@ da = DocumentArray( storage='redis', config={ 'n_dim': n_dim, - 'columns': {'location': 'geo'}, + 'columns': {'price': 'float'}, }, ) with da: - da.extend( - [ - Document(id=f'r{i}', tags={'location': f"{-98.17+i},{38.71+i}"}) - for i in range(10) - ] - ) + da.extend([Document(id=f'r{i}', tags={'price': i}) for i in range(10)]) print('\nIndexed Prices:\n') -for price in da[:, 'tags__location']: +for price in da[:, 'tags__price']: print(f'\t price={price}') ``` -Then you can retrieve all documents whose location is within `max_distance` from earth coordinates `-98.71,38.71` by applying the following filter: +Then you can retrieve all documents whose price is less than or equal to `max_price` by applying the following filter: ```python -max_distance = 1000 -n_limit = 5 +max_price = 3 +n_limit = 4 -filter = f'@location:[-98.71 38.71 {max_distance} km] ' -results = da.find(filter=filter, limit=n_limit) +filter = f'@price:[-inf {max_price}] ' +results = da.find(filter=filter) -print( - '\n Returned examples that verify filter "distance from -98.71,38.71 at most 1000 km":\n' -) -for location in results[:, 'tags__location']: - print(f'\t location={location}') +print('\n Returned examples that verify filter "price at most 3":\n') +for price in results[:, 'tags__price']: + print(f'\t price={price}') ``` This would print ``` -Returned examples that verify filter "distance from -98.71,38.71 at most 1000 km": + Returned examples that satisfy condition "price at most 3": - location=-98.17,38.71 - location=-97.17,39.71 - location=-96.17,40.71 - location=-95.17,41.71 - location=-94.17,42.71 + price=0 + price=1 + price=2 + price=3 ``` +With Redis as storage backend, you can also do geospatial searches. You can index Documents with a tag of `geo` type and retrieve all Documents that are within some `max_distance` from one earth coordinates as follows : + +```python +from docarray import Document, DocumentArray + +n_dim = 3 +da = DocumentArray( + storage='redis', + config={ + 'n_dim': n_dim, + 'columns': {'location': 'geo'}, + }, +) + +with da: + da.extend( + [ + Document(id=f'r{i}', tags={'location': f"{-98.17+i},{38.71+i}"}) + for i in range(10) + ] + ) + +max_distance = 1000 +filter = f'@location:[-98.71 38.71 {max_distance} km] ' +results = da.find(filter=filter, limit=n_limit) +``` + + (vector-search-index)= ### Update Vector Search Indexing Schema