From 3525e219f027b62fffa493538c6e8f96f2de72c5 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Wed, 23 Nov 2022 11:54:16 +0100 Subject: [PATCH 1/7] fix: remove cosine similarity field with false assignment Signed-off-by: anna-charlotte --- docarray/array/storage/weaviate/find.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docarray/array/storage/weaviate/find.py b/docarray/array/storage/weaviate/find.py index 68fbef9446e..2b46b2a1d91 100644 --- a/docarray/array/storage/weaviate/find.py +++ b/docarray/array/storage/weaviate/find.py @@ -94,8 +94,7 @@ def _find_similar_vectors( doc = Document.from_base64(result['_serialized'], **self._serialize_config) distance = result['_additional']['distance'] - doc.scores['cosine_similarity'] = NamedScore(value=distance) - doc.scores['distance'] = NamedScore(value=distance) + doc.scores['score'] = NamedScore(value=distance) certainty = result['_additional'].get('certainty', None) if certainty is not None: From 10e2eb59dbbca16d373f41baaf8ab4c9bf4b6e22 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Wed, 23 Nov 2022 12:06:24 +0100 Subject: [PATCH 2/7] fix: change key back to distance Signed-off-by: anna-charlotte --- docarray/array/storage/weaviate/find.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/weaviate/find.py b/docarray/array/storage/weaviate/find.py index 2b46b2a1d91..1024de706d3 100644 --- a/docarray/array/storage/weaviate/find.py +++ b/docarray/array/storage/weaviate/find.py @@ -94,7 +94,7 @@ def _find_similar_vectors( doc = Document.from_base64(result['_serialized'], **self._serialize_config) distance = result['_additional']['distance'] - doc.scores['score'] = NamedScore(value=distance) + doc.scores['distance'] = NamedScore(value=distance) certainty = result['_additional'].get('certainty', None) if certainty is not None: From b4a7ac57fcc1cc4127146440db805cde1bace6cc Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Wed, 23 Nov 2022 14:38:02 +0100 Subject: [PATCH 3/7] docs: update documentation Signed-off-by: anna-charlotte --- docs/advanced/document-store/weaviate.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/advanced/document-store/weaviate.md b/docs/advanced/document-store/weaviate.md index 3563a9232cd..5f5dfc4b87e 100644 --- a/docs/advanced/document-store/weaviate.md +++ b/docs/advanced/document-store/weaviate.md @@ -465,7 +465,7 @@ This should return something similar to: ```bash Only results that have a 'weaviate_certainty' of higher than 0.9 should show: text=['Persist Documents with Weaviate.'] - scores=[{'weaviate_certainty': {'value': 1.0000001}, 'cosine_similarity': {'value': 1.0000002000000001}}] + scores=[{'distance': {'value': -3.5762787e-07}}] ``` ## Include additional properties in the return From 7fafa586a62f3cce825e07a587a618a4c15c19ae Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Wed, 23 Nov 2022 16:29:08 +0100 Subject: [PATCH 4/7] docs: add documentation for distance score retrieval for doc storage Signed-off-by: anna-charlotte --- docs/advanced/document-store/elasticsearch.md | 2 +- docs/advanced/document-store/qdrant.md | 39 ++++++++++--------- docs/advanced/document-store/redis.md | 2 +- docs/advanced/document-store/weaviate.md | 6 ++- 4 files changed, 27 insertions(+), 22 deletions(-) diff --git a/docs/advanced/document-store/elasticsearch.md b/docs/advanced/document-store/elasticsearch.md index b55e3ba3172..dbbd44b3104 100644 --- a/docs/advanced/document-store/elasticsearch.md +++ b/docs/advanced/document-store/elasticsearch.md @@ -193,7 +193,7 @@ Consider we want the nearest vectors to the embedding `[8. 8. 8.]`, with the res prices must follow a filter. As an example, let's consider that retrieved documents must have `price` value lower or equal than `max_price`. We can encode this information in ElasticSearch using `filter = {'range': {'price': {'lte': max_price}}}`. -Then the search with the proposed filter can be implemented and used with the following code: +Then the search with the proposed filter can be implemented and used with the following code. Also, it is noted, that for Elasticsearch, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'score'`. ```python max_price = 7 diff --git a/docs/advanced/document-store/qdrant.md b/docs/advanced/document-store/qdrant.md index b7438f5499d..30388588350 100644 --- a/docs/advanced/document-store/qdrant.md +++ b/docs/advanced/document-store/qdrant.md @@ -76,24 +76,24 @@ Other functions behave the same as in-memory DocumentArray. The following configs can be set: -| Name | Description | Default | -|-----------------------|------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------| -| `n_dim` | Number of dimensions of embeddings to be stored and retrieved | **This is always required** | -| `collection_name` | Qdrant collection name client | **Random collection name generated** | -| `distance` | Distance metric to use during search. Can be 'cosine', 'dot' or 'euclidean' | `'cosine'` | -| `host` | Hostname of the Qdrant server | `'localhost'` | -| `port` | Port of the Qdrant server | `6333` | -| `grpc_port` | Port of the Qdrant gRPC interface | `6334` | -| `prefer_grpc` | Set `True` to use gPRC interface whenever possible in custom methods | `False` | -| `api_key` | API key for authentication in Qdrant Cloud | `None` | -| `https` | Set `True` to use HTTPS(SSL) protocol | `None` | -| `serialize_config` | [Serialization config of each Document](../../../fundamentals/document/serialization.md) | `None` | -| `scroll_batch_size` | Batch size used when scrolling over the storage | `64` | -| `ef_construct` | Number of neighbours to consider during the index building. Larger = more accurate search, more time to build index | `None`, defaults to the default value in Qdrant* | -| `full_scan_threshold` | Minimal size (in KiloBytes) of vectors for additional payload-based indexing | `None`, defaults to the default value in Qdrant* | -| `m` | Number of edges per node in the index graph. Larger = more accurate search, more space required | `None`, defaults to the default value in Qdrant* | -| `columns` | Other fields to store in Document | `None` | -| `list_like` | Controls if ordering of Documents is persisted in the Database. Disabling this breaks list-like features, but can improve performance. | True | +| Name | Description | Default | +|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------| +| `n_dim` | Number of dimensions of embeddings to be stored and retrieved | **This is always required** | +| `collection_name` | Qdrant collection name client | **Random collection name generated** | +| `distance` | Distance metric to use during search. Can be 'cosine' (similarity), 'dot' or 'euclidean' | `'cosine'` | +| `host` | Hostname of the Qdrant server | `'localhost'` | +| `port` | Port of the Qdrant server | `6333` | +| `grpc_port` | Port of the Qdrant gRPC interface | `6334` | +| `prefer_grpc` | Set `True` to use gPRC interface whenever possible in custom methods | `False` | +| `api_key` | API key for authentication in Qdrant Cloud | `None` | +| `https` | Set `True` to use HTTPS(SSL) protocol | `None` | +| `serialize_config` | [Serialization config of each Document](../../../fundamentals/document/serialization.md) | `None` | +| `scroll_batch_size` | Batch size used when scrolling over the storage | `64` | +| `ef_construct` | Number of neighbours to consider during the index building. Larger = more accurate search, more time to build index | `None`, defaults to the default value in Qdrant* | +| `full_scan_threshold` | Minimal size (in KiloBytes) of vectors for additional payload-based indexing | `None`, defaults to the default value in Qdrant* | +| `m` | Number of edges per node in the index graph. Larger = more accurate search, more space required | `None`, defaults to the default value in Qdrant* | +| `columns` | Other fields to store in Document | `None` | +| `list_like` | Controls if ordering of Documents is persisted in the Database. Disabling this breaks list-like features, but can improve performance. | True | *You can read more about the HNSW parameters and their default values [here](https://qdrant.tech/documentation/indexing/#vector-index) @@ -215,6 +215,9 @@ Embeddings Nearest Neighbours with "price" at most 7: embedding=[5. 5. 5.], price=5 embedding=[4. 4. 4.], price=4 ``` + +For Qdrant, the distance scores can be accessed in the Document's `.scores` dictionary by the key `f'{distance_metric}_similarity'`. For example, for `distance = 'euclidean'` the key would be `'euclidean_similarity'`. + ### Example of `.filter` with a filter The following example shows how to use DocArray with Qdrant Document Store in order to filter text documents. Consider Documents have the tag `price` with a value of `i`. We can create these with the following code: diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index aee5d43560b..538ee42142a 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -203,7 +203,7 @@ Consider the case where you want the nearest vectors to the embedding `[8., 8., @price:[-inf {max_price}] @color:{color} @stock:[1 1] ``` -Then the search with the proposed filter can be used as follows: +Then the search with the proposed filter can be used as follows. Also, it is noted, that for Redis, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'score'`. ```python max_price = 7 color = "blue" diff --git a/docs/advanced/document-store/weaviate.md b/docs/advanced/document-store/weaviate.md index 5f5dfc4b87e..2ac8d74366c 100644 --- a/docs/advanced/document-store/weaviate.md +++ b/docs/advanced/document-store/weaviate.md @@ -451,10 +451,10 @@ results = da.find( model, collate_fn=collate_fn, ), - query_params={"certainty": 0.9}, + query_params={"certainty": 0.995}, ) -print("Only results that have a 'weaviate_certainty' of higher than 0.9 should show:") +print("Only results that have a 'weaviate_certainty' of higher than 0.995 should show:") for res in results: print(f"\t text={res[:, 'text']}") print(f"\t scores={res[:, 'scores']}") @@ -468,6 +468,8 @@ Only results that have a 'weaviate_certainty' of higher than 0.9 should show: scores=[{'distance': {'value': -3.5762787e-07}}] ``` +It is noted, that for Weaviate, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'distance'`. + ## Include additional properties in the return GraphQL additional properties can be used on data objects in Get{} Queries to get additional information about the From 626ab7afa52e714a9cd89249c460a63f82d613c6 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 24 Nov 2022 11:48:58 +0100 Subject: [PATCH 5/7] test: fix test find for different storages Signed-off-by: anna-charlotte --- tests/unit/array/mixins/test_find.py | 40 ++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 47a66b44d76..04f041c48bc 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -74,31 +74,55 @@ def test_find(storage, config, limit, query, start_storage): # annlite uses cosine distance by default if n_dim == 1: if storage == 'weaviate': + distances = [t['distance'].value for t in result[:, 'scores']] + assert sorted(distances, reverse=False) == distances + assert len(distances) == limit + elif storage == 'qdrant': cosine_similarities = [ t['cosine_similarity'].value for t in result[:, 'scores'] ] - assert sorted(cosine_similarities, reverse=False) == cosine_similarities - if storage == 'redis': - cosine_distances = [t['score'].value for t in da[:, 'scores']] + assert sorted(cosine_similarities, reverse=True) + assert len(cosine_similarities) == limit + elif storage == 'elasticsearch': + cosine_similarities = [t['score'].value for t in result[:, 'scores']] + assert sorted(cosine_similarities, reverse=True) == cosine_similarities + assert len(cosine_similarities) == limit + elif storage == 'redis': + cosine_distances = [t['score'].value for t in result[:, 'scores']] assert sorted(cosine_distances, reverse=False) == cosine_distances - elif storage in ['memory', 'annlite', 'elasticsearch']: - cosine_distances = [t['cosine'].value for t in da[:, 'scores']] + assert len(cosine_distances) == limit + elif storage in ['memory', 'annlite']: + cosine_distances = [t['cosine'].value for t in result[:, 'scores']] assert sorted(cosine_distances, reverse=False) == cosine_distances + assert len(cosine_distances) == limit else: if storage == 'weaviate': + for da in result: + distances = [t['distance'].value for t in da[:, 'scores']] + assert sorted(distances, reverse=False) == distances + assert len(distances) == limit + elif storage == 'qdrant': for da in result: cosine_similarities = [ t['cosine_similarity'].value for t in da[:, 'scores'] ] - assert sorted(cosine_similarities, reverse=False) == cosine_similarities - if storage == 'redis': + assert sorted(cosine_similarities, reverse=True) + assert len(cosine_similarities) == limit + elif storage == 'elasticsearch': + for da in result: + cosine_similarities = [t['score'].value for t in da[:, 'scores']] + assert sorted(cosine_similarities, reverse=True) == cosine_similarities + assert len(cosine_similarities) == limit + elif storage == 'redis': for da in result: cosine_distances = [t['score'].value for t in da[:, 'scores']] assert sorted(cosine_distances, reverse=False) == cosine_distances - elif storage in ['memory', 'annlite', 'elasticsearch']: + assert len(cosine_distances) == limit + elif storage in ['memory', 'annlite']: for da in result: cosine_distances = [t['cosine'].value for t in da[:, 'scores']] assert sorted(cosine_distances, reverse=False) == cosine_distances + assert len(cosine_distances) == limit @pytest.mark.parametrize( From 17198af098ab6a5fb88d2462e355bab918887075 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Mon, 28 Nov 2022 10:43:50 +0100 Subject: [PATCH 6/7] docs: apply suggestions from code review Co-authored-by: Alex Cureton-Griffiths Signed-off-by: anna-charlotte --- docs/advanced/document-store/elasticsearch.md | 2 +- docs/advanced/document-store/redis.md | 2 +- docs/advanced/document-store/weaviate.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/advanced/document-store/elasticsearch.md b/docs/advanced/document-store/elasticsearch.md index dbbd44b3104..c227c75b40b 100644 --- a/docs/advanced/document-store/elasticsearch.md +++ b/docs/advanced/document-store/elasticsearch.md @@ -193,7 +193,7 @@ Consider we want the nearest vectors to the embedding `[8. 8. 8.]`, with the res prices must follow a filter. As an example, let's consider that retrieved documents must have `price` value lower or equal than `max_price`. We can encode this information in ElasticSearch using `filter = {'range': {'price': {'lte': max_price}}}`. -Then the search with the proposed filter can be implemented and used with the following code. Also, it is noted, that for Elasticsearch, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'score'`. +Then the search with the proposed filter can be implemented and used with the following code. Note: For Elasticsearch, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'score'`. ```python max_price = 7 diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index 538ee42142a..0de6e30558f 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -203,7 +203,7 @@ Consider the case where you want the nearest vectors to the embedding `[8., 8., @price:[-inf {max_price}] @color:{color} @stock:[1 1] ``` -Then the search with the proposed filter can be used as follows. Also, it is noted, that for Redis, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'score'`. +Then the search with the proposed filter can be used as follows. Note: For Redis, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'score'`. ```python max_price = 7 color = "blue" diff --git a/docs/advanced/document-store/weaviate.md b/docs/advanced/document-store/weaviate.md index 2ac8d74366c..2eb245eda83 100644 --- a/docs/advanced/document-store/weaviate.md +++ b/docs/advanced/document-store/weaviate.md @@ -468,7 +468,7 @@ Only results that have a 'weaviate_certainty' of higher than 0.9 should show: scores=[{'distance': {'value': -3.5762787e-07}}] ``` -It is noted, that for Weaviate, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'distance'`. +Note: For Weaviate, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'distance'`. ## Include additional properties in the return From 3ea1e09fa9c2a8397e0061b075239ec687c75b76 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Mon, 28 Nov 2022 13:30:40 +0100 Subject: [PATCH 7/7] docs: add note sections Signed-off-by: anna-charlotte --- docs/advanced/document-store/elasticsearch.md | 7 ++++++- docs/advanced/document-store/qdrant.md | 3 +++ docs/advanced/document-store/redis.md | 8 +++++++- docs/advanced/document-store/weaviate.md | 5 ++++- 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/docs/advanced/document-store/elasticsearch.md b/docs/advanced/document-store/elasticsearch.md index c227c75b40b..c6fb31c5afa 100644 --- a/docs/advanced/document-store/elasticsearch.md +++ b/docs/advanced/document-store/elasticsearch.md @@ -193,7 +193,12 @@ Consider we want the nearest vectors to the embedding `[8. 8. 8.]`, with the res prices must follow a filter. As an example, let's consider that retrieved documents must have `price` value lower or equal than `max_price`. We can encode this information in ElasticSearch using `filter = {'range': {'price': {'lte': max_price}}}`. -Then the search with the proposed filter can be implemented and used with the following code. Note: For Elasticsearch, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'score'`. +Then the search with the proposed filter can be implemented and used with the following code. + +````{admonition} Note +:class: note +For Elasticsearch, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'score'`. +```` ```python max_price = 7 diff --git a/docs/advanced/document-store/qdrant.md b/docs/advanced/document-store/qdrant.md index 30388588350..ffe34ba37d0 100644 --- a/docs/advanced/document-store/qdrant.md +++ b/docs/advanced/document-store/qdrant.md @@ -216,7 +216,10 @@ Embeddings Nearest Neighbours with "price" at most 7: embedding=[4. 4. 4.], price=4 ``` +````{admonition} Note +:class: note For Qdrant, the distance scores can be accessed in the Document's `.scores` dictionary by the key `f'{distance_metric}_similarity'`. For example, for `distance = 'euclidean'` the key would be `'euclidean_similarity'`. +```` ### Example of `.filter` with a filter The following example shows how to use DocArray with Qdrant Document Store in order to filter text documents. diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index 0de6e30558f..abb4d0db389 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -203,7 +203,13 @@ Consider the case where you want the nearest vectors to the embedding `[8., 8., @price:[-inf {max_price}] @color:{color} @stock:[1 1] ``` -Then the search with the proposed filter can be used as follows. Note: For Redis, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'score'`. +Then the search with the proposed filter can be used as follows. + +````{admonition} Note +:class: note +For Redis, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'score'`. +```` + ```python max_price = 7 color = "blue" diff --git a/docs/advanced/document-store/weaviate.md b/docs/advanced/document-store/weaviate.md index 2eb245eda83..f4e272f55ed 100644 --- a/docs/advanced/document-store/weaviate.md +++ b/docs/advanced/document-store/weaviate.md @@ -468,7 +468,10 @@ Only results that have a 'weaviate_certainty' of higher than 0.9 should show: scores=[{'distance': {'value': -3.5762787e-07}}] ``` -Note: For Weaviate, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'distance'`. +````{admonition} Note +:class: note +For Weaviate, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'distance'`. +```` ## Include additional properties in the return