diff --git a/docarray/array/storage/weaviate/find.py b/docarray/array/storage/weaviate/find.py index 68fbef9446e..1024de706d3 100644 --- a/docarray/array/storage/weaviate/find.py +++ b/docarray/array/storage/weaviate/find.py @@ -94,7 +94,6 @@ def _find_similar_vectors( doc = Document.from_base64(result['_serialized'], **self._serialize_config) distance = result['_additional']['distance'] - doc.scores['cosine_similarity'] = NamedScore(value=distance) doc.scores['distance'] = NamedScore(value=distance) certainty = result['_additional'].get('certainty', None) diff --git a/docs/advanced/document-store/elasticsearch.md b/docs/advanced/document-store/elasticsearch.md index 20af11fd6a7..cc713ca4b3a 100644 --- a/docs/advanced/document-store/elasticsearch.md +++ b/docs/advanced/document-store/elasticsearch.md @@ -193,7 +193,12 @@ Consider we want the nearest vectors to the embedding `[8. 8. 8.]`, with the res prices must follow a filter. As an example, let's consider that retrieved documents must have `price` value lower or equal than `max_price`. We can encode this information in ElasticSearch using `filter = {'range': {'price': {'lte': max_price}}}`. -Then the search with the proposed filter can be implemented and used with the following code: +Then the search with the proposed filter can be implemented and used with the following code. + +````{admonition} Note +:class: note +For Elasticsearch, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'score'`. +```` ```python max_price = 7 diff --git a/docs/advanced/document-store/qdrant.md b/docs/advanced/document-store/qdrant.md index 179fde41596..a4da3c6fccc 100644 --- a/docs/advanced/document-store/qdrant.md +++ b/docs/advanced/document-store/qdrant.md @@ -76,25 +76,26 @@ Other functions behave the same as in-memory DocumentArray. The following configs can be set: -| Name | Description | Default | -|-----------------------|------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------| -| `n_dim` | Number of dimensions of embeddings to be stored and retrieved | **This is always required** | -| `collection_name` | Qdrant collection name client | **Random collection name generated** | -| `distance` | Distance metric to use during search. Can be 'cosine', 'dot' or 'euclidean' | `'cosine'` | -| `host` | Hostname of the Qdrant server | `'localhost'` | -| `port` | Port of the Qdrant server | `6333` | -| `grpc_port` | Port of the Qdrant gRPC interface | `6334` | -| `prefer_grpc` | Set `True` to use gPRC interface whenever possible in custom methods | `False` | -| `api_key` | API key for authentication in Qdrant Cloud | `None` | -| `https` | Set `True` to use HTTPS(SSL) protocol | `None` | -| `serialize_config` | [Serialization config of each Document](../../../fundamentals/document/serialization.md) | `None` | -| `scroll_batch_size` | Batch size used when scrolling over the storage | `64` | -| `ef_construct` | Number of neighbours to consider during the index building. Larger = more accurate search, more time to build index | `None`, defaults to the default value in Qdrant* | -| `full_scan_threshold` | Minimal size (in KiloBytes) of vectors for additional payload-based indexing | `None`, defaults to the default value in Qdrant* | -| `m` | Number of edges per node in the index graph. Larger = more accurate search, more space required | `None`, defaults to the default value in Qdrant* | -| `columns` | Other fields to store in Document | `None` | -| `list_like` | Controls if ordering of Documents is persisted in the Database. Disabling this breaks list-like features, but can improve performance. | True | -| `root_id` | Boolean flag indicating whether to store `root_id` in the tags of chunk level Documents | True | +| Name | Description | Default | +|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------| +| `n_dim` | Number of dimensions of embeddings to be stored and retrieved | **This is always required** | +| `collection_name` | Qdrant collection name client | **Random collection name generated** | +| `distance` | Distance metric to use during search. Can be 'cosine' (similarity), 'dot' or 'euclidean' | `'cosine'` | +| `host` | Hostname of the Qdrant server | `'localhost'` | +| `port` | Port of the Qdrant server | `6333` | +| `grpc_port` | Port of the Qdrant gRPC interface | `6334` | +| `prefer_grpc` | Set `True` to use gPRC interface whenever possible in custom methods | `False` | +| `api_key` | API key for authentication in Qdrant Cloud | `None` | +| `https` | Set `True` to use HTTPS(SSL) protocol | `None` | +| `serialize_config` | [Serialization config of each Document](../../../fundamentals/document/serialization.md) | `None` | +| `scroll_batch_size` | Batch size used when scrolling over the storage | `64` | +| `ef_construct` | Number of neighbours to consider during the index building. Larger = more accurate search, more time to build index | `None`, defaults to the default value in Qdrant* | +| `full_scan_threshold` | Minimal size (in KiloBytes) of vectors for additional payload-based indexing | `None`, defaults to the default value in Qdrant* | +| `m` | Number of edges per node in the index graph. Larger = more accurate search, more space required | `None`, defaults to the default value in Qdrant* | +| `columns` | Other fields to store in Document | `None` | +| `list_like` | Controls if ordering of Documents is persisted in the Database. Disabling this breaks list-like features, but can improve performance. | True | +| `root_id` | Boolean flag indicating whether to store `root_id` in the tags of chunk level Documents | True | + *You can read more about the HNSW parameters and their default values [here](https://qdrant.tech/documentation/indexing/#vector-index) @@ -217,6 +218,12 @@ Embeddings Nearest Neighbours with "price" at most 7: embedding=[5. 5. 5.], price=5 embedding=[4. 4. 4.], price=4 ``` + +````{admonition} Note +:class: note +For Qdrant, the distance scores can be accessed in the Document's `.scores` dictionary by the key `f'{distance_metric}_similarity'`. For example, for `distance = 'euclidean'` the key would be `'euclidean_similarity'`. +```` + ### Example of `.filter` with a filter The following example shows how to use DocArray with Qdrant Document Store in order to filter text documents. Consider Documents have the tag `price` with a value of `i`. We can create these with the following code: diff --git a/docs/advanced/document-store/redis.md b/docs/advanced/document-store/redis.md index c5267b4e61c..94d23413b01 100644 --- a/docs/advanced/document-store/redis.md +++ b/docs/advanced/document-store/redis.md @@ -204,7 +204,13 @@ Consider the case where you want the nearest vectors to the embedding `[8., 8., @price:[-inf {max_price}] @color:{color} @stock:[1 1] ``` -Then the search with the proposed filter can be used as follows: +Then the search with the proposed filter can be used as follows. + +````{admonition} Note +:class: note +For Redis, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'score'`. +```` + ```python max_price = 7 color = "blue" diff --git a/docs/advanced/document-store/weaviate.md b/docs/advanced/document-store/weaviate.md index a60c2268ed2..faf9faef689 100644 --- a/docs/advanced/document-store/weaviate.md +++ b/docs/advanced/document-store/weaviate.md @@ -453,10 +453,10 @@ results = da.find( model, collate_fn=collate_fn, ), - query_params={"certainty": 0.9}, + query_params={"certainty": 0.995}, ) -print("Only results that have a 'weaviate_certainty' of higher than 0.9 should show:") +print("Only results that have a 'weaviate_certainty' of higher than 0.995 should show:") for res in results: print(f"\t text={res[:, 'text']}") print(f"\t scores={res[:, 'scores']}") @@ -467,9 +467,14 @@ This should return something similar to: ```bash Only results that have a 'weaviate_certainty' of higher than 0.9 should show: text=['Persist Documents with Weaviate.'] - scores=[{'weaviate_certainty': {'value': 1.0000001}, 'cosine_similarity': {'value': 1.0000002000000001}}] + scores=[{'distance': {'value': -3.5762787e-07}}] ``` +````{admonition} Note +:class: note +For Weaviate, the distance scores can be accessed in the Document's `.scores` dictionary under the key `'distance'`. +```` + ## Include additional properties in the return GraphQL additional properties can be used on data objects in Get{} Queries to get additional information about the diff --git a/tests/unit/array/mixins/test_find.py b/tests/unit/array/mixins/test_find.py index 0c87ae34858..e8ea1e225ac 100644 --- a/tests/unit/array/mixins/test_find.py +++ b/tests/unit/array/mixins/test_find.py @@ -74,31 +74,55 @@ def test_find(storage, config, limit, query, start_storage): # annlite uses cosine distance by default if n_dim == 1: if storage == 'weaviate': + distances = [t['distance'].value for t in result[:, 'scores']] + assert sorted(distances, reverse=False) == distances + assert len(distances) == limit + elif storage == 'qdrant': cosine_similarities = [ t['cosine_similarity'].value for t in result[:, 'scores'] ] - assert sorted(cosine_similarities, reverse=False) == cosine_similarities - if storage == 'redis': - cosine_distances = [t['score'].value for t in da[:, 'scores']] + assert sorted(cosine_similarities, reverse=True) + assert len(cosine_similarities) == limit + elif storage == 'elasticsearch': + cosine_similarities = [t['score'].value for t in result[:, 'scores']] + assert sorted(cosine_similarities, reverse=True) == cosine_similarities + assert len(cosine_similarities) == limit + elif storage == 'redis': + cosine_distances = [t['score'].value for t in result[:, 'scores']] assert sorted(cosine_distances, reverse=False) == cosine_distances - elif storage in ['memory', 'annlite', 'elasticsearch']: - cosine_distances = [t['cosine'].value for t in da[:, 'scores']] + assert len(cosine_distances) == limit + elif storage in ['memory', 'annlite']: + cosine_distances = [t['cosine'].value for t in result[:, 'scores']] assert sorted(cosine_distances, reverse=False) == cosine_distances + assert len(cosine_distances) == limit else: if storage == 'weaviate': + for da in result: + distances = [t['distance'].value for t in da[:, 'scores']] + assert sorted(distances, reverse=False) == distances + assert len(distances) == limit + elif storage == 'qdrant': for da in result: cosine_similarities = [ t['cosine_similarity'].value for t in da[:, 'scores'] ] - assert sorted(cosine_similarities, reverse=False) == cosine_similarities - if storage == 'redis': + assert sorted(cosine_similarities, reverse=True) + assert len(cosine_similarities) == limit + elif storage == 'elasticsearch': + for da in result: + cosine_similarities = [t['score'].value for t in da[:, 'scores']] + assert sorted(cosine_similarities, reverse=True) == cosine_similarities + assert len(cosine_similarities) == limit + elif storage == 'redis': for da in result: cosine_distances = [t['score'].value for t in da[:, 'scores']] assert sorted(cosine_distances, reverse=False) == cosine_distances - elif storage in ['memory', 'annlite', 'elasticsearch']: + assert len(cosine_distances) == limit + elif storage in ['memory', 'annlite']: for da in result: cosine_distances = [t['cosine'].value for t in da[:, 'scores']] assert sorted(cosine_distances, reverse=False) == cosine_distances + assert len(cosine_distances) == limit @pytest.mark.parametrize(