From 81177b769a2b01e9203f357b39a9de0adad2ef88 Mon Sep 17 00:00:00 2001
From: guenthermi <guenthermi50@gmail.com>
Date: Mon, 21 Nov 2022 22:52:31 +0100
Subject: [PATCH 1/7] feat: add max_rel_per_label to support recall for labeled
 data

Signed-off-by: Michael Guenther <guenthermi50@gmail.com>
---
 docarray/array/mixins/evaluation.py           | 30 +++++++++++++---
 .../array/mixins/oldproto/test_eval_class.py  | 34 ++++++++++++++++++-
 2 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/docarray/array/mixins/evaluation.py b/docarray/array/mixins/evaluation.py
index db9c01c6dad..d5eab4c350c 100644
--- a/docarray/array/mixins/evaluation.py
+++ b/docarray/array/mixins/evaluation.py
@@ -1,10 +1,10 @@
 import warnings
-from typing import Optional, Union, TYPE_CHECKING, Callable, List, Dict, Tuple
+from typing import Optional, Union, TYPE_CHECKING, Callable, List, Dict, Tuple, Any
 
 from functools import wraps
 
 import numpy as np
-from collections import defaultdict
+from collections import defaultdict, Counter
 
 from docarray.score import NamedScore
 
@@ -79,6 +79,7 @@ def evaluate(
         metric_names: Optional[List[str]] = None,
         strict: bool = True,
         label_tag: str = 'label',
+        max_rel_per_label: Optional[Dict[Any, int]] = None,
         **kwargs,
     ) -> Dict[str, float]:
         """
@@ -109,6 +110,10 @@ def evaluate(
             aligned: on the length, and on the semantic of length. These are preventing
             you to evaluate on irrelevant matches accidentally.
         :param label_tag: Specifies the tag which contains the labels.
+        :param max_rel_per_label: Some metrics, e.g., recall@k, require the
+            number of relevant documents. To apply those to a labeled dataset, one can
+            provide a dictionary which maps labels to the total number of documents
+            with this label.
         :param kwargs: Additional keyword arguments to be passed to the metric
             functions.
         :return: A dictionary which stores for each metric name the average evaluation
@@ -161,7 +166,16 @@ def evaluate(
         results = defaultdict(list)
         caller_max_rel = kwargs.pop('max_rel', None)
         for d, gd in zip(self, ground_truth):
-            max_rel = caller_max_rel or len(gd.matches)
+            if caller_max_rel:
+                max_rel = caller_max_rel
+            elif max_rel_per_label and ground_truth_type == 'labels':
+                max_rel = max_rel_per_label.get(d.tags[label_tag], None)
+                if max_rel is None:
+                    raise ValueError(
+                        '`max_rel_per_label` misses the label ' + str(d.tags[label_tag])
+                    )
+            else:
+                max_rel = len(gd.matches)
             if strict and hash_fn(d) != hash_fn(gd):
                 raise ValueError(
                     f'Document {d} from the left-hand side and '
@@ -174,7 +188,7 @@ def evaluate(
                     f'Document {d!r} or {gd!r} has no matches, please check your Document'
                 )
 
-            targets = gd.matches[:max_rel]
+            targets = gd.matches
 
             if ground_truth_type == 'matches':
                 desired = {hash_fn(m) for m in targets}
@@ -438,12 +452,20 @@ def fuse_matches(global_matches: DocumentArray, local_matches: DocumentArray):
                     new_matches.append(m)
                 query_data[doc.id, 'matches'] = new_matches
 
+        if ground_truth and label_tag in ground_truth[0].tags:
+            max_rel_per_label = dict(Counter([d.tags[label_tag] for d in ground_truth]))
+        elif not ground_truth and label_tag in query_data[0].tags:
+            max_rel_per_label = dict(Counter([d.tags[label_tag] for d in query_data]))
+        else:
+            max_rel_per_label = None
+
         metrics_resp = query_data.evaluate(
             ground_truth=ground_truth,
             metrics=metrics,
             metric_names=metric_names,
             strict=strict,
             label_tag=label_tag,
+            max_rel_per_label=max_rel_per_label,
             **kwargs,
         )
 
diff --git a/tests/unit/array/mixins/oldproto/test_eval_class.py b/tests/unit/array/mixins/oldproto/test_eval_class.py
index 3eb0b79a3a2..e00520d4e77 100644
--- a/tests/unit/array/mixins/oldproto/test_eval_class.py
+++ b/tests/unit/array/mixins/oldproto/test_eval_class.py
@@ -193,6 +193,34 @@ def test_eval_mixin_one_of_n_labeled(metric_fn, metric_score, label_tag):
     assert abs(r - metric_score) < 0.001
 
 
+@pytest.mark.parametrize('label_tag', ['label', 'custom_tag'])
+@pytest.mark.parametrize(
+    'metric_fn, metric_score',
+    [
+        ('recall_at_k', 1.0),
+        ('f1_score_at_k', 0.5),
+    ],
+)
+def test_max_rel_per_label(metric_fn, metric_score, label_tag):
+    da = DocumentArray([Document(text=str(i), tags={label_tag: i}) for i in range(3)])
+    max_rel_per_label = {i: 1 for i in range(3)}
+    for d in da:
+        d.matches = da
+    r = da.evaluate(
+        [metric_fn], label_tag=label_tag, max_rel_per_label=max_rel_per_label
+    )[metric_fn]
+    assert abs(r - metric_score) < 0.001
+
+
+def test_missing_max_rel_should_raise():
+    da = DocumentArray([Document(text=str(i), tags={'label': i}) for i in range(3)])
+    max_rel_per_label = {i: 1 for i in range(2)}
+    for d in da:
+        d.matches = da
+    with pytest.raises(ValueError):
+        da.evaluate(['recall_at_k'], max_rel_per_label=max_rel_per_label)
+
+
 @pytest.mark.parametrize(
     'storage, config',
     [
@@ -528,7 +556,11 @@ def test_embed_and_evaluate_two_das(storage, config, sample_size, start_storage)
         (False, {'precision_at_k': 1.0 / 3, 'reciprocal_rank': 1.0}, 'label'),
         (
             True,
-            {'precision_at_k': 1.0 / 3, 'reciprocal_rank': 11.0 / 18.0},
+            {
+                'precision_at_k': 1.0 / 3,
+                'reciprocal_rank': 11.0 / 18.0,
+                'recall_at_k': 1.0,
+            },
             'custom_tag',
         ),
     ],

From 29777b7ec06196895b9a600f82da2b033e9891c5 Mon Sep 17 00:00:00 2001
From: Michael Guenther <guenthermi50@gmail.com>
Date: Thu, 24 Nov 2022 09:10:43 +0100
Subject: [PATCH 2/7] refactor: change logic of setting max_rel

Signed-off-by: Michael Guenther <guenthermi50@gmail.com>
---
 docarray/array/mixins/evaluation.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/docarray/array/mixins/evaluation.py b/docarray/array/mixins/evaluation.py
index d5eab4c350c..88a5aeca306 100644
--- a/docarray/array/mixins/evaluation.py
+++ b/docarray/array/mixins/evaluation.py
@@ -168,12 +168,16 @@ def evaluate(
         for d, gd in zip(self, ground_truth):
             if caller_max_rel:
                 max_rel = caller_max_rel
-            elif max_rel_per_label and ground_truth_type == 'labels':
-                max_rel = max_rel_per_label.get(d.tags[label_tag], None)
-                if max_rel is None:
-                    raise ValueError(
-                        '`max_rel_per_label` misses the label ' + str(d.tags[label_tag])
-                    )
+            elif ground_truth_type == 'labels':
+                if max_rel_per_label:
+                    max_rel = max_rel_per_label.get(d.tags[label_tag], None)
+                    if max_rel is None:
+                        raise ValueError(
+                            '`max_rel_per_label` misses the label '
+                            + str(d.tags[label_tag])
+                        )
+                else:
+                    max_rel = None
             else:
                 max_rel = len(gd.matches)
             if strict and hash_fn(d) != hash_fn(gd):

From 0c10f35960e9b0c4a5b9efa0cd9edf2b599629c1 Mon Sep 17 00:00:00 2001
From: Michael Guenther <guenthermi50@gmail.com>
Date: Mon, 28 Nov 2022 11:32:13 +0100
Subject: [PATCH 3/7] refactor: change name of max_rel_per_label

Signed-off-by: Michael Guenther <guenthermi50@gmail.com>
---
 docarray/array/mixins/evaluation.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/docarray/array/mixins/evaluation.py b/docarray/array/mixins/evaluation.py
index 88a5aeca306..83a028eb3e2 100644
--- a/docarray/array/mixins/evaluation.py
+++ b/docarray/array/mixins/evaluation.py
@@ -79,7 +79,7 @@ def evaluate(
         metric_names: Optional[List[str]] = None,
         strict: bool = True,
         label_tag: str = 'label',
-        max_rel_per_label: Optional[Dict[Any, int]] = None,
+        num_relevant_document_per_label: Optional[Dict[Any, int]] = None,
         **kwargs,
     ) -> Dict[str, float]:
         """
@@ -110,9 +110,9 @@ def evaluate(
             aligned: on the length, and on the semantic of length. These are preventing
             you to evaluate on irrelevant matches accidentally.
         :param label_tag: Specifies the tag which contains the labels.
-        :param max_rel_per_label: Some metrics, e.g., recall@k, require the
-            number of relevant documents. To apply those to a labeled dataset, one can
-            provide a dictionary which maps labels to the total number of documents
+        :param num_relevant_document_per_label: Some metrics, e.g., recall@k, require
+            the number of relevant documents. To apply those to a labeled dataset, one
+            can provide a dictionary which maps labels to the total number of documents
             with this label.
         :param kwargs: Additional keyword arguments to be passed to the metric
             functions.
@@ -169,11 +169,13 @@ def evaluate(
             if caller_max_rel:
                 max_rel = caller_max_rel
             elif ground_truth_type == 'labels':
-                if max_rel_per_label:
-                    max_rel = max_rel_per_label.get(d.tags[label_tag], None)
+                if num_relevant_document_per_label:
+                    max_rel = num_relevant_document_per_label.get(
+                        d.tags[label_tag], None
+                    )
                     if max_rel is None:
                         raise ValueError(
-                            '`max_rel_per_label` misses the label '
+                            '`num_relevant_document_per_label` misses the label '
                             + str(d.tags[label_tag])
                         )
                 else:
@@ -457,11 +459,15 @@ def fuse_matches(global_matches: DocumentArray, local_matches: DocumentArray):
                 query_data[doc.id, 'matches'] = new_matches
 
         if ground_truth and label_tag in ground_truth[0].tags:
-            max_rel_per_label = dict(Counter([d.tags[label_tag] for d in ground_truth]))
+            num_relevant_document_per_label = dict(
+                Counter([d.tags[label_tag] for d in ground_truth])
+            )
         elif not ground_truth and label_tag in query_data[0].tags:
-            max_rel_per_label = dict(Counter([d.tags[label_tag] for d in query_data]))
+            num_relevant_document_per_label = dict(
+                Counter([d.tags[label_tag] for d in query_data])
+            )
         else:
-            max_rel_per_label = None
+            num_relevant_document_per_label = None
 
         metrics_resp = query_data.evaluate(
             ground_truth=ground_truth,
@@ -469,7 +475,7 @@ def fuse_matches(global_matches: DocumentArray, local_matches: DocumentArray):
             metric_names=metric_names,
             strict=strict,
             label_tag=label_tag,
-            max_rel_per_label=max_rel_per_label,
+            num_relevant_document_per_label=num_relevant_document_per_label,
             **kwargs,
         )
 

From 22f0dc9344283fc58702fb07f96ee0377d333735 Mon Sep 17 00:00:00 2001
From: Michael Guenther <guenthermi50@gmail.com>
Date: Mon, 28 Nov 2022 11:39:23 +0100
Subject: [PATCH 4/7] fix: add missing s

Signed-off-by: Michael Guenther <guenthermi50@gmail.com>
---
 docarray/array/mixins/evaluation.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/docarray/array/mixins/evaluation.py b/docarray/array/mixins/evaluation.py
index 83a028eb3e2..2de9433fb33 100644
--- a/docarray/array/mixins/evaluation.py
+++ b/docarray/array/mixins/evaluation.py
@@ -79,7 +79,7 @@ def evaluate(
         metric_names: Optional[List[str]] = None,
         strict: bool = True,
         label_tag: str = 'label',
-        num_relevant_document_per_label: Optional[Dict[Any, int]] = None,
+        num_relevant_documents_per_label: Optional[Dict[Any, int]] = None,
         **kwargs,
     ) -> Dict[str, float]:
         """
@@ -110,7 +110,7 @@ def evaluate(
             aligned: on the length, and on the semantic of length. These are preventing
             you to evaluate on irrelevant matches accidentally.
         :param label_tag: Specifies the tag which contains the labels.
-        :param num_relevant_document_per_label: Some metrics, e.g., recall@k, require
+        :param num_relevant_documents_per_label: Some metrics, e.g., recall@k, require
             the number of relevant documents. To apply those to a labeled dataset, one
             can provide a dictionary which maps labels to the total number of documents
             with this label.
@@ -169,13 +169,13 @@ def evaluate(
             if caller_max_rel:
                 max_rel = caller_max_rel
             elif ground_truth_type == 'labels':
-                if num_relevant_document_per_label:
-                    max_rel = num_relevant_document_per_label.get(
+                if num_relevant_documents_per_label:
+                    max_rel = num_relevant_documents_per_label.get(
                         d.tags[label_tag], None
                     )
                     if max_rel is None:
                         raise ValueError(
-                            '`num_relevant_document_per_label` misses the label '
+                            '`num_relevant_documents_per_label` misses the label '
                             + str(d.tags[label_tag])
                         )
                 else:
@@ -459,15 +459,15 @@ def fuse_matches(global_matches: DocumentArray, local_matches: DocumentArray):
                 query_data[doc.id, 'matches'] = new_matches
 
         if ground_truth and label_tag in ground_truth[0].tags:
-            num_relevant_document_per_label = dict(
+            num_relevant_documents_per_label = dict(
                 Counter([d.tags[label_tag] for d in ground_truth])
             )
         elif not ground_truth and label_tag in query_data[0].tags:
-            num_relevant_document_per_label = dict(
+            num_relevant_documents_per_label = dict(
                 Counter([d.tags[label_tag] for d in query_data])
             )
         else:
-            num_relevant_document_per_label = None
+            num_relevant_documents_per_label = None
 
         metrics_resp = query_data.evaluate(
             ground_truth=ground_truth,
@@ -475,7 +475,7 @@ def fuse_matches(global_matches: DocumentArray, local_matches: DocumentArray):
             metric_names=metric_names,
             strict=strict,
             label_tag=label_tag,
-            num_relevant_document_per_label=num_relevant_document_per_label,
+            num_relevant_documents_per_label=num_relevant_documents_per_label,
             **kwargs,
         )
 

From 5739b4ff4ab57476e67a43023355d3b23f00d903 Mon Sep 17 00:00:00 2001
From: Michael Guenther <guenthermi50@gmail.com>
Date: Mon, 28 Nov 2022 14:43:54 +0100
Subject: [PATCH 5/7] fix: tests

Signed-off-by: Michael Guenther <guenthermi50@gmail.com>
---
 .../array/mixins/oldproto/test_eval_class.py    | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/tests/unit/array/mixins/oldproto/test_eval_class.py b/tests/unit/array/mixins/oldproto/test_eval_class.py
index bbfabdd2b56..78addbefb82 100644
--- a/tests/unit/array/mixins/oldproto/test_eval_class.py
+++ b/tests/unit/array/mixins/oldproto/test_eval_class.py
@@ -192,7 +192,7 @@ def test_eval_mixin_one_of_n_labeled(metric_fn, metric_score, label_tag):
     da = DocumentArray([Document(text=str(i), tags={label_tag: i}) for i in range(3)])
     for d in da:
         d.matches = da
-    r = da.evaluate([metric_fn], label_tag=label_tag)[metric_fn]
+    r = da.evaluate([metric_fn], label_tag=label_tag, max_rel=3)[metric_fn]
     assert abs(r - metric_score) < 0.001
 
 
@@ -204,24 +204,29 @@ def test_eval_mixin_one_of_n_labeled(metric_fn, metric_score, label_tag):
         ('f1_score_at_k', 0.5),
     ],
 )
-def test_max_rel_per_label(metric_fn, metric_score, label_tag):
+def test_num_relevant_documents_per_label(metric_fn, metric_score, label_tag):
     da = DocumentArray([Document(text=str(i), tags={label_tag: i}) for i in range(3)])
-    max_rel_per_label = {i: 1 for i in range(3)}
+    num_relevant_documents_per_label = {i: 1 for i in range(3)}
     for d in da:
         d.matches = da
     r = da.evaluate(
-        [metric_fn], label_tag=label_tag, max_rel_per_label=max_rel_per_label
+        [metric_fn],
+        label_tag=label_tag,
+        num_relevant_documents_per_label=num_relevant_documents_per_label,
     )[metric_fn]
     assert abs(r - metric_score) < 0.001
 
 
 def test_missing_max_rel_should_raise():
     da = DocumentArray([Document(text=str(i), tags={'label': i}) for i in range(3)])
-    max_rel_per_label = {i: 1 for i in range(2)}
+    num_relevant_documents_per_label = {i: 1 for i in range(2)}
     for d in da:
         d.matches = da
     with pytest.raises(ValueError):
-        da.evaluate(['recall_at_k'], max_rel_per_label=max_rel_per_label)
+        da.evaluate(
+            ['recall_at_k'],
+            num_relevant_documents_per_label=num_relevant_documents_per_label,
+        )
 
 
 @pytest.mark.parametrize(

From 218574e9c6f5d048b7a1ef6caa08a8c2d136fbd8 Mon Sep 17 00:00:00 2001
From: Michael Guenther <guenthermi50@gmail.com>
Date: Tue, 29 Nov 2022 12:03:05 +0100
Subject: [PATCH 6/7] docs: add documentation for max_rel

Signed-off-by: Michael Guenther <guenthermi50@gmail.com>
---
 docarray/math/evaluation.py                   |  2 +
 docs/fundamentals/documentarray/evaluation.md | 48 ++++++++++++++++++-
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/docarray/math/evaluation.py b/docarray/math/evaluation.py
index d05a0642a43..eb2e0ef16fc 100644
--- a/docarray/math/evaluation.py
+++ b/docarray/math/evaluation.py
@@ -98,6 +98,8 @@ def recall_at_k(
     """
     _check_k(k)
     binary_relevance = np.array(binary_relevance[:k]) != 0
+    if max_rel is None:
+        raise ValueError('The metric recall_at_k requires a max_rel parameter')
     if np.sum(binary_relevance) > max_rel:
         raise ValueError(f'Number of relevant Documents retrieved > {max_rel}')
     return np.sum(binary_relevance) / max_rel
diff --git a/docs/fundamentals/documentarray/evaluation.md b/docs/fundamentals/documentarray/evaluation.md
index 7ad289ca2ff..e3f82f19b5c 100644
--- a/docs/fundamentals/documentarray/evaluation.md
+++ b/docs/fundamentals/documentarray/evaluation.md
@@ -69,7 +69,7 @@ da_prediction['@m'].summary()
 To evaluate the matches against a ground truth array, you simply provide a DocumentArray to the evaluate function like `da_groundtruth` in the call below:
 
 ```python
-da_predict.evaluate(ground_truth=da_groundtruth, metrics=['...'], **kwargs)
+da_prediction.evaluate(ground_truth=da_groundtruth, metrics=['...'], **kwargs)
 ```
 
 Thereby, `da_groundtruth` should contain the same documents as in `da_prediction` where each `matches` attribute contains exactly those documents which are relevant to the respective root document.
@@ -215,6 +215,50 @@ da_prediction.evaluate(
 
 In this case, the keyword argument `k` is passed to all metric functions, even though it does not fulfill any specific function for the calculation of the reciprocal rank.
 
+### The max-rel parameter
+
+Some metric functions shown in the table above require a `max_rel` parameter.
+This parameter should be set to the number of relevant documents in the document collection.
+Without the knowledge of this number, metrics like `recall_at_k` and `f1_score_at_k` can be calculated.
+
+In the `evaluate` function, one can provide a keyword argument `max_rel`, which is then used for all queries.
+In the example below, we can use the datasets `da_prediction` and `da_original` from the beginning, where each query has nine relevant documents.
+Therefore, we set `max_rel=9`.
+
+```python
+da_prediction.evaluate(ground_truth=da_original, metrics=['recall_at_k'], max_rel=9)
+```
+
+```text
+{'recall_at_k': 1.0}
+```
+
+Since all relevant documents are in the matches, the recall is one.
+However, this only makes sense if the number of relevant documents is equal for each query.
+If one provides a `ground_truth` parameter to the `evaluate` function, `max_rel` is set to the number of matches of the query document.
+
+```python
+da_prediction.evaluate(ground_truth=da_original, metrics=['recall_at_k'])
+```
+```text
+{'recall_at_k': 1.0}
+```
+
+For labeled datasets, this is not possible.
+Here, one can set the `num_relevant_documents_per_label` parameter of `evaluate`.
+It accepts a dictionary that contains the number of relevant documents for each label.
+In this way, the function can set `max_rel` to the correct value for each query document.
+
+```python
+example_da.evaluate(
+    metrics=['recall_at_k'], num_relevant_documents_per_label={0: 5, 1: 5}
+)
+```
+
+```text
+{'recall_at_k': 1.0}
+```
+
 ### Custom metrics
 
 If the pre-defined metrics do not fit your use-case, you can define a custom metric function.
@@ -282,6 +326,8 @@ print(result)
 {'reciprocal_rank': 0.7583333333333333}
 ```
 
+For metric functions which require a `max_rel` parameter, the `embed_and_evaluate` function (described later in this section) automatically constructs the dictionary for `num_relevant_documents_per_label` based on the `index_data` argument.
+
 ### Batch-wise matching
 
 The ``embed_and_evaluate`` function is especially useful, when you need to evaluate the queries on a very large document collection (`example_index` in the code snippet above), which is too large to store the embeddings of all documents in main-memory.

From f61e8ed5c64351d8bd61e73d5d16250f5846cbd4 Mon Sep 17 00:00:00 2001
From: Michael Guenther <guenthermi50@gmail.com>
Date: Tue, 29 Nov 2022 15:34:28 +0100
Subject: [PATCH 7/7] docs: implement review notes

Signed-off-by: Michael Guenther <guenthermi50@gmail.com>
---
 docs/fundamentals/documentarray/evaluation.md | 47 ++++++++++---------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/docs/fundamentals/documentarray/evaluation.md b/docs/fundamentals/documentarray/evaluation.md
index e3f82f19b5c..a0a45fbf14b 100644
--- a/docs/fundamentals/documentarray/evaluation.md
+++ b/docs/fundamentals/documentarray/evaluation.md
@@ -72,7 +72,7 @@ To evaluate the matches against a ground truth array, you simply provide a Docum
 da_prediction.evaluate(ground_truth=da_groundtruth, metrics=['...'], **kwargs)
 ```
 
-Thereby, `da_groundtruth` should contain the same documents as in `da_prediction` where each `matches` attribute contains exactly those documents which are relevant to the respective root document.
+Thereby, `da_groundtruth` should contain the same Documents as in `da_prediction` where each `matches` attribute contains exactly those Documents which are relevant to the respective root Document.
 The `metrics` argument determines the metric you want to use for your evaluation, e.g., `precision_at_k`.
 
 In the code cell below, we evaluate the array `da_prediction` with the noisy matches against the original one `da_original`:
@@ -111,7 +111,8 @@ for d in da_prediction:
 Note that the evaluation against a ground truth DocumentArray only works if both DocumentArrays have the same length and their nested structure is the same.
 It makes no sense to evaluate with a completely different DocumentArray.
 
-While evaluating, Document pairs are recognized as correct if they share the same identifier. By default, it simply uses {attr}`~docarray.Document.id`. One can customize this behavior by specifying `hash_fn`.
+While evaluating, Document pairs are recognized as correct if they share the same identifier. By default, it simply uses {attr}`~docarray.Document.id`.
+You can customize this behavior by specifying `hash_fn`.
 
 Let's see an example by creating two DocumentArrays with some matches with identical texts.
 
@@ -157,8 +158,8 @@ It is correct as we define the evaluation as checking if the first two character
 
 ## Evaluation via labels
 
-Alternatively, you can add labels to your documents to evaluate them.
-In this case, a match is considered relevant to its root document if it has the same label:
+Alternatively, you can add labels to your Documents to evaluate them.
+In this case, a match is considered relevant to its root Document if it has the same label:
 
 ```python
 import numpy as np
@@ -198,7 +199,7 @@ Some of those metrics accept additional arguments as `kwargs` which you can simp
 ```{danger}
 These metric scores might change if the `limit` argument of the match function is set differently.
 
-**Note:** Not all of these metrics can be applied to a Top-K result, i.e., `ndcg_at_k` and `r_precision` are calculated correctly only if the limit is set equal or higher than the number of documents in the `DocumentArray` provided to the match function.
+**Note:** Not all of these metrics can be applied to a Top-K result, i.e., `ndcg_at_k` and `r_precision` are calculated correctly only if the limit is set equal or higher than the number of Documents in the `DocumentArray` provided to the match function.
 ```
 
 You can evaluate multiple metric functions at once, as you can see below:
@@ -215,14 +216,14 @@ da_prediction.evaluate(
 
 In this case, the keyword argument `k` is passed to all metric functions, even though it does not fulfill any specific function for the calculation of the reciprocal rank.
 
-### The max-rel parameter
+### The max_rel parameter
 
 Some metric functions shown in the table above require a `max_rel` parameter.
-This parameter should be set to the number of relevant documents in the document collection.
-Without the knowledge of this number, metrics like `recall_at_k` and `f1_score_at_k` can be calculated.
+This parameter should be set to the number of relevant Documents in the Document collection.
+Without the knowledge of this number, metrics like `recall_at_k` and `f1_score_at_k` cannot be calculated.
 
-In the `evaluate` function, one can provide a keyword argument `max_rel`, which is then used for all queries.
-In the example below, we can use the datasets `da_prediction` and `da_original` from the beginning, where each query has nine relevant documents.
+In the `evaluate` function, you can provide a keyword argument `max_rel`, which is then used for all queries.
+In the example below, we can use the datasets `da_prediction` and `da_original` from the beginning, where each query has nine relevant Documents.
 Therefore, we set `max_rel=9`.
 
 ```python
@@ -233,9 +234,9 @@ da_prediction.evaluate(ground_truth=da_original, metrics=['recall_at_k'], max_re
 {'recall_at_k': 1.0}
 ```
 
-Since all relevant documents are in the matches, the recall is one.
-However, this only makes sense if the number of relevant documents is equal for each query.
-If one provides a `ground_truth` parameter to the `evaluate` function, `max_rel` is set to the number of matches of the query document.
+Since all relevant Documents are in the matches, the recall is one.
+However, this only makes sense if the number of relevant Documents is equal for each query.
+If you provide a `ground_truth` parameter to the `evaluate` function, `max_rel` is set to the number of matches of the query Document.
 
 ```python
 da_prediction.evaluate(ground_truth=da_original, metrics=['recall_at_k'])
@@ -245,9 +246,9 @@ da_prediction.evaluate(ground_truth=da_original, metrics=['recall_at_k'])
 ```
 
 For labeled datasets, this is not possible.
-Here, one can set the `num_relevant_documents_per_label` parameter of `evaluate`.
-It accepts a dictionary that contains the number of relevant documents for each label.
-In this way, the function can set `max_rel` to the correct value for each query document.
+Here, you can set the `num_relevant_documents_per_label` parameter of `evaluate`.
+It accepts a dictionary that contains the number of relevant Documents for each label.
+In this way, the function can set `max_rel` to the correct value for each query Document.
 
 ```python
 example_da.evaluate(
@@ -265,7 +266,7 @@ If the pre-defined metrics do not fit your use-case, you can define a custom met
 It should take as input a list of binary relevance judgements of a query (`1` and `0` values).
 The evaluate function already calculates this binary list from the `matches` attribute so that each number represents the relevancy of a match.
 
-Let's write a custom metric function, which counts the number of relevant documents per query:
+Let's write a custom metric function, which counts the number of relevant Documents per query:
 
 ```python
 def count_relevant(binary_relevance):
@@ -330,18 +331,18 @@ For metric functions which require a `max_rel` parameter, the `embed_and_evaluat
 
 ### Batch-wise matching
 
-The ``embed_and_evaluate`` function is especially useful, when you need to evaluate the queries on a very large document collection (`example_index` in the code snippet above), which is too large to store the embeddings of all documents in main-memory.
-In this case, ``embed_and_evaluate`` matches the queries to batches of the document collection.
+The ``embed_and_evaluate`` function is especially useful, when you need to evaluate the queries on a very large Document collection (`example_index` in the code snippet above), which is too large to store the embeddings of all Documents in main-memory.
+In this case, ``embed_and_evaluate`` matches the queries to batches of the Document collection.
 After the batch is processed all embeddings are deleted.
 By default, the batch size for the matching (`match_batch_size`) is set to `100_000`.
 If you want to reduce the memory footprint, you can set it to a lower value.
 
 ### Sampling Queries
 
-If you want to evaluate a large dataset, it might be useful to sample query documents.
+If you want to evaluate a large dataset, it might be useful to sample query Documents.
 Since the metric values returned by the `embed_and_evaluate` are mean values, sampling should not change the result significantly if the sample is large enough.
-By default, sampling is applied for `DocumentArray` objects with more than 1,000 documents.
-However, it is only applied on the `DocumentArray` itself and not on the document provided in `index_data`.
+By default, sampling is applied for `DocumentArray` objects with more than 1,000 Documents.
+However, it is only applied on the `DocumentArray` itself and not on the Documents provided in `index_data`.
 If you want to change the number of samples, you can ajust the `query_sample_size` argument.
 In the following code block an evaluation is done with 100 samples:
 
@@ -369,7 +370,7 @@ da.embed_and_evaluate(
 {'precision_at_k': 0.13649999999999998}
 ```
 
-Please note that in this way only documents which are actually evaluated obtain an `.evaluations` attribute.
+Please note that in this way only Documents which are actually evaluated obtain an `.evaluations` attribute.
 
 To test how close it is to the exact result, we execute the function again with `query_sample_size` set to 1,000: