diff --git a/docarray/math/evaluation.py b/docarray/math/evaluation.py
index 7b349db83bc..d05a0642a43 100644
--- a/docarray/math/evaluation.py
+++ b/docarray/math/evaluation.py
@@ -12,32 +12,40 @@ def _check_k(k):
 
 
 def r_precision(binary_relevance: List[int], **kwargs) -> float:
-    """R Precision after all relevant documents have been retrieved
-    Relevance is binary (nonzero is relevant).
+    """R-Precision determines the precision in the fist R documents, where R is the
+    number of documents relevant to the query.
+
+    Relevance is considered binary by this function (nonzero is relevant).
+
+    Please note, that it is necessary to provide relevance scores for all documents,
+    i.e., the calculated metric is wrong, if you apply it on the Top-K scores only.
 
     .. seealso::
         https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#R-precision
 
     :param binary_relevance: binary relevancy in rank order
-    :return: precision
+    :return: R-Precision
     """
     binary_relevance = np.array(binary_relevance) != 0
     z = binary_relevance.nonzero()[0]
     if not z.size:
         return 0.0
-    return float(np.mean(binary_relevance[: z[-1] + 1]))
+    return float(np.mean(binary_relevance[: z.size]))
 
 
 def precision_at_k(
     binary_relevance: List[int], k: Optional[int] = None, **kwargs
 ) -> float:
     """Precision @K.
+    If `binary_relevance` is empty, 0.0 is returned.
 
     :param binary_relevance: binary relevancy in rank order
     :param k: measured on top-k
     :return: precision @k
     """
     _check_k(k)
+    if len(binary_relevance) == 0:
+        return 0.0
     binary_relevance = np.array(binary_relevance)[:k] != 0
     return float(np.mean(binary_relevance))
 
@@ -147,9 +155,12 @@ def dcg_at_k(
 def ndcg_at_k(
     relevance: List[float], method: int = 0, k: Optional[int] = None, **kwargs
 ):
-    """Score is normalized discounted cumulative gain (ndcg)
-    Relevance is positive real values.  Can use binary
-    as the previous methods.
+    """Calculates a normalized discounted cumulative gain (ndcg).
+    Relevance values can be positive real values. However, one can also use binary
+    scores as in other evaluation methods.
+
+    Please note, that it is necessary to provide relevance scores for all documents,
+    i.e., the calculated metric is wrong, if you apply it on the Top-K scores only.
 
     Example from
     http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
diff --git a/docs/fundamentals/documentarray/evaluation.md b/docs/fundamentals/documentarray/evaluation.md
index 65a9be846fa..9c69548abb7 100644
--- a/docs/fundamentals/documentarray/evaluation.md
+++ b/docs/fundamentals/documentarray/evaluation.md
@@ -10,17 +10,23 @@ The results are stored in `.evaluations` field of each Document.
 
 DocArray provides some common metrics used in the information retrieval community that allows one to evaluate the nearest-neighbour matches. Different metric accepts different arguments as `kwargs`:
 
-| Metric              | Accept `kwargs`  |
-|---------------------|------------------|
-| `r_precision`       | None             |
-| `average_precision` | None             |            
-| `reciprocal_rank`   | None             |
-| `precision_at_k`    | `k`              |
-| `hit_at_k`          | `k`              |
-| `recall_at_k`       | `max_rel`, `k`   |
-| `f1_score_at_k`     | `max_rel`, `k`   |
-| `dcg_at_k`          | `method`, `k`    |
-| `ndcg_at_k`         | `method`, `k`    |
+| Metric                                              | Accept `kwargs`  |
+|-----------------------------------------------------|------------------|
+| {meth}`~docarray.math.evaluation.r_precision`       | None             |
+| {meth}`~docarray.math.evaluation.average_precision` | None             |            
+| {meth}`~docarray.math.evaluation.reciprocal_rank`   | None             |
+| {meth}`~docarray.math.evaluation.precision_at_k`    | `k`              |
+| {meth}`~docarray.math.evaluation.hit_at_k`          | `k`              |
+| {meth}`~docarray.math.evaluation.recall_at_k`       | `max_rel`, `k`   |
+| {meth}`~docarray.math.evaluation.f1_score_at_k`     | `max_rel`, `k`   |
+| {meth}`~docarray.math.evaluation.dcg_at_k`          | `method`, `k`    |
+| {meth}`~docarray.math.evaluation.ndcg_at_k`         | `method`, `k`    |
+
+```{danger}
+This metric scores might change if the `limit` attribute of the match function is set differently.
+
+**Note:** Not all of these metrics can be applied to a Top-K result, i.e., `ndcg_at_k` and `r_precision` are calculated correctly only if the limit is set equal or higher than the number of documents in the `DocumentArray` provided to the match function.
+```
 
 
 For example, let's create a DocumentArray with random embeddings and matching it to itself:
diff --git a/tests/unit/math/test_evaluation_metrics.py b/tests/unit/math/test_evaluation_metrics.py
new file mode 100644
index 00000000000..0276080bb95
--- /dev/null
+++ b/tests/unit/math/test_evaluation_metrics.py
@@ -0,0 +1,142 @@
+import pytest
+
+from docarray.math.evaluation import (
+    average_precision,
+    dcg_at_k,
+    f1_score_at_k,
+    hit_at_k,
+    ndcg_at_k,
+    precision_at_k,
+    r_precision,
+    recall_at_k,
+    reciprocal_rank,
+)
+
+
+@pytest.mark.parametrize(
+    "binary_relevance, score",
+    [
+        ([0, 1, 0, 0, 1, 1, 1], 0.25),
+        ([], 0),
+        ([1, 1, 1], 1),
+        ([0, 0], 0),
+    ],
+)
+def test_r_precision(binary_relevance, score):
+    assert abs(r_precision(binary_relevance) - score) < 0.001
+
+
+@pytest.mark.parametrize(
+    "binary_relevance, score, k",
+    [
+        ([0, 1, 0, 0, 1, 1, 1], 4.0 / 7, None),
+        ([0, 1, 0, 0, 1, 1, 1], 0.5, 2),
+        ([], 0, None),
+        ([1, 1, 1], 1, None),
+        ([0, 0], 0, None),
+    ],
+)
+def test_precision_at_k(binary_relevance, score, k):
+    assert abs(precision_at_k(binary_relevance, k=k) - score) < 0.001
+
+
+@pytest.mark.parametrize(
+    "binary_relevance, score, k",
+    [
+        ([0, 1, 0, 0, 1, 1, 1], 1, None),
+        ([0, 1, 0, 0, 1, 1, 1], 0, 1),
+        ([], 0, None),
+        ([1, 1, 1], 1, None),
+        ([0, 0], 0, None),
+    ],
+)
+def test_hit_at_k(binary_relevance, score, k):
+    assert abs(hit_at_k(binary_relevance, k=k) - score) < 0.001
+
+
+@pytest.mark.parametrize(
+    "binary_relevance, score",
+    [
+        ([0, 1, 0, 0, 1, 1, 1], (1.0 / 2 + 2.0 / 5 + 3.0 / 6 + 4.0 / 7) / 4),
+        ([], 0),
+        ([1, 1, 1], 1),
+        ([0, 0], 0),
+    ],
+)
+def test_average_precision(binary_relevance, score):
+    assert abs(average_precision(binary_relevance) - score) < 0.001
+
+
+@pytest.mark.parametrize(
+    "binary_relevance, score",
+    [
+        ([0, 1, 0, 0, 1, 1, 1], 0.5),
+        ([], 0),
+        ([1, 1, 1], 1.0),
+        ([0, 0], 0),
+    ],
+)
+def test_reciprocal_rank(binary_relevance, score):
+    assert abs(reciprocal_rank(binary_relevance) - score) < 0.001
+
+
+@pytest.mark.parametrize(
+    "binary_relevance, score, max_rel, k",
+    [
+        ([0, 1, 0, 0, 1, 1, 1], 4.0 / 7, 7, None),
+        ([0, 1, 0, 0, 1, 1, 1], 1, 4, None),
+        ([0, 1, 0, 0, 1, 1, 1], 0.25, 4, 2),
+        ([], 0, 4, None),
+        ([1, 1, 1], 0.75, 4, None),
+        ([0, 0], 0, 4, None),
+    ],
+)
+def test_recall_at_k(binary_relevance, score, max_rel, k):
+    calculated_score = recall_at_k(binary_relevance, max_rel=max_rel, k=k)
+    assert abs(calculated_score - score) < 0.001
+
+
+@pytest.mark.parametrize(
+    "binary_relevance, score, max_rel, k",
+    [
+        ([0, 1, 0, 0, 1, 1, 1], 4.0 / 7, 7, None),
+        ([0, 1, 0, 0, 1, 1, 1], 2 / (1 / (4 / 7) + 1), 4, None),
+        ([0, 1, 0, 0, 1, 1, 1], 2 / (1 / 0.5 + 1 / 0.25), 4, 2),
+        ([], 0, 4, None),
+        ([1, 1, 1], 2 / (1 / 0.75 + 1), 4, None),
+        ([0, 0], 0, 4, None),
+    ],
+)
+def test_f1_score_at_k(binary_relevance, score, max_rel, k):
+    calculated_score = f1_score_at_k(binary_relevance, max_rel=max_rel, k=k)
+    assert abs(calculated_score - score) < 0.001
+
+
+@pytest.mark.parametrize(
+    "binary_relevance, score, method, k",
+    [
+        ([0, 1, 0, 0, 1, 1, 1], 2.1737, 0, None),
+        ([0, 1, 0, 0, 1, 1, 1], 1.7073, 1, None),
+        ([0, 1, 0, 0, 1, 1, 1], 1, 0, 4),
+        ([], 0, 0, None),
+        ([1, 1, 1], 2.6309, 0, None),
+        ([0, 0], 0, 0, None),
+    ],
+)
+def test_dcg_at_k(binary_relevance, score, method, k):
+    assert abs(dcg_at_k(binary_relevance, method=method, k=k) - score) < 0.001
+
+
+@pytest.mark.parametrize(
+    "binary_relevance, score, method, k",
+    [
+        ([0, 1, 0, 0, 1, 1, 1], 0.6942, 0, None),
+        ([0, 1, 0, 0, 1, 1, 1], 0.6665, 1, None),
+        ([0, 1, 0, 0, 1, 1, 1], 0.3194, 0, 4),
+        ([], 0, 0, None),
+        ([1, 1, 1], 1, 0, None),
+        ([0, 0], 0, 0, None),
+    ],
+)
+def test_ndcg_at_k(binary_relevance, score, method, k):
+    assert abs(ndcg_at_k(binary_relevance, method=method, k=k) - score) < 0.001