diff --git a/docarray/math/evaluation.py b/docarray/math/evaluation.py index 7b349db83bc..d05a0642a43 100644 --- a/docarray/math/evaluation.py +++ b/docarray/math/evaluation.py @@ -12,32 +12,40 @@ def _check_k(k): def r_precision(binary_relevance: List[int], **kwargs) -> float: - """R Precision after all relevant documents have been retrieved - Relevance is binary (nonzero is relevant). + """R-Precision determines the precision in the fist R documents, where R is the + number of documents relevant to the query. + + Relevance is considered binary by this function (nonzero is relevant). + + Please note, that it is necessary to provide relevance scores for all documents, + i.e., the calculated metric is wrong, if you apply it on the Top-K scores only. .. seealso:: https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#R-precision :param binary_relevance: binary relevancy in rank order - :return: precision + :return: R-Precision """ binary_relevance = np.array(binary_relevance) != 0 z = binary_relevance.nonzero()[0] if not z.size: return 0.0 - return float(np.mean(binary_relevance[: z[-1] + 1])) + return float(np.mean(binary_relevance[: z.size])) def precision_at_k( binary_relevance: List[int], k: Optional[int] = None, **kwargs ) -> float: """Precision @K. + If `binary_relevance` is empty, 0.0 is returned. :param binary_relevance: binary relevancy in rank order :param k: measured on top-k :return: precision @k """ _check_k(k) + if len(binary_relevance) == 0: + return 0.0 binary_relevance = np.array(binary_relevance)[:k] != 0 return float(np.mean(binary_relevance)) @@ -147,9 +155,12 @@ def dcg_at_k( def ndcg_at_k( relevance: List[float], method: int = 0, k: Optional[int] = None, **kwargs ): - """Score is normalized discounted cumulative gain (ndcg) - Relevance is positive real values. Can use binary - as the previous methods. + """Calculates a normalized discounted cumulative gain (ndcg). + Relevance values can be positive real values. However, one can also use binary + scores as in other evaluation methods. + + Please note, that it is necessary to provide relevance scores for all documents, + i.e., the calculated metric is wrong, if you apply it on the Top-K scores only. Example from http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf diff --git a/docs/fundamentals/documentarray/evaluation.md b/docs/fundamentals/documentarray/evaluation.md index 65a9be846fa..9c69548abb7 100644 --- a/docs/fundamentals/documentarray/evaluation.md +++ b/docs/fundamentals/documentarray/evaluation.md @@ -10,17 +10,23 @@ The results are stored in `.evaluations` field of each Document. DocArray provides some common metrics used in the information retrieval community that allows one to evaluate the nearest-neighbour matches. Different metric accepts different arguments as `kwargs`: -| Metric | Accept `kwargs` | -|---------------------|------------------| -| `r_precision` | None | -| `average_precision` | None | -| `reciprocal_rank` | None | -| `precision_at_k` | `k` | -| `hit_at_k` | `k` | -| `recall_at_k` | `max_rel`, `k` | -| `f1_score_at_k` | `max_rel`, `k` | -| `dcg_at_k` | `method`, `k` | -| `ndcg_at_k` | `method`, `k` | +| Metric | Accept `kwargs` | +|-----------------------------------------------------|------------------| +| {meth}`~docarray.math.evaluation.r_precision` | None | +| {meth}`~docarray.math.evaluation.average_precision` | None | +| {meth}`~docarray.math.evaluation.reciprocal_rank` | None | +| {meth}`~docarray.math.evaluation.precision_at_k` | `k` | +| {meth}`~docarray.math.evaluation.hit_at_k` | `k` | +| {meth}`~docarray.math.evaluation.recall_at_k` | `max_rel`, `k` | +| {meth}`~docarray.math.evaluation.f1_score_at_k` | `max_rel`, `k` | +| {meth}`~docarray.math.evaluation.dcg_at_k` | `method`, `k` | +| {meth}`~docarray.math.evaluation.ndcg_at_k` | `method`, `k` | + +```{danger} +This metric scores might change if the `limit` attribute of the match function is set differently. + +**Note:** Not all of these metrics can be applied to a Top-K result, i.e., `ndcg_at_k` and `r_precision` are calculated correctly only if the limit is set equal or higher than the number of documents in the `DocumentArray` provided to the match function. +``` For example, let's create a DocumentArray with random embeddings and matching it to itself: diff --git a/tests/unit/math/test_evaluation_metrics.py b/tests/unit/math/test_evaluation_metrics.py new file mode 100644 index 00000000000..0276080bb95 --- /dev/null +++ b/tests/unit/math/test_evaluation_metrics.py @@ -0,0 +1,142 @@ +import pytest + +from docarray.math.evaluation import ( + average_precision, + dcg_at_k, + f1_score_at_k, + hit_at_k, + ndcg_at_k, + precision_at_k, + r_precision, + recall_at_k, + reciprocal_rank, +) + + +@pytest.mark.parametrize( + "binary_relevance, score", + [ + ([0, 1, 0, 0, 1, 1, 1], 0.25), + ([], 0), + ([1, 1, 1], 1), + ([0, 0], 0), + ], +) +def test_r_precision(binary_relevance, score): + assert abs(r_precision(binary_relevance) - score) < 0.001 + + +@pytest.mark.parametrize( + "binary_relevance, score, k", + [ + ([0, 1, 0, 0, 1, 1, 1], 4.0 / 7, None), + ([0, 1, 0, 0, 1, 1, 1], 0.5, 2), + ([], 0, None), + ([1, 1, 1], 1, None), + ([0, 0], 0, None), + ], +) +def test_precision_at_k(binary_relevance, score, k): + assert abs(precision_at_k(binary_relevance, k=k) - score) < 0.001 + + +@pytest.mark.parametrize( + "binary_relevance, score, k", + [ + ([0, 1, 0, 0, 1, 1, 1], 1, None), + ([0, 1, 0, 0, 1, 1, 1], 0, 1), + ([], 0, None), + ([1, 1, 1], 1, None), + ([0, 0], 0, None), + ], +) +def test_hit_at_k(binary_relevance, score, k): + assert abs(hit_at_k(binary_relevance, k=k) - score) < 0.001 + + +@pytest.mark.parametrize( + "binary_relevance, score", + [ + ([0, 1, 0, 0, 1, 1, 1], (1.0 / 2 + 2.0 / 5 + 3.0 / 6 + 4.0 / 7) / 4), + ([], 0), + ([1, 1, 1], 1), + ([0, 0], 0), + ], +) +def test_average_precision(binary_relevance, score): + assert abs(average_precision(binary_relevance) - score) < 0.001 + + +@pytest.mark.parametrize( + "binary_relevance, score", + [ + ([0, 1, 0, 0, 1, 1, 1], 0.5), + ([], 0), + ([1, 1, 1], 1.0), + ([0, 0], 0), + ], +) +def test_reciprocal_rank(binary_relevance, score): + assert abs(reciprocal_rank(binary_relevance) - score) < 0.001 + + +@pytest.mark.parametrize( + "binary_relevance, score, max_rel, k", + [ + ([0, 1, 0, 0, 1, 1, 1], 4.0 / 7, 7, None), + ([0, 1, 0, 0, 1, 1, 1], 1, 4, None), + ([0, 1, 0, 0, 1, 1, 1], 0.25, 4, 2), + ([], 0, 4, None), + ([1, 1, 1], 0.75, 4, None), + ([0, 0], 0, 4, None), + ], +) +def test_recall_at_k(binary_relevance, score, max_rel, k): + calculated_score = recall_at_k(binary_relevance, max_rel=max_rel, k=k) + assert abs(calculated_score - score) < 0.001 + + +@pytest.mark.parametrize( + "binary_relevance, score, max_rel, k", + [ + ([0, 1, 0, 0, 1, 1, 1], 4.0 / 7, 7, None), + ([0, 1, 0, 0, 1, 1, 1], 2 / (1 / (4 / 7) + 1), 4, None), + ([0, 1, 0, 0, 1, 1, 1], 2 / (1 / 0.5 + 1 / 0.25), 4, 2), + ([], 0, 4, None), + ([1, 1, 1], 2 / (1 / 0.75 + 1), 4, None), + ([0, 0], 0, 4, None), + ], +) +def test_f1_score_at_k(binary_relevance, score, max_rel, k): + calculated_score = f1_score_at_k(binary_relevance, max_rel=max_rel, k=k) + assert abs(calculated_score - score) < 0.001 + + +@pytest.mark.parametrize( + "binary_relevance, score, method, k", + [ + ([0, 1, 0, 0, 1, 1, 1], 2.1737, 0, None), + ([0, 1, 0, 0, 1, 1, 1], 1.7073, 1, None), + ([0, 1, 0, 0, 1, 1, 1], 1, 0, 4), + ([], 0, 0, None), + ([1, 1, 1], 2.6309, 0, None), + ([0, 0], 0, 0, None), + ], +) +def test_dcg_at_k(binary_relevance, score, method, k): + assert abs(dcg_at_k(binary_relevance, method=method, k=k) - score) < 0.001 + + +@pytest.mark.parametrize( + "binary_relevance, score, method, k", + [ + ([0, 1, 0, 0, 1, 1, 1], 0.6942, 0, None), + ([0, 1, 0, 0, 1, 1, 1], 0.6665, 1, None), + ([0, 1, 0, 0, 1, 1, 1], 0.3194, 0, 4), + ([], 0, 0, None), + ([1, 1, 1], 1, 0, None), + ([0, 0], 0, 0, None), + ], +) +def test_ndcg_at_k(binary_relevance, score, method, k): + assert abs(ndcg_at_k(binary_relevance, method=method, k=k) - score) < 0.001