From 0a52258475d207da9c3566123e00a789db72fa6d Mon Sep 17 00:00:00 2001 From: guenthermi Date: Fri, 21 Oct 2022 14:40:47 +0200 Subject: [PATCH] docs: complement and rewrite evaluation docs --- docarray/array/mixins/evaluation.py | 7 +- docs/fundamentals/documentarray/evaluation.md | 224 +++++++++++------- 2 files changed, 144 insertions(+), 87 deletions(-) diff --git a/docarray/array/mixins/evaluation.py b/docarray/array/mixins/evaluation.py index 893913a8d5a..d9098900bba 100644 --- a/docarray/array/mixins/evaluation.py +++ b/docarray/array/mixins/evaluation.py @@ -81,7 +81,7 @@ def evaluate( ) -> Dict[str, float]: """ Compute ranking evaluation metrics for a given `DocumentArray` when compared - with a groundtruth. + with a ground truth. If one provides a `ground_truth` DocumentArray that is structurally identical to `self`, this function compares the `matches` of `documents` inside the @@ -97,8 +97,9 @@ def evaluate( :param metrics: list of metric names or metric functions to be computed :param ground_truth: The ground_truth `DocumentArray` that the `DocumentArray` compares to. - :param hash_fn: The function used for identifying the uniqueness of Documents. - If not given, then ``Document.id`` is used. + :param hash_fn: For the evaluation against a `ground_truth` DocumentArray, + this function is used for generating hashes which are used to compare the + documents. If not given, ``Document.id`` is used. :param metric_names: If provided, the results of the metrics computation will be stored in the `evaluations` field of each Document with this names. If not provided, the names will be derived from the metric function names. diff --git a/docs/fundamentals/documentarray/evaluation.md b/docs/fundamentals/documentarray/evaluation.md index 33454c09c19..ba5d0702faf 100644 --- a/docs/fundamentals/documentarray/evaluation.md +++ b/docs/fundamentals/documentarray/evaluation.md @@ -1,63 +1,22 @@ # Evaluate Matches -After you get `.matches`, you can evaluate matches against the groundtruth via {meth}`~docarray.array.mixins.evaluation.EvaluationMixin.evaluate`. +After the execution of {meth}`~docarray.array.mixins.match.MatchMixin.match`, your `DocumentArray` receives a `.matches` attribute. +You can evaluate those matches against the ground truth via {meth}`~docarray.array.mixins.evaluation.EvaluationMixin.evaluate`. +The ground truth describes which matches are relevant and non-relevant and can be provided in two formats: (1) a ground truth array or (2) in the form of labels. -```python -da_predict.evaluate(ground_truth=da_groundtruth, metrics=['...'], **kwargs) -``` - -Alternatively, you can add labels to your documents to evaluate them. -In this case, a match is considered as relevant to its root document, if it has the same label. - -```python -import numpy as np -from docarray import Document, DocumentArray - -example_da = DocumentArray([Document(tags={'label': (i % 2)}) for i in range(10)]) -example_da.embeddings = np.random.random([10, 3]) - -example_da.match(example_da) - -example_da.evaluate(metrics=['precision_at_k']) -``` - -The results are stored in `.evaluations` field of each Document. - -DocArray provides some common metrics used in the information retrieval community that allows one to evaluate the nearest-neighbour matches. Different metric accepts different arguments as `kwargs`: - -| Metric | Accept `kwargs` | -|-----------------------------------------------------|------------------| -| {meth}`~docarray.math.evaluation.r_precision` | None | -| {meth}`~docarray.math.evaluation.average_precision` | None | -| {meth}`~docarray.math.evaluation.reciprocal_rank` | None | -| {meth}`~docarray.math.evaluation.precision_at_k` | `k` | -| {meth}`~docarray.math.evaluation.hit_at_k` | `k` | -| {meth}`~docarray.math.evaluation.recall_at_k` | `max_rel`, `k` | -| {meth}`~docarray.math.evaluation.f1_score_at_k` | `max_rel`, `k` | -| {meth}`~docarray.math.evaluation.dcg_at_k` | `method`, `k` | -| {meth}`~docarray.math.evaluation.ndcg_at_k` | `method`, `k` | - -```{danger} -This metric scores might change if the `limit` attribute of the match function is set differently. - -**Note:** Not all of these metrics can be applied to a Top-K result, i.e., `ndcg_at_k` and `r_precision` are calculated correctly only if the limit is set equal or higher than the number of documents in the `DocumentArray` provided to the match function. -``` - - -For example, let's create a DocumentArray with random embeddings and matching it to itself: +To demonstrate this, let's create a DocumentArray with random embeddings and match it to itself: ```python import numpy as np from docarray import DocumentArray -da = DocumentArray.empty(10) -da.embeddings = np.random.random([10, 3]) -da.match(da, exclude_self=True) +da_original = DocumentArray.empty(10) +da_original.embeddings = np.random.random([10, 3]) +da_original.match(da_original, exclude_self=True) -da.summary() +da_original.summary() ``` - ```text Documents Summary @@ -74,19 +33,17 @@ da.summary() id ('str',) 10 False matches ('MatchArray',) 10 False ``` - -Now `da.matches` contains the nearest neighbours. Let's use it as the groundtruth. - -Let's create imperfect matches by mixing in ten "noise Documents" to every `d.matches`. +Now `da.matches` contains the nearest neighbours. +To make our scenario more interesting, we mix in ten "noise Documents" to every `d.matches`: ```python -da2 = DocumentArray(da, copy=True) +da_prediction = DocumentArray(da_original, copy=True) -for d in da2: +for d in da_prediction: d.matches.extend(DocumentArray.empty(10)) d.matches = d.matches.shuffle() -da2['@m'].summary() +da_prediction['@m'].summary() ``` ```text @@ -107,53 +64,52 @@ da2['@m'].summary() scores ('defaultdict',) 190 False ``` +## Evaluation against a ground truth array +To evaluate the matches against a ground truth array, you simply provide a DocumentArray to the evaluate function like `da_groundtruth` in the call below: -Now `da2` is our prediction, and `da` is our groundtruth. If we evaluate the average Precision@10, we should get something close to 0.47 (we have 9 real matches, we mixed in 10 fake matches and shuffle it, so top-10 would have approximate 9/19 real matches): +```python +da_predict.evaluate(ground_truth=da_groundtruth, metrics=['...'], **kwargs) +``` + +Thereby, `da_groundtruth` should contain the same documents as in `da_prediction` where each `matches` attribute contains exactly those documents which are relevant to the respective root document. +The `metrics` argument determines the metric you want to use for your evaluation, e.g., `precision_at_k`. + +In the code cell below, we evaluate the array `da_prediction` with the noisy matches against the original one `da_original`: ```python -da2.evaluate(ground_truth=da, metrics=['precision_at_k'], k=10) +da_prediction.evaluate(ground_truth=da_original, metrics=['precision_at_k'], k=10) ``` ```text -{'precision_at_k': 0.48} +{'precision_at_k': 0.45} ``` - -Note that this value is an average number over all Documents of `da2`. If you want to look at the individual evaluation, you can check {attr}`~docarray.Document.evaluations` attribute, e.g. +It returns the average value for the `precision_at_k` metric. +The average is calculated over all Documents of `da_prediction`. +If you want to look at the individual evaluation values, you can check the {attr}`~docarray.Document.evaluations` attribute, e.g.: ```python -for d in da2: +for d in da_prediction: print(d.evaluations['precision_at_k'].value) ``` ```text 0.5 -0.4 -0.3 -0.6 0.5 +0.5 +0.6 0.3 0.4 -0.6 0.5 -0.7 -``` - -If you want to evaluate your data with multiple metric functions, you can pass a list of metrics: - -```python -da2.evaluate(ground_truth=da, metrics=['precision_at_k', 'reciprocal_rank'], k=10) -``` - -```text -{'precision_at_k': 0.48, 'reciprocal_rank': 0.6333333333333333} +0.4 +0.5 +0.3 ``` -In this case, the keyword attribute `k` is passed to all metric functions, even though it does not fulfill any specific function for the calculation of the reciprocal rank. +### Document identifier -## Document identifier - -Note that `.evaluate()` works only when two DocumentArray have the same length and their nested structure are same. It makes no sense to evaluate on two completely irrelevant DocumentArrays. +Note that the evaluation against a ground truth DocumentArray only works if both DocumentArrays have the same length and their nested structure is the same. +It makes no sense to evaluate with a completely different DocumentArray. While evaluating, Document pairs are recognized as correct if they share the same identifier. By default, it simply uses {attr}`~docarray.Document.id`. One can customize this behavior by specifying `hash_fn`. @@ -169,7 +125,7 @@ for d in p_da: g_da = DocumentArray.empty(3) for d in g_da: - d.matches.append(Document(text='my groundtruth')) + d.matches.append(Document(text='my ground truth')) ``` Now when you do evaluate, you will receive an error: @@ -182,9 +138,10 @@ p_da.evaluate('average_precision', ground_truth=g_da) ValueError: Document from the left-hand side and from the right-hand are not hashed to the same value. This means your left and right DocumentArray may not be aligned; or it means your `hash_fn` is badly designed. ``` -This basically saying that based on `.id` (default identifier), the given two DocumentArrays are so different that they can't be evaluated. It is a valid point because our two DocumentArrays have completely random `.id`. +This says that based on `.id` (default identifier), the given two DocumentArrays are so different that they can't be evaluated. +It is a valid point because our two DocumentArrays have completely random `.id`. -If we override the hash function as following the evaluation can be conducted: +If we override the hash function as follows, the evaluation can be conducted: ```python p_da.evaluate('average_precision', ground_truth=g_da, hash_fn=lambda d: d.text[:2]) @@ -196,3 +153,102 @@ p_da.evaluate('average_precision', ground_truth=g_da, hash_fn=lambda d: d.text[: It is correct as we define the evaluation as checking if the first two characters in `.text` are the same. + + +## Evaluation via labels + +Alternatively, you can add labels to your documents to evaluate them. +In this case, a match is considered relevant to its root document if it has the same label: + +```python +import numpy as np +from docarray import Document, DocumentArray + +example_da = DocumentArray([Document(tags={'label': (i % 2)}) for i in range(10)]) +example_da.embeddings = np.random.random([10, 3]) + +example_da.match(example_da) + +example_da.evaluate(metrics=['precision_at_k']) +``` + +```text +{'precision_at_k': 0.5} +``` + +Also here, the results are stored in the `.evaluations` field of each Document. + +## Metric functions + +DocArray provides common metrics used in the information retrieval community for evaluating the nearest-neighbour matches. +Some of those metrics accept additional arguments as `kwargs` which you can simply add to the call of the evaluate function: + +| Metric | Accept `kwargs` | +|-----------------------------------------------------|------------------| +| {meth}`~docarray.math.evaluation.r_precision` | None | +| {meth}`~docarray.math.evaluation.average_precision` | None | +| {meth}`~docarray.math.evaluation.reciprocal_rank` | None | +| {meth}`~docarray.math.evaluation.precision_at_k` | `k` | +| {meth}`~docarray.math.evaluation.hit_at_k` | `k` | +| {meth}`~docarray.math.evaluation.recall_at_k` | `max_rel`, `k` | +| {meth}`~docarray.math.evaluation.f1_score_at_k` | `max_rel`, `k` | +| {meth}`~docarray.math.evaluation.dcg_at_k` | `method`, `k` | +| {meth}`~docarray.math.evaluation.ndcg_at_k` | `method`, `k` | + +```{danger} +These metric scores might change if the `limit` argument of the match function is set differently. + +**Note:** Not all of these metrics can be applied to a Top-K result, i.e., `ndcg_at_k` and `r_precision` are calculated correctly only if the limit is set equal or higher than the number of documents in the `DocumentArray` provided to the match function. +``` + +You can evaluate multiple metric functions at once, as you can see below: + +```python +da_prediction.evaluate( + ground_truth=da_original, metrics=['precision_at_k', 'reciprocal_rank'], k=10 +) +``` + +```text +{'precision_at_k': 0.45, 'reciprocal_rank': 0.8166666666666667} +``` + +In this case, the keyword argument `k` is passed to all metric functions, even though it does not fulfill any specific function for the calculation of the reciprocal rank. + +### Custom metrics + +If the pre-defined metrics do not fit your use-case, you can define a custom metric function. +It should take as input a list of binary relevance judgements of a query (`1` and `0` values). +The evaluate function already calculates this binary list from the `matches` attribute so that each number represents the relevancy of a match. + +Let's write a custom metric function, which counts the number of relevant documents per query: + +```python +def count_relevant(binary_relevance): + return sum(binary_relevance) + +da_prediction.evaluate(ground_truth=da_original, metrics=[count_relevant]) +``` + +```text +{'count_relevant': 9.0} +``` + +For an inspiration for writing your own metric function, you can take a look at DocArray's {mod}`~docarray.math.evaluation` module, which contains the implementations of the custom metric functions. + +### Custom names + +By default, the metrics are stored with the name of the metric function. +Alternatively, you can customize those names via the `metric_names` argument of the `evaluate` function: + +```python +da_prediction.evaluate( + ground_truth=da_original, + metrics=[count_relevant, 'precision_at_k'], + metric_names=['#Relevant', 'Precision@K'], +) +``` + +```text +{'#Relevant': 9.0, 'Precision@K': 0.47368421052631576} +```