From d2bfe2401839bce08aac94a2332a6edf134a6391 Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Fri, 2 Dec 2022 11:12:02 +0100
Subject: [PATCH 1/8] feat: native len for milvus

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 docarray/array/storage/milvus/seqlike.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/docarray/array/storage/milvus/seqlike.py b/docarray/array/storage/milvus/seqlike.py
index 1711c5b8080..d1ce651c0c6 100644
--- a/docarray/array/storage/milvus/seqlike.py
+++ b/docarray/array/storage/milvus/seqlike.py
@@ -1,6 +1,6 @@
 from typing import Iterable, Iterator, Union, TYPE_CHECKING
 from docarray.array.storage.base.seqlike import BaseSequenceLikeMixin
-from docarray.array.storage.milvus.backend import _batch_list
+from docarray.array.storage.milvus.backend import _batch_list, _always_true_expr
 from docarray import Document
 
 
@@ -56,3 +56,11 @@ def _extend(self, values: Iterable['Document'], **kwargs):
             payload = self._docs_to_milvus_payload(docs_batch)
             self._collection.insert(payload, **kwargs)
             self._offset2ids.extend([doc.id for doc in docs_batch])
+
+    def __len__(self):
+        with self.loaded_collection():
+            res = self._collection.query(
+                expr=_always_true_expr('document_id'),
+                output_fields=['document_id'],
+            )
+            return len(res)

From e8422c8a2f8990fcf2689d6506bcb220a457e3a6 Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Fri, 2 Dec 2022 11:15:32 +0100
Subject: [PATCH 2/8] fix: make implementing len non-optional

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 docarray/array/storage/base/seqlike.py | 2 +-
 docs/advanced/document-store/extend.md | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/docarray/array/storage/base/seqlike.py b/docarray/array/storage/base/seqlike.py
index 5e46cafe607..ce89b82a3bf 100644
--- a/docarray/array/storage/base/seqlike.py
+++ b/docarray/array/storage/base/seqlike.py
@@ -50,7 +50,7 @@ def __eq__(self, other):
         ...
 
     def __len__(self):
-        return len(self._offset2ids)
+        ...
 
     def __iter__(self) -> Iterator['Document']:
         for _id in self._offset2ids:
diff --git a/docs/advanced/document-store/extend.md b/docs/advanced/document-store/extend.md
index a65d5ac32bb..591d2ce8832 100644
--- a/docs/advanced/document-store/extend.md
+++ b/docs/advanced/document-store/extend.md
@@ -145,6 +145,9 @@ class SequenceLikeMixin(BaseSequenceLikeMixin):
     def __add__(self, other: Union['Document', Iterable['Document']]):
         ...
 
+    def __len__(self):
+        ...
+
     def insert(self, index: int, value: 'Document'):
         # Optional. By default, this will add a new item and update offset2id
         # if you want to customize this, make sure to handle offset2id
@@ -158,10 +161,6 @@ class SequenceLikeMixin(BaseSequenceLikeMixin):
         # Optional. Override this if you have better implementation than appending one by one
         ...
 
-    def __len__(self):
-        # Optional. By default, this will rely on offset2id to get the length
-        ...
-
     def __iter__(self) -> Iterator['Document']:
         # Optional. By default, this will rely on offset2id to iterate
         ...

From cbfc6e6725eb4ddbc81e3a682ba4d5ab7bdf7a4e Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Tue, 10 Jan 2023 13:13:28 +0100
Subject: [PATCH 3/8] ci: trigger the ci for debugging purposes

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 docarray/array/mixins/getattr.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docarray/array/mixins/getattr.py b/docarray/array/mixins/getattr.py
index 588b03e12aa..120ff951e11 100644
--- a/docarray/array/mixins/getattr.py
+++ b/docarray/array/mixins/getattr.py
@@ -11,8 +11,9 @@ def _get_attributes(self, *fields: str) -> List:
         :return: Returns a list of the values for these fields.
             When `fields` has multiple values, then it returns a list of list.
         """
-        e_index, b_index = None, None
+        # small change just to trigger CI tests
         fields = list(fields)
+        e_index, b_index = None, None
         if 'embedding' in fields:
             e_index = fields.index('embedding')
         if 'tensor' in fields:

From 1e632cbcd6fa7997644ce887859eb1e0d39f4ccd Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Tue, 10 Jan 2023 15:07:07 +0100
Subject: [PATCH 4/8] ci: only run oldproto tests

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 .github/workflows/ci.yml | 100 +++++++++++++++++++--------------------
 1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index eda1318e7b1..5f72b5ced0a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -157,55 +157,55 @@ jobs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       matrix-oldproto: ${{ steps.set-matrix.outputs.matrix-oldproto }}
 
-  docarray-test:
-    needs: prep-testbed
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: [3.8]
-        test-path: ${{fromJson(needs.prep-testbed.outputs.matrix)}}
-    steps:
-      - uses: actions/checkout@v2.5.0
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Prepare environment
-        run: |
-          python -m pip install --upgrade pip
-          python -m pip install wheel
-          # pip does not properly resolve dependency versions with syntax pip install --no-cache-dir ".[test,full]"
-          pip install --no-cache-dir ".[test]"
-          pip install --no-cache-dir ".[qdrant]"
-          pip install --no-cache-dir ".[annlite]"
-          pip install --no-cache-dir ".[weaviate]"
-          pip install --no-cache-dir ".[elasticsearch]"
-          pip install --no-cache-dir ".[redis]"
-          pip install --no-cache-dir ".[full]"
-          sudo apt-get install libsndfile1
-      - name: Test
-        id: test
-        run: |
-          pytest --suppress-no-test-exit-code --cov=docarray --cov-report=xml \
-            -v -s -m "not gpu" ${{ matrix.test-path }}
-          echo "codecov_flag=docarray" >> $GITHUB_OUTPUT
-        timeout-minutes: 60
-        env:
-          JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}"
-      - name: Check codecov file
-        id: check_files
-        uses: andstor/file-existence-action@v1
-        with:
-          files: "coverage.xml"
-      - name: Upload coverage from test to Codecov
-        uses: codecov/codecov-action@v3.1.1
-        if: steps.check_files.outputs.files_exists == 'true' && ${{ matrix.python-version }} == '3.8'
-        with:
-          file: coverage.xml
-          flags: ${{ steps.test.outputs.codecov_flag }}
-          fail_ci_if_error: false
-          token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
+#  docarray-test:
+#    needs: prep-testbed
+#    runs-on: ubuntu-latest
+#    strategy:
+#      fail-fast: false
+#      matrix:
+#        python-version: [3.8]
+#        test-path: ${{fromJson(needs.prep-testbed.outputs.matrix)}}
+#    steps:
+#      - uses: actions/checkout@v2.5.0
+#      - name: Set up Python ${{ matrix.python-version }}
+#        uses: actions/setup-python@v4
+#        with:
+#          python-version: ${{ matrix.python-version }}
+#      - name: Prepare environment
+#        run: |
+#          python -m pip install --upgrade pip
+#          python -m pip install wheel
+#          # pip does not properly resolve dependency versions with syntax pip install --no-cache-dir ".[test,full]"
+#          pip install --no-cache-dir ".[test]"
+#          pip install --no-cache-dir ".[qdrant]"
+#          pip install --no-cache-dir ".[annlite]"
+#          pip install --no-cache-dir ".[weaviate]"
+#          pip install --no-cache-dir ".[elasticsearch]"
+#          pip install --no-cache-dir ".[redis]"
+#          pip install --no-cache-dir ".[full]"
+#          sudo apt-get install libsndfile1
+#      - name: Test
+#        id: test
+#        run: |
+#          pytest --suppress-no-test-exit-code --cov=docarray --cov-report=xml \
+#            -v -s -m "not gpu" ${{ matrix.test-path }}
+#          echo "codecov_flag=docarray" >> $GITHUB_OUTPUT
+#        timeout-minutes: 60
+#        env:
+#          JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}"
+#      - name: Check codecov file
+#        id: check_files
+#        uses: andstor/file-existence-action@v1
+#        with:
+#          files: "coverage.xml"
+#      - name: Upload coverage from test to Codecov
+#        uses: codecov/codecov-action@v3.1.1
+#        if: steps.check_files.outputs.files_exists == 'true' && ${{ matrix.python-version }} == '3.8'
+#        with:
+#          file: coverage.xml
+#          flags: ${{ steps.test.outputs.codecov_flag }}
+#          fail_ci_if_error: false
+#          token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
 
   docarray-oldproto-test:
     needs: prep-testbed
@@ -260,7 +260,7 @@ jobs:
 
   # just for blocking the merge until all parallel core-test are successful
   success-all-test:
-    needs: [commit-lint, docarray-test, docarray-oldproto-test]
+    needs: [commit-lint, docarray-oldproto-test]
     if: always()
     runs-on: ubuntu-latest
     steps:

From 0654713701f15a19ada78a8e767e6d731c75b847 Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Tue, 10 Jan 2023 15:59:48 +0100
Subject: [PATCH 5/8] test: add some context mngrs

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 .../array/mixins/oldproto/test_eval_class.py  | 144 ++++++++++--------
 1 file changed, 80 insertions(+), 64 deletions(-)

diff --git a/tests/unit/array/mixins/oldproto/test_eval_class.py b/tests/unit/array/mixins/oldproto/test_eval_class.py
index 2645a4cab07..8d0278a0f8c 100644
--- a/tests/unit/array/mixins/oldproto/test_eval_class.py
+++ b/tests/unit/array/mixins/oldproto/test_eval_class.py
@@ -42,7 +42,8 @@ def test_eval_mixin_perfect_match(metric_fn, kwargs, storage, config, start_stor
     da1 = DocumentArray.empty(10)
     da1.embeddings = np.random.random([10, 256])
     da1_index = DocumentArray(da1, storage=storage, config=config)
-    da1.match(da1_index, exclude_self=True)
+    with da1_index:
+        da1.match(da1_index, exclude_self=True)
     r = da1.evaluate(ground_truth=da1, metrics=[metric_fn], strict=False, **kwargs)[
         metric_fn
     ]
@@ -80,7 +81,8 @@ def test_eval_mixin_perfect_match_multiple_metrics(storage, config, start_storag
     da1 = DocumentArray.empty(10)
     da1.embeddings = np.random.random([10, 256])
     da1_index = DocumentArray(da1, storage=storage, config=config)
-    da1.match(da1_index, exclude_self=True)
+    with da1_index:
+        da1.match(da1_index, exclude_self=True)
     r = da1.evaluate(ground_truth=da1, metrics=metric_fns, strict=False, **kwargs)
     for metric_fn in metric_fns:
         assert metric_fn in r
@@ -123,7 +125,8 @@ def test_eval_mixin_perfect_match_labeled(
         d.tags = {'label': 'A'}
     da1.embeddings = np.random.random([10, 256])
     da1_index = DocumentArray(da1, storage=storage, config=config)
-    da1.match(da1_index, exclude_self=True)
+    with da1_index:
+        da1.match(da1_index, exclude_self=True)
     r = da1.evaluate(metrics=[metric_fn], **kwargs)[metric_fn]
     assert isinstance(r, float)
     assert r == 1.0
@@ -166,7 +169,8 @@ def test_eval_mixin_zero_labeled(storage, config, metric_fn, start_storage, kwar
     for d in da2:
         d.tags = {'label': 'B'}
     da1_index = DocumentArray(da2, storage=storage, config=config)
-    da1.match(da1_index, exclude_self=True)
+    with da1_index:
+        da1.match(da1_index, exclude_self=True)
     r = da1.evaluate([metric_fn], **kwargs)[metric_fn]
     assert isinstance(r, float)
     assert r == 0.0
@@ -264,9 +268,10 @@ def test_eval_mixin_zero_match(storage, config, metric_fn, start_storage, kwargs
     da2 = copy.deepcopy(da1)
     da2.embeddings = np.random.random([10, 256])
     da2_index = DocumentArray(da2, storage=storage, config=config)
-    da2.match(da2_index, exclude_self=True)
+    with da2_index:
+        da2.match(da2_index, exclude_self=True)
 
-    r = da1.evaluate(ground_truth=da2, metrics=[metric_fn], **kwargs)[metric_fn]
+        r = da1.evaluate(ground_truth=da2, metrics=[metric_fn], **kwargs)[metric_fn]
     assert isinstance(r, float)
     assert r == 1.0
     for d in da1:
@@ -337,17 +342,20 @@ def test_same_hash_same_len_fun_should_work(storage, config, start_storage):
     da1 = DocumentArray.empty(10)
     da1.embeddings = np.random.random([10, 3])
     da1_index = DocumentArray(da1, storage=storage, config=config)
-    da1.match(da1_index)
+    with da1_index:
+        da1.match(da1_index)
     da2 = DocumentArray.empty(10)
     da2.embeddings = np.random.random([10, 3])
     da2_index = DocumentArray(da1, storage=storage, config=config)
-    da2.match(da2_index)
-    with pytest.raises(ValueError):
-        da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
-    for d1, d2 in zip(da1, da2):
-        d1.id = d2.id
+    with da2_index:
+        da2.match(da2_index)
+    with da1_index, da2_index:
+        with pytest.raises(ValueError):
+            da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
+        for d1, d2 in zip(da1, da2):
+            d1.id = d2.id
 
-    da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
+        da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
 
 
 @pytest.mark.parametrize(
@@ -368,7 +376,8 @@ def test_adding_noise(storage, config, start_storage):
 
     da.embeddings = np.random.random([10, 3])
     da_index = DocumentArray(da, storage=storage, config=config)
-    da.match(da_index, exclude_self=True)
+    with da_index:
+        da.match(da_index, exclude_self=True)
 
     da2 = copy.deepcopy(da)
 
@@ -410,17 +419,18 @@ def test_adding_noise(storage, config, start_storage):
 def test_diff_match_len_in_gd(storage, config, metric_fn, start_storage, kwargs):
     da1 = DocumentArray.empty(10)
     da1.embeddings = np.random.random([10, 128])
-    da1_index = DocumentArray(da1, storage=storage, config=config)
+    # da1_index = DocumentArray(da1, storage=storage, config=config)
     da1.match(da1, exclude_self=True)
 
     da2 = copy.deepcopy(da1)
     da2.embeddings = np.random.random([10, 128])
     da2_index = DocumentArray(da2, storage=storage, config=config)
-    da2.match(da2_index, exclude_self=True)
-    # pop some matches from first document
-    da2[0].matches.pop(8)
+    with da2_index:
+        da2.match(da2_index, exclude_self=True)
+        # pop some matches from first document
+        da2[0].matches.pop(8)
 
-    r = da1.evaluate(ground_truth=da2, metrics=[metric_fn], **kwargs)[metric_fn]
+        r = da1.evaluate(ground_truth=da2, metrics=[metric_fn], **kwargs)[metric_fn]
     assert isinstance(r, float)
     np.testing.assert_allclose(r, 1.0, rtol=1e-2)  #
     for d in da1:
@@ -486,7 +496,8 @@ def test_useless_groundtruth_warning_should_raise(storage, config, start_storage
         d.tags = {'label': 'A'}
     da1.embeddings = np.random.random([10, 256])
     da1_index = DocumentArray(da1, storage=storage, config=config)
-    da1.match(da1_index, exclude_self=True)
+    with da1_index:
+        da1.match(da1_index, exclude_self=True)
     da2 = DocumentArray.empty(10)
     with pytest.warns(UserWarning):
         da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
@@ -518,13 +529,14 @@ def test_embed_and_evaluate_single_da(storage, config, start_storage):
     dummy_embed_function(gt)
     gt.match(gt, limit=3)
 
-    res = queries_da.embed_and_evaluate(
-        ground_truth=gt,
-        metrics=['precision_at_k', 'reciprocal_rank'],
-        embed_funcs=dummy_embed_function,
-        match_batch_size=1,
-        limit=3,
-    )
+    with queries_da:
+        res = queries_da.embed_and_evaluate(
+            ground_truth=gt,
+            metrics=['precision_at_k', 'reciprocal_rank'],
+            embed_funcs=dummy_embed_function,
+            match_batch_size=1,
+            limit=3,
+        )
     assert all([v == 1.0 for v in res.values()])
 
 
@@ -601,15 +613,16 @@ def test_embed_and_evaluate_two_das(storage, config, sample_size, start_storage)
     dummy_embed_function(gt_index)
     gt_queries.match(gt_index, limit=3)
 
-    res = queries_da.embed_and_evaluate(
-        ground_truth=gt_queries,
-        index_data=index_da,
-        metrics=['precision_at_k', 'reciprocal_rank'],
-        embed_funcs=dummy_embed_function,
-        match_batch_size=1,
-        limit=3,
-        query_sample_size=sample_size,
-    )
+    with index_da:
+        res = queries_da.embed_and_evaluate(
+            ground_truth=gt_queries,
+            index_data=index_da,
+            metrics=['precision_at_k', 'reciprocal_rank'],
+            embed_funcs=dummy_embed_function,
+            match_batch_size=1,
+            limit=3,
+            query_sample_size=sample_size,
+        )
     assert all([v == 1.0 for v in res.values()])
 
 
@@ -681,25 +694,26 @@ def emb_func(da):
     da1 = DocumentArray([Document(text=str(i), tags={label_tag: i}) for i in range(3)])
     da2 = DocumentArray(da1, storage=storage, config=config, copy=True)
 
-    if (
-        use_index
-    ):  # query and index da are distinct # (different embeddings are generated)
-        res = da1.embed_and_evaluate(
-            index_data=da2,
-            metrics=metric_fns,
-            embed_funcs=emb_func,
-            match_batch_size=1,
-            limit=3,
-            label_tag=label_tag,
-        )
-    else:  # query and index are the same (embeddings of both das are equal)
-        res = da2.embed_and_evaluate(
-            metrics=metric_fns,
-            embed_funcs=emb_func,
-            match_batch_size=1,
-            limit=3,
-            label_tag=label_tag,
-        )
+    with da2:
+        if (
+            use_index
+        ):  # query and index da are distinct # (different embeddings are generated)
+            res = da1.embed_and_evaluate(
+                index_data=da2,
+                metrics=metric_fns,
+                embed_funcs=emb_func,
+                match_batch_size=1,
+                limit=3,
+                label_tag=label_tag,
+            )
+        else:  # query and index are the same (embeddings of both das are equal)
+            res = da2.embed_and_evaluate(
+                metrics=metric_fns,
+                embed_funcs=emb_func,
+                match_batch_size=1,
+                limit=3,
+                label_tag=label_tag,
+            )
     for key in metric_fns:
         assert key in res
         assert abs(res[key] - expected[key]) < 1e-4
@@ -799,9 +813,10 @@ def test_embed_and_evaluate_with_embed_model(
         [Document(text=f'some text {i}', tags={'label': str(i)}) for i in range(5)]
     )
     da = DocumentArray(da, storage=storage, config=config)
-    res = da.embed_and_evaluate(
-        metrics=['precision_at_k'], embed_models=model, collate_fns=collate_fn
-    )
+    with da:
+        res = da.embed_and_evaluate(
+            metrics=['precision_at_k'], embed_models=model, collate_fns=collate_fn
+        )
     assert res
     assert res['precision_at_k'] == 0.2
 
@@ -873,12 +888,13 @@ def emb_func(da):
     )
     da2 = DocumentArray(da1, storage=storage, config=config, copy=True)
 
-    res = da1.embed_and_evaluate(
-        index_data=da2,
-        metrics=metric_fns,
-        embed_funcs=emb_func,
-        query_sample_size=sample_size,
-    )
+    with da2:
+        res = da1.embed_and_evaluate(
+            index_data=da2,
+            metrics=metric_fns,
+            embed_funcs=emb_func,
+            query_sample_size=sample_size,
+        )
     expected_size = (
         sample_size if sample_size and (sample_size < len(da1)) else len(da1)
     )

From 02326b50cab00a74215e7a761d1741b6314c64fd Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Tue, 10 Jan 2023 17:08:59 +0100
Subject: [PATCH 6/8] test: remove some tests

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 .../array/mixins/oldproto/test_eval_class.py  | 776 +++++++++---------
 1 file changed, 388 insertions(+), 388 deletions(-)

diff --git a/tests/unit/array/mixins/oldproto/test_eval_class.py b/tests/unit/array/mixins/oldproto/test_eval_class.py
index 8d0278a0f8c..1d1e9378071 100644
--- a/tests/unit/array/mixins/oldproto/test_eval_class.py
+++ b/tests/unit/array/mixins/oldproto/test_eval_class.py
@@ -12,45 +12,45 @@
 from docarray import DocumentArray, Document
 
 
-@pytest.mark.parametrize(
-    'storage, config',
-    [
-        ('memory', {}),
-        ('weaviate', {}),
-        ('sqlite', {}),
-        ('annlite', {'n_dim': 256}),
-        ('qdrant', {'n_dim': 256}),
-        ('elasticsearch', {'n_dim': 256}),
-        ('redis', {'n_dim': 256}),
-        ('milvus', {'n_dim': 256}),
-    ],
-)
-@pytest.mark.parametrize(
-    'metric_fn, kwargs',
-    [
-        ('r_precision', {}),
-        ('precision_at_k', {}),
-        ('hit_at_k', {}),
-        ('average_precision', {}),
-        ('reciprocal_rank', {}),
-        ('recall_at_k', {'max_rel': 9}),
-        ('f1_score_at_k', {'max_rel': 9}),
-        ('ndcg_at_k', {}),
-    ],
-)
-def test_eval_mixin_perfect_match(metric_fn, kwargs, storage, config, start_storage):
-    da1 = DocumentArray.empty(10)
-    da1.embeddings = np.random.random([10, 256])
-    da1_index = DocumentArray(da1, storage=storage, config=config)
-    with da1_index:
-        da1.match(da1_index, exclude_self=True)
-    r = da1.evaluate(ground_truth=da1, metrics=[metric_fn], strict=False, **kwargs)[
-        metric_fn
-    ]
-    assert isinstance(r, float)
-    assert r == 1.0
-    for d in da1:
-        assert d.evaluations[metric_fn].value == 1.0
+# @pytest.mark.parametrize(
+#     'storage, config',
+#     [
+#         ('memory', {}),
+#         ('weaviate', {}),
+#         ('sqlite', {}),
+#         ('annlite', {'n_dim': 256}),
+#         ('qdrant', {'n_dim': 256}),
+#         ('elasticsearch', {'n_dim': 256}),
+#         ('redis', {'n_dim': 256}),
+#         ('milvus', {'n_dim': 256}),
+#     ],
+# )
+# @pytest.mark.parametrize(
+#     'metric_fn, kwargs',
+#     [
+#         ('r_precision', {}),
+#         ('precision_at_k', {}),
+#         ('hit_at_k', {}),
+#         ('average_precision', {}),
+#         ('reciprocal_rank', {}),
+#         ('recall_at_k', {'max_rel': 9}),
+#         ('f1_score_at_k', {'max_rel': 9}),
+#         ('ndcg_at_k', {}),
+#     ],
+# )
+# def test_eval_mixin_perfect_match(metric_fn, kwargs, storage, config, start_storage):
+#     da1 = DocumentArray.empty(10)
+#     da1.embeddings = np.random.random([10, 256])
+#     da1_index = DocumentArray(da1, storage=storage, config=config)
+#     with da1_index:
+#         da1.match(da1_index, exclude_self=True)
+#     r = da1.evaluate(ground_truth=da1, metrics=[metric_fn], strict=False, **kwargs)[
+#         metric_fn
+#     ]
+#     assert isinstance(r, float)
+#     assert r == 1.0
+#     for d in da1:
+#         assert d.evaluations[metric_fn].value == 1.0
 
 
 @pytest.mark.parametrize(
@@ -134,70 +134,70 @@ def test_eval_mixin_perfect_match_labeled(
         assert d.evaluations[metric_fn].value == 1.0
 
 
-@pytest.mark.parametrize(
-    'storage, config',
-    [
-        ('memory', {}),
-        ('weaviate', {}),
-        ('sqlite', {}),
-        ('annlite', {'n_dim': 256}),
-        ('qdrant', {'n_dim': 256}),
-        ('elasticsearch', {'n_dim': 256}),
-        ('redis', {'n_dim': 256}),
-        ('milvus', {'n_dim': 256}),
-    ],
-)
-@pytest.mark.parametrize(
-    'metric_fn, kwargs',
-    [
-        ('r_precision', {}),
-        ('precision_at_k', {}),
-        ('hit_at_k', {}),
-        ('average_precision', {}),
-        ('reciprocal_rank', {}),
-        ('recall_at_k', {'max_rel': 9}),
-        ('f1_score_at_k', {'max_rel': 9}),
-        ('ndcg_at_k', {}),
-    ],
-)
-def test_eval_mixin_zero_labeled(storage, config, metric_fn, start_storage, kwargs):
-    da1 = DocumentArray.empty(10)
-    for d in da1:
-        d.tags = {'label': 'A'}
-    da1.embeddings = np.random.random([10, 256])
-    da2 = copy.deepcopy(da1)
-    for d in da2:
-        d.tags = {'label': 'B'}
-    da1_index = DocumentArray(da2, storage=storage, config=config)
-    with da1_index:
-        da1.match(da1_index, exclude_self=True)
-    r = da1.evaluate([metric_fn], **kwargs)[metric_fn]
-    assert isinstance(r, float)
-    assert r == 0.0
-    for d in da1:
-        assert d.evaluations[metric_fn].value == 0.0
-
-
-@pytest.mark.parametrize('label_tag', ['label', 'custom_tag'])
-@pytest.mark.parametrize(
-    'metric_fn, metric_score',
-    [
-        ('r_precision', 1.0 / 3),
-        ('precision_at_k', 1.0 / 3),
-        ('hit_at_k', 1.0),
-        ('average_precision', (1.0 + 0.5 + (1.0 / 3)) / 3),
-        ('reciprocal_rank', (1.0 + 0.5 + (1.0 / 3)) / 3),
-        ('recall_at_k', 1.0 / 3),
-        ('f1_score_at_k', 1.0 / 3),
-        ('dcg_at_k', (1.0 + 1.0 + 0.6309) / 3),
-    ],
-)
-def test_eval_mixin_one_of_n_labeled(metric_fn, metric_score, label_tag):
-    da = DocumentArray([Document(text=str(i), tags={label_tag: i}) for i in range(3)])
-    for d in da:
-        d.matches = da
-    r = da.evaluate([metric_fn], label_tag=label_tag, max_rel=3)[metric_fn]
-    assert abs(r - metric_score) < 0.001
+# @pytest.mark.parametrize(
+#     'storage, config',
+#     [
+#         ('memory', {}),
+#         ('weaviate', {}),
+#         ('sqlite', {}),
+#         ('annlite', {'n_dim': 256}),
+#         ('qdrant', {'n_dim': 256}),
+#         ('elasticsearch', {'n_dim': 256}),
+#         ('redis', {'n_dim': 256}),
+#         ('milvus', {'n_dim': 256}),
+#     ],
+# )
+# @pytest.mark.parametrize(
+#     'metric_fn, kwargs',
+#     [
+#         ('r_precision', {}),
+#         ('precision_at_k', {}),
+#         ('hit_at_k', {}),
+#         ('average_precision', {}),
+#         ('reciprocal_rank', {}),
+#         ('recall_at_k', {'max_rel': 9}),
+#         ('f1_score_at_k', {'max_rel': 9}),
+#         ('ndcg_at_k', {}),
+#     ],
+# )
+# def test_eval_mixin_zero_labeled(storage, config, metric_fn, start_storage, kwargs):
+#     da1 = DocumentArray.empty(10)
+#     for d in da1:
+#         d.tags = {'label': 'A'}
+#     da1.embeddings = np.random.random([10, 256])
+#     da2 = copy.deepcopy(da1)
+#     for d in da2:
+#         d.tags = {'label': 'B'}
+#     da1_index = DocumentArray(da2, storage=storage, config=config)
+#     with da1_index:
+#         da1.match(da1_index, exclude_self=True)
+#     r = da1.evaluate([metric_fn], **kwargs)[metric_fn]
+#     assert isinstance(r, float)
+#     assert r == 0.0
+#     for d in da1:
+#         assert d.evaluations[metric_fn].value == 0.0
+
+
+# @pytest.mark.parametrize('label_tag', ['label', 'custom_tag'])
+# @pytest.mark.parametrize(
+#     'metric_fn, metric_score',
+#     [
+#         ('r_precision', 1.0 / 3),
+#         ('precision_at_k', 1.0 / 3),
+#         ('hit_at_k', 1.0),
+#         ('average_precision', (1.0 + 0.5 + (1.0 / 3)) / 3),
+#         ('reciprocal_rank', (1.0 + 0.5 + (1.0 / 3)) / 3),
+#         ('recall_at_k', 1.0 / 3),
+#         ('f1_score_at_k', 1.0 / 3),
+#         ('dcg_at_k', (1.0 + 1.0 + 0.6309) / 3),
+#     ],
+# )
+# def test_eval_mixin_one_of_n_labeled(metric_fn, metric_score, label_tag):
+#     da = DocumentArray([Document(text=str(i), tags={label_tag: i}) for i in range(3)])
+#     for d in da:
+#         d.matches = da
+#     r = da.evaluate([metric_fn], label_tag=label_tag, max_rel=3)[metric_fn]
+#     assert abs(r - metric_score) < 0.001
 
 
 @pytest.mark.parametrize('label_tag', ['label', 'custom_tag'])
@@ -221,16 +221,16 @@ def test_num_relevant_documents_per_label(metric_fn, metric_score, label_tag):
     assert abs(r - metric_score) < 0.001
 
 
-def test_missing_max_rel_should_raise():
-    da = DocumentArray([Document(text=str(i), tags={'label': i}) for i in range(3)])
-    num_relevant_documents_per_label = {i: 1 for i in range(2)}
-    for d in da:
-        d.matches = da
-    with pytest.raises(ValueError):
-        da.evaluate(
-            ['recall_at_k'],
-            num_relevant_documents_per_label=num_relevant_documents_per_label,
-        )
+# def test_missing_max_rel_should_raise():
+#     da = DocumentArray([Document(text=str(i), tags={'label': i}) for i in range(3)])
+#     num_relevant_documents_per_label = {i: 1 for i in range(2)}
+#     for d in da:
+#         d.matches = da
+#     with pytest.raises(ValueError):
+#         da.evaluate(
+#             ['recall_at_k'],
+#             num_relevant_documents_per_label=num_relevant_documents_per_label,
+#         )
 
 
 @pytest.mark.parametrize(
@@ -279,50 +279,50 @@ def test_eval_mixin_zero_match(storage, config, metric_fn, start_storage, kwargs
         assert d.evaluations[metric_fn].value == 1.0
 
 
-@pytest.mark.parametrize(
-    'storage, config',
-    [
-        ('memory', {}),
-        ('weaviate', {}),
-        ('sqlite', {}),
-        ('annlite', {'n_dim': 256}),
-        ('qdrant', {'n_dim': 256}),
-        ('elasticsearch', {'n_dim': 256}),
-        ('redis', {'n_dim': 256}),
-        ('milvus', {'n_dim': 256}),
-    ],
-)
-def test_diff_len_should_raise(storage, config, start_storage):
-    da1 = DocumentArray.empty(10)
-    da2 = DocumentArray.empty(5)
-    for d in da2:
-        d.matches.append(da2[0])
-    da2 = DocumentArray(da2, storage=storage, config=config)
-    with pytest.raises(ValueError):
-        da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
-
-
-@pytest.mark.parametrize(
-    'storage, config',
-    [
-        ('memory', {}),
-        ('weaviate', {}),
-        ('sqlite', {}),
-        ('annlite', {'n_dim': 256}),
-        ('qdrant', {'n_dim': 256}),
-        ('elasticsearch', {'n_dim': 256}),
-        ('redis', {'n_dim': 256}),
-        ('milvus', {'n_dim': 256}),
-    ],
-)
-def test_diff_hash_fun_should_raise(storage, config, start_storage):
-    da1 = DocumentArray.empty(10)
-    da2 = DocumentArray.empty(5)
-    for d in da2:
-        d.matches.append(da2[0])
-    da2 = DocumentArray(da2, storage=storage, config=config)
-    with pytest.raises(ValueError):
-        da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
+# @pytest.mark.parametrize(
+#     'storage, config',
+#     [
+#         ('memory', {}),
+#         ('weaviate', {}),
+#         ('sqlite', {}),
+#         ('annlite', {'n_dim': 256}),
+#         ('qdrant', {'n_dim': 256}),
+#         ('elasticsearch', {'n_dim': 256}),
+#         ('redis', {'n_dim': 256}),
+#         ('milvus', {'n_dim': 256}),
+#     ],
+# )
+# def test_diff_len_should_raise(storage, config, start_storage):
+#     da1 = DocumentArray.empty(10)
+#     da2 = DocumentArray.empty(5)
+#     for d in da2:
+#         d.matches.append(da2[0])
+#     da2 = DocumentArray(da2, storage=storage, config=config)
+#     with pytest.raises(ValueError):
+#         da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
+
+
+# @pytest.mark.parametrize(
+#     'storage, config',
+#     [
+#         ('memory', {}),
+#         ('weaviate', {}),
+#         ('sqlite', {}),
+#         ('annlite', {'n_dim': 256}),
+#         ('qdrant', {'n_dim': 256}),
+#         ('elasticsearch', {'n_dim': 256}),
+#         ('redis', {'n_dim': 256}),
+#         ('milvus', {'n_dim': 256}),
+#     ],
+# )
+# def test_diff_hash_fun_should_raise(storage, config, start_storage):
+#     da1 = DocumentArray.empty(10)
+#     da2 = DocumentArray.empty(5)
+#     for d in da2:
+#         d.matches.append(da2[0])
+#     da2 = DocumentArray(da2, storage=storage, config=config)
+#     with pytest.raises(ValueError):
+#         da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
 
 
 @pytest.mark.parametrize(
@@ -439,68 +439,68 @@ def test_diff_match_len_in_gd(storage, config, metric_fn, start_storage, kwargs)
         assert d.evaluations[metric_fn].value > 0.9
 
 
-@pytest.mark.parametrize(
-    'storage, config',
-    [
-        ('memory', {}),
-        ('weaviate', {}),
-        ('sqlite', {}),
-        ('annlite', {'n_dim': 256}),
-        ('qdrant', {'n_dim': 256}),
-        ('elasticsearch', {'n_dim': 256}),
-        ('redis', {'n_dim': 256}),
-        ('milvus', {'n_dim': 256}),
-    ],
-)
-def test_empty_da_should_raise(storage, config, start_storage):
-    da = DocumentArray([], storage=storage, config=config)
-    with pytest.raises(ValueError):
-        da.evaluate(metrics=['precision_at_k'])
-
-
-@pytest.mark.parametrize(
-    'storage, config',
-    [
-        ('memory', {}),
-        ('weaviate', {}),
-        ('sqlite', {}),
-        ('annlite', {'n_dim': 256}),
-        ('qdrant', {'n_dim': 256}),
-        ('elasticsearch', {'n_dim': 256}),
-        ('redis', {'n_dim': 256}),
-        ('milvus', {'n_dim': 256}),
-    ],
-)
-def test_missing_groundtruth_should_raise(storage, config, start_storage):
-    da = DocumentArray(DocumentArray.empty(10), storage=storage, config=config)
-    with pytest.raises(RuntimeError):
-        da.evaluate(metrics=['precision_at_k'])
-
-
-@pytest.mark.parametrize(
-    'storage, config',
-    [
-        ('memory', {}),
-        ('weaviate', {}),
-        ('sqlite', {}),
-        ('annlite', {'n_dim': 256}),
-        ('qdrant', {'n_dim': 256}),
-        ('elasticsearch', {'n_dim': 256}),
-        ('redis', {'n_dim': 256}),
-        ('milvus', {'n_dim': 256}),
-    ],
-)
-def test_useless_groundtruth_warning_should_raise(storage, config, start_storage):
-    da1 = DocumentArray.empty(10)
-    for d in da1:
-        d.tags = {'label': 'A'}
-    da1.embeddings = np.random.random([10, 256])
-    da1_index = DocumentArray(da1, storage=storage, config=config)
-    with da1_index:
-        da1.match(da1_index, exclude_self=True)
-    da2 = DocumentArray.empty(10)
-    with pytest.warns(UserWarning):
-        da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
+# @pytest.mark.parametrize(
+#     'storage, config',
+#     [
+#         ('memory', {}),
+#         ('weaviate', {}),
+#         ('sqlite', {}),
+#         ('annlite', {'n_dim': 256}),
+#         ('qdrant', {'n_dim': 256}),
+#         ('elasticsearch', {'n_dim': 256}),
+#         ('redis', {'n_dim': 256}),
+#         ('milvus', {'n_dim': 256}),
+#     ],
+# )
+# def test_empty_da_should_raise(storage, config, start_storage):
+#     da = DocumentArray([], storage=storage, config=config)
+#     with pytest.raises(ValueError):
+#         da.evaluate(metrics=['precision_at_k'])
+
+
+# @pytest.mark.parametrize(
+#     'storage, config',
+#     [
+#         ('memory', {}),
+#         ('weaviate', {}),
+#         ('sqlite', {}),
+#         ('annlite', {'n_dim': 256}),
+#         ('qdrant', {'n_dim': 256}),
+#         ('elasticsearch', {'n_dim': 256}),
+#         ('redis', {'n_dim': 256}),
+#         ('milvus', {'n_dim': 256}),
+#     ],
+# )
+# def test_missing_groundtruth_should_raise(storage, config, start_storage):
+#     da = DocumentArray(DocumentArray.empty(10), storage=storage, config=config)
+#     with pytest.raises(RuntimeError):
+#         da.evaluate(metrics=['precision_at_k'])
+
+
+# @pytest.mark.parametrize(
+#     'storage, config',
+#     [
+#         ('memory', {}),
+#         ('weaviate', {}),
+#         ('sqlite', {}),
+#         ('annlite', {'n_dim': 256}),
+#         ('qdrant', {'n_dim': 256}),
+#         ('elasticsearch', {'n_dim': 256}),
+#         ('redis', {'n_dim': 256}),
+#         ('milvus', {'n_dim': 256}),
+#     ],
+# )
+# def test_useless_groundtruth_warning_should_raise(storage, config, start_storage):
+#     da1 = DocumentArray.empty(10)
+#     for d in da1:
+#         d.tags = {'label': 'A'}
+#     da1.embeddings = np.random.random([10, 256])
+#     da1_index = DocumentArray(da1, storage=storage, config=config)
+#     with da1_index:
+#         da1.match(da1_index, exclude_self=True)
+#     da2 = DocumentArray.empty(10)
+#     with pytest.warns(UserWarning):
+#         da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
 
 
 def dummy_embed_function(da):
@@ -509,35 +509,35 @@ def dummy_embed_function(da):
         da[i, 'embedding'] = np.random.random(5)
 
 
-@pytest.mark.parametrize(
-    'storage, config',
-    [
-        ('memory', {}),
-        ('weaviate', {}),
-        ('sqlite', {}),
-        ('annlite', {'n_dim': 5}),
-        ('qdrant', {'n_dim': 5}),
-        ('elasticsearch', {'n_dim': 5}),
-        ('redis', {'n_dim': 5}),
-    ],
-)
-def test_embed_and_evaluate_single_da(storage, config, start_storage):
-
-    gt = DocumentArray([Document(text=str(i)) for i in range(10)])
-    queries_da = DocumentArray(gt, copy=True)
-    queries_da = DocumentArray(queries_da, storage=storage, config=config)
-    dummy_embed_function(gt)
-    gt.match(gt, limit=3)
-
-    with queries_da:
-        res = queries_da.embed_and_evaluate(
-            ground_truth=gt,
-            metrics=['precision_at_k', 'reciprocal_rank'],
-            embed_funcs=dummy_embed_function,
-            match_batch_size=1,
-            limit=3,
-        )
-    assert all([v == 1.0 for v in res.values()])
+# @pytest.mark.parametrize(
+#     'storage, config',
+#     [
+#         ('memory', {}),
+#         ('weaviate', {}),
+#         ('sqlite', {}),
+#         ('annlite', {'n_dim': 5}),
+#         ('qdrant', {'n_dim': 5}),
+#         ('elasticsearch', {'n_dim': 5}),
+#         ('redis', {'n_dim': 5}),
+#     ],
+# )
+# def test_embed_and_evaluate_single_da(storage, config, start_storage):
+#
+#     gt = DocumentArray([Document(text=str(i)) for i in range(10)])
+#     queries_da = DocumentArray(gt, copy=True)
+#     queries_da = DocumentArray(queries_da, storage=storage, config=config)
+#     dummy_embed_function(gt)
+#     gt.match(gt, limit=3)
+#
+#     with queries_da:
+#         res = queries_da.embed_and_evaluate(
+#             ground_truth=gt,
+#             metrics=['precision_at_k', 'reciprocal_rank'],
+#             embed_funcs=dummy_embed_function,
+#             match_batch_size=1,
+#             limit=3,
+#         )
+#     assert all([v == 1.0 for v in res.values()])
 
 
 @pytest.mark.parametrize(
@@ -586,44 +586,44 @@ def test_embed_and_evaluate_with_and_without_exclude_self(
         assert abs(res[key] - expected_results[key]) < 1e-5
 
 
-@pytest.mark.parametrize(
-    'sample_size',
-    [None, 10],
-)
-@pytest.mark.parametrize(
-    'storage, config',
-    [
-        ('memory', {}),
-        ('weaviate', {}),
-        ('sqlite', {}),
-        ('annlite', {'n_dim': 5}),
-        ('qdrant', {'n_dim': 5}),
-        ('elasticsearch', {'n_dim': 5}),
-        ('redis', {'n_dim': 5}),
-    ],
-)
-def test_embed_and_evaluate_two_das(storage, config, sample_size, start_storage):
-
-    gt_queries = DocumentArray([Document(text=str(i)) for i in range(100)])
-    gt_index = DocumentArray([Document(text=str(i)) for i in range(100, 200)])
-    queries_da = DocumentArray(gt_queries, copy=True)
-    index_da = DocumentArray(gt_index, copy=True)
-    index_da = DocumentArray(index_da, storage=storage, config=config)
-    dummy_embed_function(gt_queries)
-    dummy_embed_function(gt_index)
-    gt_queries.match(gt_index, limit=3)
-
-    with index_da:
-        res = queries_da.embed_and_evaluate(
-            ground_truth=gt_queries,
-            index_data=index_da,
-            metrics=['precision_at_k', 'reciprocal_rank'],
-            embed_funcs=dummy_embed_function,
-            match_batch_size=1,
-            limit=3,
-            query_sample_size=sample_size,
-        )
-    assert all([v == 1.0 for v in res.values()])
+# @pytest.mark.parametrize(
+#     'sample_size',
+#     [None, 10],
+# )
+# @pytest.mark.parametrize(
+#     'storage, config',
+#     [
+#         ('memory', {}),
+#         ('weaviate', {}),
+#         ('sqlite', {}),
+#         ('annlite', {'n_dim': 5}),
+#         ('qdrant', {'n_dim': 5}),
+#         ('elasticsearch', {'n_dim': 5}),
+#         ('redis', {'n_dim': 5}),
+#     ],
+# )
+# def test_embed_and_evaluate_two_das(storage, config, sample_size, start_storage):
+#
+#     gt_queries = DocumentArray([Document(text=str(i)) for i in range(100)])
+#     gt_index = DocumentArray([Document(text=str(i)) for i in range(100, 200)])
+#     queries_da = DocumentArray(gt_queries, copy=True)
+#     index_da = DocumentArray(gt_index, copy=True)
+#     index_da = DocumentArray(index_da, storage=storage, config=config)
+#     dummy_embed_function(gt_queries)
+#     dummy_embed_function(gt_index)
+#     gt_queries.match(gt_index, limit=3)
+#
+#     with index_da:
+#         res = queries_da.embed_and_evaluate(
+#             ground_truth=gt_queries,
+#             index_data=index_da,
+#             metrics=['precision_at_k', 'reciprocal_rank'],
+#             embed_funcs=dummy_embed_function,
+#             match_batch_size=1,
+#             limit=3,
+#             query_sample_size=sample_size,
+#         )
+#     assert all([v == 1.0 for v in res.values()])
 
 
 def test_embed_and_evaluate_two_different_das():
@@ -655,68 +655,68 @@ def test_embed_and_evaluate_two_different_das():
     assert abs(res['f1_score_at_k'] - 1.0 / 1.5) < 1e-5
 
 
-@pytest.mark.parametrize(
-    'use_index, expected, label_tag',
-    [
-        (False, {'precision_at_k': 1.0 / 3, 'reciprocal_rank': 1.0}, 'label'),
-        (
-            True,
-            {
-                'precision_at_k': 1.0 / 3,
-                'reciprocal_rank': 11.0 / 18.0,
-                'recall_at_k': 1.0,
-            },
-            'custom_tag',
-        ),
-    ],
-)
-@pytest.mark.parametrize(
-    'storage, config',
-    [
-        ('memory', {}),
-        ('weaviate', {}),
-        ('sqlite', {}),
-        ('annlite', {'n_dim': 5}),
-        ('qdrant', {'n_dim': 5}),
-        ('elasticsearch', {'n_dim': 5}),
-        ('redis', {'n_dim': 5}),
-    ],
-)
-def test_embed_and_evaluate_labeled_dataset(
-    storage, config, start_storage, use_index, expected, label_tag
-):
-    metric_fns = list(expected.keys())
-
-    def emb_func(da):
-        np.random.seed(0)  # makes sure that embeddings are always equal
-        da[:, 'embedding'] = np.random.random((len(da), 5))
-
-    da1 = DocumentArray([Document(text=str(i), tags={label_tag: i}) for i in range(3)])
-    da2 = DocumentArray(da1, storage=storage, config=config, copy=True)
-
-    with da2:
-        if (
-            use_index
-        ):  # query and index da are distinct # (different embeddings are generated)
-            res = da1.embed_and_evaluate(
-                index_data=da2,
-                metrics=metric_fns,
-                embed_funcs=emb_func,
-                match_batch_size=1,
-                limit=3,
-                label_tag=label_tag,
-            )
-        else:  # query and index are the same (embeddings of both das are equal)
-            res = da2.embed_and_evaluate(
-                metrics=metric_fns,
-                embed_funcs=emb_func,
-                match_batch_size=1,
-                limit=3,
-                label_tag=label_tag,
-            )
-    for key in metric_fns:
-        assert key in res
-        assert abs(res[key] - expected[key]) < 1e-4
+# @pytest.mark.parametrize(
+#     'use_index, expected, label_tag',
+#     [
+#         (False, {'precision_at_k': 1.0 / 3, 'reciprocal_rank': 1.0}, 'label'),
+#         (
+#             True,
+#             {
+#                 'precision_at_k': 1.0 / 3,
+#                 'reciprocal_rank': 11.0 / 18.0,
+#                 'recall_at_k': 1.0,
+#             },
+#             'custom_tag',
+#         ),
+#     ],
+# )
+# @pytest.mark.parametrize(
+#     'storage, config',
+#     [
+#         ('memory', {}),
+#         ('weaviate', {}),
+#         ('sqlite', {}),
+#         ('annlite', {'n_dim': 5}),
+#         ('qdrant', {'n_dim': 5}),
+#         ('elasticsearch', {'n_dim': 5}),
+#         ('redis', {'n_dim': 5}),
+#     ],
+# )
+# def test_embed_and_evaluate_labeled_dataset(
+#     storage, config, start_storage, use_index, expected, label_tag
+# ):
+#     metric_fns = list(expected.keys())
+#
+#     def emb_func(da):
+#         np.random.seed(0)  # makes sure that embeddings are always equal
+#         da[:, 'embedding'] = np.random.random((len(da), 5))
+#
+#     da1 = DocumentArray([Document(text=str(i), tags={label_tag: i}) for i in range(3)])
+#     da2 = DocumentArray(da1, storage=storage, config=config, copy=True)
+#
+#     with da2:
+#         if (
+#             use_index
+#         ):  # query and index da are distinct # (different embeddings are generated)
+#             res = da1.embed_and_evaluate(
+#                 index_data=da2,
+#                 metrics=metric_fns,
+#                 embed_funcs=emb_func,
+#                 match_batch_size=1,
+#                 limit=3,
+#                 label_tag=label_tag,
+#             )
+#         else:  # query and index are the same (embeddings of both das are equal)
+#             res = da2.embed_and_evaluate(
+#                 metrics=metric_fns,
+#                 embed_funcs=emb_func,
+#                 match_batch_size=1,
+#                 limit=3,
+#                 label_tag=label_tag,
+#             )
+#     for key in metric_fns:
+#         assert key in res
+#         assert abs(res[key] - expected[key]) < 1e-4
 
 
 @pytest.mark.parametrize(
@@ -821,46 +821,46 @@ def test_embed_and_evaluate_with_embed_model(
     assert res['precision_at_k'] == 0.2
 
 
-@pytest.mark.parametrize(
-    'queries, kwargs, exception',
-    [
-        (DocumentArray.empty(4), {}, ValueError),
-        (
-            DocumentArray([Document(tags={'label': 0})]),
-            {'index_data': DocumentArray.empty(4)},
-            ValueError,
-        ),
-        (DocumentArray([Document(tags={'label': 0})]), {}, RuntimeError),
-        (
-            DocumentArray([Document(tags={'label': 0})]),
-            {'index_data': DocumentArray([Document(tags={'label': 0})])},
-            RuntimeError,
-        ),
-    ],
-)
-@pytest.mark.parametrize(
-    'storage, config',
-    [
-        ('memory', {}),
-        ('weaviate', {}),
-        ('sqlite', {}),
-        ('annlite', {'n_dim': 5}),
-        ('qdrant', {'n_dim': 5}),
-        ('elasticsearch', {'n_dim': 5}),
-        ('redis', {'n_dim': 5}),
-    ],
-)
-def test_embed_and_evaluate_invalid_input_should_raise(
-    storage, config, queries, kwargs, exception, start_storage
-):
-    kwargs.update({'metrics': ['precision_at_k']})
-    if 'index_data' in kwargs:
-        kwargs['index_data'] = DocumentArray(
-            kwargs['index_data'], storage=storage, config=config
-        )
-
-    with pytest.raises(exception):
-        queries.embed_and_evaluate(**kwargs)
+# @pytest.mark.parametrize(
+#     'queries, kwargs, exception',
+#     [
+#         (DocumentArray.empty(4), {}, ValueError),
+#         (
+#             DocumentArray([Document(tags={'label': 0})]),
+#             {'index_data': DocumentArray.empty(4)},
+#             ValueError,
+#         ),
+#         (DocumentArray([Document(tags={'label': 0})]), {}, RuntimeError),
+#         (
+#             DocumentArray([Document(tags={'label': 0})]),
+#             {'index_data': DocumentArray([Document(tags={'label': 0})])},
+#             RuntimeError,
+#         ),
+#     ],
+# )
+# @pytest.mark.parametrize(
+#     'storage, config',
+#     [
+#         ('memory', {}),
+#         ('weaviate', {}),
+#         ('sqlite', {}),
+#         ('annlite', {'n_dim': 5}),
+#         ('qdrant', {'n_dim': 5}),
+#         ('elasticsearch', {'n_dim': 5}),
+#         ('redis', {'n_dim': 5}),
+#     ],
+# )
+# def test_embed_and_evaluate_invalid_input_should_raise(
+#     storage, config, queries, kwargs, exception, start_storage
+# ):
+#     kwargs.update({'metrics': ['precision_at_k']})
+#     if 'index_data' in kwargs:
+#         kwargs['index_data'] = DocumentArray(
+#             kwargs['index_data'], storage=storage, config=config
+#         )
+#
+#     with pytest.raises(exception):
+#         queries.embed_and_evaluate(**kwargs)
 
 
 @pytest.mark.parametrize(

From 1dab1c59c2867627c5c78fe4ae4cb8e670c7199c Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Wed, 11 Jan 2023 11:06:32 +0100
Subject: [PATCH 7/8] test: try to find minimal working changes

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 .github/workflows/ci.yml                      |  98 +--
 docarray/array/mixins/getattr.py              |   3 +-
 .../array/mixins/oldproto/test_eval_class.py  | 776 +++++++++---------
 3 files changed, 438 insertions(+), 439 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5f72b5ced0a..3ea6ccfe944 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -157,55 +157,55 @@ jobs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       matrix-oldproto: ${{ steps.set-matrix.outputs.matrix-oldproto }}
 
-#  docarray-test:
-#    needs: prep-testbed
-#    runs-on: ubuntu-latest
-#    strategy:
-#      fail-fast: false
-#      matrix:
-#        python-version: [3.8]
-#        test-path: ${{fromJson(needs.prep-testbed.outputs.matrix)}}
-#    steps:
-#      - uses: actions/checkout@v2.5.0
-#      - name: Set up Python ${{ matrix.python-version }}
-#        uses: actions/setup-python@v4
-#        with:
-#          python-version: ${{ matrix.python-version }}
-#      - name: Prepare environment
-#        run: |
-#          python -m pip install --upgrade pip
-#          python -m pip install wheel
-#          # pip does not properly resolve dependency versions with syntax pip install --no-cache-dir ".[test,full]"
-#          pip install --no-cache-dir ".[test]"
-#          pip install --no-cache-dir ".[qdrant]"
-#          pip install --no-cache-dir ".[annlite]"
-#          pip install --no-cache-dir ".[weaviate]"
-#          pip install --no-cache-dir ".[elasticsearch]"
-#          pip install --no-cache-dir ".[redis]"
-#          pip install --no-cache-dir ".[full]"
-#          sudo apt-get install libsndfile1
-#      - name: Test
-#        id: test
-#        run: |
-#          pytest --suppress-no-test-exit-code --cov=docarray --cov-report=xml \
-#            -v -s -m "not gpu" ${{ matrix.test-path }}
-#          echo "codecov_flag=docarray" >> $GITHUB_OUTPUT
-#        timeout-minutes: 60
-#        env:
-#          JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}"
-#      - name: Check codecov file
-#        id: check_files
-#        uses: andstor/file-existence-action@v1
-#        with:
-#          files: "coverage.xml"
-#      - name: Upload coverage from test to Codecov
-#        uses: codecov/codecov-action@v3.1.1
-#        if: steps.check_files.outputs.files_exists == 'true' && ${{ matrix.python-version }} == '3.8'
-#        with:
-#          file: coverage.xml
-#          flags: ${{ steps.test.outputs.codecov_flag }}
-#          fail_ci_if_error: false
-#          token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
+  docarray-test:
+    needs: prep-testbed
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.8]
+        test-path: ${{fromJson(needs.prep-testbed.outputs.matrix)}}
+    steps:
+      - uses: actions/checkout@v2.5.0
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Prepare environment
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install wheel
+          # pip does not properly resolve dependency versions with syntax pip install --no-cache-dir ".[test,full]"
+          pip install --no-cache-dir ".[test]"
+          pip install --no-cache-dir ".[qdrant]"
+          pip install --no-cache-dir ".[annlite]"
+          pip install --no-cache-dir ".[weaviate]"
+          pip install --no-cache-dir ".[elasticsearch]"
+          pip install --no-cache-dir ".[redis]"
+          pip install --no-cache-dir ".[full]"
+          sudo apt-get install libsndfile1
+      - name: Test
+        id: test
+        run: |
+          pytest --suppress-no-test-exit-code --cov=docarray --cov-report=xml \
+            -v -s -m "not gpu" ${{ matrix.test-path }}
+          echo "codecov_flag=docarray" >> $GITHUB_OUTPUT
+        timeout-minutes: 60
+        env:
+          JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}"
+      - name: Check codecov file
+        id: check_files
+        uses: andstor/file-existence-action@v1
+        with:
+          files: "coverage.xml"
+      - name: Upload coverage from test to Codecov
+        uses: codecov/codecov-action@v3.1.1
+        if: steps.check_files.outputs.files_exists == 'true' && ${{ matrix.python-version }} == '3.8'
+        with:
+          file: coverage.xml
+          flags: ${{ steps.test.outputs.codecov_flag }}
+          fail_ci_if_error: false
+          token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
 
   docarray-oldproto-test:
     needs: prep-testbed
diff --git a/docarray/array/mixins/getattr.py b/docarray/array/mixins/getattr.py
index 120ff951e11..588b03e12aa 100644
--- a/docarray/array/mixins/getattr.py
+++ b/docarray/array/mixins/getattr.py
@@ -11,9 +11,8 @@ def _get_attributes(self, *fields: str) -> List:
         :return: Returns a list of the values for these fields.
             When `fields` has multiple values, then it returns a list of list.
         """
-        # small change just to trigger CI tests
-        fields = list(fields)
         e_index, b_index = None, None
+        fields = list(fields)
         if 'embedding' in fields:
             e_index = fields.index('embedding')
         if 'tensor' in fields:
diff --git a/tests/unit/array/mixins/oldproto/test_eval_class.py b/tests/unit/array/mixins/oldproto/test_eval_class.py
index 1d1e9378071..8d0278a0f8c 100644
--- a/tests/unit/array/mixins/oldproto/test_eval_class.py
+++ b/tests/unit/array/mixins/oldproto/test_eval_class.py
@@ -12,45 +12,45 @@
 from docarray import DocumentArray, Document
 
 
-# @pytest.mark.parametrize(
-#     'storage, config',
-#     [
-#         ('memory', {}),
-#         ('weaviate', {}),
-#         ('sqlite', {}),
-#         ('annlite', {'n_dim': 256}),
-#         ('qdrant', {'n_dim': 256}),
-#         ('elasticsearch', {'n_dim': 256}),
-#         ('redis', {'n_dim': 256}),
-#         ('milvus', {'n_dim': 256}),
-#     ],
-# )
-# @pytest.mark.parametrize(
-#     'metric_fn, kwargs',
-#     [
-#         ('r_precision', {}),
-#         ('precision_at_k', {}),
-#         ('hit_at_k', {}),
-#         ('average_precision', {}),
-#         ('reciprocal_rank', {}),
-#         ('recall_at_k', {'max_rel': 9}),
-#         ('f1_score_at_k', {'max_rel': 9}),
-#         ('ndcg_at_k', {}),
-#     ],
-# )
-# def test_eval_mixin_perfect_match(metric_fn, kwargs, storage, config, start_storage):
-#     da1 = DocumentArray.empty(10)
-#     da1.embeddings = np.random.random([10, 256])
-#     da1_index = DocumentArray(da1, storage=storage, config=config)
-#     with da1_index:
-#         da1.match(da1_index, exclude_self=True)
-#     r = da1.evaluate(ground_truth=da1, metrics=[metric_fn], strict=False, **kwargs)[
-#         metric_fn
-#     ]
-#     assert isinstance(r, float)
-#     assert r == 1.0
-#     for d in da1:
-#         assert d.evaluations[metric_fn].value == 1.0
+@pytest.mark.parametrize(
+    'storage, config',
+    [
+        ('memory', {}),
+        ('weaviate', {}),
+        ('sqlite', {}),
+        ('annlite', {'n_dim': 256}),
+        ('qdrant', {'n_dim': 256}),
+        ('elasticsearch', {'n_dim': 256}),
+        ('redis', {'n_dim': 256}),
+        ('milvus', {'n_dim': 256}),
+    ],
+)
+@pytest.mark.parametrize(
+    'metric_fn, kwargs',
+    [
+        ('r_precision', {}),
+        ('precision_at_k', {}),
+        ('hit_at_k', {}),
+        ('average_precision', {}),
+        ('reciprocal_rank', {}),
+        ('recall_at_k', {'max_rel': 9}),
+        ('f1_score_at_k', {'max_rel': 9}),
+        ('ndcg_at_k', {}),
+    ],
+)
+def test_eval_mixin_perfect_match(metric_fn, kwargs, storage, config, start_storage):
+    da1 = DocumentArray.empty(10)
+    da1.embeddings = np.random.random([10, 256])
+    da1_index = DocumentArray(da1, storage=storage, config=config)
+    with da1_index:
+        da1.match(da1_index, exclude_self=True)
+    r = da1.evaluate(ground_truth=da1, metrics=[metric_fn], strict=False, **kwargs)[
+        metric_fn
+    ]
+    assert isinstance(r, float)
+    assert r == 1.0
+    for d in da1:
+        assert d.evaluations[metric_fn].value == 1.0
 
 
 @pytest.mark.parametrize(
@@ -134,70 +134,70 @@ def test_eval_mixin_perfect_match_labeled(
         assert d.evaluations[metric_fn].value == 1.0
 
 
-# @pytest.mark.parametrize(
-#     'storage, config',
-#     [
-#         ('memory', {}),
-#         ('weaviate', {}),
-#         ('sqlite', {}),
-#         ('annlite', {'n_dim': 256}),
-#         ('qdrant', {'n_dim': 256}),
-#         ('elasticsearch', {'n_dim': 256}),
-#         ('redis', {'n_dim': 256}),
-#         ('milvus', {'n_dim': 256}),
-#     ],
-# )
-# @pytest.mark.parametrize(
-#     'metric_fn, kwargs',
-#     [
-#         ('r_precision', {}),
-#         ('precision_at_k', {}),
-#         ('hit_at_k', {}),
-#         ('average_precision', {}),
-#         ('reciprocal_rank', {}),
-#         ('recall_at_k', {'max_rel': 9}),
-#         ('f1_score_at_k', {'max_rel': 9}),
-#         ('ndcg_at_k', {}),
-#     ],
-# )
-# def test_eval_mixin_zero_labeled(storage, config, metric_fn, start_storage, kwargs):
-#     da1 = DocumentArray.empty(10)
-#     for d in da1:
-#         d.tags = {'label': 'A'}
-#     da1.embeddings = np.random.random([10, 256])
-#     da2 = copy.deepcopy(da1)
-#     for d in da2:
-#         d.tags = {'label': 'B'}
-#     da1_index = DocumentArray(da2, storage=storage, config=config)
-#     with da1_index:
-#         da1.match(da1_index, exclude_self=True)
-#     r = da1.evaluate([metric_fn], **kwargs)[metric_fn]
-#     assert isinstance(r, float)
-#     assert r == 0.0
-#     for d in da1:
-#         assert d.evaluations[metric_fn].value == 0.0
-
-
-# @pytest.mark.parametrize('label_tag', ['label', 'custom_tag'])
-# @pytest.mark.parametrize(
-#     'metric_fn, metric_score',
-#     [
-#         ('r_precision', 1.0 / 3),
-#         ('precision_at_k', 1.0 / 3),
-#         ('hit_at_k', 1.0),
-#         ('average_precision', (1.0 + 0.5 + (1.0 / 3)) / 3),
-#         ('reciprocal_rank', (1.0 + 0.5 + (1.0 / 3)) / 3),
-#         ('recall_at_k', 1.0 / 3),
-#         ('f1_score_at_k', 1.0 / 3),
-#         ('dcg_at_k', (1.0 + 1.0 + 0.6309) / 3),
-#     ],
-# )
-# def test_eval_mixin_one_of_n_labeled(metric_fn, metric_score, label_tag):
-#     da = DocumentArray([Document(text=str(i), tags={label_tag: i}) for i in range(3)])
-#     for d in da:
-#         d.matches = da
-#     r = da.evaluate([metric_fn], label_tag=label_tag, max_rel=3)[metric_fn]
-#     assert abs(r - metric_score) < 0.001
+@pytest.mark.parametrize(
+    'storage, config',
+    [
+        ('memory', {}),
+        ('weaviate', {}),
+        ('sqlite', {}),
+        ('annlite', {'n_dim': 256}),
+        ('qdrant', {'n_dim': 256}),
+        ('elasticsearch', {'n_dim': 256}),
+        ('redis', {'n_dim': 256}),
+        ('milvus', {'n_dim': 256}),
+    ],
+)
+@pytest.mark.parametrize(
+    'metric_fn, kwargs',
+    [
+        ('r_precision', {}),
+        ('precision_at_k', {}),
+        ('hit_at_k', {}),
+        ('average_precision', {}),
+        ('reciprocal_rank', {}),
+        ('recall_at_k', {'max_rel': 9}),
+        ('f1_score_at_k', {'max_rel': 9}),
+        ('ndcg_at_k', {}),
+    ],
+)
+def test_eval_mixin_zero_labeled(storage, config, metric_fn, start_storage, kwargs):
+    da1 = DocumentArray.empty(10)
+    for d in da1:
+        d.tags = {'label': 'A'}
+    da1.embeddings = np.random.random([10, 256])
+    da2 = copy.deepcopy(da1)
+    for d in da2:
+        d.tags = {'label': 'B'}
+    da1_index = DocumentArray(da2, storage=storage, config=config)
+    with da1_index:
+        da1.match(da1_index, exclude_self=True)
+    r = da1.evaluate([metric_fn], **kwargs)[metric_fn]
+    assert isinstance(r, float)
+    assert r == 0.0
+    for d in da1:
+        assert d.evaluations[metric_fn].value == 0.0
+
+
+@pytest.mark.parametrize('label_tag', ['label', 'custom_tag'])
+@pytest.mark.parametrize(
+    'metric_fn, metric_score',
+    [
+        ('r_precision', 1.0 / 3),
+        ('precision_at_k', 1.0 / 3),
+        ('hit_at_k', 1.0),
+        ('average_precision', (1.0 + 0.5 + (1.0 / 3)) / 3),
+        ('reciprocal_rank', (1.0 + 0.5 + (1.0 / 3)) / 3),
+        ('recall_at_k', 1.0 / 3),
+        ('f1_score_at_k', 1.0 / 3),
+        ('dcg_at_k', (1.0 + 1.0 + 0.6309) / 3),
+    ],
+)
+def test_eval_mixin_one_of_n_labeled(metric_fn, metric_score, label_tag):
+    da = DocumentArray([Document(text=str(i), tags={label_tag: i}) for i in range(3)])
+    for d in da:
+        d.matches = da
+    r = da.evaluate([metric_fn], label_tag=label_tag, max_rel=3)[metric_fn]
+    assert abs(r - metric_score) < 0.001
 
 
 @pytest.mark.parametrize('label_tag', ['label', 'custom_tag'])
@@ -221,16 +221,16 @@ def test_num_relevant_documents_per_label(metric_fn, metric_score, label_tag):
     assert abs(r - metric_score) < 0.001
 
 
-# def test_missing_max_rel_should_raise():
-#     da = DocumentArray([Document(text=str(i), tags={'label': i}) for i in range(3)])
-#     num_relevant_documents_per_label = {i: 1 for i in range(2)}
-#     for d in da:
-#         d.matches = da
-#     with pytest.raises(ValueError):
-#         da.evaluate(
-#             ['recall_at_k'],
-#             num_relevant_documents_per_label=num_relevant_documents_per_label,
-#         )
+def test_missing_max_rel_should_raise():
+    da = DocumentArray([Document(text=str(i), tags={'label': i}) for i in range(3)])
+    num_relevant_documents_per_label = {i: 1 for i in range(2)}
+    for d in da:
+        d.matches = da
+    with pytest.raises(ValueError):
+        da.evaluate(
+            ['recall_at_k'],
+            num_relevant_documents_per_label=num_relevant_documents_per_label,
+        )
 
 
 @pytest.mark.parametrize(
@@ -279,50 +279,50 @@ def test_eval_mixin_zero_match(storage, config, metric_fn, start_storage, kwargs
         assert d.evaluations[metric_fn].value == 1.0
 
 
-# @pytest.mark.parametrize(
-#     'storage, config',
-#     [
-#         ('memory', {}),
-#         ('weaviate', {}),
-#         ('sqlite', {}),
-#         ('annlite', {'n_dim': 256}),
-#         ('qdrant', {'n_dim': 256}),
-#         ('elasticsearch', {'n_dim': 256}),
-#         ('redis', {'n_dim': 256}),
-#         ('milvus', {'n_dim': 256}),
-#     ],
-# )
-# def test_diff_len_should_raise(storage, config, start_storage):
-#     da1 = DocumentArray.empty(10)
-#     da2 = DocumentArray.empty(5)
-#     for d in da2:
-#         d.matches.append(da2[0])
-#     da2 = DocumentArray(da2, storage=storage, config=config)
-#     with pytest.raises(ValueError):
-#         da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
-
-
-# @pytest.mark.parametrize(
-#     'storage, config',
-#     [
-#         ('memory', {}),
-#         ('weaviate', {}),
-#         ('sqlite', {}),
-#         ('annlite', {'n_dim': 256}),
-#         ('qdrant', {'n_dim': 256}),
-#         ('elasticsearch', {'n_dim': 256}),
-#         ('redis', {'n_dim': 256}),
-#         ('milvus', {'n_dim': 256}),
-#     ],
-# )
-# def test_diff_hash_fun_should_raise(storage, config, start_storage):
-#     da1 = DocumentArray.empty(10)
-#     da2 = DocumentArray.empty(5)
-#     for d in da2:
-#         d.matches.append(da2[0])
-#     da2 = DocumentArray(da2, storage=storage, config=config)
-#     with pytest.raises(ValueError):
-#         da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
+@pytest.mark.parametrize(
+    'storage, config',
+    [
+        ('memory', {}),
+        ('weaviate', {}),
+        ('sqlite', {}),
+        ('annlite', {'n_dim': 256}),
+        ('qdrant', {'n_dim': 256}),
+        ('elasticsearch', {'n_dim': 256}),
+        ('redis', {'n_dim': 256}),
+        ('milvus', {'n_dim': 256}),
+    ],
+)
+def test_diff_len_should_raise(storage, config, start_storage):
+    da1 = DocumentArray.empty(10)
+    da2 = DocumentArray.empty(5)
+    for d in da2:
+        d.matches.append(da2[0])
+    da2 = DocumentArray(da2, storage=storage, config=config)
+    with pytest.raises(ValueError):
+        da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
+
+
+@pytest.mark.parametrize(
+    'storage, config',
+    [
+        ('memory', {}),
+        ('weaviate', {}),
+        ('sqlite', {}),
+        ('annlite', {'n_dim': 256}),
+        ('qdrant', {'n_dim': 256}),
+        ('elasticsearch', {'n_dim': 256}),
+        ('redis', {'n_dim': 256}),
+        ('milvus', {'n_dim': 256}),
+    ],
+)
+def test_diff_hash_fun_should_raise(storage, config, start_storage):
+    da1 = DocumentArray.empty(10)
+    da2 = DocumentArray.empty(5)
+    for d in da2:
+        d.matches.append(da2[0])
+    da2 = DocumentArray(da2, storage=storage, config=config)
+    with pytest.raises(ValueError):
+        da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
 
 
 @pytest.mark.parametrize(
@@ -439,68 +439,68 @@ def test_diff_match_len_in_gd(storage, config, metric_fn, start_storage, kwargs)
         assert d.evaluations[metric_fn].value > 0.9
 
 
-# @pytest.mark.parametrize(
-#     'storage, config',
-#     [
-#         ('memory', {}),
-#         ('weaviate', {}),
-#         ('sqlite', {}),
-#         ('annlite', {'n_dim': 256}),
-#         ('qdrant', {'n_dim': 256}),
-#         ('elasticsearch', {'n_dim': 256}),
-#         ('redis', {'n_dim': 256}),
-#         ('milvus', {'n_dim': 256}),
-#     ],
-# )
-# def test_empty_da_should_raise(storage, config, start_storage):
-#     da = DocumentArray([], storage=storage, config=config)
-#     with pytest.raises(ValueError):
-#         da.evaluate(metrics=['precision_at_k'])
-
-
-# @pytest.mark.parametrize(
-#     'storage, config',
-#     [
-#         ('memory', {}),
-#         ('weaviate', {}),
-#         ('sqlite', {}),
-#         ('annlite', {'n_dim': 256}),
-#         ('qdrant', {'n_dim': 256}),
-#         ('elasticsearch', {'n_dim': 256}),
-#         ('redis', {'n_dim': 256}),
-#         ('milvus', {'n_dim': 256}),
-#     ],
-# )
-# def test_missing_groundtruth_should_raise(storage, config, start_storage):
-#     da = DocumentArray(DocumentArray.empty(10), storage=storage, config=config)
-#     with pytest.raises(RuntimeError):
-#         da.evaluate(metrics=['precision_at_k'])
-
-
-# @pytest.mark.parametrize(
-#     'storage, config',
-#     [
-#         ('memory', {}),
-#         ('weaviate', {}),
-#         ('sqlite', {}),
-#         ('annlite', {'n_dim': 256}),
-#         ('qdrant', {'n_dim': 256}),
-#         ('elasticsearch', {'n_dim': 256}),
-#         ('redis', {'n_dim': 256}),
-#         ('milvus', {'n_dim': 256}),
-#     ],
-# )
-# def test_useless_groundtruth_warning_should_raise(storage, config, start_storage):
-#     da1 = DocumentArray.empty(10)
-#     for d in da1:
-#         d.tags = {'label': 'A'}
-#     da1.embeddings = np.random.random([10, 256])
-#     da1_index = DocumentArray(da1, storage=storage, config=config)
-#     with da1_index:
-#         da1.match(da1_index, exclude_self=True)
-#     da2 = DocumentArray.empty(10)
-#     with pytest.warns(UserWarning):
-#         da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
+@pytest.mark.parametrize(
+    'storage, config',
+    [
+        ('memory', {}),
+        ('weaviate', {}),
+        ('sqlite', {}),
+        ('annlite', {'n_dim': 256}),
+        ('qdrant', {'n_dim': 256}),
+        ('elasticsearch', {'n_dim': 256}),
+        ('redis', {'n_dim': 256}),
+        ('milvus', {'n_dim': 256}),
+    ],
+)
+def test_empty_da_should_raise(storage, config, start_storage):
+    da = DocumentArray([], storage=storage, config=config)
+    with pytest.raises(ValueError):
+        da.evaluate(metrics=['precision_at_k'])
+
+
+@pytest.mark.parametrize(
+    'storage, config',
+    [
+        ('memory', {}),
+        ('weaviate', {}),
+        ('sqlite', {}),
+        ('annlite', {'n_dim': 256}),
+        ('qdrant', {'n_dim': 256}),
+        ('elasticsearch', {'n_dim': 256}),
+        ('redis', {'n_dim': 256}),
+        ('milvus', {'n_dim': 256}),
+    ],
+)
+def test_missing_groundtruth_should_raise(storage, config, start_storage):
+    da = DocumentArray(DocumentArray.empty(10), storage=storage, config=config)
+    with pytest.raises(RuntimeError):
+        da.evaluate(metrics=['precision_at_k'])
+
+
+@pytest.mark.parametrize(
+    'storage, config',
+    [
+        ('memory', {}),
+        ('weaviate', {}),
+        ('sqlite', {}),
+        ('annlite', {'n_dim': 256}),
+        ('qdrant', {'n_dim': 256}),
+        ('elasticsearch', {'n_dim': 256}),
+        ('redis', {'n_dim': 256}),
+        ('milvus', {'n_dim': 256}),
+    ],
+)
+def test_useless_groundtruth_warning_should_raise(storage, config, start_storage):
+    da1 = DocumentArray.empty(10)
+    for d in da1:
+        d.tags = {'label': 'A'}
+    da1.embeddings = np.random.random([10, 256])
+    da1_index = DocumentArray(da1, storage=storage, config=config)
+    with da1_index:
+        da1.match(da1_index, exclude_self=True)
+    da2 = DocumentArray.empty(10)
+    with pytest.warns(UserWarning):
+        da1.evaluate(ground_truth=da2, metrics=['precision_at_k'])
 
 
 def dummy_embed_function(da):
@@ -509,35 +509,35 @@ def dummy_embed_function(da):
         da[i, 'embedding'] = np.random.random(5)
 
 
-# @pytest.mark.parametrize(
-#     'storage, config',
-#     [
-#         ('memory', {}),
-#         ('weaviate', {}),
-#         ('sqlite', {}),
-#         ('annlite', {'n_dim': 5}),
-#         ('qdrant', {'n_dim': 5}),
-#         ('elasticsearch', {'n_dim': 5}),
-#         ('redis', {'n_dim': 5}),
-#     ],
-# )
-# def test_embed_and_evaluate_single_da(storage, config, start_storage):
-#
-#     gt = DocumentArray([Document(text=str(i)) for i in range(10)])
-#     queries_da = DocumentArray(gt, copy=True)
-#     queries_da = DocumentArray(queries_da, storage=storage, config=config)
-#     dummy_embed_function(gt)
-#     gt.match(gt, limit=3)
-#
-#     with queries_da:
-#         res = queries_da.embed_and_evaluate(
-#             ground_truth=gt,
-#             metrics=['precision_at_k', 'reciprocal_rank'],
-#             embed_funcs=dummy_embed_function,
-#             match_batch_size=1,
-#             limit=3,
-#         )
-#     assert all([v == 1.0 for v in res.values()])
+@pytest.mark.parametrize(
+    'storage, config',
+    [
+        ('memory', {}),
+        ('weaviate', {}),
+        ('sqlite', {}),
+        ('annlite', {'n_dim': 5}),
+        ('qdrant', {'n_dim': 5}),
+        ('elasticsearch', {'n_dim': 5}),
+        ('redis', {'n_dim': 5}),
+    ],
+)
+def test_embed_and_evaluate_single_da(storage, config, start_storage):
+
+    gt = DocumentArray([Document(text=str(i)) for i in range(10)])
+    queries_da = DocumentArray(gt, copy=True)
+    queries_da = DocumentArray(queries_da, storage=storage, config=config)
+    dummy_embed_function(gt)
+    gt.match(gt, limit=3)
+
+    with queries_da:
+        res = queries_da.embed_and_evaluate(
+            ground_truth=gt,
+            metrics=['precision_at_k', 'reciprocal_rank'],
+            embed_funcs=dummy_embed_function,
+            match_batch_size=1,
+            limit=3,
+        )
+    assert all([v == 1.0 for v in res.values()])
 
 
 @pytest.mark.parametrize(
@@ -586,44 +586,44 @@ def test_embed_and_evaluate_with_and_without_exclude_self(
         assert abs(res[key] - expected_results[key]) < 1e-5
 
 
-# @pytest.mark.parametrize(
-#     'sample_size',
-#     [None, 10],
-# )
-# @pytest.mark.parametrize(
-#     'storage, config',
-#     [
-#         ('memory', {}),
-#         ('weaviate', {}),
-#         ('sqlite', {}),
-#         ('annlite', {'n_dim': 5}),
-#         ('qdrant', {'n_dim': 5}),
-#         ('elasticsearch', {'n_dim': 5}),
-#         ('redis', {'n_dim': 5}),
-#     ],
-# )
-# def test_embed_and_evaluate_two_das(storage, config, sample_size, start_storage):
-#
-#     gt_queries = DocumentArray([Document(text=str(i)) for i in range(100)])
-#     gt_index = DocumentArray([Document(text=str(i)) for i in range(100, 200)])
-#     queries_da = DocumentArray(gt_queries, copy=True)
-#     index_da = DocumentArray(gt_index, copy=True)
-#     index_da = DocumentArray(index_da, storage=storage, config=config)
-#     dummy_embed_function(gt_queries)
-#     dummy_embed_function(gt_index)
-#     gt_queries.match(gt_index, limit=3)
-#
-#     with index_da:
-#         res = queries_da.embed_and_evaluate(
-#             ground_truth=gt_queries,
-#             index_data=index_da,
-#             metrics=['precision_at_k', 'reciprocal_rank'],
-#             embed_funcs=dummy_embed_function,
-#             match_batch_size=1,
-#             limit=3,
-#             query_sample_size=sample_size,
-#         )
-#     assert all([v == 1.0 for v in res.values()])
+@pytest.mark.parametrize(
+    'sample_size',
+    [None, 10],
+)
+@pytest.mark.parametrize(
+    'storage, config',
+    [
+        ('memory', {}),
+        ('weaviate', {}),
+        ('sqlite', {}),
+        ('annlite', {'n_dim': 5}),
+        ('qdrant', {'n_dim': 5}),
+        ('elasticsearch', {'n_dim': 5}),
+        ('redis', {'n_dim': 5}),
+    ],
+)
+def test_embed_and_evaluate_two_das(storage, config, sample_size, start_storage):
+
+    gt_queries = DocumentArray([Document(text=str(i)) for i in range(100)])
+    gt_index = DocumentArray([Document(text=str(i)) for i in range(100, 200)])
+    queries_da = DocumentArray(gt_queries, copy=True)
+    index_da = DocumentArray(gt_index, copy=True)
+    index_da = DocumentArray(index_da, storage=storage, config=config)
+    dummy_embed_function(gt_queries)
+    dummy_embed_function(gt_index)
+    gt_queries.match(gt_index, limit=3)
+
+    with index_da:
+        res = queries_da.embed_and_evaluate(
+            ground_truth=gt_queries,
+            index_data=index_da,
+            metrics=['precision_at_k', 'reciprocal_rank'],
+            embed_funcs=dummy_embed_function,
+            match_batch_size=1,
+            limit=3,
+            query_sample_size=sample_size,
+        )
+    assert all([v == 1.0 for v in res.values()])
 
 
 def test_embed_and_evaluate_two_different_das():
@@ -655,68 +655,68 @@ def test_embed_and_evaluate_two_different_das():
     assert abs(res['f1_score_at_k'] - 1.0 / 1.5) < 1e-5
 
 
-# @pytest.mark.parametrize(
-#     'use_index, expected, label_tag',
-#     [
-#         (False, {'precision_at_k': 1.0 / 3, 'reciprocal_rank': 1.0}, 'label'),
-#         (
-#             True,
-#             {
-#                 'precision_at_k': 1.0 / 3,
-#                 'reciprocal_rank': 11.0 / 18.0,
-#                 'recall_at_k': 1.0,
-#             },
-#             'custom_tag',
-#         ),
-#     ],
-# )
-# @pytest.mark.parametrize(
-#     'storage, config',
-#     [
-#         ('memory', {}),
-#         ('weaviate', {}),
-#         ('sqlite', {}),
-#         ('annlite', {'n_dim': 5}),
-#         ('qdrant', {'n_dim': 5}),
-#         ('elasticsearch', {'n_dim': 5}),
-#         ('redis', {'n_dim': 5}),
-#     ],
-# )
-# def test_embed_and_evaluate_labeled_dataset(
-#     storage, config, start_storage, use_index, expected, label_tag
-# ):
-#     metric_fns = list(expected.keys())
-#
-#     def emb_func(da):
-#         np.random.seed(0)  # makes sure that embeddings are always equal
-#         da[:, 'embedding'] = np.random.random((len(da), 5))
-#
-#     da1 = DocumentArray([Document(text=str(i), tags={label_tag: i}) for i in range(3)])
-#     da2 = DocumentArray(da1, storage=storage, config=config, copy=True)
-#
-#     with da2:
-#         if (
-#             use_index
-#         ):  # query and index da are distinct # (different embeddings are generated)
-#             res = da1.embed_and_evaluate(
-#                 index_data=da2,
-#                 metrics=metric_fns,
-#                 embed_funcs=emb_func,
-#                 match_batch_size=1,
-#                 limit=3,
-#                 label_tag=label_tag,
-#             )
-#         else:  # query and index are the same (embeddings of both das are equal)
-#             res = da2.embed_and_evaluate(
-#                 metrics=metric_fns,
-#                 embed_funcs=emb_func,
-#                 match_batch_size=1,
-#                 limit=3,
-#                 label_tag=label_tag,
-#             )
-#     for key in metric_fns:
-#         assert key in res
-#         assert abs(res[key] - expected[key]) < 1e-4
+@pytest.mark.parametrize(
+    'use_index, expected, label_tag',
+    [
+        (False, {'precision_at_k': 1.0 / 3, 'reciprocal_rank': 1.0}, 'label'),
+        (
+            True,
+            {
+                'precision_at_k': 1.0 / 3,
+                'reciprocal_rank': 11.0 / 18.0,
+                'recall_at_k': 1.0,
+            },
+            'custom_tag',
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    'storage, config',
+    [
+        ('memory', {}),
+        ('weaviate', {}),
+        ('sqlite', {}),
+        ('annlite', {'n_dim': 5}),
+        ('qdrant', {'n_dim': 5}),
+        ('elasticsearch', {'n_dim': 5}),
+        ('redis', {'n_dim': 5}),
+    ],
+)
+def test_embed_and_evaluate_labeled_dataset(
+    storage, config, start_storage, use_index, expected, label_tag
+):
+    metric_fns = list(expected.keys())
+
+    def emb_func(da):
+        np.random.seed(0)  # makes sure that embeddings are always equal
+        da[:, 'embedding'] = np.random.random((len(da), 5))
+
+    da1 = DocumentArray([Document(text=str(i), tags={label_tag: i}) for i in range(3)])
+    da2 = DocumentArray(da1, storage=storage, config=config, copy=True)
+
+    with da2:
+        if (
+            use_index
+        ):  # query and index da are distinct # (different embeddings are generated)
+            res = da1.embed_and_evaluate(
+                index_data=da2,
+                metrics=metric_fns,
+                embed_funcs=emb_func,
+                match_batch_size=1,
+                limit=3,
+                label_tag=label_tag,
+            )
+        else:  # query and index are the same (embeddings of both das are equal)
+            res = da2.embed_and_evaluate(
+                metrics=metric_fns,
+                embed_funcs=emb_func,
+                match_batch_size=1,
+                limit=3,
+                label_tag=label_tag,
+            )
+    for key in metric_fns:
+        assert key in res
+        assert abs(res[key] - expected[key]) < 1e-4
 
 
 @pytest.mark.parametrize(
@@ -821,46 +821,46 @@ def test_embed_and_evaluate_with_embed_model(
     assert res['precision_at_k'] == 0.2
 
 
-# @pytest.mark.parametrize(
-#     'queries, kwargs, exception',
-#     [
-#         (DocumentArray.empty(4), {}, ValueError),
-#         (
-#             DocumentArray([Document(tags={'label': 0})]),
-#             {'index_data': DocumentArray.empty(4)},
-#             ValueError,
-#         ),
-#         (DocumentArray([Document(tags={'label': 0})]), {}, RuntimeError),
-#         (
-#             DocumentArray([Document(tags={'label': 0})]),
-#             {'index_data': DocumentArray([Document(tags={'label': 0})])},
-#             RuntimeError,
-#         ),
-#     ],
-# )
-# @pytest.mark.parametrize(
-#     'storage, config',
-#     [
-#         ('memory', {}),
-#         ('weaviate', {}),
-#         ('sqlite', {}),
-#         ('annlite', {'n_dim': 5}),
-#         ('qdrant', {'n_dim': 5}),
-#         ('elasticsearch', {'n_dim': 5}),
-#         ('redis', {'n_dim': 5}),
-#     ],
-# )
-# def test_embed_and_evaluate_invalid_input_should_raise(
-#     storage, config, queries, kwargs, exception, start_storage
-# ):
-#     kwargs.update({'metrics': ['precision_at_k']})
-#     if 'index_data' in kwargs:
-#         kwargs['index_data'] = DocumentArray(
-#             kwargs['index_data'], storage=storage, config=config
-#         )
-#
-#     with pytest.raises(exception):
-#         queries.embed_and_evaluate(**kwargs)
+@pytest.mark.parametrize(
+    'queries, kwargs, exception',
+    [
+        (DocumentArray.empty(4), {}, ValueError),
+        (
+            DocumentArray([Document(tags={'label': 0})]),
+            {'index_data': DocumentArray.empty(4)},
+            ValueError,
+        ),
+        (DocumentArray([Document(tags={'label': 0})]), {}, RuntimeError),
+        (
+            DocumentArray([Document(tags={'label': 0})]),
+            {'index_data': DocumentArray([Document(tags={'label': 0})])},
+            RuntimeError,
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    'storage, config',
+    [
+        ('memory', {}),
+        ('weaviate', {}),
+        ('sqlite', {}),
+        ('annlite', {'n_dim': 5}),
+        ('qdrant', {'n_dim': 5}),
+        ('elasticsearch', {'n_dim': 5}),
+        ('redis', {'n_dim': 5}),
+    ],
+)
+def test_embed_and_evaluate_invalid_input_should_raise(
+    storage, config, queries, kwargs, exception, start_storage
+):
+    kwargs.update({'metrics': ['precision_at_k']})
+    if 'index_data' in kwargs:
+        kwargs['index_data'] = DocumentArray(
+            kwargs['index_data'], storage=storage, config=config
+        )
+
+    with pytest.raises(exception):
+        queries.embed_and_evaluate(**kwargs)
 
 
 @pytest.mark.parametrize(

From b3387a45a6e0cfa57d6cd4e8214466e8f72d3d97 Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Wed, 11 Jan 2023 11:08:34 +0100
Subject: [PATCH 8/8] ci: fix success all tests

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3ea6ccfe944..eda1318e7b1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -260,7 +260,7 @@ jobs:
 
   # just for blocking the merge until all parallel core-test are successful
   success-all-test:
-    needs: [commit-lint, docarray-oldproto-test]
+    needs: [commit-lint, docarray-test, docarray-oldproto-test]
     if: always()
     runs-on: ubuntu-latest
     steps: