From 7af2e98fd3e85464b1030dfb466d0e78819d8bbe Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 1 Feb 2022 13:26:03 +0100 Subject: [PATCH 01/25] test: test_embed covers weaviate --- tests/unit/array/mixins/test_embed.py | 9 +++++++-- tests/unit/array/test_advance_indexing.py | 1 - 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/unit/array/mixins/test_embed.py b/tests/unit/array/mixins/test_embed.py index 442cdb1ea8c..ebea5d9946f 100644 --- a/tests/unit/array/mixins/test_embed.py +++ b/tests/unit/array/mixins/test_embed.py @@ -11,6 +11,7 @@ from docarray import DocumentArray from docarray.array.memory import DocumentArrayInMemory from docarray.array.sqlite import DocumentArraySqlite +from docarray.array.weaviate import DocumentArrayWeaviate random_embed_models = { 'keras': lambda: tf.keras.Sequential( @@ -43,11 +44,15 @@ @pytest.mark.parametrize('framework', ['onnx', 'keras', 'pytorch', 'paddle']) -@pytest.mark.parametrize('da', [DocumentArray, DocumentArraySqlite]) +@pytest.mark.parametrize( + 'da', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) @pytest.mark.parametrize('N', [2, 1000]) @pytest.mark.parametrize('batch_size', [1, 256]) @pytest.mark.parametrize('to_numpy', [True, False]) -def test_embedding_on_random_network(framework, da, N, batch_size, to_numpy): +def test_embedding_on_random_network( + framework, da, N, batch_size, to_numpy, start_weaviate +): docs = da.empty(N) docs.tensors = np.random.random([N, 128]).astype(np.float32) embed_model = random_embed_models[framework]() diff --git a/tests/unit/array/test_advance_indexing.py b/tests/unit/array/test_advance_indexing.py index e12c5080faf..7ef71ff5bbb 100644 --- a/tests/unit/array/test_advance_indexing.py +++ b/tests/unit/array/test_advance_indexing.py @@ -2,7 +2,6 @@ import pytest from docarray import DocumentArray, Document -from docarray.array.weaviate import DocumentArrayWeaviate @pytest.fixture From ec1c644df021ff8c0c7f8dd51e7a9d78106394ff Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 1 Feb 2022 13:56:13 +0100 Subject: [PATCH 02/25] fix: drop table only if exists --- docarray/array/storage/sqlite/seqlike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/sqlite/seqlike.py b/docarray/array/storage/sqlite/seqlike.py index 6265aaf7919..8ca74816b1d 100644 --- a/docarray/array/storage/sqlite/seqlike.py +++ b/docarray/array/storage/sqlite/seqlike.py @@ -57,7 +57,7 @@ def __del__(self) -> None: 'DELETE FROM metadata WHERE table_name=? AND container_type=?', (self._table_name, self.__class__.__name__), ) - self._sql(f'DROP TABLE {self._table_name}') + self._sql(f'DROP TABLE IF EXISTS {self._table_name}') self._commit() def __contains__(self, item: Union[str, 'Document']): From 16fdd55ee7505ce239b04fa1fc75a96d119d24ba Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 1 Feb 2022 13:56:40 +0100 Subject: [PATCH 03/25] feat: make sqlite document array pickable --- docarray/array/storage/sqlite/backend.py | 30 +++++++++++++++++++----- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/docarray/array/storage/sqlite/backend.py b/docarray/array/storage/sqlite/backend.py index 2de14f16d4b..02b222ebeea 100644 --- a/docarray/array/storage/sqlite/backend.py +++ b/docarray/array/storage/sqlite/backend.py @@ -76,16 +76,18 @@ def _init_storage( 'Document', lambda x: Document.from_bytes(x, **config.serialize_config) ) - _conn_kwargs = dict( - detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False - ) + _conn_kwargs = dict() _conn_kwargs.update(config.conn_config) if config.connection is None: + config.connection = NamedTemporaryFile().name + + if isinstance(config.connection, str): self._connection = sqlite3.connect( - NamedTemporaryFile().name, **_conn_kwargs + config.connection, + detect_types=sqlite3.PARSE_DECLTYPES, + check_same_thread=False, + **_conn_kwargs, ) - elif isinstance(config.connection, str): - self._connection = sqlite3.connect(config.connection, **_conn_kwargs) elif isinstance(config.connection, sqlite3.Connection): self._connection = config.connection else: @@ -118,3 +120,19 @@ def _init_storage( else: if isinstance(_docs, Document): self.append(_docs) + + def __getstate__(self): + d = dict(self.__dict__) + del d['_connection'] + return d + + def __setstate__(self, state): + self.__dict__ = state + _conn_kwargs = dict() + _conn_kwargs.update(state['_config'].conn_config) + self._connection = sqlite3.connect( + state['_config'].connection, + detect_types=sqlite3.PARSE_DECLTYPES, + check_same_thread=False, + **_conn_kwargs, + ) From 113a4c97a33cc0a2cac39fd63d9cfde8f2d980e6 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 1 Feb 2022 14:58:58 +0100 Subject: [PATCH 04/25] ci: increase ci timeout --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d837164411e..4d2892a4789 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -118,7 +118,7 @@ jobs: pytest --suppress-no-test-exit-code --cov=docarray --cov-report=xml \ -v -s -m "not gpu" ${{ matrix.test-path }} echo "::set-output name=codecov_flag::docarray" - timeout-minutes: 30 + timeout-minutes: 40 - name: Check codecov file id: check_files uses: andstor/file-existence-action@v1 From a0bc6139ebeb0a02c76fc04ce9aea3d347787c55 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 1 Feb 2022 15:19:17 +0100 Subject: [PATCH 05/25] feat: reset default serialization protocol for sqlite to pickle --- docarray/array/storage/sqlite/__init__.py | 5 +- docarray/array/storage/sqlite/binary.py | 134 ---------------------- 2 files changed, 1 insertion(+), 138 deletions(-) delete mode 100644 docarray/array/storage/sqlite/binary.py diff --git a/docarray/array/storage/sqlite/__init__.py b/docarray/array/storage/sqlite/__init__.py index a7ab954ddd4..4d9bf2b291b 100644 --- a/docarray/array/storage/sqlite/__init__.py +++ b/docarray/array/storage/sqlite/__init__.py @@ -1,14 +1,11 @@ from abc import ABC from .backend import BackendMixin, SqliteConfig -from .binary import SqliteBinaryIOMixin from .getsetdel import GetSetDelMixin from .seqlike import SequenceLikeMixin __all__ = ['StorageMixins', 'SqliteConfig'] -class StorageMixins( - SqliteBinaryIOMixin, BackendMixin, GetSetDelMixin, SequenceLikeMixin, ABC -): +class StorageMixins(BackendMixin, GetSetDelMixin, SequenceLikeMixin, ABC): ... diff --git a/docarray/array/storage/sqlite/binary.py b/docarray/array/storage/sqlite/binary.py deleted file mode 100644 index e179418b7d9..00000000000 --- a/docarray/array/storage/sqlite/binary.py +++ /dev/null @@ -1,134 +0,0 @@ -from typing import Union, BinaryIO, TYPE_CHECKING, Type, Optional, Generator -from docarray.array.mixins import BinaryIOMixin - -if TYPE_CHECKING: - from ....types import T - from .... import Document, DocumentArray - - -def _check_protocol(protocol): - if protocol == 'pickle-array': - raise ValueError( - 'protocol pickle-array is not supported for DocumentArraySqlite' - ) - - -class SqliteBinaryIOMixin(BinaryIOMixin): - """Save/load an array to a binary file.""" - - @classmethod - def load_binary( - cls: Type['T'], - file: Union[str, BinaryIO, bytes], - protocol: str = 'protobuf-array', - compress: Optional[str] = None, - _show_progress: bool = False, - streaming: bool = False, - ) -> Union['DocumentArray', Generator['Document', None, None]]: - """Load array elements from a compressed binary file. - - :param file: File or filename or serialized bytes where the data is stored. - :param protocol: protocol to use. 'pickle-array' is not supported for DocumentArraySqlite - :param compress: compress algorithm to use - :param _show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :param streaming: if `True` returns a generator over `Document` objects. - In case protocol is pickle the `Documents` are streamed from disk to save memory usage - :return: a DocumentArray object - """ - _check_protocol(protocol) - return super().load_binary( - file=file, - protocol=protocol, - compress=compress, - _show_progress=_show_progress, - streaming=streaming, - ) - - @classmethod - def from_bytes( - cls: Type['T'], - data: bytes, - protocol: str = 'protobuf-array', - compress: Optional[str] = None, - _show_progress: bool = False, - ) -> 'T': - _check_protocol(protocol) - return super().from_bytes( - data=data, - protocol=protocol, - compress=compress, - _show_progress=_show_progress, - ) - - def save_binary( - self, - file: Union[str, BinaryIO], - protocol: str = 'protobuf-array', - compress: Optional[str] = None, - ) -> None: - """Save array elements into a binary file. - - Comparing to :meth:`save_json`, it is faster and the file is smaller, but not human-readable. - - .. note:: - To get a binary presentation in memory, use ``bytes(...)``. - - :param protocol: protocol to use. 'pickle-array' is not supported for DocumentArraySqlite - :param compress: compress algorithm to use - :param file: File or filename to which the data is saved. - """ - _check_protocol(protocol) - super(SqliteBinaryIOMixin, self).save_binary( - file=file, protocol=protocol, compress=compress - ) - - def to_bytes( - self, - protocol: str = 'protobuf-array', - compress: Optional[str] = None, - _file_ctx: Optional[BinaryIO] = None, - _show_progress: bool = False, - ) -> bytes: - """Serialize itself into bytes. - - For more Pythonic code, please use ``bytes(...)``. - - :param _file_ctx: File or filename or serialized bytes where the data is stored. - :param protocol: protocol to use. 'pickle-array' is not supported for DocumentArraySqlite - :param compress: compress algorithm to use - :param _show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :return: the binary serialization in bytes - """ - _check_protocol(protocol) - return super(SqliteBinaryIOMixin, self).to_bytes( - protocol=protocol, - compress=compress, - _file_ctx=_file_ctx, - _show_progress=_show_progress, - ) - - @classmethod - def from_base64( - cls: Type['T'], - data: str, - protocol: str = 'protobuf-array', - compress: Optional[str] = None, - _show_progress: bool = False, - ) -> 'T': - _check_protocol(protocol) - return super().from_base64( - data=data, - protocol=protocol, - compress=compress, - _show_progress=_show_progress, - ) - - def to_base64( - self, - protocol: str = 'protobuf-array', - compress: Optional[str] = None, - _show_progress: bool = False, - ) -> str: - return super(SqliteBinaryIOMixin, self).to_base64( - protocol=protocol, compress=compress, _show_progress=_show_progress - ) From e443cf4dc51468973b2d9973e6f02d7ede530745 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 1 Feb 2022 15:29:34 +0100 Subject: [PATCH 06/25] test: test_content covers weaviate --- tests/unit/array/mixins/test_content.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/unit/array/mixins/test_content.py b/tests/unit/array/mixins/test_content.py index 21f4825e05f..adb1c8ae0ca 100644 --- a/tests/unit/array/mixins/test_content.py +++ b/tests/unit/array/mixins/test_content.py @@ -3,6 +3,7 @@ from docarray import DocumentArray from docarray.array.sqlite import DocumentArraySqlite +from docarray.array.weaviate import DocumentArrayWeaviate @pytest.mark.parametrize('cls', [DocumentArray, DocumentArraySqlite]) @@ -31,7 +32,9 @@ def test_content_empty_setter(cls, content_attr): assert getattr(da, content_attr[0]) is None -@pytest.mark.parametrize('cls', [DocumentArray, DocumentArraySqlite]) +@pytest.mark.parametrize( + 'cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) @pytest.mark.parametrize( 'content_attr', [ @@ -40,7 +43,7 @@ def test_content_empty_setter(cls, content_attr): ('blobs', [b's'] * 10), ], ) -def test_content_getter_setter(cls, content_attr): +def test_content_getter_setter(cls, content_attr, start_weaviate): da = cls.empty(10) setattr(da, content_attr[0], content_attr[1]) np.testing.assert_equal(da.contents, content_attr[1]) @@ -52,8 +55,10 @@ def test_content_getter_setter(cls, content_attr): @pytest.mark.parametrize('da_len', [0, 1, 2]) -@pytest.mark.parametrize('cls', [DocumentArray, DocumentArraySqlite]) -def test_content_empty(da_len, cls): +@pytest.mark.parametrize( + 'cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_content_empty(da_len, cls, start_weaviate): da = cls.empty(da_len) assert not da.texts assert not da.contents @@ -71,8 +76,10 @@ def test_content_empty(da_len, cls): @pytest.mark.parametrize('da_len', [0, 1, 2]) -@pytest.mark.parametrize('cls', [DocumentArray, DocumentArraySqlite]) -def test_embeddings_setter(da_len, cls): +@pytest.mark.parametrize( + 'cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_embeddings_setter(da_len, cls, start_weaviate): da = cls.empty(da_len) da.embeddings = np.random.rand(da_len, 5) for doc in da: From 34ad9a6cd7328e9230c7cb06cfb48475bcdce714 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 1 Feb 2022 15:34:59 +0100 Subject: [PATCH 07/25] test: test_parallel covers weaviate --- tests/unit/array/mixins/test_parallel.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tests/unit/array/mixins/test_parallel.py b/tests/unit/array/mixins/test_parallel.py index ea54de3e548..94f22c2c2a0 100644 --- a/tests/unit/array/mixins/test_parallel.py +++ b/tests/unit/array/mixins/test_parallel.py @@ -3,6 +3,7 @@ from docarray import DocumentArray, Document from docarray.array.sqlite import DocumentArraySqlite +from docarray.array.weaviate import DocumentArrayWeaviate def foo(d: Document): @@ -26,11 +27,11 @@ def foo_batch(da: DocumentArray): ) @pytest.mark.parametrize( 'da_cls', - [DocumentArray, DocumentArraySqlite], + [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate], ) @pytest.mark.parametrize('backend', ['process', 'thread']) @pytest.mark.parametrize('num_worker', [1, 2, None]) -def test_parallel_map(pytestconfig, da_cls, backend, num_worker): +def test_parallel_map(pytestconfig, da_cls, backend, num_worker, start_weaviate): da = da_cls.from_files(f'{pytestconfig.rootdir}/**/*.jpeg')[:10] # use a generator @@ -57,12 +58,14 @@ def test_parallel_map(pytestconfig, da_cls, backend, num_worker): ) @pytest.mark.parametrize( 'da_cls', - [DocumentArray, DocumentArraySqlite], + [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate], ) @pytest.mark.parametrize('backend', ['thread']) @pytest.mark.parametrize('num_worker', [1, 2, None]) @pytest.mark.parametrize('b_size', [1, 2, 256]) -def test_parallel_map_batch(pytestconfig, da_cls, backend, num_worker, b_size): +def test_parallel_map_batch( + pytestconfig, da_cls, backend, num_worker, b_size, start_weaviate +): da = da_cls.from_files(f'{pytestconfig.rootdir}/**/*.jpeg')[:10] # use a generator @@ -95,9 +98,9 @@ def test_parallel_map_batch(pytestconfig, da_cls, backend, num_worker, b_size): ) @pytest.mark.parametrize( 'da_cls', - [DocumentArray, DocumentArraySqlite], + [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate], ) -def test_map_lambda(pytestconfig, da_cls): +def test_map_lambda(pytestconfig, da_cls, start_weaviate): da = da_cls.from_files(f'{pytestconfig.rootdir}/**/*.jpeg')[:10] for d in da: @@ -107,9 +110,9 @@ def test_map_lambda(pytestconfig, da_cls): assert d.tensor is not None -@pytest.mark.parametrize('storage', ['memory', 'sqlite']) +@pytest.mark.parametrize('storage', ['memory', 'sqlite', 'weaviate']) @pytest.mark.parametrize('backend', ['thread', 'process']) -def test_apply_diff_backend_storage(storage, backend): +def test_apply_diff_backend_storage(storage, backend, start_weaviate): da = DocumentArray( (Document(text='hello world she smiled too much') for _ in range(1000)), storage=storage, From a8c4d07ceb417d74d28406c184bec537f4a7daa5 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 1 Feb 2022 15:38:42 +0100 Subject: [PATCH 08/25] test: test_empty covers weaviate --- tests/unit/array/mixins/test_empty.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/unit/array/mixins/test_empty.py b/tests/unit/array/mixins/test_empty.py index 12da4462916..26185ad11f3 100644 --- a/tests/unit/array/mixins/test_empty.py +++ b/tests/unit/array/mixins/test_empty.py @@ -2,10 +2,13 @@ from docarray import DocumentArray from docarray.array.sqlite import DocumentArraySqlite +from docarray.array.weaviate import DocumentArrayWeaviate -@pytest.mark.parametrize('da_cls', [DocumentArray, DocumentArraySqlite]) -def test_empty_non_zero(da_cls): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_empty_non_zero(da_cls, start_weaviate): da = DocumentArray.empty(10) assert len(da) == 10 da = DocumentArray.empty() From cbfc4e44bc21b898a2139610e96c0cc3403ac372 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 1 Feb 2022 18:11:39 +0100 Subject: [PATCH 09/25] test: reduce the number of docs in test_embed --- tests/unit/array/mixins/test_embed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/array/mixins/test_embed.py b/tests/unit/array/mixins/test_embed.py index ebea5d9946f..b453fc39c9d 100644 --- a/tests/unit/array/mixins/test_embed.py +++ b/tests/unit/array/mixins/test_embed.py @@ -47,7 +47,7 @@ @pytest.mark.parametrize( 'da', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] ) -@pytest.mark.parametrize('N', [2, 1000]) +@pytest.mark.parametrize('N', [2, 10]) @pytest.mark.parametrize('batch_size', [1, 256]) @pytest.mark.parametrize('to_numpy', [True, False]) def test_embedding_on_random_network( From 3dac06320958ff627b8fda45cf1ef2620b9ac760 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 2 Feb 2022 09:10:19 +0100 Subject: [PATCH 10/25] fix: protobuf as default backend for weaviate --- docarray/array/storage/weaviate/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/weaviate/backend.py b/docarray/array/storage/weaviate/backend.py index 693f3fe6266..fcc5bbe803e 100644 --- a/docarray/array/storage/weaviate/backend.py +++ b/docarray/array/storage/weaviate/backend.py @@ -33,7 +33,7 @@ class WeaviateConfig: client: Optional[Union[str, weaviate.Client]] = None n_dim: Optional[int] = None name: Optional[str] = None - serialize_config: Dict = field(default_factory=lambda: {'protocol': 'pickle'}) + serialize_config: Dict = field(default_factory=lambda: {'protocol': 'protobuf'}) class BackendMixin(BaseBackendMixin): From 907d5f99250e8c48cebd090c2af5cca8c20192b0 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 2 Feb 2022 10:30:57 +0100 Subject: [PATCH 11/25] test: cover weaviate in getset --- tests/unit/array/mixins/test_getset.py | 97 ++++++++++++++++++-------- 1 file changed, 67 insertions(+), 30 deletions(-) diff --git a/tests/unit/array/mixins/test_getset.py b/tests/unit/array/mixins/test_getset.py index 69f96754afe..4acf7f0632b 100644 --- a/tests/unit/array/mixins/test_getset.py +++ b/tests/unit/array/mixins/test_getset.py @@ -7,28 +7,25 @@ from docarray import DocumentArray, Document from docarray.array.sqlite import DocumentArraySqlite +from docarray.array.weaviate import DocumentArrayWeaviate from tests import random_docs rand_array = np.random.random([10, 3]) -def da_and_dam(): +@pytest.fixture() +def docs(): rand_docs = random_docs(100) - da = DocumentArray() - da.extend(rand_docs) - das = DocumentArraySqlite(rand_docs) - return (da, das) + return rand_docs -def nested_da_and_dam(): +@pytest.fixture() +def nested_docs(): docs = [ Document(id='r1', chunks=[Document(id='c1'), Document(id='c2')]), Document(id='r2', matches=[Document(id='m1'), Document(id='m2')]), ] - da = DocumentArray() - da.extend(docs) - das = DocumentArraySqlite(docs) - return (da, das) + return docs @pytest.mark.parametrize( @@ -45,14 +42,22 @@ def test_set_embeddings_multi_kind(array): da.embeddings = array -@pytest.mark.parametrize('da', da_and_dam()) -def test_da_get_embeddings(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_da_get_embeddings(docs, da_cls, start_weaviate): + da = da_cls() + da.extend(docs) np.testing.assert_almost_equal(da._get_attributes('embedding'), da.embeddings) np.testing.assert_almost_equal(da[:, 'embedding'], da.embeddings) -@pytest.mark.parametrize('da', da_and_dam()) -def test_embeddings_setter_da(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_embeddings_setter_da(docs, da_cls, start_weaviate): + da = da_cls() + da.extend(docs) emb = np.random.random((100, 128)) da.embeddings = emb np.testing.assert_almost_equal(da.embeddings, emb) @@ -66,16 +71,24 @@ def test_embeddings_setter_da(da): assert not da.embeddings -@pytest.mark.parametrize('da', da_and_dam()) -def test_embeddings_wrong_len(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_embeddings_wrong_len(docs, da_cls, start_weaviate): + da = da_cls() + da.extend(docs) embeddings = np.ones((2, 10)) with pytest.raises(ValueError): da.embeddings = embeddings -@pytest.mark.parametrize('da', da_and_dam()) -def test_tensors_getter_da(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_tensors_getter_da(docs, da_cls, start_weaviate): + da = da_cls() + da.extend(docs) tensors = np.random.random((100, 10, 10)) da.tensors = tensors assert len(da) == 100 @@ -85,8 +98,12 @@ def test_tensors_getter_da(da): assert da.tensors is None -@pytest.mark.parametrize('da', da_and_dam()) -def test_texts_getter_da(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_texts_getter_da(docs, da_cls, start_weaviate): + da = da_cls() + da.extend(docs) assert len(da.texts) == 100 assert da.texts == da[:, 'text'] texts = ['text' for _ in range(100)] @@ -105,8 +122,12 @@ def test_texts_getter_da(da): assert not da.texts -@pytest.mark.parametrize('da', da_and_dam()) -def test_setter_by_sequences_in_selected_docs_da(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_setter_by_sequences_in_selected_docs_da(docs, da_cls, start_weaviate): + da = da_cls() + da.extend(docs) da[[0, 1, 2], 'text'] = 'test' assert da[[0, 1, 2], 'text'] == ['test', 'test', 'test'] @@ -131,24 +152,36 @@ def test_setter_by_sequences_in_selected_docs_da(da): assert ['101', '102'] == da[[0, 1], 'id'] -@pytest.mark.parametrize('da', da_and_dam()) -def test_texts_wrong_len(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_texts_wrong_len(docs, da_cls, start_weaviate): + da = da_cls() + da.extend(docs) texts = ['hello'] with pytest.raises(ValueError): da.texts = texts -@pytest.mark.parametrize('da', da_and_dam()) -def test_tensors_wrong_len(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_tensors_wrong_len(docs, da_cls, start_weaviate): + da = da_cls() + da.extend(docs) tensors = np.ones((2, 10, 10)) with pytest.raises(ValueError): da.tensors = tensors -@pytest.mark.parametrize('da', da_and_dam()) -def test_blobs_getter_setter(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_blobs_getter_setter(docs, da_cls, start_weaviate): + da = da_cls() + da.extend(docs) with pytest.raises(ValueError): da.blobs = [b'cc', b'bb', b'aa', b'dd'] @@ -164,8 +197,12 @@ def test_blobs_getter_setter(da): assert not da.blobs -@pytest.mark.parametrize('da', nested_da_and_dam()) -def test_ellipsis_getter(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_ellipsis_getter(nested_docs, da_cls, start_weaviate): + da = da_cls() + da.extend(nested_docs) flattened = da[...] assert len(flattened) == 6 for d, doc_id in zip(flattened, ['c1', 'c2', 'r1', 'm1', 'm2', 'r2']): From b1b1060f850e9f6f183f01e54b084d21beb1a75e Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 2 Feb 2022 10:36:44 +0100 Subject: [PATCH 12/25] fix: add start_weaviate --- tests/unit/array/mixins/test_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/array/mixins/test_io.py b/tests/unit/array/mixins/test_io.py index 89888da835a..1660c542d18 100644 --- a/tests/unit/array/mixins/test_io.py +++ b/tests/unit/array/mixins/test_io.py @@ -114,7 +114,7 @@ def test_from_to_bytes(da_cls, start_weaviate): 'da_cls', [DocumentArrayInMemory, DocumentArrayWeaviate, DocumentArraySqlite] ) @pytest.mark.parametrize('show_progress', [True, False]) -def test_push_pull_io(da_cls, show_progress): +def test_push_pull_io(da_cls, show_progress, start_weaviate): da1 = da_cls.empty(10) da1[:, 'embedding'] = np.random.random([len(da1), 256]) random_texts = [str(uuid.uuid1()) for _ in da1] From e4dadfa62d728f3d55f77b6f2c3923209f2f7529 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 2 Feb 2022 10:46:24 +0100 Subject: [PATCH 13/25] test: cover weaviate in test_magic --- tests/unit/array/mixins/test_magic.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/tests/unit/array/mixins/test_magic.py b/tests/unit/array/mixins/test_magic.py index ece3c3e2458..50503d0bafc 100644 --- a/tests/unit/array/mixins/test_magic.py +++ b/tests/unit/array/mixins/test_magic.py @@ -2,6 +2,7 @@ from docarray import DocumentArray, Document from docarray.array.sqlite import DocumentArraySqlite +from docarray.array.weaviate import DocumentArrayWeaviate N = 100 @@ -17,8 +18,11 @@ def docs(): yield (Document(text=str(j)) for j in range(100)) -@pytest.mark.parametrize('da', da_and_dam()) -def test_iter_len_bool(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_iter_len_bool(da_cls, start_weaviate): + da = da_cls.empty(N) j = 0 for _ in da: j += 1 @@ -29,15 +33,17 @@ def test_iter_len_bool(da): assert not da -@pytest.mark.parametrize('da', da_and_dam()) -def test_repr(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_repr(da_cls, start_weaviate): + da = da_cls.empty(N) assert f'length={N}' in repr(da) -@pytest.mark.parametrize('storage', ['memory', 'sqlite']) -def test_repr_str(docs, storage): +@pytest.mark.parametrize('storage', ['memory', 'sqlite', 'weaviate']) +def test_repr_str(docs, storage, start_weaviate): da = DocumentArray(docs, storage=storage) - print(da) da.summary() assert da da.clear() @@ -45,8 +51,11 @@ def test_repr_str(docs, storage): print(da) -@pytest.mark.parametrize('da', da_and_dam()) -def test_iadd(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_iadd(da_cls, start_weaviate): + da = da_cls.empty(N) oid = id(da) dap = DocumentArray.empty(10) da += dap From 07b93e63c48989b2960c0eaa376cc1cb8501234b Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 2 Feb 2022 10:52:54 +0100 Subject: [PATCH 14/25] test: cover weaviate in test_construct --- tests/unit/array/test_construct.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/unit/array/test_construct.py b/tests/unit/array/test_construct.py index 593c15320e4..158c81cbec7 100644 --- a/tests/unit/array/test_construct.py +++ b/tests/unit/array/test_construct.py @@ -41,9 +41,11 @@ def test_construct_docarray(da_cls, start_weaviate): assert len(da1) == 10 -@pytest.mark.parametrize('da_cls', [DocumentArrayInMemory, DocumentArraySqlite]) +@pytest.mark.parametrize( + 'da_cls', [DocumentArrayInMemory, DocumentArraySqlite, DocumentArrayWeaviate] +) @pytest.mark.parametrize('is_copy', [True, False]) -def test_docarray_copy_singleton(da_cls, is_copy): +def test_docarray_copy_singleton(da_cls, is_copy, start_weaviate): d = Document() da = da_cls(d, copy=is_copy) d.id = 'hello' @@ -56,9 +58,11 @@ def test_docarray_copy_singleton(da_cls, is_copy): assert da[0].id != 'hello' -@pytest.mark.parametrize('da_cls', [DocumentArrayInMemory, DocumentArraySqlite]) +@pytest.mark.parametrize( + 'da_cls', [DocumentArrayInMemory, DocumentArraySqlite, DocumentArrayWeaviate] +) @pytest.mark.parametrize('is_copy', [True, False]) -def test_docarray_copy_da(da_cls, is_copy): +def test_docarray_copy_da(da_cls, is_copy, start_weaviate): d1 = Document() d2 = Document() da = da_cls([d1, d2], copy=is_copy) @@ -74,7 +78,7 @@ def test_docarray_copy_da(da_cls, is_copy): @pytest.mark.parametrize('da_cls', [DocumentArrayInMemory, DocumentArraySqlite]) @pytest.mark.parametrize('is_copy', [True, False]) -def test_docarray_copy_list(da_cls, is_copy): +def test_docarray_copy_list(da_cls, is_copy, start_weaviate): d1 = Document() d2 = Document() da = da_cls([d1, d2], copy=is_copy) From 50090bf369ba31c861534342387f9d42468c2ca6 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 2 Feb 2022 10:53:26 +0100 Subject: [PATCH 15/25] ci: reduce ci timeout --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4d2892a4789..d837164411e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -118,7 +118,7 @@ jobs: pytest --suppress-no-test-exit-code --cov=docarray --cov-report=xml \ -v -s -m "not gpu" ${{ matrix.test-path }} echo "::set-output name=codecov_flag::docarray" - timeout-minutes: 40 + timeout-minutes: 30 - name: Check codecov file id: check_files uses: andstor/file-existence-action@v1 From 6d00102115894b4e570060e4ce50c3afeb240959 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 2 Feb 2022 14:46:44 +0100 Subject: [PATCH 16/25] test: cover weaviate --- tests/unit/array/mixins/test_plot.py | 33 ++++++++++++++++-------- tests/unit/array/mixins/test_sample.py | 35 +++++++++++++++----------- 2 files changed, 44 insertions(+), 24 deletions(-) diff --git a/tests/unit/array/mixins/test_plot.py b/tests/unit/array/mixins/test_plot.py index b49594396ca..f0c3fb59e40 100644 --- a/tests/unit/array/mixins/test_plot.py +++ b/tests/unit/array/mixins/test_plot.py @@ -7,10 +7,13 @@ from docarray import DocumentArray, Document from docarray.array.sqlite import DocumentArraySqlite +from docarray.array.weaviate import DocumentArrayWeaviate -@pytest.mark.parametrize('da_cls', [DocumentArray, DocumentArraySqlite]) -def test_sprite_fail_tensor_success_uri(pytestconfig, tmpdir, da_cls): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_sprite_fail_tensor_success_uri(pytestconfig, tmpdir, da_cls, start_weaviate): da = da_cls.from_files( [ f'{pytestconfig.rootdir}/**/*.png', @@ -28,8 +31,12 @@ def test_sprite_fail_tensor_success_uri(pytestconfig, tmpdir, da_cls): @pytest.mark.parametrize('image_source', ['tensor', 'uri']) -@pytest.mark.parametrize('da_cls', [DocumentArray, DocumentArraySqlite]) -def test_sprite_image_generator(pytestconfig, tmpdir, image_source, da_cls): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_sprite_image_generator( + pytestconfig, tmpdir, image_source, da_cls, start_weaviate +): da = da_cls.from_files( [ f'{pytestconfig.rootdir}/**/*.png', @@ -72,8 +79,10 @@ def test_plot_embeddings(da): assert config['embeddings'][0]['tensorShape'] == list(da.embeddings.shape) -@pytest.mark.parametrize('da_cls', [DocumentArray, DocumentArraySqlite]) -def test_plot_embeddings_same_path(tmpdir, da_cls): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_plot_embeddings_same_path(tmpdir, da_cls, start_weaviate): da1 = da_cls.empty(100) da1.embeddings = np.random.random([100, 5]) p1 = da1.plot_embeddings(start_server=False, path=tmpdir) @@ -87,8 +96,10 @@ def test_plot_embeddings_same_path(tmpdir, da_cls): assert len(config['embeddings']) == 2 -@pytest.mark.parametrize('da_cls', [DocumentArray, DocumentArraySqlite]) -def test_summary_homo_hetero(da_cls): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_summary_homo_hetero(da_cls, start_weaviate): da = da_cls.empty(100) da._get_attributes() da.summary() @@ -97,8 +108,10 @@ def test_summary_homo_hetero(da_cls): da.summary() -@pytest.mark.parametrize('da_cls', [DocumentArray, DocumentArraySqlite]) -def test_empty_get_attributes(da_cls): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_empty_get_attributes(da_cls, start_weaviate): da = da_cls.empty(10) da[0].pop('id') print(da[:, 'id']) diff --git a/tests/unit/array/mixins/test_sample.py b/tests/unit/array/mixins/test_sample.py index dabbe79155f..7f61f8b1c10 100644 --- a/tests/unit/array/mixins/test_sample.py +++ b/tests/unit/array/mixins/test_sample.py @@ -1,17 +1,15 @@ import pytest +from docarray.array.weaviate import DocumentArrayWeaviate from docarray import DocumentArray from docarray.array.sqlite import DocumentArraySqlite -def da_and_dam(N): - da = DocumentArray.empty(N) - dam = DocumentArraySqlite.empty(N) - return (da, dam) - - -@pytest.mark.parametrize('da', da_and_dam(100)) -def test_sample(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_sample(da_cls, start_weaviate): + da = da_cls.empty(100) sampled = da.sample(1) assert len(sampled) == 1 sampled = da.sample(5) @@ -21,8 +19,11 @@ def test_sample(da): da.sample(101) # can not sample with k greater than lenth of document array. -@pytest.mark.parametrize('da', da_and_dam(100)) -def test_sample_with_seed(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_sample_with_seed(da_cls, start_weaviate): + da = da_cls.empty(100) sampled_1 = da.sample(5, seed=1) sampled_2 = da.sample(5, seed=1) sampled_3 = da.sample(5, seed=2) @@ -31,8 +32,11 @@ def test_sample_with_seed(da): assert sampled_1 != sampled_3 -@pytest.mark.parametrize('da', da_and_dam(100)) -def test_shuffle(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_shuffle(da_cls, start_weaviate): + da = da_cls.empty(100) shuffled = da.shuffle() assert len(shuffled) == len(da) assert isinstance(shuffled, DocumentArray) @@ -42,8 +46,11 @@ def test_shuffle(da): assert sorted(ids_before_shuffle) == sorted(ids_after_shuffle) -@pytest.mark.parametrize('da', da_and_dam(100)) -def test_shuffle_with_seed(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_shuffle_with_seed(da_cls, start_weaviate): + da = da_cls.empty(100) shuffled_1 = da.shuffle(seed=1) shuffled_2 = da.shuffle(seed=1) shuffled_3 = da.shuffle(seed=2) From 6d1a114d61bd90d09737dd3be63e4dae8bc4eaf9 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 2 Feb 2022 14:53:49 +0100 Subject: [PATCH 17/25] test: cover weaviate in test_text --- tests/unit/array/mixins/test_text.py | 61 +++++++++++++++------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/tests/unit/array/mixins/test_text.py b/tests/unit/array/mixins/test_text.py index f9470f7e27f..fb340109190 100644 --- a/tests/unit/array/mixins/test_text.py +++ b/tests/unit/array/mixins/test_text.py @@ -3,31 +3,24 @@ from docarray import DocumentArray, Document from docarray.array.sqlite import DocumentArraySqlite +from docarray.array.weaviate import DocumentArrayWeaviate -def da_and_dam(): - da = DocumentArray( - [ - Document(text='hello'), - Document(text='hello world'), - Document(text='goodbye world!'), - ] - ) - - das = DocumentArraySqlite( - [ - Document(text='hello'), - Document(text='hello world'), - Document(text='goodbye world!'), - ] - ) - - return (da, das) +@pytest.fixture(scope='function') +def docs(): + return [ + Document(text='hello'), + Document(text='hello world'), + Document(text='goodbye world!'), + ] @pytest.mark.parametrize('min_freq', [1, 2, 3]) -@pytest.mark.parametrize('da', da_and_dam()) -def test_da_vocabulary(da, min_freq): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_da_vocabulary(da_cls, docs, min_freq, start_weaviate): + da = da_cls(docs) vocab = da.get_vocabulary(min_freq) if min_freq <= 1: assert set(vocab.values()) == {2, 3, 4} # 0,1 are reserved @@ -40,8 +33,11 @@ def test_da_vocabulary(da, min_freq): assert not vocab.keys() -@pytest.mark.parametrize('test_docs', da_and_dam()) -def test_da_text_to_tensor_non_max_len(test_docs): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_da_text_to_tensor_non_max_len(docs, da_cls, start_weaviate): + test_docs = da_cls(docs) vocab = test_docs.get_vocabulary() test_docs.apply(lambda d: d.convert_text_to_tensor(vocab)) np.testing.assert_array_equal(test_docs[0].tensor, [2]) @@ -54,8 +50,11 @@ def test_da_text_to_tensor_non_max_len(test_docs): assert test_docs[2].text == 'goodbye world' -@pytest.mark.parametrize('test_docs', da_and_dam()) -def test_da_text_to_tensor_max_len_3(test_docs): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_da_text_to_tensor_max_len_3(docs, da_cls, start_weaviate): + test_docs = da_cls(docs) vocab = test_docs.get_vocabulary() test_docs.apply(lambda d: d.convert_text_to_tensor(vocab, max_length=3)) @@ -70,8 +69,11 @@ def test_da_text_to_tensor_max_len_3(test_docs): assert test_docs[2].text == 'goodbye world' -@pytest.mark.parametrize('test_docs', da_and_dam()) -def test_da_text_to_tensor_max_len_1(test_docs): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_da_text_to_tensor_max_len_1(docs, da_cls, start_weaviate): + test_docs = da_cls(docs) vocab = test_docs.get_vocabulary() test_docs.apply(lambda d: d.convert_text_to_tensor(vocab, max_length=1)) @@ -86,8 +88,11 @@ def test_da_text_to_tensor_max_len_1(test_docs): assert test_docs[2].text == 'world' -@pytest.mark.parametrize('da', da_and_dam()) -def test_convert_text_tensor_random_text(da): +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_convert_text_tensor_random_text(da_cls, docs, start_weaviate): + da = da_cls(docs) texts = ['a short phrase', 'word', 'this is a much longer sentence'] da.clear() da.extend(Document(text=t) for t in texts) From 2088999f816e5e6bb454f3a84646750b93e37882 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 2 Feb 2022 15:16:31 +0100 Subject: [PATCH 18/25] chore: add explaining comments to __setitem__ --- docarray/array/mixins/setitem.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/docarray/array/mixins/setitem.py b/docarray/array/mixins/setitem.py index 4c8d8440f50..e0ec66a6016 100644 --- a/docarray/array/mixins/setitem.py +++ b/docarray/array/mixins/setitem.py @@ -64,24 +64,41 @@ def __setitem__( value: Union['Document', Sequence['Document']], ): + # set by offset + # allows da[1] = Document() if isinstance(index, (int, np.generic)) and not isinstance(index, bool): self._set_doc_by_offset(int(index), value) elif isinstance(index, str): + # set by traversal paths + # allows da['@m,c] = [m1, m2, ..., mn, c1, c2, ..., cp] if index.startswith('@'): self._set_doc_value_pairs_nested(self.traverse_flat(index[1:]), value) + + # set by ID + # allows da['id_123'] = Document() else: self._set_doc_by_id(index, value) + # set by slice + # allows da[1:3] = [d1, d2] elif isinstance(index, slice): self._set_docs_by_slice(index, value) + + # flatten and set + # allows da[...] = [d1, d2,..., dn] elif index is Ellipsis: self._set_doc_value_pairs(self.flatten(), value) + + # index is sequence elif isinstance(index, Sequence): + # allows da[idx1, idx2] = value if isinstance(index, tuple) and len(index) == 2: self._set_by_pair(index[0], index[1], value) + # allows da[True, False, True, True] elif isinstance(index[0], bool): self._set_by_mask(index, value) + # allows da[id1, id2, id3] = [d1, d2, d3] elif isinstance(index[0], (int, str)): for si, _val in zip(index, value): self[si] = _val # leverage existing setter @@ -90,6 +107,7 @@ def __setitem__( f'{index} should be either a sequence of bool, int or str' ) + # set by ndarray elif isinstance(index, np.ndarray): index = index.squeeze() if index.ndim == 1: @@ -104,12 +122,15 @@ def __setitem__( def _set_by_pair(self, idx1, idx2, value): if isinstance(idx1, str) and not idx1.startswith('@'): # second is an ID + # allows da[id1, id2] = [d1, d2] if isinstance(idx2, str) and idx2 in self: self._set_doc_value_pairs((self[idx1], self[idx2]), value) # second is an attribute + # allows da[id, attr] = attr_value elif isinstance(idx2, str) and hasattr(self[idx1], idx2): self._set_doc_attr_by_id(idx1, idx2, value) # second is a list of attributes: + # allows da[id, [attr1, attr2, attr3]] = [v1, v2, v3] elif ( isinstance(idx2, Sequence) and all(isinstance(attr, str) for attr in idx2) @@ -121,12 +142,15 @@ def _set_by_pair(self, idx1, idx2, value): raise IndexError(f'`{idx2}` is neither a valid id nor attribute name') elif isinstance(idx1, int): # second is an offset + # allows da[offset1, offset2] = [d1, d2] if isinstance(idx2, int): self._set_doc_value_pairs((self[idx1], self[idx2]), value) # second is an attribute + # allows da[offset, attr] = value elif isinstance(idx2, str) and hasattr(self[idx1], idx2): self._set_doc_attr_by_offset(idx1, idx2, value) - # second is a list of attributes: + # second is a list of attributes + # allows da[offset, [attr1, attr2, attr3]] = [v1, v2, v3] elif ( isinstance(idx2, Sequence) and all(isinstance(attr, str) for attr in idx2) @@ -137,6 +161,7 @@ def _set_by_pair(self, idx1, idx2, value): else: raise IndexError(f'`{idx2}` must be an attribute or list of attributes') + # allows da[sequence/slice/ellipsis/traversal_path, attributes] = [v1, v2, ...] elif ( isinstance(idx1, (slice, Sequence)) or idx1 is Ellipsis From f767ff1fa85bbe9968098cda8b0fae22ac5bda6b Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 2 Feb 2022 15:23:40 +0100 Subject: [PATCH 19/25] fix: accept extra kwargs --- docarray/array/storage/weaviate/backend.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docarray/array/storage/weaviate/backend.py b/docarray/array/storage/weaviate/backend.py index fcc5bbe803e..aa41dc7e1a1 100644 --- a/docarray/array/storage/weaviate/backend.py +++ b/docarray/array/storage/weaviate/backend.py @@ -43,11 +43,13 @@ def _init_storage( self, docs: Optional['DocumentArraySourceType'] = None, config: Optional[WeaviateConfig] = None, + **kwargs ): """Initialize weaviate storage. :param docs: the list of documents to initialize to :param config: the config object used to ininitialize connection to weaviate server + :param kwargs: extra keyword arguments :raises ValueError: only one of name or docs can be used for initialization, raise an error if both are provided """ From e803499f500d15f5ceb84a4266567149ce5eeaec Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 2 Feb 2022 15:24:07 +0100 Subject: [PATCH 20/25] fix: fix change id in weaviate --- docarray/array/storage/weaviate/getsetdel.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/docarray/array/storage/weaviate/getsetdel.py b/docarray/array/storage/weaviate/getsetdel.py index bebca98dc7b..24baa80545f 100644 --- a/docarray/array/storage/weaviate/getsetdel.py +++ b/docarray/array/storage/weaviate/getsetdel.py @@ -41,6 +41,13 @@ def _setitem(self, wid: str, value: Document): self._offset2ids[self._offset2ids.index(wid)] = self.wmap(value.id) self._update_offset2ids_meta() + def _change_doc_id(self, old_wid: str, doc: Document, new_wid: str): + payload = self._doc2weaviate_create_payload(doc) + self._client.data_object.delete(old_wid) + self._client.data_object.create(**payload) + self._offset2ids[self._offset2ids.index(old_wid)] = new_wid + self._update_offset2ids_meta() + def _delitem(self, wid: str): """Helper method for deleting an item with weaviate as storage @@ -119,8 +126,14 @@ def _set_doc_attr_by_id(self, _id: str, attr: str, value: Any): if attr == 'id' and value is None: raise ValueError('pop id from Document stored with weaviate is not allowed') doc = self[_id] - setattr(doc, attr, value) - self._setitem(self.wmap(doc.id), doc) + + if attr == 'id': + old_wid = self.wmap(doc.id) + setattr(doc, attr, value) + self._change_doc_id(old_wid, doc, self.wmap(value)) + else: + setattr(doc, attr, value) + self._setitem(self.wmap(doc.id), doc) def _set_docs_attrs(self, docs: 'DocumentArray', attr: str, values: Iterable[Any]): # TODO: remove this function to use _set_doc_attr_by_id once From c8f736a70166b815d7f5f61ce9c6a1a3063a72e0 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 2 Feb 2022 16:25:00 +0100 Subject: [PATCH 21/25] feat: show storage information in summary --- docarray/array/mixins/plot.py | 8 ++++-- docarray/array/storage/base/backend.py | 8 ++++++ docarray/array/storage/memory/backend.py | 5 ++++ docarray/array/storage/sqlite/backend.py | 10 +++++++ docarray/array/storage/weaviate/backend.py | 31 ++++++++++++++++------ 5 files changed, 52 insertions(+), 10 deletions(-) diff --git a/docarray/array/mixins/plot.py b/docarray/array/mixins/plot.py index c9d776cdb5d..94a0b21dafa 100644 --- a/docarray/array/mixins/plot.py +++ b/docarray/array/mixins/plot.py @@ -57,7 +57,7 @@ def summary(self): table.add_row('Common Attributes', str(list(attr_counter.items())[0][0])) else: for _a, _n in attr_counter.most_common(): - if _n <= 1: + if _n == 1: _doc_text = f'{_n} Document has' else: _doc_text = f'{_n} Documents have' @@ -96,7 +96,11 @@ def summary(self): str(len(_a)), str(any(_aa is None for _aa in _a)), ) - console.print(table, attr_table) + + storage_table = Table(box=box.SIMPLE, title='Storage Summary') + self._fill_storage_table(storage_table) + + console.print(table, attr_table, storage_table) def plot_embeddings( self, diff --git a/docarray/array/storage/base/backend.py b/docarray/array/storage/base/backend.py index 06ece2b12e6..4ff70d67fb4 100644 --- a/docarray/array/storage/base/backend.py +++ b/docarray/array/storage/base/backend.py @@ -1,7 +1,15 @@ from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from rich.table import Table class BaseBackendMixin(ABC): @abstractmethod def _init_storage(self, *args, **kwargs): ... + + def _fill_storage_table(self, table: 'Table'): + table.show_header = False + table.add_row('Class', self.__class__.__name__) diff --git a/docarray/array/storage/memory/backend.py b/docarray/array/storage/memory/backend.py index c38e1acaab0..42c2cc7005f 100644 --- a/docarray/array/storage/memory/backend.py +++ b/docarray/array/storage/memory/backend.py @@ -17,6 +17,7 @@ from ....types import ( DocumentArraySourceType, ) + from rich.table import Table def needs_id2offset_rebuild(func) -> Callable: @@ -86,3 +87,7 @@ def _init_storage( self.append(Document(_docs, copy=True)) else: self.append(_docs) + + def _fill_storage_table(self, table: 'Table'): + super()._fill_storage_table(table) + table.add_row('Backend', 'In Memory') diff --git a/docarray/array/storage/sqlite/backend.py b/docarray/array/storage/sqlite/backend.py index 02b222ebeea..b97b8719bee 100644 --- a/docarray/array/storage/sqlite/backend.py +++ b/docarray/array/storage/sqlite/backend.py @@ -21,6 +21,7 @@ from ....types import ( DocumentArraySourceType, ) + from rich.table import Table def _sanitize_table_name(table_name: str) -> str: @@ -136,3 +137,12 @@ def __setstate__(self, state): check_same_thread=False, **_conn_kwargs, ) + + def _fill_storage_table(self, table: 'Table'): + super()._fill_storage_table(table) + table.add_row('Backend', 'SQLite (https://www.sqlite.org)') + table.add_row('Connection', self._config.connection) + table.add_row('Table Name', self._table_name) + table.add_row( + 'Serialization Protocol', self._config.serialize_config.get('protocol') + ) diff --git a/docarray/array/storage/weaviate/backend.py b/docarray/array/storage/weaviate/backend.py index aa41dc7e1a1..4195f062941 100644 --- a/docarray/array/storage/weaviate/backend.py +++ b/docarray/array/storage/weaviate/backend.py @@ -23,6 +23,7 @@ from ....types import ( DocumentArraySourceType, ) + from rich.table import Table @dataclass @@ -67,8 +68,9 @@ def _init_storage( import weaviate if config.client is None: - self._client = weaviate.Client('http://localhost:8080') - elif isinstance(config.client, str): + config.client = 'http://localhost:8080' + + if isinstance(config.client, str): self._client = weaviate.Client(config.client) else: self._client = config.client @@ -77,8 +79,9 @@ def _init_storage( raise ValueError( 'only one of name or docs can be provided for initialization' ) + self._config = config - self._schemas = self._load_or_create_weaviate_schema(config.name) + self._schemas = self._load_or_create_weaviate_schema() self._offset2ids, self._offset2ids_wid = self._get_offset2ids_meta() if docs is None and config.name: @@ -146,7 +149,7 @@ def _get_schema_by_name(self, cls_name: str) -> Dict: ] } - def _load_or_create_weaviate_schema(self, cls_name: Optional[str] = None): + def _load_or_create_weaviate_schema(self): """Create a new weaviate schema for this :class:`DocumentArrayWeaviate` object if not present in weaviate or if ``cls_name`` not provided, else if ``cls_name`` is provided load the object with the given ``cls_name`` @@ -156,14 +159,17 @@ def _load_or_create_weaviate_schema(self, cls_name: Optional[str] = None): with a newly generated class name. :return: the schemas of this :class`DocumentArrayWeaviate` object and its meta """ - if not cls_name: - doc_schemas = self._get_schema_by_name(self._get_weaviate_class_name()) + if not self._config.name: + name_candidate = self._get_weaviate_class_name() + doc_schemas = self._get_schema_by_name(name_candidate) while self._client.schema.contains(doc_schemas): - doc_schemas = self._get_schema_by_name(self._get_weaviate_class_name()) + name_candidate = self._get_weaviate_class_name() + doc_schemas = self._get_schema_by_name(name_candidate) self._client.schema.create(doc_schemas) + self._config.name = name_candidate return doc_schemas - doc_schemas = self._get_schema_by_name(cls_name) + doc_schemas = self._get_schema_by_name(self._config.name) if self._client.schema.contains(doc_schemas): return doc_schemas @@ -299,3 +305,12 @@ def wmap(self, doc_id: str): # daw2 = DocumentArrayWeaviate([Document(id=str(i), text='bye') for i in range(3)]) # daw2[0, 'text'] == 'hi' # this will be False if we don't append class name return str(uuid.uuid5(uuid.NAMESPACE_URL, doc_id + self._class_name)) + + def _fill_storage_table(self, table: 'Table'): + super()._fill_storage_table(table) + table.add_row('Backend', 'Weaviate (www.semi.technology/developers/weaviate)') + table.add_row('Hostname', self._config.client) + table.add_row('Schema Name', self._config.name) + table.add_row( + 'Serialization Protocol', self._config.serialize_config.get('protocol') + ) From 52a25263b860f971d9bf0673c649cc7c12b2a43e Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 2 Feb 2022 16:42:45 +0100 Subject: [PATCH 22/25] test: fix test_from_to_base64 --- docarray/array/storage/weaviate/seqlike.py | 1 + tests/unit/array/mixins/test_io.py | 15 +++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/docarray/array/storage/weaviate/seqlike.py b/docarray/array/storage/weaviate/seqlike.py index c9759bfc390..be2882ba4b1 100644 --- a/docarray/array/storage/weaviate/seqlike.py +++ b/docarray/array/storage/weaviate/seqlike.py @@ -27,6 +27,7 @@ def __eq__(self, other): return ( type(self) is type(other) and self._client.get_meta() == other._client.get_meta() + and self._config == other._config ) def __len__(self): diff --git a/tests/unit/array/mixins/test_io.py b/tests/unit/array/mixins/test_io.py index 1660c542d18..0fa0273425c 100644 --- a/tests/unit/array/mixins/test_io.py +++ b/tests/unit/array/mixins/test_io.py @@ -128,14 +128,21 @@ def test_push_pull_io(da_cls, show_progress, start_weaviate): assert da1.texts == da2.texts == random_texts -@pytest.mark.parametrize('protocol', ['protobuf', 'pickle']) -@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) @pytest.mark.parametrize( - 'da_cls', [DocumentArrayInMemory, DocumentArrayWeaviate, DocumentArraySqlite] + 'protocol', ['protobuf', 'pickle', 'protobuf-array', 'pickle-array'] ) +@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) +@pytest.mark.parametrize('da_cls', [DocumentArrayWeaviate]) def test_from_to_base64(protocol, compress, da_cls): da = da_cls.empty(10) da[:, 'embedding'] = [[1, 2, 3]] * len(da) da_r = da_cls.from_base64(da.to_base64(protocol, compress), protocol, compress) - assert da_r == da + + # only pickle-array will serialize the configuration so we can assume DAs are equal + if protocol == 'pickle-array': + assert da_r == da + # for the rest, we can only check the docs content + else: + for d1, d2 in zip(da_r, da): + assert d1 == d2 assert da_r[0].embedding == [1, 2, 3] From b700904513ee01610cf208ffaec5901013d80d60 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 2 Feb 2022 17:13:57 +0100 Subject: [PATCH 23/25] test: fix test_from_to_base64 --- docarray/array/storage/weaviate/getsetdel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/weaviate/getsetdel.py b/docarray/array/storage/weaviate/getsetdel.py index 24baa80545f..fe2bec499d0 100644 --- a/docarray/array/storage/weaviate/getsetdel.py +++ b/docarray/array/storage/weaviate/getsetdel.py @@ -191,7 +191,7 @@ def _del_all_docs(self): self._client.schema.delete_class(self._class_name) self._client.schema.delete_class(self._meta_name) self._offset2ids.clear() - self._load_or_create_weaviate_schema(self._class_name) + self._load_or_create_weaviate_schema() self._update_offset2ids_meta() def _del_docs_by_mask(self, mask: Sequence[bool]): From 5f2c0bde87a29d83dc60a7479d1b9c4d1b2addfa Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 2 Feb 2022 17:22:52 +0100 Subject: [PATCH 24/25] fix: fix set many attributes by offset --- docarray/array/mixins/setitem.py | 2 +- tests/unit/array/test_advance_indexing.py | 11 ++++------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/docarray/array/mixins/setitem.py b/docarray/array/mixins/setitem.py index e0ec66a6016..04f73305dc5 100644 --- a/docarray/array/mixins/setitem.py +++ b/docarray/array/mixins/setitem.py @@ -157,7 +157,7 @@ def _set_by_pair(self, idx1, idx2, value): and all(hasattr(self[idx1], attr) for attr in idx2) ): for attr, _v in zip(idx2, value): - self._set_doc_attr_by_id(idx1, attr, _v) + self._set_doc_attr_by_offset(idx1, attr, _v) else: raise IndexError(f'`{idx2}` must be an attribute or list of attributes') diff --git a/tests/unit/array/test_advance_indexing.py b/tests/unit/array/test_advance_indexing.py index 7ef71ff5bbb..b3288fc9c3b 100644 --- a/tests/unit/array/test_advance_indexing.py +++ b/tests/unit/array/test_advance_indexing.py @@ -105,11 +105,7 @@ def test_sequence_bool_index(docs, storage, start_weaviate): # setter mask = [True, False] * 50 - # TODO: unifiy the following test logic - if storage == 'sqlite': - docs[mask, 'text'] = [f'repl{j}' for j in range(50)] - else: - docs[mask] = [Document(text=f'repl{j}') for j in range(50)] + docs[mask, 'text'] = [f'repl{j}' for j in range(50)] for idx, d in enumerate(docs): if idx % 2 == 0: @@ -276,8 +272,9 @@ def test_path_syntax_indexing_set(storage, start_weaviate): with pytest.raises(ValueError): da['@m'] = [Document() for _ in range(3 * 7)] - # TODO also test cases like da[1, ['text', 'id']], - # where first index is str/int and second is attr + da[2, ['text', 'id']] = ['new_text', 'new_id'] + assert da[2].text == 'new_text' + assert da[2].id == 'new_id' @pytest.mark.parametrize('size', [1, 5]) From b80d2f3cc2fea1c5a9dc106d0ad2a1f81732a5e5 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 2 Feb 2022 17:42:26 +0100 Subject: [PATCH 25/25] chore: handle todos --- docarray/array/mixins/setitem.py | 19 ++++++++++++-- docarray/array/storage/weaviate/getsetdel.py | 26 -------------------- tests/unit/array/mixins/test_getset.py | 14 ++++++++--- 3 files changed, 27 insertions(+), 32 deletions(-) diff --git a/docarray/array/mixins/setitem.py b/docarray/array/mixins/setitem.py index 04f73305dc5..f023e2925b7 100644 --- a/docarray/array/mixins/setitem.py +++ b/docarray/array/mixins/setitem.py @@ -168,14 +168,14 @@ def _set_by_pair(self, idx1, idx2, value): or (isinstance(idx1, str) and idx1.startswith('@')) ): self._set_docs_attributes(idx1, idx2, value) - # TODO: else raise error + else: + raise IndexError(f'Unsupported first index type {typename(idx1)}: {idx1}') def _set_by_mask(self, mask: List[bool], value): _selected = itertools.compress(self, mask) self._set_doc_value_pairs(_selected, value) def _set_docs_attributes(self, index, attributes, value): - # TODO: handle index is Ellipsis if isinstance(attributes, str): # a -> [a] # [a, a] -> [a, a] @@ -184,6 +184,21 @@ def _set_docs_attributes(self, index, attributes, value): if isinstance(index, str) and index.startswith('@'): self._set_docs_attributes_traversal_paths(index, attributes, value) + elif index is Ellipsis: + _docs = self[index] + for _a, _v in zip(attributes, value): + if _a == 'tensor': + _docs.tensors = _v + elif _a == 'embedding': + _docs.embeddings = _v + else: + if not isinstance(_v, (list, tuple)): + for _d in _docs: + setattr(_d, _a, _v) + else: + for _d, _vv in zip(_docs, _v): + setattr(_d, _a, _vv) + self._set_doc_value_pairs_nested(_docs, _docs) else: _docs = self[index] if not _docs: diff --git a/docarray/array/storage/weaviate/getsetdel.py b/docarray/array/storage/weaviate/getsetdel.py index fe2bec499d0..8a692671b0f 100644 --- a/docarray/array/storage/weaviate/getsetdel.py +++ b/docarray/array/storage/weaviate/getsetdel.py @@ -135,32 +135,6 @@ def _set_doc_attr_by_id(self, _id: str, attr: str, value: Any): setattr(doc, attr, value) self._setitem(self.wmap(doc.id), doc) - def _set_docs_attrs(self, docs: 'DocumentArray', attr: str, values: Iterable[Any]): - # TODO: remove this function to use _set_doc_attr_by_id once - # we find a way to do - from ...memory import DocumentArrayInMemory - - if attr == 'embedding': - docs.embeddings = values - elif attr == 'tensor': - docs.tensors = values - else: - for d, v in zip(docs, values): - setattr(d, attr, v) - - def _set_attr_util(_docs: DocumentArrayInMemory): - for d in _docs: - if d in docs: - setattr(d, attr, getattr(docs[d.id], attr)) - _set_attr_util(d.chunks) - _set_attr_util(d.matches) - - res = DocumentArrayInMemory([d for d in self]) - _set_attr_util(res) - - for r in res: - self._setitem(self.wmap(r.id), r) - def _del_doc_by_offset(self, offset: int): """Concrete implementation of base class' ``_del_doc_by_offset`` diff --git a/tests/unit/array/mixins/test_getset.py b/tests/unit/array/mixins/test_getset.py index 4acf7f0632b..a6cdf90057b 100644 --- a/tests/unit/array/mixins/test_getset.py +++ b/tests/unit/array/mixins/test_getset.py @@ -134,10 +134,6 @@ def test_setter_by_sequences_in_selected_docs_da(docs, da_cls, start_weaviate): da[[3, 4], 'text'] = ['test', 'test'] assert da[[3, 4], 'text'] == ['test', 'test'] - # TODO Clarify whether this change can be accepted - # I think since the first element of the index (i.e. [0]) - # is a list, it might be more natural to expect a list - # as values? da[[0], 'text'] = ['jina'] assert da[[0], 'text'] == ['jina'] @@ -209,6 +205,16 @@ def test_ellipsis_getter(nested_docs, da_cls, start_weaviate): assert d.id == doc_id +@pytest.mark.parametrize( + 'da_cls', [DocumentArray, DocumentArraySqlite, DocumentArrayWeaviate] +) +def test_ellipsis_attribute_setter(nested_docs, da_cls, start_weaviate): + da = da_cls() + da.extend(nested_docs) + da[..., 'text'] = 'new' + assert all(d.text == 'new' for d in da[...]) + + def test_zero_embeddings(): a = np.zeros([10, 6]) da = DocumentArray.empty(10)