From f64edf0099f02f071b8c35355b46f46684ecf3c7 Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Wed, 23 Feb 2022 11:37:58 +0100 Subject: [PATCH 01/16] feat: add compression and protocol in file name --- docarray/array/mixins/io/binary.py | 12 +++++- docarray/helper.py | 67 ++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/docarray/array/mixins/io/binary.py b/docarray/array/mixins/io/binary.py index d3a336bf51f..0959c843010 100644 --- a/docarray/array/mixins/io/binary.py +++ b/docarray/array/mixins/io/binary.py @@ -5,7 +5,13 @@ from contextlib import nullcontext from typing import Union, BinaryIO, TYPE_CHECKING, Type, Optional, Generator -from ....helper import __windows__, get_compress_ctx, decompress_bytes +from ....helper import ( + __windows__, + get_compress_ctx, + decompress_bytes, + protocol_and_compress_from_file_path, + add_protocol_and_compress_to_file_path, +) if TYPE_CHECKING: from ....types import T @@ -42,6 +48,9 @@ def load_binary( elif isinstance(file, bytes): file_ctx = nullcontext(file) elif os.path.exists(file): + protocol, compress = protocol_and_compress_from_file_path( + file, protocol, compress + ) file_ctx = open(file, 'rb') else: raise ValueError(f'unsupported input {file!r}') @@ -203,6 +212,7 @@ def save_binary( if isinstance(file, io.BufferedWriter): file_ctx = nullcontext(file) else: + file = add_protocol_and_compress_to_file_path(file, protocol, compress) file_ctx = open(file, 'wb') self.to_bytes(protocol=protocol, compress=compress, _file_ctx=file_ctx) diff --git a/docarray/helper.py b/docarray/helper.py index 811241e2da2..bbc650db3ed 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -2,6 +2,7 @@ import random import sys import uuid +import pathlib import warnings from typing import Any, Dict, Optional, Sequence @@ -321,3 +322,69 @@ def dataclass_from_dict(klass, dikt): if isinstance(dikt, (tuple, list)): return [dataclass_from_dict(klass.__args__[0], f) for f in dikt] return dikt + + +def protocol_and_compress_from_file_path( + file_path: str, + default_protocol: Optional[str] = None, + default_compress: Optional[str] = None, +) -> tuple[Optional[str], Opstional[str]]: + """Extract protocol and compression algorithm from a string, use defaults if not found. + + :param file_path: path of a file. + :param default_protocol: default serialization protocol used in case not found. + :param default_compress: default compression method used in case not found. + + Examples: + + >>> protocol_and_compress_from_file_path('./docarray_fashion_mnist.protobuff.gzip') + ('protobuf', 'gzip') + + >>> protocol_and_compress_from_file_path('/Documents/docarray_fashion_mnist.protobuff') + ('protobuf', None) + + >>> protocol_and_compress_from_file_path('/Documents/docarray_fashion_mnist.gzip') + (None, gzip) + """ + + ALLOWED_PROTOCOLS = {'pickle', 'protobuf', 'protobuf-array', 'pickle-array'} + ALLOWED_COMPRESSIONS = {'lz4', 'bz2', 'lzma', 'zlib', 'gzip'} + + protocol = default_protocol + compress = default_compress + + file_extensions = [e.replace('.', '') for e in pathlib.Path(file_path).suffixes] + for extension in file_extensions: + if extension in ALLOWED_PROTOCOLS: + protocol = extension + elif extension in ALLOWED_COMPRESSIONS: + compress = extension + + return protocol, compress + + +def add_protocol_and_compress_to_file_path( + file_path: str, protocol: Optional[str] = None, compress: Optional[str] = None +) -> str: + """Creates a new file path with the protocol and compression methods as extensions. + + :param file_path: path of a file. + :param protocol: chosen protocol. + :param compress: compression algorithm. + + Examples: + + >>> add_protocol_and_compress_to_file_path('docarray_fashion_mnist.bin') + 'docarray_fashion_mnist.bin' + + >>> add_protocol_and_compress_to_file_path('docarray_fashion_mnist', 'protobuf', 'gzip') + 'docarray_fashion_mnist.protobuf.gzip' + """ + + file_path_extended = file_path + if protocol: + file_path_extended += '.' + protocol + if compress: + file_path_extended += '.' + compress + + return file_path_extended From 4e002e1ea27b2708c7aa80b735de38e52ec71598 Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Wed, 23 Feb 2022 11:39:33 +0100 Subject: [PATCH 02/16] fix: typo --- docarray/helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/helper.py b/docarray/helper.py index bbc650db3ed..cdf7faf1cb4 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -328,7 +328,7 @@ def protocol_and_compress_from_file_path( file_path: str, default_protocol: Optional[str] = None, default_compress: Optional[str] = None, -) -> tuple[Optional[str], Opstional[str]]: +) -> tuple[Optional[str], Optional[str]]: """Extract protocol and compression algorithm from a string, use defaults if not found. :param file_path: path of a file. From b072c6fe9e70374bfa1f670043df4e01704b05fb Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Wed, 23 Feb 2022 12:02:30 +0100 Subject: [PATCH 03/16] fix: typo type hint --- docarray/helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docarray/helper.py b/docarray/helper.py index cdf7faf1cb4..5696efa7d5c 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -4,7 +4,7 @@ import uuid import pathlib import warnings -from typing import Any, Dict, Optional, Sequence +from typing import Any, Dict, Optional, Sequence, Tuple __windows__ = sys.platform == 'win32' @@ -328,7 +328,7 @@ def protocol_and_compress_from_file_path( file_path: str, default_protocol: Optional[str] = None, default_compress: Optional[str] = None, -) -> tuple[Optional[str], Optional[str]]: +) -> Tuple[Optional[str], Optional[str]]: """Extract protocol and compression algorithm from a string, use defaults if not found. :param file_path: path of a file. From d0cbb8208fa139a74db4b1933047f7f75a249389 Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Wed, 23 Feb 2022 12:30:57 +0100 Subject: [PATCH 04/16] test: test helper functions --- tests/unit/test_helper.py | 48 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 tests/unit/test_helper.py diff --git a/tests/unit/test_helper.py b/tests/unit/test_helper.py new file mode 100644 index 00000000000..88deb10d0de --- /dev/null +++ b/tests/unit/test_helper.py @@ -0,0 +1,48 @@ +import pytest +import pathlib + +from docarray.helper import ( + add_protocol_and_compress_to_file_path, + protocol_and_compress_from_file_path, +) + + +@pytest.mark.parametrize('file_path', ['doc_array', './some_folder/doc_array']) +@pytest.mark.parametrize( + 'protocol', ['protobuf', 'protobuf-array', 'pickle', 'pickle-array'] +) +@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip']) +def test_add_protocol_and_compress_to_file_path(file_path, compress, protocol): + file_path_extended = add_protocol_and_compress_to_file_path( + file_path, compress, protocol + ) + file_path_suffixes = [ + e.replace('.', '') for e in pathlib.Path(file_path_extended).suffixes + ] + + if compress: + assert compress in file_path_suffixes + if protocol: + assert protocol in file_path_suffixes + + +@pytest.mark.parametrize( + 'file_path', ['doc_array', '../docarray', './a_folder/docarray'] +) +@pytest.mark.parametrize( + 'protocol', ['protobuf', 'protobuf-array', 'pickle', 'pickle-array'] +) +@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) +def test_protocol_and_compress_from_file_path(file_path, protocol, compress): + + file_path_extended = file_path + if protocol: + file_path_extended += '.' + protocol + if compress: + file_path_extended += '.' + compress + + protocol, compress = protocol_and_compress_from_file_path(file_path_extended) + if protocol: + assert protocol in ['protobuf', 'protobuf-array', 'pickle', 'pickle-array'] + if compress: + assert compress in ['lz4', 'bz2', 'lzma', 'zlib', 'gzip'] From e6e1b84866472a1daa93474bdaae5ca1d65d3b9f Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Wed, 23 Feb 2022 12:39:39 +0100 Subject: [PATCH 05/16] refactor: global variables in binary --- docarray/array/mixins/io/binary.py | 3 +++ docarray/helper.py | 5 ++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docarray/array/mixins/io/binary.py b/docarray/array/mixins/io/binary.py index 0959c843010..13a55dbd085 100644 --- a/docarray/array/mixins/io/binary.py +++ b/docarray/array/mixins/io/binary.py @@ -18,6 +18,9 @@ from ....proto.docarray_pb2 import DocumentArrayProto from .... import Document, DocumentArray +ALLOWED_PROTOCOLS = {'pickle', 'protobuf', 'protobuf-array', 'pickle-array'} +ALLOWED_COMPRESSIONS = {'lz4', 'bz2', 'lzma', 'zlib', 'gzip'} + class BinaryIOMixin: """Save/load an array to a binary file.""" diff --git a/docarray/helper.py b/docarray/helper.py index 5696efa7d5c..2c09698d73a 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -6,6 +6,8 @@ import warnings from typing import Any, Dict, Optional, Sequence, Tuple +from docarray.array.mixins.io.binary import ALLOWED_PROTOCOLS, ALLOWED_COMPRESSIONS + __windows__ = sys.platform == 'win32' __resources_path__ = os.path.join( @@ -347,9 +349,6 @@ def protocol_and_compress_from_file_path( (None, gzip) """ - ALLOWED_PROTOCOLS = {'pickle', 'protobuf', 'protobuf-array', 'pickle-array'} - ALLOWED_COMPRESSIONS = {'lz4', 'bz2', 'lzma', 'zlib', 'gzip'} - protocol = default_protocol compress = default_compress From ea68c6df87328b0644b3aefaedb223ab9cb366d6 Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Wed, 23 Feb 2022 12:56:58 +0100 Subject: [PATCH 06/16] refactor: global variables in helper --- docarray/array/mixins/io/binary.py | 3 --- docarray/helper.py | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/docarray/array/mixins/io/binary.py b/docarray/array/mixins/io/binary.py index 13a55dbd085..0959c843010 100644 --- a/docarray/array/mixins/io/binary.py +++ b/docarray/array/mixins/io/binary.py @@ -18,9 +18,6 @@ from ....proto.docarray_pb2 import DocumentArrayProto from .... import Document, DocumentArray -ALLOWED_PROTOCOLS = {'pickle', 'protobuf', 'protobuf-array', 'pickle-array'} -ALLOWED_COMPRESSIONS = {'lz4', 'bz2', 'lzma', 'zlib', 'gzip'} - class BinaryIOMixin: """Save/load an array to a binary file.""" diff --git a/docarray/helper.py b/docarray/helper.py index 2c09698d73a..900a95dbf14 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -6,7 +6,8 @@ import warnings from typing import Any, Dict, Optional, Sequence, Tuple -from docarray.array.mixins.io.binary import ALLOWED_PROTOCOLS, ALLOWED_COMPRESSIONS +ALLOWED_PROTOCOLS = {'pickle', 'protobuf', 'protobuf-array', 'pickle-array'} +ALLOWED_COMPRESSIONS = {'lz4', 'bz2', 'lzma', 'zlib', 'gzip'} __windows__ = sys.platform == 'win32' From 7ab2bf06164e51baa1bc681ae4655e988639a2b6 Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Wed, 23 Feb 2022 13:26:40 +0100 Subject: [PATCH 07/16] test: fix saving file with protocol and compression --- tests/unit/array/test_from_to_bytes.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/unit/array/test_from_to_bytes.py b/tests/unit/array/test_from_to_bytes.py index 8ecc5083e0e..92f95a60a7b 100644 --- a/tests/unit/array/test_from_to_bytes.py +++ b/tests/unit/array/test_from_to_bytes.py @@ -106,6 +106,10 @@ def test_save_bytes_stream(tmpfile, protocol, compress): [Document(text='aaa'), Document(text='bbb'), Document(text='ccc')] ) da.save_binary(tmpfile, protocol=protocol, compress=compress) + + # note that save_binary will save protocol and compression in the filename + tmpfile += '.' + protocol + '.' + compress + da_reconstructed = DocumentArray.load_binary( tmpfile, protocol=protocol, compress=compress, streaming=True ) From b599620e8ad1088bd45f46dcebcefd13d12da563 Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Wed, 23 Feb 2022 13:35:19 +0100 Subject: [PATCH 08/16] test: add protocol and compress --- tests/unit/array/test_from_to_bytes.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/unit/array/test_from_to_bytes.py b/tests/unit/array/test_from_to_bytes.py index 92f95a60a7b..29be5f77c1a 100644 --- a/tests/unit/array/test_from_to_bytes.py +++ b/tests/unit/array/test_from_to_bytes.py @@ -108,7 +108,10 @@ def test_save_bytes_stream(tmpfile, protocol, compress): da.save_binary(tmpfile, protocol=protocol, compress=compress) # note that save_binary will save protocol and compression in the filename - tmpfile += '.' + protocol + '.' + compress + if protocol: + tmpfile += '.' + protocol + if compress: + tmpfile += '.' + compress da_reconstructed = DocumentArray.load_binary( tmpfile, protocol=protocol, compress=compress, streaming=True From 5ad298e2f16689d8d5d59863cae9062cac026251 Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Thu, 24 Feb 2022 11:52:44 +0100 Subject: [PATCH 09/16] refactor: allow keyword and extension loading --- docarray/array/mixins/io/binary.py | 34 ++++++++++++++++++++++---- docarray/helper.py | 31 ++--------------------- tests/unit/array/test_from_to_bytes.py | 6 ----- tests/unit/test_helper.py | 33 ++++++------------------- 4 files changed, 38 insertions(+), 66 deletions(-) diff --git a/docarray/array/mixins/io/binary.py b/docarray/array/mixins/io/binary.py index 0959c843010..ee56d0459af 100644 --- a/docarray/array/mixins/io/binary.py +++ b/docarray/array/mixins/io/binary.py @@ -1,6 +1,7 @@ import base64 import io import os.path +import os import pickle from contextlib import nullcontext from typing import Union, BinaryIO, TYPE_CHECKING, Type, Optional, Generator @@ -10,7 +11,6 @@ get_compress_ctx, decompress_bytes, protocol_and_compress_from_file_path, - add_protocol_and_compress_to_file_path, ) if TYPE_CHECKING: @@ -42,7 +42,15 @@ def load_binary( :param streaming: if `True` returns a generator over `Document` objects. In case protocol is pickle the `Documents` are streamed from disk to save memory usage :return: a DocumentArray object + + .. note:: + If `file` is `str` it can specify `protocol` and `compress` as file extensions. + This functionality assumes `file=file_name.$protocol.$compress` where `$protocol` and `$compress` refer to a + string interpolation of the respective `protocol` and `compress` methods. + For example if `file=my_docarray.protobuf.lz4` then the binary data will be loaded assuming `protocol=protobuf` + and `compress=lz4`. """ + if isinstance(file, io.BufferedReader): file_ctx = nullcontext(file) elif isinstance(file, bytes): @@ -62,6 +70,7 @@ def load_binary( _show_progress=_show_progress, ) else: + print(protocol, compress) return cls._load_binary_all( file_ctx, protocol, compress, _show_progress, *args, **kwargs ) @@ -200,21 +209,36 @@ def save_binary( ) -> None: """Save array elements into a binary file. + :param file: File or filename to which the data is saved. + :param protocol: protocol to use + :param compress: compress algorithm to use + + .. note:: + If `file` is `str` it can specify `protocol` and `compress` as file extensions. + This functionality assumes `file=file_name.$protocol.$compress` where `$protocol` and `$compress` refer to a + string interpolation of the respective `protocol` and `compress` methods. + For example if `file=my_docarray.protobuf.lz4` then the binary data will be created using `protocol=protobuf` + and `compress=lz4`. + Comparing to :meth:`save_json`, it is faster and the file is smaller, but not human-readable. .. note:: To get a binary presentation in memory, use ``bytes(...)``. - :param protocol: protocol to use - :param compress: compress algorithm to use - :param file: File or filename to which the data is saved. """ if isinstance(file, io.BufferedWriter): file_ctx = nullcontext(file) else: - file = add_protocol_and_compress_to_file_path(file, protocol, compress) + _protocol, _compress = protocol_and_compress_from_file_path(file) + + if _protocol is not None: + protocol = _protocol + if _compress is not None: + compress = _compress + file_ctx = open(file, 'wb') + print(protocol, compress) self.to_bytes(protocol=protocol, compress=compress, _file_ctx=file_ctx) def to_bytes( diff --git a/docarray/helper.py b/docarray/helper.py index 900a95dbf14..0036c961c30 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -340,10 +340,10 @@ def protocol_and_compress_from_file_path( Examples: - >>> protocol_and_compress_from_file_path('./docarray_fashion_mnist.protobuff.gzip') + >>> protocol_and_compress_from_file_path('./docarray_fashion_mnist.protobuf.gzip') ('protobuf', 'gzip') - >>> protocol_and_compress_from_file_path('/Documents/docarray_fashion_mnist.protobuff') + >>> protocol_and_compress_from_file_path('/Documents/docarray_fashion_mnist.protobuf') ('protobuf', None) >>> protocol_and_compress_from_file_path('/Documents/docarray_fashion_mnist.gzip') @@ -361,30 +361,3 @@ def protocol_and_compress_from_file_path( compress = extension return protocol, compress - - -def add_protocol_and_compress_to_file_path( - file_path: str, protocol: Optional[str] = None, compress: Optional[str] = None -) -> str: - """Creates a new file path with the protocol and compression methods as extensions. - - :param file_path: path of a file. - :param protocol: chosen protocol. - :param compress: compression algorithm. - - Examples: - - >>> add_protocol_and_compress_to_file_path('docarray_fashion_mnist.bin') - 'docarray_fashion_mnist.bin' - - >>> add_protocol_and_compress_to_file_path('docarray_fashion_mnist', 'protobuf', 'gzip') - 'docarray_fashion_mnist.protobuf.gzip' - """ - - file_path_extended = file_path - if protocol: - file_path_extended += '.' + protocol - if compress: - file_path_extended += '.' + compress - - return file_path_extended diff --git a/tests/unit/array/test_from_to_bytes.py b/tests/unit/array/test_from_to_bytes.py index 29be5f77c1a..e39acf8c56a 100644 --- a/tests/unit/array/test_from_to_bytes.py +++ b/tests/unit/array/test_from_to_bytes.py @@ -107,12 +107,6 @@ def test_save_bytes_stream(tmpfile, protocol, compress): ) da.save_binary(tmpfile, protocol=protocol, compress=compress) - # note that save_binary will save protocol and compression in the filename - if protocol: - tmpfile += '.' + protocol - if compress: - tmpfile += '.' + compress - da_reconstructed = DocumentArray.load_binary( tmpfile, protocol=protocol, compress=compress, streaming=True ) diff --git a/tests/unit/test_helper.py b/tests/unit/test_helper.py index 88deb10d0de..52a07cdbc51 100644 --- a/tests/unit/test_helper.py +++ b/tests/unit/test_helper.py @@ -1,31 +1,10 @@ import pytest -import pathlib from docarray.helper import ( - add_protocol_and_compress_to_file_path, protocol_and_compress_from_file_path, ) -@pytest.mark.parametrize('file_path', ['doc_array', './some_folder/doc_array']) -@pytest.mark.parametrize( - 'protocol', ['protobuf', 'protobuf-array', 'pickle', 'pickle-array'] -) -@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip']) -def test_add_protocol_and_compress_to_file_path(file_path, compress, protocol): - file_path_extended = add_protocol_and_compress_to_file_path( - file_path, compress, protocol - ) - file_path_suffixes = [ - e.replace('.', '') for e in pathlib.Path(file_path_extended).suffixes - ] - - if compress: - assert compress in file_path_suffixes - if protocol: - assert protocol in file_path_suffixes - - @pytest.mark.parametrize( 'file_path', ['doc_array', '../docarray', './a_folder/docarray'] ) @@ -41,8 +20,10 @@ def test_protocol_and_compress_from_file_path(file_path, protocol, compress): if compress: file_path_extended += '.' + compress - protocol, compress = protocol_and_compress_from_file_path(file_path_extended) - if protocol: - assert protocol in ['protobuf', 'protobuf-array', 'pickle', 'pickle-array'] - if compress: - assert compress in ['lz4', 'bz2', 'lzma', 'zlib', 'gzip'] + _protocol, _compress = protocol_and_compress_from_file_path(file_path_extended) + + assert _protocol in {'protobuf', 'protobuf-array', 'pickle', 'pickle-array', None} + assert _compress in {'lz4', 'bz2', 'lzma', 'zlib', 'gzip', None} + + assert protocol == _protocol + assert compress == _compress From 0c5a03c44a2d3e503679b53e12144b7ada97fc20 Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Thu, 24 Feb 2022 12:11:09 +0100 Subject: [PATCH 10/16] test: simplify test with helper func --- docarray/helper.py | 27 ++++++++++++++++++++++++++ tests/unit/array/test_from_to_bytes.py | 18 ++++++++++++++++- tests/unit/test_helper.py | 20 +++++++++++++++++++ 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/docarray/helper.py b/docarray/helper.py index 0036c961c30..e1bb5f0d08e 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -361,3 +361,30 @@ def protocol_and_compress_from_file_path( compress = extension return protocol, compress + + +def add_protocol_and_compress_to_file_path( + file_path: str, protocol: Optional[str] = None, compress: Optional[str] = None +) -> str: + """Creates a new file path with the protocol and compression methods as extensions. + + :param file_path: path of a file. + :param protocol: chosen protocol. + :param compress: compression algorithm. + + Examples: + + >>> add_protocol_and_compress_to_file_path('docarray_fashion_mnist.bin') + 'docarray_fashion_mnist.bin' + + >>> add_protocol_and_compress_to_file_path('docarray_fashion_mnist', 'protobuf', 'gzip') + 'docarray_fashion_mnist.protobuf.gzip' + """ + + file_path_extended = file_path + if protocol: + file_path_extended += '.' + protocol + if compress: + file_path_extended += '.' + compress + + return file_path_extended diff --git a/tests/unit/array/test_from_to_bytes.py b/tests/unit/array/test_from_to_bytes.py index e39acf8c56a..50c94dbf5b5 100644 --- a/tests/unit/array/test_from_to_bytes.py +++ b/tests/unit/array/test_from_to_bytes.py @@ -11,6 +11,9 @@ from tests import random_docs +from docarray.helper import add_protocol_and_compress_to_file_path + + def get_ndarrays_for_ravel(): a = np.random.random([100, 3]) a[a > 0.5] = 0 @@ -60,17 +63,30 @@ def test_to_from_bytes(target_da, protocol, compress, ndarray_val, is_sparse): ) @pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) def test_save_bytes(target_da, protocol, compress, tmpfile): + + # tests .save_binary(file, protocol=protocol, compress=compress) target_da.save_binary(tmpfile, protocol=protocol, compress=compress) target_da.save_binary(str(tmpfile), protocol=protocol, compress=compress) with open(tmpfile, 'wb') as fp: target_da.save_binary(fp, protocol=protocol, compress=compress) - DocumentArray.load_binary(tmpfile, protocol=protocol, compress=compress) + da_from_protocol_compress = DocumentArray.load_binary( + tmpfile, protocol=protocol, compress=compress + ) DocumentArray.load_binary(str(tmpfile), protocol=protocol, compress=compress) with open(tmpfile, 'rb') as fp: DocumentArray.load_binary(fp, protocol=protocol, compress=compress) + # tests .save_binary(file.protocol.compress) without arguments `compression` and `protocol` + file_path_extended = add_protocol_and_compress_to_file_path( + str(tmpfile), protocol, compress + ) + + target_da.save_binary(file_path_extended) + da_from_file_extension = DocumentArray.load_binary(file_path_extended) + assert da_from_protocol_compress == da_from_file_extension + @pytest.mark.parametrize('target_da', [DocumentArray.empty(100), random_docs(100)]) def test_from_to_protobuf(target_da): diff --git a/tests/unit/test_helper.py b/tests/unit/test_helper.py index 52a07cdbc51..17a56db0499 100644 --- a/tests/unit/test_helper.py +++ b/tests/unit/test_helper.py @@ -2,6 +2,7 @@ from docarray.helper import ( protocol_and_compress_from_file_path, + add_protocol_and_compress_to_file_path, ) @@ -27,3 +28,22 @@ def test_protocol_and_compress_from_file_path(file_path, protocol, compress): assert protocol == _protocol assert compress == _compress + + +@pytest.mark.parametrize('file_path', ['doc_array', './some_folder/doc_array']) +@pytest.mark.parametrize( + 'protocol', ['protobuf', 'protobuf-array', 'pickle', 'pickle-array'] +) +@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip']) +def test_add_protocol_and_compress_to_file_path(file_path, compress, protocol): + file_path_extended = add_protocol_and_compress_to_file_path( + file_path, compress, protocol + ) + file_path_suffixes = [ + e.replace('.', '') for e in pathlib.Path(file_path_extended).suffixes + ] + + if compress: + assert compress in file_path_suffixes + if protocol: + assert protocol in file_path_suffixes From 919f534bb1ff98df9a125a7188bbd95a95d24bed Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Thu, 24 Feb 2022 12:25:32 +0100 Subject: [PATCH 11/16] test: add missing import --- tests/unit/test_helper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/test_helper.py b/tests/unit/test_helper.py index 17a56db0499..fb2057c6df0 100644 --- a/tests/unit/test_helper.py +++ b/tests/unit/test_helper.py @@ -1,4 +1,5 @@ import pytest +import pathlib from docarray.helper import ( protocol_and_compress_from_file_path, From 2eb81330cdaca55e9b6dd4e41ce119c8d1cfc5a0 Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Thu, 24 Feb 2022 12:36:51 +0100 Subject: [PATCH 12/16] refactor: remove print --- docarray/array/mixins/io/binary.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/docarray/array/mixins/io/binary.py b/docarray/array/mixins/io/binary.py index ee56d0459af..14d539e84d4 100644 --- a/docarray/array/mixins/io/binary.py +++ b/docarray/array/mixins/io/binary.py @@ -70,7 +70,6 @@ def load_binary( _show_progress=_show_progress, ) else: - print(protocol, compress) return cls._load_binary_all( file_ctx, protocol, compress, _show_progress, *args, **kwargs ) @@ -238,7 +237,6 @@ def save_binary( file_ctx = open(file, 'wb') - print(protocol, compress) self.to_bytes(protocol=protocol, compress=compress, _file_ctx=file_ctx) def to_bytes( From 03d3daa076743eddb94f34326edcfd62b1c7d1eb Mon Sep 17 00:00:00 2001 From: David Buchaca Date: Fri, 25 Feb 2022 12:59:46 +0100 Subject: [PATCH 13/16] docs: add serialization documentation --- .../documentarray/serialization.md | 48 ++++++++++++++++++- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/docs/fundamentals/documentarray/serialization.md b/docs/fundamentals/documentarray/serialization.md index ca3fe2a2728..a57ca9ae14c 100644 --- a/docs/fundamentals/documentarray/serialization.md +++ b/docs/fundamentals/documentarray/serialization.md @@ -1,7 +1,9 @@ (docarray-serialization)= # Serialization -DocArray is designed to be "ready-to-wire" at anytime. Serialization is important. DocumentArray provides multiple serialization methods that allows one transfer DocumentArray object over network and across different microservices. +DocArray is designed to be "ready-to-wire" at anytime. Serialization is important. +DocumentArray provides multiple serialization methods that allows one transfer DocumentArray object over network and across different microservices. +Moreover, there is the ability to store/load `DocumentArray` objects to disk. - JSON string: `.from_json()`/`.to_json()` - Pydantic model: `.from_pydantic_model()`/`.to_pydantic_model()` @@ -15,7 +17,6 @@ DocArray is designed to be "ready-to-wire" at anytime. Serialization is importan - ## From/to JSON @@ -161,6 +162,49 @@ Afterwards, `doc1_bytes` describes how many bytes are used to serialize `doc1`, The pattern `dock_bytes` and `dock.to_bytes` is repeated `len(docs)` times. +### From/to Disk + +If you want to store a `DocumentArray` to disk you can use `.save_binary(filename, protocol, compress)` +where `protocol` and `compress` refer to the protocol and compression methods used to serialize the data. +If you want to load a `DocumentArray` from disk you can use `.load_binary(filename, protocol, compress)`. + +For example, the following snippet shows how to save/load a `DocumentArray` in `my_docarray.bin`. + +```python +from docarray import DocumentArray, Document + +da = DocumentArray([Document(text='hello'), Document(text='world')]) + +da.save_binary('my_docarray.bin', protocol='protobuf', compress='lz4') +da_rec = DocumentArray.load_binary('my_docarray.bin', protocol='protobuf', compress='lz4') +da_rec == da +``` + +Note that in the previous code snippet the user needs to remember the protol and compression methods used to store the data +in order to load it back correctly. `DocArray` allows you to specify `protocol` and `compress` as file extensions. +By doing so you can forget later on which protocol and compression methods were used to serialize the data to disk. +This functionality assumes `.save_binary` and `.load_binary` are called with `filename` following the form +`file_name.$protocol.$compress` where `$protocol` and `$compress` refer to a string interpolation of the respective `protocol` and `compress` methods. + +For example if `file=my_docarray.protobuf.lz4` then the binary data will be created using `protocol=protobuf` and `compress=lz4`. + +The previous code snippet can be simplified to + +```python +from docarray import DocumentArray, Document + +da = DocumentArray([Document(text='hello'), Document(text='world')]) + +da.save_binary('my_docarray.protobuf.lz4') +da_rec = DocumentArray.load_binary('my_docarray.protobuf.lz4') +da_rec == da +``` + +```{tip} +If you don't want to specify and remember `protocol` and `compress` to store to disk, save your `DocumentArray` `da` using +`da.save_binary('file_name.$protocol.$compress')` so that when loading you don't need to specify anything. +``` + ### Stream large binary serialization from disk In particular, if a serialization uses `protocol='pickle'` or `protocol='protobuf'`, then you can load it via streaming with a constant memory consumption by setting `streaming=True`: From 10b22fa57bb9e095c66d526cc2fc042452def3b0 Mon Sep 17 00:00:00 2001 From: David Buchaca Date: Fri, 25 Feb 2022 13:04:49 +0100 Subject: [PATCH 14/16] docs: rephrase explanation --- docs/fundamentals/documentarray/serialization.md | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/docs/fundamentals/documentarray/serialization.md b/docs/fundamentals/documentarray/serialization.md index a57ca9ae14c..2c800e56256 100644 --- a/docs/fundamentals/documentarray/serialization.md +++ b/docs/fundamentals/documentarray/serialization.md @@ -164,8 +164,7 @@ The pattern `dock_bytes` and `dock.to_bytes` is repeated `len(docs)` times. ### From/to Disk -If you want to store a `DocumentArray` to disk you can use `.save_binary(filename, protocol, compress)` -where `protocol` and `compress` refer to the protocol and compression methods used to serialize the data. +If you want to store a `DocumentArray` to disk you can use `.save_binary(filename, protocol, compress)` where `protocol` and `compress` refer to the protocol and compression methods used to serialize the data. If you want to load a `DocumentArray` from disk you can use `.load_binary(filename, protocol, compress)`. For example, the following snippet shows how to save/load a `DocumentArray` in `my_docarray.bin`. @@ -180,11 +179,9 @@ da_rec = DocumentArray.load_binary('my_docarray.bin', protocol='protobuf', compr da_rec == da ``` -Note that in the previous code snippet the user needs to remember the protol and compression methods used to store the data -in order to load it back correctly. `DocArray` allows you to specify `protocol` and `compress` as file extensions. +Note that in the previous code snippet the user needs to remember the protol and compression methods used to store the data in order to load it back correctly. `DocArray` allows you to specify `protocol` and `compress` as file extensions. By doing so you can forget later on which protocol and compression methods were used to serialize the data to disk. -This functionality assumes `.save_binary` and `.load_binary` are called with `filename` following the form -`file_name.$protocol.$compress` where `$protocol` and `$compress` refer to a string interpolation of the respective `protocol` and `compress` methods. +This functionality assumes `.save_binary` and `.load_binary` are called with `filename` following the form `file_name.$protocol.$compress`, where `$protocol` and `$compress` refer to a string interpolation of the respective `protocol` and `compress` methods. For example if `file=my_docarray.protobuf.lz4` then the binary data will be created using `protocol=protobuf` and `compress=lz4`. @@ -201,8 +198,8 @@ da_rec == da ``` ```{tip} -If you don't want to specify and remember `protocol` and `compress` to store to disk, save your `DocumentArray` `da` using -`da.save_binary('file_name.$protocol.$compress')` so that when loading you don't need to specify anything. +If you don't want to specify and remember `protocol` and `compress` to store/load to/from disk, save your `DocumentArray` `da` using +`da.save_binary('file_name.$protocol.$compress')` so that it can be loaded back with `DocumentArray.load_binary('file_name.$protocol.$compress')` ``` ### Stream large binary serialization from disk From 2352842d4d74f53ae7a7c45132672194cf92bf80 Mon Sep 17 00:00:00 2001 From: David Buchaca Date: Fri, 25 Feb 2022 13:28:48 +0100 Subject: [PATCH 15/16] docs: add save binary --- docs/fundamentals/documentarray/serialization.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/fundamentals/documentarray/serialization.md b/docs/fundamentals/documentarray/serialization.md index 2c800e56256..952fe6d58d4 100644 --- a/docs/fundamentals/documentarray/serialization.md +++ b/docs/fundamentals/documentarray/serialization.md @@ -8,6 +8,7 @@ Moreover, there is the ability to store/load `DocumentArray` objects to disk. - JSON string: `.from_json()`/`.to_json()` - Pydantic model: `.from_pydantic_model()`/`.to_pydantic_model()` - Bytes (compressed): `.from_bytes()`/`.to_bytes()` + - Disk serialization: `.save_binary()`/`.load_binary()` - Base64 (compressed): `.from_base64()`/`.to_base64()` - Protobuf Message: `.from_protobuf()`/`.to_protobuf()` - Python List: `.from_list()`/`.to_list()` From 2545ed7db1497f086b1fb2ed4417eb47e4ac69c5 Mon Sep 17 00:00:00 2001 From: David Buchaca Date: Fri, 25 Feb 2022 13:28:53 +0100 Subject: [PATCH 16/16] docs: add save binary --- docs/fundamentals/documentarray/serialization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/fundamentals/documentarray/serialization.md b/docs/fundamentals/documentarray/serialization.md index 952fe6d58d4..50bdd01c18f 100644 --- a/docs/fundamentals/documentarray/serialization.md +++ b/docs/fundamentals/documentarray/serialization.md @@ -3,7 +3,7 @@ DocArray is designed to be "ready-to-wire" at anytime. Serialization is important. DocumentArray provides multiple serialization methods that allows one transfer DocumentArray object over network and across different microservices. -Moreover, there is the ability to store/load `DocumentArray` objects to disk. +Moreover, there is the ability to store/load `DocumentArray` objects to/from disk. - JSON string: `.from_json()`/`.to_json()` - Pydantic model: `.from_pydantic_model()`/`.to_pydantic_model()`