From 48fe5cc11c0fdc3455db11c17a7c6805e14cb948 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Fri, 4 Mar 2022 11:30:00 +0100 Subject: [PATCH 1/2] feat(protobuf): add arg for compatible ndarray type --- docarray/array/mixins/io/binary.py | 10 ++++++++-- docarray/document/mixins/protobuf.py | 11 ++++++++--- docarray/proto/io/__init__.py | 6 +++--- docarray/proto/io/ndarray.py | 13 ++++++++++--- docs/fundamentals/document/serialization.md | 1 + tests/unit/math/test_ndarray.py | 14 +++++++++++++- 6 files changed, 43 insertions(+), 12 deletions(-) diff --git a/docarray/array/mixins/io/binary.py b/docarray/array/mixins/io/binary.py index 8edcb05ef64..0e1e77e5c35 100644 --- a/docarray/array/mixins/io/binary.py +++ b/docarray/array/mixins/io/binary.py @@ -311,12 +311,18 @@ def to_bytes( if not _file_ctx: return bf.getvalue() - def to_protobuf(self) -> 'DocumentArrayProto': + def to_protobuf(self, ndarray_type: Optional[str] = None) -> 'DocumentArrayProto': + """Convert DocumentArray into a Protobuf message. + + :param ndarray_type: can be ``list`` or ``numpy``, if set it will force all ndarray-like object from all + Documents to ``List`` or ``numpy.ndarray``. + :return: the protobuf message + """ from ....proto.docarray_pb2 import DocumentArrayProto dap = DocumentArrayProto() for d in self: - dap.docs.append(d.to_protobuf()) + dap.docs.append(d.to_protobuf(), ndarray_type=ndarray_type) return dap @classmethod diff --git a/docarray/document/mixins/protobuf.py b/docarray/document/mixins/protobuf.py index d5154bdfac7..1f38e8378bc 100644 --- a/docarray/document/mixins/protobuf.py +++ b/docarray/document/mixins/protobuf.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Type +from typing import TYPE_CHECKING, Type, Optional if TYPE_CHECKING: from ...types import T @@ -12,7 +12,12 @@ def from_protobuf(cls: Type['T'], pb_msg: 'DocumentProto') -> 'T': return parse_proto(pb_msg) - def to_protobuf(self) -> 'DocumentProto': + def to_protobuf(self, ndarray_type: Optional[str] = None) -> 'DocumentProto': + """Convert Document into a Protobuf message. + + :param ndarray_type: can be ``list`` or ``numpy``, if set it will force all ndarray-like object to be ``List`` or ``numpy.ndarray``. + :return: the protobuf message + """ from ...proto.io import flush_proto - return flush_proto(self) + return flush_proto(self, ndarray_type) diff --git a/docarray/proto/io/__init__.py b/docarray/proto/io/__init__.py index 39e94715bfc..7439f89b0c1 100644 --- a/docarray/proto/io/__init__.py +++ b/docarray/proto/io/__init__.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional from google.protobuf.json_format import MessageToDict from google.protobuf.struct_pb2 import Struct @@ -37,13 +37,13 @@ def parse_proto(pb_msg: 'DocumentProto') -> 'Document': return Document(**fields) -def flush_proto(doc: 'Document') -> 'DocumentProto': +def flush_proto(doc: 'Document', ndarray_type: Optional[str] = None) -> 'DocumentProto': pb_msg = DocumentProto() for key in doc.non_empty_fields: try: value = getattr(doc, key) if key in ('tensor', 'embedding'): - flush_ndarray(getattr(pb_msg, key), value) + flush_ndarray(getattr(pb_msg, key), value, ndarray_type=ndarray_type) elif key in ('chunks', 'matches'): for d in value: d: Document diff --git a/docarray/proto/io/ndarray.py b/docarray/proto/io/ndarray.py index 39614b2b07e..3c02470b82c 100644 --- a/docarray/proto/io/ndarray.py +++ b/docarray/proto/io/ndarray.py @@ -1,8 +1,8 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional import numpy as np -from ...math.ndarray import get_array_type +from ...math.ndarray import get_array_type, to_numpy_array if TYPE_CHECKING: from ...types import ArrayType @@ -44,7 +44,14 @@ def read_ndarray(pb_msg: 'NdArrayProto') -> 'ArrayType': return _to_framework_array(x, framework) -def flush_ndarray(pb_msg: 'NdArrayProto', value: 'ArrayType'): +def flush_ndarray( + pb_msg: 'NdArrayProto', value: 'ArrayType', ndarray_type: Optional[str] = None +): + if ndarray_type == 'list': + value = to_numpy_array(value).tolist() + elif ndarray_type == 'numpy': + value = to_numpy_array(value) + framework, is_sparse = get_array_type(value) if framework == 'docarray': diff --git a/docs/fundamentals/document/serialization.md b/docs/fundamentals/document/serialization.md index 2f20650e71f..571a4c0ad49 100644 --- a/docs/fundamentals/document/serialization.md +++ b/docs/fundamentals/document/serialization.md @@ -240,6 +240,7 @@ mime_type: "image/jpeg" One can refer to the [Protobuf specification of `Document`](../../proto/index.md) for details. +When `.tensor` or `.embedding` contains frameworks-specific ndarray-like object, you can use `.to_protobuf(..., ndarray_type='numpy')` or `.to_protobuf(..., ndarray_type='list')` to cast them into `list` or `numpy.ndarray` automatically. This will help to ensure the maximum compatability between different microservices. ## What's next? diff --git a/tests/unit/math/test_ndarray.py b/tests/unit/math/test_ndarray.py index 05b82a3124e..059fd265bc1 100644 --- a/tests/unit/math/test_ndarray.py +++ b/tests/unit/math/test_ndarray.py @@ -1,3 +1,4 @@ +import numpy as np import paddle import pytest import tensorflow as tf @@ -5,6 +6,8 @@ from scipy.sparse import csr_matrix, coo_matrix, bsr_matrix, csc_matrix, issparse from docarray.math.ndarray import get_array_rows +from docarray.proto.docarray_pb2 import NdArrayProto +from docarray.proto.io import flush_ndarray, read_ndarray @pytest.mark.parametrize( @@ -30,7 +33,8 @@ csc_matrix, ], ) -def test_get_array_rows(data, expected_result, arraytype): +@pytest.mark.parametrize('ndarray_type', ['list', 'numpy']) +def test_get_array_rows(data, expected_result, arraytype, ndarray_type): data_array = arraytype(data) num_rows, ndim = get_array_rows(data_array) @@ -39,3 +43,11 @@ def test_get_array_rows(data, expected_result, arraytype): assert expected_result[0] == num_rows else: assert expected_result == (num_rows, ndim) + + na_proto = NdArrayProto() + flush_ndarray(na_proto, value=data_array, ndarray_type=ndarray_type) + r_data_array = read_ndarray(na_proto) + if ndarray_type == 'list': + assert isinstance(r_data_array, list) + elif ndarray_type == 'numpy': + assert isinstance(r_data_array, np.ndarray) From 7f50cec09a9c93e33265626e66947ca3e6bffd2c Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Fri, 4 Mar 2022 12:00:13 +0100 Subject: [PATCH 2/2] feat(protobuf): add arg for compatible ndarray type --- docarray/array/mixins/io/binary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/mixins/io/binary.py b/docarray/array/mixins/io/binary.py index 0e1e77e5c35..6c8881e726d 100644 --- a/docarray/array/mixins/io/binary.py +++ b/docarray/array/mixins/io/binary.py @@ -322,7 +322,7 @@ def to_protobuf(self, ndarray_type: Optional[str] = None) -> 'DocumentArrayProto dap = DocumentArrayProto() for d in self: - dap.docs.append(d.to_protobuf(), ndarray_type=ndarray_type) + dap.docs.append(d.to_protobuf(ndarray_type)) return dap @classmethod