From cf4f3bba967a9b0d5fad60807f00cc6f8edc3c8c Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 15 Mar 2022 11:56:34 +0100 Subject: [PATCH 01/14] fix: fix save and load json behaviour --- docarray/array/mixins/io/json.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/docarray/array/mixins/io/json.py b/docarray/array/mixins/io/json.py index 43a9304e983..9c0407cdfa1 100644 --- a/docarray/array/mixins/io/json.py +++ b/docarray/array/mixins/io/json.py @@ -31,9 +31,7 @@ def save_json( file_ctx = open(file, 'w', encoding=encoding) with file_ctx as fp: - for d in self: - json.dump(d.to_dict(protocol=protocol, **kwargs), fp) - fp.write('\n') + fp.write(self.to_json(protocol=protocol, **kwargs)) @classmethod def load_json( @@ -51,30 +49,27 @@ def load_json( :return: a DocumentArrayLike object """ - - from .... import Document - - constructor = Document.from_json if hasattr(file, 'read'): file_ctx = nullcontext(file) - elif os.path.exists(file): - file_ctx = open(file, 'r', encoding=encoding) else: - file_ctx = nullcontext(json.loads(file)) - constructor = Document.from_dict + file_ctx = open(file, 'w', encoding=encoding) with file_ctx as fp: - return cls([constructor(v, protocol=protocol) for v in fp], **kwargs) + return cls.from_json(fp.read(), protocol=protocol, **kwargs) @classmethod def from_json( cls: Type['T'], file: Union[str, bytes, bytearray, TextIO], protocol: str = 'jsonschema', - encoding: str = 'utf-8', **kwargs ) -> 'T': - return cls.load_json(file, protocol=protocol, encoding=encoding, **kwargs) + from .... import Document + + json_docs = json.loads(file) + return cls( + [Document.from_dict(v, protocol=protocol) for v in json_docs], **kwargs + ) @classmethod def from_list( From ed02cb41c63e58a1c1e17059f2b8b15272274d00 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 15 Mar 2022 13:17:41 +0100 Subject: [PATCH 02/14] fix: use read protocol --- docarray/array/mixins/io/json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/mixins/io/json.py b/docarray/array/mixins/io/json.py index 9c0407cdfa1..0bdea618dfa 100644 --- a/docarray/array/mixins/io/json.py +++ b/docarray/array/mixins/io/json.py @@ -52,7 +52,7 @@ def load_json( if hasattr(file, 'read'): file_ctx = nullcontext(file) else: - file_ctx = open(file, 'w', encoding=encoding) + file_ctx = open(file, 'r', encoding=encoding) with file_ctx as fp: return cls.from_json(fp.read(), protocol=protocol, **kwargs) From 3d6987f08a5a9475c1e8073b6e96f9666319d25c Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 15 Mar 2022 14:19:55 +0100 Subject: [PATCH 03/14] feat: supported dynamic structure in serialization --- docarray/array/mixins/io/json.py | 8 ++++---- docarray/document/mixins/porting.py | 26 ++++++++++++++++++-------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/docarray/array/mixins/io/json.py b/docarray/array/mixins/io/json.py index 0bdea618dfa..d3a9df4ff18 100644 --- a/docarray/array/mixins/io/json.py +++ b/docarray/array/mixins/io/json.py @@ -22,7 +22,7 @@ def save_json( Comparing to :meth:`save_binary`, it is human-readable but slower to save/load and the file size larger. :param file: File or filename to which the data is saved. - :param protocol: `jsonschema` or `protobuf` + :param protocol: `jsonschema`, `protobuf` or `dynamic` :param encoding: encoding used to save data into a JSON file. By default, ``utf-8`` is used. """ if hasattr(file, 'write'): @@ -44,7 +44,7 @@ def load_json( """Load array elements from a JSON file. :param file: File or filename or a JSON string to which the data is saved. - :param protocol: `jsonschema` or `protobuf` + :param protocol: `jsonschema`, `protobuf` or `dynamic` :param encoding: encoding used to load data from a JSON file. By default, ``utf-8`` is used. :return: a DocumentArrayLike object @@ -82,7 +82,7 @@ def from_list( def to_list(self, protocol: str = 'jsonschema', **kwargs) -> List: """Convert the object into a Python list. - :param protocol: `jsonschema` or `protobuf` + :param protocol: `jsonschema`, `protobuf` or `dynamic` :return: a Python list """ return [d.to_dict(protocol=protocol, **kwargs) for d in self] @@ -90,7 +90,7 @@ def to_list(self, protocol: str = 'jsonschema', **kwargs) -> List: def to_json(self, protocol: str = 'jsonschema', **kwargs) -> str: """Convert the object into a JSON string. Can be loaded via :meth:`.load_json`. - :param protocol: `jsonschema` or `protobuf` + :param protocol: `jsonschema`, `protobuf` or `dynamic` :return: a Python list """ return json.dumps(self.to_list(protocol=protocol, **kwargs)) diff --git a/docarray/document/mixins/porting.py b/docarray/document/mixins/porting.py index 130f04cb45f..337da84c44d 100644 --- a/docarray/document/mixins/porting.py +++ b/docarray/document/mixins/porting.py @@ -1,5 +1,6 @@ import base64 import dataclasses +import json import pickle import warnings from typing import Optional, TYPE_CHECKING, Type, Dict, Any, Union @@ -18,7 +19,7 @@ def from_dict( """Convert a dict object into a Document. :param obj: a Python dict object - :param protocol: `jsonschema` or `protobuf` + :param protocol: `jsonschema`, `protobuf` or `dynamic` :param kwargs: extra key-value args pass to pydantic and protobuf parser. :return: the parsed Document object """ @@ -33,6 +34,8 @@ def from_dict( pb_msg = DocumentProto() json_format.ParseDict(obj, pb_msg, **kwargs) return cls.from_protobuf(pb_msg) + elif protocol == 'dynamic': + return cls(obj) else: raise ValueError(f'protocol=`{protocol}` is not supported') @@ -46,7 +49,7 @@ def from_json( """Convert a JSON string into a Document. :param obj: a valid JSON string - :param protocol: `jsonschema` or `protobuf` + :param protocol: `jsonschema`, `protobuf` or `dynamic` :param kwargs: extra key-value args pass to pydantic and protobuf parser. :return: the parsed Document object """ @@ -61,13 +64,15 @@ def from_json( pb_msg = DocumentProto() json_format.Parse(obj, pb_msg, **kwargs) return cls.from_protobuf(pb_msg) + elif protocol == 'dynamic': + return cls.from_dict(json.loads(obj), protocol=protocol) else: raise ValueError(f'protocol=`{protocol}` is not supported') def to_dict(self, protocol: str = 'jsonschema', **kwargs) -> Dict[str, Any]: """Convert itself into a Python dict object. - :param protocol: `jsonschema` or `protobuf` + :param protocol: `jsonschema`, `protobuf` or `dynamic` :param kwargs: extra key-value args pass to pydantic and protobuf dumper. :return: the dumped Document as a dict object """ @@ -80,12 +85,15 @@ def to_dict(self, protocol: str = 'jsonschema', **kwargs) -> Dict[str, Any]: self.to_protobuf(), **kwargs, ) - else: + elif protocol == 'dynamic': warnings.warn( - f'protocol=`{protocol}` is not supported, ' - f'the result dict is a Python dynamic typing dict without any promise on the schema.' + 'The result dict is a Python dynamic typing dict without any promise on the schema.' ) - return dataclasses.asdict(self._data) + res = dataclasses.asdict(self._data) + del res['_reference_doc'] + return res + else: + raise ValueError(f'protocol=`{protocol}` is not supported') def to_bytes( self, protocol: str = 'pickle', compress: Optional[str] = None @@ -131,7 +139,7 @@ def from_bytes( def to_json(self, protocol: str = 'jsonschema', **kwargs) -> str: """Convert itself into a JSON string. - :param protocol: `jsonschema` or `protobuf` + :param protocol: `jsonschema`, `protobuf` or `dynamic` :param kwargs: extra key-value args pass to pydantic and protobuf dumper. :return: the dumped JSON string """ @@ -141,6 +149,8 @@ def to_json(self, protocol: str = 'jsonschema', **kwargs) -> str: from google.protobuf.json_format import MessageToJson return MessageToJson(self.to_protobuf(), **kwargs) + elif protocol == 'dynamic': + return json.dumps(self.to_dict(protocol=protocol)) else: raise ValueError(f'protocol={protocol} is not supported.') From 7f4e21319e1c9654ce7d3b678ee12286fe3a6d14 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 15 Mar 2022 14:20:30 +0100 Subject: [PATCH 04/14] test: cover protocol=dynamic --- tests/unit/array/test_from_to_bytes.py | 2 +- tests/unit/test_pydantic.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/array/test_from_to_bytes.py b/tests/unit/array/test_from_to_bytes.py index a8b77d90327..ef04f29a50a 100644 --- a/tests/unit/array/test_from_to_bytes.py +++ b/tests/unit/array/test_from_to_bytes.py @@ -99,7 +99,7 @@ def test_non_existing_file_raises_file_not_found_error(): @pytest.mark.parametrize('target', [DocumentArray.empty(10), random_docs(10)]) -@pytest.mark.parametrize('protocol', ['jsonschema', 'protobuf']) +@pytest.mark.parametrize('protocol', ['jsonschema', 'protobuf', 'dynamic']) @pytest.mark.parametrize('to_fn', ['dict', 'json']) def test_from_to_safe_list(target, protocol, to_fn): da_r = getattr(DocumentArray, f'from_{to_fn}')( diff --git a/tests/unit/test_pydantic.py b/tests/unit/test_pydantic.py index 7bfebe366e5..b848501f39a 100644 --- a/tests/unit/test_pydantic.py +++ b/tests/unit/test_pydantic.py @@ -135,7 +135,7 @@ def test_with_embedding_no_tensor(): ({'x': 1}, dict), ], ) -@pytest.mark.parametrize('protocol', ['protobuf', 'jsonschema']) +@pytest.mark.parametrize('protocol', ['protobuf', 'jsonschema', 'dynamic']) def test_tags_int_float_str_bool(tag_type, tag_value, protocol): d = Document(tags={'hello': tag_value}) dd = d.to_dict(protocol=protocol)['tags']['hello'] @@ -159,7 +159,7 @@ def test_tags_int_float_str_bool(tag_type, tag_value, protocol): @pytest.mark.parametrize( 'blob', [None, b'123', bytes(Document()), bytes(bytearray(os.urandom(512 * 4)))] ) -@pytest.mark.parametrize('protocol', ['jsonschema', 'protobuf']) +@pytest.mark.parametrize('protocol', ['jsonschema', 'protobuf', 'dynamic']) @pytest.mark.parametrize('to_fn', ['dict', 'json']) def test_to_from_with_blob(protocol, to_fn, blob): d = Document(blob=blob) From 0f2e1de1034d45aecd4417baba0214580df2a961 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 15 Mar 2022 14:51:57 +0100 Subject: [PATCH 05/14] docs: document dynamic protocol --- docs/fundamentals/document/serialization.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/fundamentals/document/serialization.md b/docs/fundamentals/document/serialization.md index 571a4c0ad49..ae4930bba9d 100644 --- a/docs/fundamentals/document/serialization.md +++ b/docs/fundamentals/document/serialization.md @@ -43,7 +43,7 @@ print(d_as_json, d) ``` -By default, it uses {ref}`JSON Schema and pydantic model` for serialization, i.e. `protocol='jsonschema'`. You can switch the method to `protocol='protobuf'`, which leverages Protobuf as the JSON serialization backend. +By default, it uses {ref}`JSON Schema and pydantic model` for serialization, i.e. `protocol='jsonschema'`. You can switch the method to `protocol='protobuf'`, which leverages Protobuf as the JSON serialization backend or `protocol='dynamic'` which accepts schemaless Documents. ```python from docarray import Document @@ -91,6 +91,7 @@ It is easier to eyes. But when building REST API, you do not need to explicitly To find out what extra parameters you can pass to `to_json()`/`to_dict()`, please check out: - [`protocol='jsonschema', **kwargs`](https://pydantic-docs.helpmanual.io/usage/exporting_models/#modeljson) - [`protocol='protobuf', **kwargs`](https://googleapis.dev/python/protobuf/latest/google/protobuf/json_format.html#google.protobuf.json_format.MessageToJson) +- `protocol='dynamic': Accepts any json/dict schema and retrieves/puts all extra fields from/to `Document.tags`. ``` (doc-in-bytes)= From 8e2c36f0d69a9c0f1f74fcdf3387cb3418602542 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 16 Mar 2022 08:27:55 +0100 Subject: [PATCH 06/14] chore: dynamic no more supported for json --- docarray/array/mixins/io/json.py | 6 +++--- docarray/base.py | 2 +- docarray/document/mixins/porting.py | 8 ++------ 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/docarray/array/mixins/io/json.py b/docarray/array/mixins/io/json.py index d3a9df4ff18..ad74f0eea72 100644 --- a/docarray/array/mixins/io/json.py +++ b/docarray/array/mixins/io/json.py @@ -22,7 +22,7 @@ def save_json( Comparing to :meth:`save_binary`, it is human-readable but slower to save/load and the file size larger. :param file: File or filename to which the data is saved. - :param protocol: `jsonschema`, `protobuf` or `dynamic` + :param protocol: `jsonschema` or `protobuf` :param encoding: encoding used to save data into a JSON file. By default, ``utf-8`` is used. """ if hasattr(file, 'write'): @@ -44,7 +44,7 @@ def load_json( """Load array elements from a JSON file. :param file: File or filename or a JSON string to which the data is saved. - :param protocol: `jsonschema`, `protobuf` or `dynamic` + :param protocol: `jsonschema` or `protobuf` :param encoding: encoding used to load data from a JSON file. By default, ``utf-8`` is used. :return: a DocumentArrayLike object @@ -90,7 +90,7 @@ def to_list(self, protocol: str = 'jsonschema', **kwargs) -> List: def to_json(self, protocol: str = 'jsonschema', **kwargs) -> str: """Convert the object into a JSON string. Can be loaded via :meth:`.load_json`. - :param protocol: `jsonschema`, `protobuf` or `dynamic` + :param protocol: `jsonschema` or `protobuf` :return: a Python list """ return json.dumps(self.to_list(protocol=protocol, **kwargs)) diff --git a/docarray/base.py b/docarray/base.py index 478c5d20c1f..91908949f83 100644 --- a/docarray/base.py +++ b/docarray/base.py @@ -32,7 +32,7 @@ def __init__( else: self._data = _obj._data elif isinstance(_obj, dict): - kwargs.update(_obj) + kwargs.update(dict(filter(lambda item: item[1], _obj.items()))) if kwargs: try: diff --git a/docarray/document/mixins/porting.py b/docarray/document/mixins/porting.py index 337da84c44d..6ce1a35734f 100644 --- a/docarray/document/mixins/porting.py +++ b/docarray/document/mixins/porting.py @@ -49,7 +49,7 @@ def from_json( """Convert a JSON string into a Document. :param obj: a valid JSON string - :param protocol: `jsonschema`, `protobuf` or `dynamic` + :param protocol: `jsonschema` or `protobuf` :param kwargs: extra key-value args pass to pydantic and protobuf parser. :return: the parsed Document object """ @@ -64,8 +64,6 @@ def from_json( pb_msg = DocumentProto() json_format.Parse(obj, pb_msg, **kwargs) return cls.from_protobuf(pb_msg) - elif protocol == 'dynamic': - return cls.from_dict(json.loads(obj), protocol=protocol) else: raise ValueError(f'protocol=`{protocol}` is not supported') @@ -139,7 +137,7 @@ def from_bytes( def to_json(self, protocol: str = 'jsonschema', **kwargs) -> str: """Convert itself into a JSON string. - :param protocol: `jsonschema`, `protobuf` or `dynamic` + :param protocol: `jsonschema` or `protobuf` :param kwargs: extra key-value args pass to pydantic and protobuf dumper. :return: the dumped JSON string """ @@ -149,8 +147,6 @@ def to_json(self, protocol: str = 'jsonschema', **kwargs) -> str: from google.protobuf.json_format import MessageToJson return MessageToJson(self.to_protobuf(), **kwargs) - elif protocol == 'dynamic': - return json.dumps(self.to_dict(protocol=protocol)) else: raise ValueError(f'protocol={protocol} is not supported.') From 226634dfefd23eca38f37302856ed8962a2ce8c9 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 16 Mar 2022 08:31:07 +0100 Subject: [PATCH 07/14] test: dynamic only supported for dict --- tests/unit/test_pydantic.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_pydantic.py b/tests/unit/test_pydantic.py index b848501f39a..c5abc2c2c9a 100644 --- a/tests/unit/test_pydantic.py +++ b/tests/unit/test_pydantic.py @@ -159,8 +159,16 @@ def test_tags_int_float_str_bool(tag_type, tag_value, protocol): @pytest.mark.parametrize( 'blob', [None, b'123', bytes(Document()), bytes(bytearray(os.urandom(512 * 4)))] ) -@pytest.mark.parametrize('protocol', ['jsonschema', 'protobuf', 'dynamic']) -@pytest.mark.parametrize('to_fn', ['dict', 'json']) +@pytest.mark.parametrize( + 'to_fn,protocol', + [ + ('dict', 'jsonschema'), + ('json', 'jsonschema'), + ('dict', 'protobuf'), + ('json', 'protobuf'), + ('dict', 'dynamic'), + ], +) def test_to_from_with_blob(protocol, to_fn, blob): d = Document(blob=blob) r_d = getattr(Document, f'from_{to_fn}')( From fd83a482292c31997b6362db9edd306aa1296bb1 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 16 Mar 2022 10:33:36 +0100 Subject: [PATCH 08/14] Revert "chore: dynamic no more supported for json" This reverts commit 8e2c36f0d69a9c0f1f74fcdf3387cb3418602542. --- docarray/array/mixins/io/json.py | 6 +++--- docarray/base.py | 2 +- docarray/document/mixins/porting.py | 8 ++++++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/docarray/array/mixins/io/json.py b/docarray/array/mixins/io/json.py index ad74f0eea72..d3a9df4ff18 100644 --- a/docarray/array/mixins/io/json.py +++ b/docarray/array/mixins/io/json.py @@ -22,7 +22,7 @@ def save_json( Comparing to :meth:`save_binary`, it is human-readable but slower to save/load and the file size larger. :param file: File or filename to which the data is saved. - :param protocol: `jsonschema` or `protobuf` + :param protocol: `jsonschema`, `protobuf` or `dynamic` :param encoding: encoding used to save data into a JSON file. By default, ``utf-8`` is used. """ if hasattr(file, 'write'): @@ -44,7 +44,7 @@ def load_json( """Load array elements from a JSON file. :param file: File or filename or a JSON string to which the data is saved. - :param protocol: `jsonschema` or `protobuf` + :param protocol: `jsonschema`, `protobuf` or `dynamic` :param encoding: encoding used to load data from a JSON file. By default, ``utf-8`` is used. :return: a DocumentArrayLike object @@ -90,7 +90,7 @@ def to_list(self, protocol: str = 'jsonschema', **kwargs) -> List: def to_json(self, protocol: str = 'jsonschema', **kwargs) -> str: """Convert the object into a JSON string. Can be loaded via :meth:`.load_json`. - :param protocol: `jsonschema` or `protobuf` + :param protocol: `jsonschema`, `protobuf` or `dynamic` :return: a Python list """ return json.dumps(self.to_list(protocol=protocol, **kwargs)) diff --git a/docarray/base.py b/docarray/base.py index 91908949f83..478c5d20c1f 100644 --- a/docarray/base.py +++ b/docarray/base.py @@ -32,7 +32,7 @@ def __init__( else: self._data = _obj._data elif isinstance(_obj, dict): - kwargs.update(dict(filter(lambda item: item[1], _obj.items()))) + kwargs.update(_obj) if kwargs: try: diff --git a/docarray/document/mixins/porting.py b/docarray/document/mixins/porting.py index 6ce1a35734f..337da84c44d 100644 --- a/docarray/document/mixins/porting.py +++ b/docarray/document/mixins/porting.py @@ -49,7 +49,7 @@ def from_json( """Convert a JSON string into a Document. :param obj: a valid JSON string - :param protocol: `jsonschema` or `protobuf` + :param protocol: `jsonschema`, `protobuf` or `dynamic` :param kwargs: extra key-value args pass to pydantic and protobuf parser. :return: the parsed Document object """ @@ -64,6 +64,8 @@ def from_json( pb_msg = DocumentProto() json_format.Parse(obj, pb_msg, **kwargs) return cls.from_protobuf(pb_msg) + elif protocol == 'dynamic': + return cls.from_dict(json.loads(obj), protocol=protocol) else: raise ValueError(f'protocol=`{protocol}` is not supported') @@ -137,7 +139,7 @@ def from_bytes( def to_json(self, protocol: str = 'jsonschema', **kwargs) -> str: """Convert itself into a JSON string. - :param protocol: `jsonschema` or `protobuf` + :param protocol: `jsonschema`, `protobuf` or `dynamic` :param kwargs: extra key-value args pass to pydantic and protobuf dumper. :return: the dumped JSON string """ @@ -147,6 +149,8 @@ def to_json(self, protocol: str = 'jsonschema', **kwargs) -> str: from google.protobuf.json_format import MessageToJson return MessageToJson(self.to_protobuf(), **kwargs) + elif protocol == 'dynamic': + return json.dumps(self.to_dict(protocol=protocol)) else: raise ValueError(f'protocol={protocol} is not supported.') From f0d349162acb8dde2242ed72a32b49bc913b84d7 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 16 Mar 2022 10:36:49 +0100 Subject: [PATCH 09/14] fix: schemaless docs are only allowed when loading --- docarray/array/mixins/io/json.py | 6 +++--- docarray/document/mixins/porting.py | 13 ++----------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/docarray/array/mixins/io/json.py b/docarray/array/mixins/io/json.py index d3a9df4ff18..10a7255374a 100644 --- a/docarray/array/mixins/io/json.py +++ b/docarray/array/mixins/io/json.py @@ -22,7 +22,7 @@ def save_json( Comparing to :meth:`save_binary`, it is human-readable but slower to save/load and the file size larger. :param file: File or filename to which the data is saved. - :param protocol: `jsonschema`, `protobuf` or `dynamic` + :param protocol: `jsonschema` or `protobuf` :param encoding: encoding used to save data into a JSON file. By default, ``utf-8`` is used. """ if hasattr(file, 'write'): @@ -82,7 +82,7 @@ def from_list( def to_list(self, protocol: str = 'jsonschema', **kwargs) -> List: """Convert the object into a Python list. - :param protocol: `jsonschema`, `protobuf` or `dynamic` + :param protocol: `jsonschema` or `protobuf` :return: a Python list """ return [d.to_dict(protocol=protocol, **kwargs) for d in self] @@ -90,7 +90,7 @@ def to_list(self, protocol: str = 'jsonschema', **kwargs) -> List: def to_json(self, protocol: str = 'jsonschema', **kwargs) -> str: """Convert the object into a JSON string. Can be loaded via :meth:`.load_json`. - :param protocol: `jsonschema`, `protobuf` or `dynamic` + :param protocol: `jsonschema` or `protobuf` :return: a Python list """ return json.dumps(self.to_list(protocol=protocol, **kwargs)) diff --git a/docarray/document/mixins/porting.py b/docarray/document/mixins/porting.py index 337da84c44d..12b7ef4d681 100644 --- a/docarray/document/mixins/porting.py +++ b/docarray/document/mixins/porting.py @@ -72,7 +72,7 @@ def from_json( def to_dict(self, protocol: str = 'jsonschema', **kwargs) -> Dict[str, Any]: """Convert itself into a Python dict object. - :param protocol: `jsonschema`, `protobuf` or `dynamic` + :param protocol: `jsonschema` or `protobuf` :param kwargs: extra key-value args pass to pydantic and protobuf dumper. :return: the dumped Document as a dict object """ @@ -85,13 +85,6 @@ def to_dict(self, protocol: str = 'jsonschema', **kwargs) -> Dict[str, Any]: self.to_protobuf(), **kwargs, ) - elif protocol == 'dynamic': - warnings.warn( - 'The result dict is a Python dynamic typing dict without any promise on the schema.' - ) - res = dataclasses.asdict(self._data) - del res['_reference_doc'] - return res else: raise ValueError(f'protocol=`{protocol}` is not supported') @@ -139,7 +132,7 @@ def from_bytes( def to_json(self, protocol: str = 'jsonschema', **kwargs) -> str: """Convert itself into a JSON string. - :param protocol: `jsonschema`, `protobuf` or `dynamic` + :param protocol: `jsonschema` or `protobuf` :param kwargs: extra key-value args pass to pydantic and protobuf dumper. :return: the dumped JSON string """ @@ -149,8 +142,6 @@ def to_json(self, protocol: str = 'jsonschema', **kwargs) -> str: from google.protobuf.json_format import MessageToJson return MessageToJson(self.to_protobuf(), **kwargs) - elif protocol == 'dynamic': - return json.dumps(self.to_dict(protocol=protocol)) else: raise ValueError(f'protocol={protocol} is not supported.') From 5dbe44a908be5081dc073461d05e711385e16c15 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 16 Mar 2022 10:55:57 +0100 Subject: [PATCH 10/14] test: test dynamic only on load/from methods --- tests/unit/array/test_from_to_bytes.py | 2 +- tests/unit/document/test_porting.py | 26 ++++++++++++++++++++++++++ tests/unit/test_pydantic.py | 14 +++----------- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/tests/unit/array/test_from_to_bytes.py b/tests/unit/array/test_from_to_bytes.py index ef04f29a50a..a8b77d90327 100644 --- a/tests/unit/array/test_from_to_bytes.py +++ b/tests/unit/array/test_from_to_bytes.py @@ -99,7 +99,7 @@ def test_non_existing_file_raises_file_not_found_error(): @pytest.mark.parametrize('target', [DocumentArray.empty(10), random_docs(10)]) -@pytest.mark.parametrize('protocol', ['jsonschema', 'protobuf', 'dynamic']) +@pytest.mark.parametrize('protocol', ['jsonschema', 'protobuf']) @pytest.mark.parametrize('to_fn', ['dict', 'json']) def test_from_to_safe_list(target, protocol, to_fn): da_r = getattr(DocumentArray, f'from_{to_fn}')( diff --git a/tests/unit/document/test_porting.py b/tests/unit/document/test_porting.py index 42d78d2d9ee..4d87a9841b3 100644 --- a/tests/unit/document/test_porting.py +++ b/tests/unit/document/test_porting.py @@ -1,3 +1,5 @@ +import json + import pytest from docarray import Document, DocumentArray @@ -25,6 +27,30 @@ def test_dict_json(target, protocol, to_fn): assert d == d_r +@pytest.mark.parametrize('to_fn,preproc', [('dict', dict), ('json', json.dumps)]) +def test_schemaless(to_fn, preproc): + input = { + 'attr1': 123, + 'attr2': 'abc', + 'attr3': [1, 2, 3], + 'attr4': ['a', 'b', 'c'], + 'attr5': { + 'attr6': 'a', + 'attr7': 1, + }, + } + doc = getattr(Document, f'from_{to_fn}')(preproc(input), protocol='dynamic') + assert doc.tags['attr1'] == 123 + assert doc.tags['attr2'] == 'abc' + assert doc.tags['attr3'] == [1, 2, 3] + assert doc.tags['attr4'] == ['a', 'b', 'c'] + + assert doc.tags['attr5'] == { + 'attr6': 'a', + 'attr7': 1, + } + + @pytest.mark.parametrize('protocol', ['protobuf', 'pickle']) @pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) def test_to_from_base64(protocol, compress): diff --git a/tests/unit/test_pydantic.py b/tests/unit/test_pydantic.py index c5abc2c2c9a..7bfebe366e5 100644 --- a/tests/unit/test_pydantic.py +++ b/tests/unit/test_pydantic.py @@ -135,7 +135,7 @@ def test_with_embedding_no_tensor(): ({'x': 1}, dict), ], ) -@pytest.mark.parametrize('protocol', ['protobuf', 'jsonschema', 'dynamic']) +@pytest.mark.parametrize('protocol', ['protobuf', 'jsonschema']) def test_tags_int_float_str_bool(tag_type, tag_value, protocol): d = Document(tags={'hello': tag_value}) dd = d.to_dict(protocol=protocol)['tags']['hello'] @@ -159,16 +159,8 @@ def test_tags_int_float_str_bool(tag_type, tag_value, protocol): @pytest.mark.parametrize( 'blob', [None, b'123', bytes(Document()), bytes(bytearray(os.urandom(512 * 4)))] ) -@pytest.mark.parametrize( - 'to_fn,protocol', - [ - ('dict', 'jsonschema'), - ('json', 'jsonschema'), - ('dict', 'protobuf'), - ('json', 'protobuf'), - ('dict', 'dynamic'), - ], -) +@pytest.mark.parametrize('protocol', ['jsonschema', 'protobuf']) +@pytest.mark.parametrize('to_fn', ['dict', 'json']) def test_to_from_with_blob(protocol, to_fn, blob): d = Document(blob=blob) r_d = getattr(Document, f'from_{to_fn}')( From fa82eaee59c70aaad1d15cecefd73a1ce1e501c9 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 16 Mar 2022 11:43:53 +0100 Subject: [PATCH 11/14] docs: clarify usage of protocol dynamic --- docs/fundamentals/document/serialization.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/fundamentals/document/serialization.md b/docs/fundamentals/document/serialization.md index ae4930bba9d..a3653bcde35 100644 --- a/docs/fundamentals/document/serialization.md +++ b/docs/fundamentals/document/serialization.md @@ -43,7 +43,7 @@ print(d_as_json, d) ``` -By default, it uses {ref}`JSON Schema and pydantic model` for serialization, i.e. `protocol='jsonschema'`. You can switch the method to `protocol='protobuf'`, which leverages Protobuf as the JSON serialization backend or `protocol='dynamic'` which accepts schemaless Documents. +By default, it uses {ref}`JSON Schema and pydantic model` for serialization, i.e. `protocol='jsonschema'`. You can switch the method to `protocol='protobuf'`, which leverages Protobuf as the JSON serialization backend. ```python from docarray import Document @@ -91,7 +91,7 @@ It is easier to eyes. But when building REST API, you do not need to explicitly To find out what extra parameters you can pass to `to_json()`/`to_dict()`, please check out: - [`protocol='jsonschema', **kwargs`](https://pydantic-docs.helpmanual.io/usage/exporting_models/#modeljson) - [`protocol='protobuf', **kwargs`](https://googleapis.dev/python/protobuf/latest/google/protobuf/json_format.html#google.protobuf.json_format.MessageToJson) -- `protocol='dynamic': Accepts any json/dict schema and retrieves/puts all extra fields from/to `Document.tags`. +- `protocol='dynamic': Used to load any json/dict schema and puts all fields to `Document.tags`. ``` (doc-in-bytes)= From 3da3000dfc8f783323965f1da44828ac766a0fb6 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 16 Mar 2022 14:58:15 +0100 Subject: [PATCH 12/14] fix: from_json interface --- docarray/array/mixins/io/json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/mixins/io/json.py b/docarray/array/mixins/io/json.py index 10a7255374a..a4b6783f2af 100644 --- a/docarray/array/mixins/io/json.py +++ b/docarray/array/mixins/io/json.py @@ -60,7 +60,7 @@ def load_json( @classmethod def from_json( cls: Type['T'], - file: Union[str, bytes, bytearray, TextIO], + file: Union[str, bytes, bytearray], protocol: str = 'jsonschema', **kwargs ) -> 'T': From ddf9b9c17ffcb17a85bebb5b63b280f696b34d1a Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 16 Mar 2022 15:14:35 +0100 Subject: [PATCH 13/14] chore: remove dynamic option and use else --- docarray/array/mixins/io/json.py | 2 +- docarray/document/mixins/porting.py | 12 ++++-------- tests/unit/document/test_porting.py | 2 +- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/docarray/array/mixins/io/json.py b/docarray/array/mixins/io/json.py index a4b6783f2af..8d7146a574a 100644 --- a/docarray/array/mixins/io/json.py +++ b/docarray/array/mixins/io/json.py @@ -44,7 +44,7 @@ def load_json( """Load array elements from a JSON file. :param file: File or filename or a JSON string to which the data is saved. - :param protocol: `jsonschema`, `protobuf` or `dynamic` + :param protocol: `jsonschema` or `protobuf` :param encoding: encoding used to load data from a JSON file. By default, ``utf-8`` is used. :return: a DocumentArrayLike object diff --git a/docarray/document/mixins/porting.py b/docarray/document/mixins/porting.py index 12b7ef4d681..e952e00ba2d 100644 --- a/docarray/document/mixins/porting.py +++ b/docarray/document/mixins/porting.py @@ -19,7 +19,7 @@ def from_dict( """Convert a dict object into a Document. :param obj: a Python dict object - :param protocol: `jsonschema`, `protobuf` or `dynamic` + :param protocol: `jsonschema` or `protobuf` :param kwargs: extra key-value args pass to pydantic and protobuf parser. :return: the parsed Document object """ @@ -34,10 +34,8 @@ def from_dict( pb_msg = DocumentProto() json_format.ParseDict(obj, pb_msg, **kwargs) return cls.from_protobuf(pb_msg) - elif protocol == 'dynamic': - return cls(obj) else: - raise ValueError(f'protocol=`{protocol}` is not supported') + return cls(obj) @classmethod def from_json( @@ -49,7 +47,7 @@ def from_json( """Convert a JSON string into a Document. :param obj: a valid JSON string - :param protocol: `jsonschema`, `protobuf` or `dynamic` + :param protocol: `jsonschema` or `protobuf` :param kwargs: extra key-value args pass to pydantic and protobuf parser. :return: the parsed Document object """ @@ -64,10 +62,8 @@ def from_json( pb_msg = DocumentProto() json_format.Parse(obj, pb_msg, **kwargs) return cls.from_protobuf(pb_msg) - elif protocol == 'dynamic': - return cls.from_dict(json.loads(obj), protocol=protocol) else: - raise ValueError(f'protocol=`{protocol}` is not supported') + return cls.from_dict(json.loads(obj), protocol=protocol) def to_dict(self, protocol: str = 'jsonschema', **kwargs) -> Dict[str, Any]: """Convert itself into a Python dict object. diff --git a/tests/unit/document/test_porting.py b/tests/unit/document/test_porting.py index 4d87a9841b3..0798523a0d4 100644 --- a/tests/unit/document/test_porting.py +++ b/tests/unit/document/test_porting.py @@ -39,7 +39,7 @@ def test_schemaless(to_fn, preproc): 'attr7': 1, }, } - doc = getattr(Document, f'from_{to_fn}')(preproc(input), protocol='dynamic') + doc = getattr(Document, f'from_{to_fn}')(preproc(input), protocol=None) assert doc.tags['attr1'] == 123 assert doc.tags['attr2'] == 'abc' assert doc.tags['attr3'] == [1, 2, 3] From b034960340017675404c0fbea56c4898fad6cec1 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 16 Mar 2022 15:22:14 +0100 Subject: [PATCH 14/14] docs: apply suggestions --- docs/fundamentals/document/serialization.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/fundamentals/document/serialization.md b/docs/fundamentals/document/serialization.md index a3653bcde35..9afe7750ef2 100644 --- a/docs/fundamentals/document/serialization.md +++ b/docs/fundamentals/document/serialization.md @@ -45,6 +45,8 @@ print(d_as_json, d) By default, it uses {ref}`JSON Schema and pydantic model` for serialization, i.e. `protocol='jsonschema'`. You can switch the method to `protocol='protobuf'`, which leverages Protobuf as the JSON serialization backend. +To load an arbitrary JSON file, please set `protocol=None`. But as it is "arbitrary", you should not expect it can be succesfully loaded. DocArray tries its best reasonable effort by first load this JSON into `dict` and then load it via `Document(dict)`. + ```python from docarray import Document @@ -91,7 +93,6 @@ It is easier to eyes. But when building REST API, you do not need to explicitly To find out what extra parameters you can pass to `to_json()`/`to_dict()`, please check out: - [`protocol='jsonschema', **kwargs`](https://pydantic-docs.helpmanual.io/usage/exporting_models/#modeljson) - [`protocol='protobuf', **kwargs`](https://googleapis.dev/python/protobuf/latest/google/protobuf/json_format.html#google.protobuf.json_format.MessageToJson) -- `protocol='dynamic': Used to load any json/dict schema and puts all fields to `Document.tags`. ``` (doc-in-bytes)=