diff --git a/docarray/array/mixins/io/json.py b/docarray/array/mixins/io/json.py index 43a9304e983..8d7146a574a 100644 --- a/docarray/array/mixins/io/json.py +++ b/docarray/array/mixins/io/json.py @@ -31,9 +31,7 @@ def save_json( file_ctx = open(file, 'w', encoding=encoding) with file_ctx as fp: - for d in self: - json.dump(d.to_dict(protocol=protocol, **kwargs), fp) - fp.write('\n') + fp.write(self.to_json(protocol=protocol, **kwargs)) @classmethod def load_json( @@ -51,30 +49,27 @@ def load_json( :return: a DocumentArrayLike object """ - - from .... import Document - - constructor = Document.from_json if hasattr(file, 'read'): file_ctx = nullcontext(file) - elif os.path.exists(file): - file_ctx = open(file, 'r', encoding=encoding) else: - file_ctx = nullcontext(json.loads(file)) - constructor = Document.from_dict + file_ctx = open(file, 'r', encoding=encoding) with file_ctx as fp: - return cls([constructor(v, protocol=protocol) for v in fp], **kwargs) + return cls.from_json(fp.read(), protocol=protocol, **kwargs) @classmethod def from_json( cls: Type['T'], - file: Union[str, bytes, bytearray, TextIO], + file: Union[str, bytes, bytearray], protocol: str = 'jsonschema', - encoding: str = 'utf-8', **kwargs ) -> 'T': - return cls.load_json(file, protocol=protocol, encoding=encoding, **kwargs) + from .... import Document + + json_docs = json.loads(file) + return cls( + [Document.from_dict(v, protocol=protocol) for v in json_docs], **kwargs + ) @classmethod def from_list( diff --git a/docarray/document/mixins/porting.py b/docarray/document/mixins/porting.py index 130f04cb45f..e952e00ba2d 100644 --- a/docarray/document/mixins/porting.py +++ b/docarray/document/mixins/porting.py @@ -1,5 +1,6 @@ import base64 import dataclasses +import json import pickle import warnings from typing import Optional, TYPE_CHECKING, Type, Dict, Any, Union @@ -34,7 +35,7 @@ def from_dict( json_format.ParseDict(obj, pb_msg, **kwargs) return cls.from_protobuf(pb_msg) else: - raise ValueError(f'protocol=`{protocol}` is not supported') + return cls(obj) @classmethod def from_json( @@ -62,7 +63,7 @@ def from_json( json_format.Parse(obj, pb_msg, **kwargs) return cls.from_protobuf(pb_msg) else: - raise ValueError(f'protocol=`{protocol}` is not supported') + return cls.from_dict(json.loads(obj), protocol=protocol) def to_dict(self, protocol: str = 'jsonschema', **kwargs) -> Dict[str, Any]: """Convert itself into a Python dict object. @@ -81,11 +82,7 @@ def to_dict(self, protocol: str = 'jsonschema', **kwargs) -> Dict[str, Any]: **kwargs, ) else: - warnings.warn( - f'protocol=`{protocol}` is not supported, ' - f'the result dict is a Python dynamic typing dict without any promise on the schema.' - ) - return dataclasses.asdict(self._data) + raise ValueError(f'protocol=`{protocol}` is not supported') def to_bytes( self, protocol: str = 'pickle', compress: Optional[str] = None diff --git a/docs/fundamentals/document/serialization.md b/docs/fundamentals/document/serialization.md index 571a4c0ad49..9afe7750ef2 100644 --- a/docs/fundamentals/document/serialization.md +++ b/docs/fundamentals/document/serialization.md @@ -45,6 +45,8 @@ print(d_as_json, d) By default, it uses {ref}`JSON Schema and pydantic model` for serialization, i.e. `protocol='jsonschema'`. You can switch the method to `protocol='protobuf'`, which leverages Protobuf as the JSON serialization backend. +To load an arbitrary JSON file, please set `protocol=None`. But as it is "arbitrary", you should not expect it can be succesfully loaded. DocArray tries its best reasonable effort by first load this JSON into `dict` and then load it via `Document(dict)`. + ```python from docarray import Document diff --git a/tests/unit/document/test_porting.py b/tests/unit/document/test_porting.py index 42d78d2d9ee..0798523a0d4 100644 --- a/tests/unit/document/test_porting.py +++ b/tests/unit/document/test_porting.py @@ -1,3 +1,5 @@ +import json + import pytest from docarray import Document, DocumentArray @@ -25,6 +27,30 @@ def test_dict_json(target, protocol, to_fn): assert d == d_r +@pytest.mark.parametrize('to_fn,preproc', [('dict', dict), ('json', json.dumps)]) +def test_schemaless(to_fn, preproc): + input = { + 'attr1': 123, + 'attr2': 'abc', + 'attr3': [1, 2, 3], + 'attr4': ['a', 'b', 'c'], + 'attr5': { + 'attr6': 'a', + 'attr7': 1, + }, + } + doc = getattr(Document, f'from_{to_fn}')(preproc(input), protocol=None) + assert doc.tags['attr1'] == 123 + assert doc.tags['attr2'] == 'abc' + assert doc.tags['attr3'] == [1, 2, 3] + assert doc.tags['attr4'] == ['a', 'b', 'c'] + + assert doc.tags['attr5'] == { + 'attr6': 'a', + 'attr7': 1, + } + + @pytest.mark.parametrize('protocol', ['protobuf', 'pickle']) @pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None]) def test_to_from_base64(protocol, compress):