Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 10 additions & 15 deletions docarray/array/mixins/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,7 @@ def save_json(
file_ctx = open(file, 'w', encoding=encoding)

with file_ctx as fp:
for d in self:
json.dump(d.to_dict(protocol=protocol, **kwargs), fp)
fp.write('\n')
fp.write(self.to_json(protocol=protocol, **kwargs))

@classmethod
def load_json(
Expand All @@ -51,30 +49,27 @@ def load_json(

:return: a DocumentArrayLike object
"""

from .... import Document

constructor = Document.from_json
if hasattr(file, 'read'):
file_ctx = nullcontext(file)
elif os.path.exists(file):
file_ctx = open(file, 'r', encoding=encoding)
else:
file_ctx = nullcontext(json.loads(file))
constructor = Document.from_dict
file_ctx = open(file, 'r', encoding=encoding)

with file_ctx as fp:
return cls([constructor(v, protocol=protocol) for v in fp], **kwargs)
return cls.from_json(fp.read(), protocol=protocol, **kwargs)

@classmethod
def from_json(
cls: Type['T'],
file: Union[str, bytes, bytearray, TextIO],
file: Union[str, bytes, bytearray],
protocol: str = 'jsonschema',
encoding: str = 'utf-8',
**kwargs
) -> 'T':
return cls.load_json(file, protocol=protocol, encoding=encoding, **kwargs)
from .... import Document

json_docs = json.loads(file)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are your sure this maps to the type hint?

        file: Union[str, bytes, bytearray, TextIO],

either this line or type hint, one of them is wrong

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed. Only str, bytes and bytearray are kept while TextIO and str are supported for load_json

return cls(
[Document.from_dict(v, protocol=protocol) for v in json_docs], **kwargs
)

@classmethod
def from_list(
Expand Down
11 changes: 4 additions & 7 deletions docarray/document/mixins/porting.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import base64
import dataclasses
import json
import pickle
import warnings
from typing import Optional, TYPE_CHECKING, Type, Dict, Any, Union
Expand Down Expand Up @@ -34,7 +35,7 @@ def from_dict(
json_format.ParseDict(obj, pb_msg, **kwargs)
return cls.from_protobuf(pb_msg)
else:
raise ValueError(f'protocol=`{protocol}` is not supported')
return cls(obj)

@classmethod
def from_json(
Expand Down Expand Up @@ -62,7 +63,7 @@ def from_json(
json_format.Parse(obj, pb_msg, **kwargs)
return cls.from_protobuf(pb_msg)
else:
raise ValueError(f'protocol=`{protocol}` is not supported')
return cls.from_dict(json.loads(obj), protocol=protocol)

def to_dict(self, protocol: str = 'jsonschema', **kwargs) -> Dict[str, Any]:
"""Convert itself into a Python dict object.
Expand All @@ -81,11 +82,7 @@ def to_dict(self, protocol: str = 'jsonschema', **kwargs) -> Dict[str, Any]:
**kwargs,
)
else:
warnings.warn(
f'protocol=`{protocol}` is not supported, '
f'the result dict is a Python dynamic typing dict without any promise on the schema.'
)
return dataclasses.asdict(self._data)
raise ValueError(f'protocol=`{protocol}` is not supported')

def to_bytes(
self, protocol: str = 'pickle', compress: Optional[str] = None
Expand Down
2 changes: 2 additions & 0 deletions docs/fundamentals/document/serialization.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ print(d_as_json, d)

By default, it uses {ref}`JSON Schema and pydantic model<schema-gen>` for serialization, i.e. `protocol='jsonschema'`. You can switch the method to `protocol='protobuf'`, which leverages Protobuf as the JSON serialization backend.

To load an arbitrary JSON file, please set `protocol=None`. But as it is "arbitrary", you should not expect it can be succesfully loaded. DocArray tries its best reasonable effort by first load this JSON into `dict` and then load it via `Document(dict)`.

```python
from docarray import Document

Expand Down
26 changes: 26 additions & 0 deletions tests/unit/document/test_porting.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import json

import pytest

from docarray import Document, DocumentArray
Expand Down Expand Up @@ -25,6 +27,30 @@ def test_dict_json(target, protocol, to_fn):
assert d == d_r


@pytest.mark.parametrize('to_fn,preproc', [('dict', dict), ('json', json.dumps)])
def test_schemaless(to_fn, preproc):
input = {
'attr1': 123,
'attr2': 'abc',
'attr3': [1, 2, 3],
'attr4': ['a', 'b', 'c'],
'attr5': {
'attr6': 'a',
'attr7': 1,
},
}
doc = getattr(Document, f'from_{to_fn}')(preproc(input), protocol=None)
assert doc.tags['attr1'] == 123
assert doc.tags['attr2'] == 'abc'
assert doc.tags['attr3'] == [1, 2, 3]
assert doc.tags['attr4'] == ['a', 'b', 'c']

assert doc.tags['attr5'] == {
'attr6': 'a',
'attr7': 1,
}


@pytest.mark.parametrize('protocol', ['protobuf', 'pickle'])
@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None])
def test_to_from_base64(protocol, compress):
Expand Down