From b66f5ea6a46ab53a80833504eb84b61eca82bf35 Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Thu, 23 Mar 2023 13:48:03 +0100 Subject: [PATCH 1/3] feat: create documents from dict Signed-off-by: jupyterjazz --- docarray/documents/helper.py | 36 +++++++++++++ tests/integrations/document/test_document.py | 54 +++++++++++++++++++- 2 files changed, 88 insertions(+), 2 deletions(-) diff --git a/docarray/documents/helper.py b/docarray/documents/helper.py index 23fed9aa8e5..4096cea40e2 100644 --- a/docarray/documents/helper.py +++ b/docarray/documents/helper.py @@ -118,3 +118,39 @@ class MyAudio(TypedDict): doc = create_model_from_typeddict(typeddict_cls, **kwargs) return doc + + +def create_from_dict(model_name: str, data_dict: Dict[str, Any]) -> Type['T_doc']: + """ + Create a subclass of BaseDocument based on example data given as a dictionary. + + In case the example contains None as a value, + corresponding field will be viewed as the type Any. + + :param model_name: Name of the new Document class + :param data_dict: Dictionary of field types to their corresponding values. + :return: the new Document class + + EXAMPLE USAGE + + .. code-block:: python + + import numpy as np + from docarray.documents import ImageDoc + from docarray.documents.helper import create_from_dict + + data_dict = {'image': ImageDoc(tensor=np.random.rand(3, 224, 224)), 'author': 'me'} + + MyDoc = create_from_dict(model_name='MyDoc', data_dict=data_dict) + + assert issubclass(MyDoc, BaseDocument) + + """ + if not data_dict: + raise ValueError('`data_dict` should contain at least one item') + + field_types: Dict[str, Tuple[Type, ...]] = { + field: (type(value) if value else Any, ...) + for field, value in data_dict.items() + } + return create_doc(__model_name=model_name, **field_types) diff --git a/tests/integrations/document/test_document.py b/tests/integrations/document/test_document.py index 2991d6cb8f0..043239bab61 100644 --- a/tests/integrations/document/test_document.py +++ b/tests/integrations/document/test_document.py @@ -2,12 +2,16 @@ import numpy as np import pytest -from pydantic import BaseModel +from pydantic import BaseModel, ValidationError from typing_extensions import TypedDict from docarray import BaseDocument, DocumentArray from docarray.documents import AudioDoc, ImageDoc, TextDoc -from docarray.documents.helper import create_doc, create_from_typeddict +from docarray.documents.helper import ( + create_doc, + create_from_typeddict, + create_from_dict, +) from docarray.typing import AudioNdArray @@ -98,3 +102,49 @@ class MyAudio(TypedDict): assert issubclass(Doc, BaseDocument) assert issubclass(Doc, AudioDoc) + + +def test_create_from_dict(): + data_dict = { + 'image': ImageDoc(tensor=np.random.rand(3, 224, 224)), + 'text': TextDoc(text='hello'), + 'id': 123, + } + + MyDoc = create_from_dict(model_name='MyDoc', data_dict=data_dict) + + assert issubclass(MyDoc, BaseDocument) + + doc = MyDoc( + image=ImageDoc(tensor=np.random.rand(3, 224, 224)), + text=TextDoc(text='hey'), + id=111, + ) + + assert isinstance(doc, BaseDocument) + assert isinstance(doc.text, TextDoc) + assert isinstance(doc.image, ImageDoc) + assert isinstance(doc.id, int) + + # Create a doc with an incorrect type + with pytest.raises(ValidationError): + doc = MyDoc( + image=ImageDoc(tensor=np.random.rand(3, 224, 224)), + text=['some', 'text'], # should be TextDoc + id=111, + ) + + # Handle empty data_dict + with pytest.raises(ValueError): + MyDoc = create_from_dict(model_name='MyDoc', data_dict={}) + + # Data with a None value + data_dict = {'text': 'some text', 'other': None} + MyDoc = create_from_dict(model_name='MyDoc', data_dict=data_dict) + + assert issubclass(MyDoc, BaseDocument) + + doc1 = MyDoc(text='txt', other=10) + doc2 = MyDoc(text='txt', other='also text') + + assert isinstance(doc1, BaseDocument) and isinstance(doc2, BaseDocument) From 332a2281bbbe70ab16a4246626d5a873772d14cd Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Thu, 23 Mar 2023 14:16:58 +0100 Subject: [PATCH 2/3] fix: ignore type Signed-off-by: jupyterjazz --- docarray/documents/helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docarray/documents/helper.py b/docarray/documents/helper.py index 4096cea40e2..d1a0b522334 100644 --- a/docarray/documents/helper.py +++ b/docarray/documents/helper.py @@ -149,8 +149,8 @@ def create_from_dict(model_name: str, data_dict: Dict[str, Any]) -> Type['T_doc' if not data_dict: raise ValueError('`data_dict` should contain at least one item') - field_types: Dict[str, Tuple[Type, ...]] = { + field_types = { field: (type(value) if value else Any, ...) for field, value in data_dict.items() } - return create_doc(__model_name=model_name, **field_types) + return create_doc(__model_name=model_name, **field_types) # type: ignore From 34c4af5818a465d0ac37401487367f3227dd789a Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Thu, 23 Mar 2023 15:16:10 +0100 Subject: [PATCH 3/3] refactor: change fn names Signed-off-by: jupyterjazz --- docarray/documents/helper.py | 12 ++++++------ tests/integrations/document/test_document.py | 20 ++++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docarray/documents/helper.py b/docarray/documents/helper.py index d1a0b522334..a7f7cc35a5f 100644 --- a/docarray/documents/helper.py +++ b/docarray/documents/helper.py @@ -73,7 +73,7 @@ def create_doc( return doc -def create_from_typeddict( +def create_doc_from_typeddict( typeddict_cls: Type['TypedDict'], # type: ignore **kwargs: Any, ): @@ -91,7 +91,7 @@ def create_from_typeddict( from docarray import BaseDocument from docarray.documents import Audio - from docarray.documents.helper import create_from_typeddict + from docarray.documents.helper import create_doc_from_typeddict from docarray.typing.tensor.audio import AudioNdArray @@ -100,7 +100,7 @@ class MyAudio(TypedDict): tensor: AudioNdArray - Doc = create_from_typeddict(MyAudio, __base__=Audio) + Doc = create_doc_from_typeddict(MyAudio, __base__=Audio) assert issubclass(Doc, BaseDocument) assert issubclass(Doc, Audio) @@ -120,7 +120,7 @@ class MyAudio(TypedDict): return doc -def create_from_dict(model_name: str, data_dict: Dict[str, Any]) -> Type['T_doc']: +def create_doc_from_dict(model_name: str, data_dict: Dict[str, Any]) -> Type['T_doc']: """ Create a subclass of BaseDocument based on example data given as a dictionary. @@ -137,11 +137,11 @@ def create_from_dict(model_name: str, data_dict: Dict[str, Any]) -> Type['T_doc' import numpy as np from docarray.documents import ImageDoc - from docarray.documents.helper import create_from_dict + from docarray.documents.helper import create_doc_from_dict data_dict = {'image': ImageDoc(tensor=np.random.rand(3, 224, 224)), 'author': 'me'} - MyDoc = create_from_dict(model_name='MyDoc', data_dict=data_dict) + MyDoc = create_doc_from_dict(model_name='MyDoc', data_dict=data_dict) assert issubclass(MyDoc, BaseDocument) diff --git a/tests/integrations/document/test_document.py b/tests/integrations/document/test_document.py index 043239bab61..35cbba24d53 100644 --- a/tests/integrations/document/test_document.py +++ b/tests/integrations/document/test_document.py @@ -9,8 +9,8 @@ from docarray.documents import AudioDoc, ImageDoc, TextDoc from docarray.documents.helper import ( create_doc, - create_from_typeddict, - create_from_dict, + create_doc_from_typeddict, + create_doc_from_dict, ) from docarray.typing import AudioNdArray @@ -82,15 +82,15 @@ def test_create_doc(): assert issubclass(MyAudio, AudioDoc) -def test_create_from_typeddict(): +def test_create_doc_from_typeddict(): class MyMultiModalDoc(TypedDict): image: ImageDoc text: TextDoc with pytest.raises(ValueError): - _ = create_from_typeddict(MyMultiModalDoc, __base__=BaseModel) + _ = create_doc_from_typeddict(MyMultiModalDoc, __base__=BaseModel) - Doc = create_from_typeddict(MyMultiModalDoc) + Doc = create_doc_from_typeddict(MyMultiModalDoc) assert issubclass(Doc, BaseDocument) @@ -98,20 +98,20 @@ class MyAudio(TypedDict): title: str tensor: Optional[AudioNdArray] - Doc = create_from_typeddict(MyAudio, __base__=AudioDoc) + Doc = create_doc_from_typeddict(MyAudio, __base__=AudioDoc) assert issubclass(Doc, BaseDocument) assert issubclass(Doc, AudioDoc) -def test_create_from_dict(): +def test_create_doc_from_dict(): data_dict = { 'image': ImageDoc(tensor=np.random.rand(3, 224, 224)), 'text': TextDoc(text='hello'), 'id': 123, } - MyDoc = create_from_dict(model_name='MyDoc', data_dict=data_dict) + MyDoc = create_doc_from_dict(model_name='MyDoc', data_dict=data_dict) assert issubclass(MyDoc, BaseDocument) @@ -136,11 +136,11 @@ def test_create_from_dict(): # Handle empty data_dict with pytest.raises(ValueError): - MyDoc = create_from_dict(model_name='MyDoc', data_dict={}) + MyDoc = create_doc_from_dict(model_name='MyDoc', data_dict={}) # Data with a None value data_dict = {'text': 'some text', 'other': None} - MyDoc = create_from_dict(model_name='MyDoc', data_dict=data_dict) + MyDoc = create_doc_from_dict(model_name='MyDoc', data_dict=data_dict) assert issubclass(MyDoc, BaseDocument)