docarray · samsja · Jan 17, 2023 · Jan 12, 2023 · Jan 12, 2023 · Jan 12, 2023
diff --git a/README.md b/README.md
@@ -22,18 +22,24 @@ from typing import Optional
 class MyDocument(BaseDocument):
     description: str
     image_url: ImageUrl
-    image_tensor: Optional[TorchTensor[3, 224, 224]]
-    embedding: Optional[TorchTensor[768]]
+    image_tensor: Optional[TorchTensor[1704, 2272, 3]]
+    # The field above only work with tensor of shape (1704, 2272, 3)
+    embedding: Optional[TorchTensor]
 
 
 doc = MyDocument(
     description="This is a photo of a mountain",
     image_url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg",
 )
 doc.image_tensor = doc.image_url.load()  # load image tensor from URL
-doc.embedding = CLIPImageEncoder()(
+```
+
+```python
+doc.embedding = clip_image_encoder(
     doc.image_tensor
 )  # create and store embedding using model of your choice
+
+print(doc.embedding.shape)
 ```
 
 - **Model** data of any type (audio, video, text, images, 3D meshes, raw tensors, etc) as a single, unified data structure, the `Document`
@@ -47,9 +53,9 @@ from docarray.documents import Image
 doc = Image(
     url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg",
 )
-doc.image_tensor = doc.url.load()  # load image tensor from URL
-doc.embedding = CLIPImageEncoder()(
-    doc.image_tensor
+doc.tensor = doc.url.load()  # load image tensor from URL
+doc.embedding = clip_image_encoder(
+    doc.tensor
 )  # create and store embedding using model of your choice
 ```
 ### Compose nested Documents:
@@ -71,21 +77,50 @@ doc = MultiModalDocument(
 ```
 
 ### Collect multiple `Documents` into a `DocumentArray`:
+```python
+from docarray import DocumentArray, BaseDocument
+from docarray.typing import AnyTensor, ImageUrl
+import numpy as np
+
+
+class Image(BaseDocument):
+    url: ImageUrl
+    tensor: AnyTensor
+```
+
 ```python
 from docarray import DocumentArray
-from docarray.documents import Image
 
-da = DocumentArray(
+da = DocumentArray[Image](
     [
         Image(
             url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg",
+            tensor=np.zeros((3, 224, 224)),
         )
         for _ in range(100)
     ]
 )
 ```
 
 
+Access fields at the DocumentArray level:
+
+```python
+print(len(da.tensor))
+print(da.tensor[0].shape)
+```
+
+You can stack tensors if you want to perform in batch processing:
+
+```python
+da = da.stack()
+```
+
+```python
+print(type(da.tensor))
+print(da.tensor.shape)
+```
+
 ## Send
 - **Serialize** any `Document` or `DocumentArray` into _protobuf_, _json_, _jsonschema_, _bytes_ or _base64_
 - Use in **microservice** architecture: Send over **HTTP** or **gRPC**
@@ -101,7 +136,9 @@ doc = Image(tensor=np.zeros((3, 224, 224)))
 # JSON over HTTP
 async with AsyncClient(app=app, base_url="http://test") as ac:
     response = await ac.post("/doc/", data=input_doc.json())
+```
 
+```python
 # (de)serialize from/to protobuf
 Image.from_protobuf(doc.to_protobuf())
 ```
@@ -153,6 +190,94 @@ If you come from Pydantic, you can see Documents as juiced up models, and DocArr
 - Cloud ready: Serialization to **Protobuf** for use with microservices and **gRPC**
 - Support for **vector search functionalities**, such as `find()` and `embed()`
 
+## Coming from PyTorch
+
+DocArray can be used directly inside ML models to handle and represent multi-modal data. This allows you to reason about your data using DocArray's abstractions deep inside of `nn.Module`, and provides a (FastAPI compatible) schema that eases the transition between model training and model serving.
+
+To see the effect of this, let's first observe a vanilla PyTorch implementation of a tri-modal ML model:
+
+```python
+import torch
+from torch import nn
+
+
+class MyMultiModalModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.audio_encoder = AudioEncoder()
+        self.image_encoder = ImageEncoder()
+        self.text_encoder = TextEncoder()
+
+    def forward(self, text_1, text_2, image_1, image_2, audio_1, audio_2):
+        emnedding_text_1 = self.text_encoder(text_1)
+        emnedding_text_2 = self.text_encoder(text_2)
+
+        emnedding_image_1 = self.image_encoder(image_1)
+        emnedding_image_2 = self.image_encoder(image_2)
+
+        emnedding_audio_1 = self.image_encoder(audio_1)
+        emnedding_audio_2 = self.image_encoder(audio_2)
+
+        return (
+            emnedding_text_1,
+            emnedding_text_2,
+            emnedding_image_1,
+            emnedding_image_2,
+            emnedding_audio_1,
+            emnedding_audio_2,
+        )
+```
+
+Not very easy on the eyes if you ask us. And even worse, if you need to add one more modality you have to touch every part of your code base, changing the `forward()` return type and make a whole lot of changes downstream from that.
+
+So now let's see what the same code looks like with DocArray:
+
+```python
+from docarray import DocumentArray, BaseDocument
+from docarray.documents import Image, Text, Audio
+from docarray.typing import TorchTensor
+
+import torch
+
+
+class Podcast(BaseDocument):
+    text: Text
+    image: Image
+    audio: Audio
+
+
+class PairPodcast(BaseDocument):
+    left: Podcast
+    right: Podcast
+
+
+class MyPodcastModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.audio_encoder = AudioEncoder()
+        self.image_encoder = ImageEncoder()
+        self.text_encoder = TextEncoder()
+
+    def forward_podcast(da: DocumentArray[Podcast]) -> DocumentArray[Podcast]:
+        da.audio.embedding = self.audio_encoder(da.audio.tensor)
+        da.text.embedding = self.text_encoder(da.text.tensor)
+        da.image.embedding = self.image_encoder(da.image.tensor)
+
+        return da
+
+    def forward(da: DocumentArray[PairPodcast]) -> DocumentArray[PairPodcast]:
+        da.left = self.forward_podcast(da.left)
+        da.right = self.forward_podcast(da.right)
+
+        return da
+```
+
+Looks much better, doesn't it?
+You instantly win in code readability and maintainability. And for the same price you can turn your PyTorch model into a FastAPI app and reuse your Document
+schema definition (see below). Everything handles in a pythonic manner by relying on type hints.
+
+
+
 ## Coming from FastAPI
 
 Documents are Pydantic Models (with a twist), and as such they are fully compatible with FastAPI:
@@ -165,6 +290,7 @@ from httpx import AsyncClient
 from docarray import BaseDocument
 from docarray.documents import Image
 from docarray.typing import NdArray
+from docarray.base_document import DocumentResponse
 
 
 class InputDoc(BaseDocument):
@@ -181,12 +307,13 @@ input_doc = InputDoc(img=Image(tensor=np.zeros((3, 224, 224))))
 app = FastAPI()
 
 
-@app.post("/doc/", response_model=OutputDoc)
+@app.post("/doc/", response_model=OutputDoc, response_class=DocumentResponse)
 async def create_item(doc: InputDoc) -> OutputDoc:
     ## call my fancy model to generate the embeddings
-    return OutputDoc(
+    doc = OutputDoc(
         embedding_clip=np.zeros((100, 1)), embedding_bert=np.zeros((100, 1))
     )
+    return doc
 
 
 async with AsyncClient(app=app, base_url="http://test") as ac:

diff --git a/docs/index_init.md b/docs/index_init.md
@@ -6,6 +6,7 @@
 
 api_public
 api/docarray
+tutorials/mutlimodal_training_and_serving.md
 ```
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,7 @@ @@
     api_public
     api/docarray
+    tutorials/mutlimodal_training_and_serving.md
     ```
@@ Expand Down @@