From 9fa3efebbae8b141139e15cac0b2d454ee6dac6c Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 12 Jan 2023 13:31:18 +0100 Subject: [PATCH 01/29] docs: add tutorials notebook as md Signed-off-by: Sami Jaghouar --- docs/index_init.md | 1 + .../mutlimodal_training_and_serving.md | 327 ++++++++++++++++++ 2 files changed, 328 insertions(+) create mode 100644 docs/tutorials/mutlimodal_training_and_serving.md diff --git a/docs/index_init.md b/docs/index_init.md index 17809b8333f..79711f6520e 100644 --- a/docs/index_init.md +++ b/docs/index_init.md @@ -6,6 +6,7 @@ api_public api/docarray +tutorials/mutlimodal_training_and_serving.md ``` diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md new file mode 100644 index 00000000000..cef2f8bf8f0 --- /dev/null +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -0,0 +1,327 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .md + format_name: markdown + format_version: '1.3' + jupytext_version: 1.14.0 + kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +# MultiModal Deep learning with DocArray + + + The goal of this notebook is to showcase the usage of `docarray` with 'pytorch' to do multi-modal machine learning. + +We will train a [CLIP(https://arxiv.org/abs/2103.00020)-like model on a dataset compose of text and image. The goal is to train the model +that is able to understand both text and image and project them into a common embedding space. + +We train the CLIP-like model on the [flick8k](https://www.kaggle.com/datasets/adityajn105/flickr8k) dataset. To run this notebook you need to download and unzip the data into the same folder as the notebook. + +In this notebook we don't aim at reproduce any CLIP results (our dataset is way to small anyway) but rather to show how DocArray datastructures help researcher to write beautiful and pythonic multi-modal pytorch code + +```python tags=[] +#!pip install "git+https://github.com/docarray/docarray@feat-rewrite-v2#egg=docarray[torch,image]" +#!pip install torchvision +#!pip install transformers +#!pip install fastapi +``` + +```python +import itertools +from typing import Callable, Dict, List, Optional +``` + +```python +import docarray +import torch +``` + +```python +import torchvision +from torch import nn +from transformers import AutoTokenizer, DistilBertModel +``` + +```python +DEVICE = "cuda:2" +``` + + +## Create the Documents for handling the MutiModal data + + +At the heart of DocArray live the concept of `BaseDocument` that allow user to define a nested data schema to represent any kind of complex multi modal data. `BaseDocument` is a pythonic way to define a data schema, it is inspired by [Pydantic BaseModel](https://docs.pydantic.dev/usage/models/) (it is actually built on top of it) + +Lets start to define Document to handle the different modalities that we will use during our training + +```python +from docarray import BaseDocument, DocumentArray +from docarray.documents import Image +from docarray.documents import Text as BaseText +from docarray.typing import TorchTensor, ImageUrl +``` + +```python tags=[] +class Tokens(BaseDocument): + input_ids: TorchTensor[512] + attention_mask: TorchTensor +``` + +```python +class Text(BaseText): + tokens: Optional[Tokens] +``` + +the final document use for training here is the PairTextImage which combine the Text and Image modalities + +```python +class PairTextImage(BaseDocument): + text: Text + image: Image +``` + +## Create the Dataset + + +In this section we will create a multi modal pytorch dataset around the Flick8k dataset using docarray. + +We will use DocArray data loading functionality to load the data and use torchvision and transformers to preprocess the data before fedding it to our deepl learning model + +```python +from torch.utils.data import DataLoader, Dataset +``` + +```python +class VisionPreprocess: + def __init__(self): + self.transform = torchvision.transforms.Compose( + [ + torchvision.transforms.ToTensor(), + torchvision.transforms.Resize(232), + torchvision.transforms.RandomCrop(224), + torchvision.transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + + def __call__(self, url: ImageUrl) -> TorchTensor[3, 224, 224]: + return self.transform(url.load()) +``` + +```python +class TextPreprocess: + def __init__(self): + self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased") + + def __call__(self, text: str) -> Tokens: + return Tokens(**self.tokenizer(text, padding="max_length", truncation=True)) +``` + +```python +class PairDataset(Dataset): + def __init__( + self, + file: str, + vision_preprocess: VisionPreprocess, + text_preprocess: TextPreprocess, + N=None, + ): + self.docs = DocumentArray[PairTextImage]([]) + + with open("captions.txt", "r") as f: + lines = list(f.readlines()) + lines = lines[1:N] if N else lines[1:] + for line in lines: + line = line.split(",") + doc = PairTextImage( + text=Text(text=line[1]), image=Image(url=f"Images/{line[0]}") + ) + self.docs.append(doc) + + self.vision_preprocess = vision_preprocess + self.text_preprocess = text_preprocess + + def __len__(self): + return len(self.docs) + + def __getitem__(self, item): + doc = self.docs[item].copy() + doc.image.tensor = self.vision_preprocess(doc.image.url) + doc.text.tokens = self.text_preprocess(doc.text.text) + return doc + + @staticmethod + def collate_fn(batch: List[PairTextImage]): + batch = DocumentArray[PairTextImage](batch, tensor_type=TorchTensor) + batch = batch.stack() + + return batch +``` + +```python +vision_preprocess = VisionPreprocess() +text_preprocess = TextPreprocess() +``` + +```python +dataset = PairDataset("captions.txt", vision_preprocess, text_preprocess) +loader = DataLoader( + dataset, batch_size=64, collate_fn=PairDataset.collate_fn, shuffle=True +) +``` + +## Create the Pytorch model that work on DocumentArray + + +In this section we create two encoders one for each modalities (Text and Image). These encoders are nornal pytorch nn.Module. the Only difference is that they operate on DocumentArray directly rather that on tensor + +```python +class TextEncoder(nn.Module): + def __init__(self): + super().__init__() + self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased") + + def forward(self, texts: DocumentArray[Text]) -> TorchTensor: + last_hidden_state = self.bert( + input_ids=texts.tokens.input_ids, attention_mask=texts.tokens.attention_mask + ).last_hidden_state + + return self._mean_pool(last_hidden_state, texts.tokens.attention_mask) + + def _mean_pool( + self, last_hidden_state: TorchTensor, attention_mask: TorchTensor + ) -> TorchTensor: + masked_output = last_hidden_state * attention_mask.unsqueeze(-1) + return masked_output.sum(dim=1) / attention_mask.sum(-1, keepdim=True) +``` + +```python +class VisionEncoder(nn.Module): + def __init__(self): + super().__init__() + self.backbone = torchvision.models.resnet18(pretrained=True) + self.linear = nn.LazyLinear(out_features=768) + + def forward(self, images: DocumentArray[Image]) -> TorchTensor: + x = self.backbone(images.tensor) + return self.linear(x) +``` + +```python +vision_encoder = VisionEncoder().to(DEVICE) +text_encoder = TextEncoder().to(DEVICE) +``` + +## Train the model in a constrative way between Text and Image (CLIP) + + +Now that we have defined our dataloader and our models we can train the two encoder is a contrastive way. The goal is to match the representation of the text and the image for each pair in the dataset. + +```python +optim = torch.optim.Adam( + itertools.chain(vision_encoder.parameters(), text_encoder.parameters()), lr=3e-4 +) +``` + +```python +def cosine_sim(x_mat: TorchTensor, y_mat: TorchTensor) -> TorchTensor: + a_n, b_n = x_mat.norm(dim=1)[:, None], y_mat.norm(dim=1)[:, None] + a_norm = x_mat / torch.clamp(a_n, min=1e-7) + b_norm = y_mat / torch.clamp(b_n, min=1e-7) + return torch.mm(a_norm, b_norm.transpose(0, 1)).squeeze() +``` + +```python +def clip_loss(image: DocumentArray[Image], text: DocumentArray[Text]) -> TorchTensor: + sims = cosine_sim(image.embedding, text.embedding) + return torch.norm(sims - torch.eye(sims.shape[0], device=DEVICE)) +``` + +```python +num_epoch = 1 # here you should do more epochs to really learn something +``` + +One things to notice here is that our dataloader does not return a torch.Tensor but a DocumentArray[PairTextImage] ! + +```python tags=[] +with torch.autocast(device_type="cuda", dtype=torch.float16): + for epoch in range(num_epoch): + for i, batch in enumerate(loader): + batch.to(DEVICE) + + optim.zero_grad() + batch.image.embedding = vision_encoder(batch.image) + batch.text.embedding = text_encoder(batch.text) + loss = clip_loss(batch.image, batch.text) + if i % 10 == 0: + print(f"{i+epoch} steps , loss : {loss}") + loss.backward() + optim.step() +``` +## From prototype to production in, well, almost no line of code + + +Now we have a ML clip model trained ! Let's see how we can serve this model with a RestAPI by reusing most of the code above. + +lets use our beloved [FastAPI](https://fastapi.tiangolo.com/) + + +FastAPI is powerfull because it allows you to define your RestAPI data schema only with python ! And DocArray is fully compatible with FastAPI that means that as long as you have a function that takes as input a Document FastAPI will be able to translate it into a fully fledge RestAPI with documentation, openAPI specification and more ! + +```python +from fastapi import FastAPI +from docarray.base_document import DocumentResponse +``` + +```python +app = FastAPI() +``` + +```python +vision_encoder = vision_encoder.eval() +text_encoder = text_encoder.eval() +``` + +now we can test the API + +```python +@app.post("/embed_text/", response_model=Text, response_class=DocumentResponse) +async def embed_text(doc: Text) -> Text: + with torch.autocast(device_type="cuda", dtype=torch.float16): + with torch.inference_mode(): + doc.tokens = text_preprocess(doc.text) + da = DocumentArray[Text]([doc], tensor_type=TorchTensor).stack() + da.to(DEVICE) + doc.embedding = text_encoder(da)[0].to('cpu') + return doc +``` + +```python +from httpx import AsyncClient +``` + +```python +text_input = Text(text='a picture of a rainbow') +``` + +```python +async with AsyncClient( + app=app, + base_url="http://test", +) as ac: + response = await ac.post("/embed_text/", data=text_input.json()) +``` + +```python +doc_resp = Text.parse_raw(response.content.decode()) +``` + +```python +doc_resp.embedding.shape +``` From 117b4029076d7a91ef424f42c4b7999ae5778635 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 12 Jan 2023 15:01:35 +0100 Subject: [PATCH 02/29] chore: update readme Signed-off-by: Sami Jaghouar --- README.md | 57 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 8a8605d85b4..35d1357f8eb 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ from typing import Optional class MyDocument(BaseDocument): description: str image_url: ImageUrl - image_tensor: Optional[TorchTensor[3, 224, 224]] + image_tensor: Optional[TorchTensor] embedding: Optional[TorchTensor[768]] @@ -31,9 +31,11 @@ doc = MyDocument( image_url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", ) doc.image_tensor = doc.image_url.load() # load image tensor from URL -doc.embedding = CLIPImageEncoder()( +doc.embedding = clip_image_encoder( doc.image_tensor ) # create and store embedding using model of your choice + +print(doc.embedding.shape) ``` - **Model** data of any type (audio, video, text, images, 3D meshes, raw tensors, etc) as a single, unified data structure, the `Document` @@ -47,14 +49,14 @@ from docarray.documents import Image doc = Image( url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", ) -doc.image_tensor = doc.url.load() # load image tensor from URL -doc.embedding = CLIPImageEncoder()( - doc.image_tensor +doc.tensor = doc.url.load() # load image tensor from URL +doc.embedding = clip_image_encoder( + doc.tensor ) # create and store embedding using model of your choice ``` ### Compose nested Documents: -```python + from docarray import BaseDocument from docarray.documents import Image, Text import numpy as np @@ -68,17 +70,28 @@ class MultiModalDocument(BaseDocument): doc = MultiModalDocument( image_doc=Image(tensor=np.zeros((3, 224, 224))), text_doc=Text(text='hi!') ) -``` + ### Collect multiple `Documents` into a `DocumentArray`: +```python +from docarray import DocumentArray, BaseDocument +from docarray.typing import AnyTensor, ImageUrl +import numpy as np + + +class Image(BaseDocument): + url: ImageUrl + tensor: AnyTensor +``` + ```python from docarray import DocumentArray -from docarray.documents import Image -da = DocumentArray( +da = DocumentArray[Image]( [ Image( url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", + tensor=np.zeros((3, 224, 224)), ) for _ in range(100) ] @@ -86,6 +99,24 @@ da = DocumentArray( ``` +access field at the DocumentArray level + +```python +print(len(da.tensor)) +print(da.tensor[0].shape) +``` + +You can stack tensor if you want to perform in batch processing + +```python +da = da.stack() +``` + +```python +print(type(da.tensor)) +print(da.tensor.shape) +``` + ## Send - **Serialize** any `Document` or `DocumentArray` into _protobuf_, _json_, _jsonschema_, _bytes_ or _base64_ - Use in **microservice** architecture: Send over **HTTP** or **gRPC** @@ -101,7 +132,9 @@ doc = Image(tensor=np.zeros((3, 224, 224))) # JSON over HTTP async with AsyncClient(app=app, base_url="http://test") as ac: response = await ac.post("/doc/", data=input_doc.json()) +``` +```python # (de)serialize from/to protobuf Image.from_protobuf(doc.to_protobuf()) ``` @@ -165,6 +198,7 @@ from httpx import AsyncClient from docarray import BaseDocument from docarray.documents import Image from docarray.typing import NdArray +from docarray.base_document import DocumentResponse class InputDoc(BaseDocument): @@ -181,12 +215,13 @@ input_doc = InputDoc(img=Image(tensor=np.zeros((3, 224, 224)))) app = FastAPI() -@app.post("/doc/", response_model=OutputDoc) +@app.post("/doc/", response_model=OutputDoc, response_class=DocumentResponse) async def create_item(doc: InputDoc) -> OutputDoc: ## call my fancy model to generate the embeddings - return OutputDoc( + doc = OutputDoc( embedding_clip=np.zeros((100, 1)), embedding_bert=np.zeros((100, 1)) ) + return doc async with AsyncClient(app=app, base_url="http://test") as ac: From 7e5b73ff299ebe5cbb557320a2381b4a9d3baba3 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 12 Jan 2023 15:10:08 +0100 Subject: [PATCH 03/29] docs: aply j suggestion Signed-off-by: Sami Jaghouar --- docs/tutorials/mutlimodal_training_and_serving.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index cef2f8bf8f0..cc91274bd97 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -325,3 +325,5 @@ doc_resp = Text.parse_raw(response.content.decode()) ```python doc_resp.embedding.shape ``` + +And we're done! You have trained and served a mulit-modal ML model, with zero headache and a lot of DocArray! From 708eeff0d6f29307022678aa226f8c0d56966545 Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:10:59 +0100 Subject: [PATCH 04/29] feat: apply johannes suggestion Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docs/tutorials/mutlimodal_training_and_serving.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index cc91274bd97..7c043fc7f28 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -15,9 +15,9 @@ jupyter: # MultiModal Deep learning with DocArray - The goal of this notebook is to showcase the usage of `docarray` with 'pytorch' to do multi-modal machine learning. + The goal of this notebook is to showcase the usage of **DocArray** with **PyTorch** to do multi-modal machine learning. -We will train a [CLIP(https://arxiv.org/abs/2103.00020)-like model on a dataset compose of text and image. The goal is to train the model +We will train a [CLIP](https://arxiv.org/abs/2103.00020)-like model on a dataset compose of text and image. The goal is to train the model that is able to understand both text and image and project them into a common embedding space. We train the CLIP-like model on the [flick8k](https://www.kaggle.com/datasets/adityajn105/flickr8k) dataset. To run this notebook you need to download and unzip the data into the same folder as the notebook. From dee036d404f672256b68d763ec757c116d6172ce Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:11:11 +0100 Subject: [PATCH 05/29] feat: apply johannes suggestion Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docs/tutorials/mutlimodal_training_and_serving.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index 7c043fc7f28..5d90eb4a1b6 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -22,7 +22,7 @@ that is able to understand both text and image and project them into a common em We train the CLIP-like model on the [flick8k](https://www.kaggle.com/datasets/adityajn105/flickr8k) dataset. To run this notebook you need to download and unzip the data into the same folder as the notebook. -In this notebook we don't aim at reproduce any CLIP results (our dataset is way to small anyway) but rather to show how DocArray datastructures help researcher to write beautiful and pythonic multi-modal pytorch code +In this notebook we don't aim at reproduce any CLIP results (our dataset is way to small anyway) but rather to show how DocArray datastructures help researchers and practitioners to write beautiful and pythonic multi-modal PyTorch code. ```python tags=[] #!pip install "git+https://github.com/docarray/docarray@feat-rewrite-v2#egg=docarray[torch,image]" @@ -55,9 +55,11 @@ DEVICE = "cuda:2" ## Create the Documents for handling the MutiModal data -At the heart of DocArray live the concept of `BaseDocument` that allow user to define a nested data schema to represent any kind of complex multi modal data. `BaseDocument` is a pythonic way to define a data schema, it is inspired by [Pydantic BaseModel](https://docs.pydantic.dev/usage/models/) (it is actually built on top of it) +At the heart of DocArray live the concept of a `Document`, a collection of mulit-modal data. The `BaseDocument` class allows users to define their own (nested, multi-modal) Document schema to represent any kind of complex data. -Lets start to define Document to handle the different modalities that we will use during our training +`BaseDocument` is a pythonic way to define a data schema, and is inspired by and built on top of [Pydantic BaseModel](https://docs.pydantic.dev/usage/models/). + +Let's start by defining a few Documents to handle the different modalities that we will use during our training: ```python from docarray import BaseDocument, DocumentArray From 8dbf46095bc844b9580814d49c5591e1b29a7823 Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:11:50 +0100 Subject: [PATCH 06/29] feat: apply johannes suggestion Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docs/tutorials/mutlimodal_training_and_serving.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index 5d90eb4a1b6..09823cd8f7d 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -63,12 +63,11 @@ Let's start by defining a few Documents to handle the different modalities that ```python from docarray import BaseDocument, DocumentArray -from docarray.documents import Image -from docarray.documents import Text as BaseText from docarray.typing import TorchTensor, ImageUrl -``` - +Let's first create a Document for our Text modality. It will contain a number of `Tokens`, which we also define: ```python tags=[] +from docarray.documents import Text as BaseText + class Tokens(BaseDocument): input_ids: TorchTensor[512] attention_mask: TorchTensor @@ -78,7 +77,7 @@ class Tokens(BaseDocument): class Text(BaseText): tokens: Optional[Tokens] ``` - +Notice the `TorchTensor` type. It is a thin wrapper around `torch.Tensor` that enables additional features like shape parametrization (`TorchTensor[512]`), but can be use like any other torch tensor. If you want, you can always get a raw `torch.Tensor` from a `TorchTensor`. the final document use for training here is the PairTextImage which combine the Text and Image modalities ```python From eb6ef638f7cd0c05680b658d17072aff3ede742a Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:12:01 +0100 Subject: [PATCH 07/29] Update docs/tutorials/mutlimodal_training_and_serving.md Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docs/tutorials/mutlimodal_training_and_serving.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index 09823cd8f7d..09dacf225cf 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -78,7 +78,7 @@ class Text(BaseText): tokens: Optional[Tokens] ``` Notice the `TorchTensor` type. It is a thin wrapper around `torch.Tensor` that enables additional features like shape parametrization (`TorchTensor[512]`), but can be use like any other torch tensor. If you want, you can always get a raw `torch.Tensor` from a `TorchTensor`. -the final document use for training here is the PairTextImage which combine the Text and Image modalities +The final Document used for training here is the `PairTextImage`, which combines the Text and Image modalities: ```python class PairTextImage(BaseDocument): From 63238c8aecc7d50eb75bbed10d29c5f99b98a850 Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:12:14 +0100 Subject: [PATCH 08/29] feat: apply johannes suggestion Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docs/tutorials/mutlimodal_training_and_serving.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index 09dacf225cf..1b6df3bad28 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -89,7 +89,7 @@ class PairTextImage(BaseDocument): ## Create the Dataset -In this section we will create a multi modal pytorch dataset around the Flick8k dataset using docarray. +In this section we will create a multi modal pytorch dataset around the Flick8k dataset using DocArray. We will use DocArray data loading functionality to load the data and use torchvision and transformers to preprocess the data before fedding it to our deepl learning model From e6ba8f1fbf8bf30b68e86a944f6d8df64dfec9b3 Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:12:27 +0100 Subject: [PATCH 09/29] feat: apply johannes suggestion Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docs/tutorials/mutlimodal_training_and_serving.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index 1b6df3bad28..d6dbd4fcd41 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -91,7 +91,7 @@ class PairTextImage(BaseDocument): In this section we will create a multi modal pytorch dataset around the Flick8k dataset using DocArray. -We will use DocArray data loading functionality to load the data and use torchvision and transformers to preprocess the data before fedding it to our deepl learning model +We will use DocArray data loading functionality to load the data and use Torchvision and Transformers to preprocess the data before feeding it to our deep learning model: ```python from torch.utils.data import DataLoader, Dataset From 3a4e0e72632193035ac70be5212f7713021ba457 Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:12:38 +0100 Subject: [PATCH 10/29] feat: apply johannes suggestion Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docs/tutorials/mutlimodal_training_and_serving.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index d6dbd4fcd41..de41c1ac73e 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -180,7 +180,7 @@ loader = DataLoader( ## Create the Pytorch model that work on DocumentArray -In this section we create two encoders one for each modalities (Text and Image). These encoders are nornal pytorch nn.Module. the Only difference is that they operate on DocumentArray directly rather that on tensor +In this section we create two encoders one for each modalities (Text and Image). These encoders are normal PyTorch `nn.Module`s. The only difference is that they operate on DocumentArray rather that on torch.Tensor: ```python class TextEncoder(nn.Module): From 3bb9e1c4eff2ddf77e580aed1a1f9fc6609141a2 Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:12:49 +0100 Subject: [PATCH 11/29] feat: apply johannes suggestion Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docs/tutorials/mutlimodal_training_and_serving.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index de41c1ac73e..892186dfd16 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -177,7 +177,7 @@ loader = DataLoader( ) ``` -## Create the Pytorch model that work on DocumentArray +## Create the Pytorch model that works on DocumentArray In this section we create two encoders one for each modalities (Text and Image). These encoders are normal PyTorch `nn.Module`s. The only difference is that they operate on DocumentArray rather that on torch.Tensor: From 770c54ef6944ba455a74aed89d3a2f8dfa847c41 Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:13:11 +0100 Subject: [PATCH 12/29] feat: apply johannes suggestion Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docs/tutorials/mutlimodal_training_and_serving.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index 892186dfd16..b136f43760d 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -218,7 +218,7 @@ class VisionEncoder(nn.Module): vision_encoder = VisionEncoder().to(DEVICE) text_encoder = TextEncoder().to(DEVICE) ``` - +As you can see, DocArray helps us to clearly convey what data is expected as input and output for each method, all through Python type hints. ## Train the model in a constrative way between Text and Image (CLIP) From 3771c3bf43f1b53d451dbda0ef3e02482ee61f2f Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:13:21 +0100 Subject: [PATCH 13/29] feat: apply johannes suggestion Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docs/tutorials/mutlimodal_training_and_serving.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index b136f43760d..f8a8c18f72f 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -222,7 +222,7 @@ As you can see, DocArray helps us to clearly convey what data is expected as inp ## Train the model in a constrative way between Text and Image (CLIP) -Now that we have defined our dataloader and our models we can train the two encoder is a contrastive way. The goal is to match the representation of the text and the image for each pair in the dataset. +Now that we have defined our dataloader and our models we can train the two encoders is a contrastive way. The goal is to match the representation of the text and the image for each pair in the dataset. ```python optim = torch.optim.Adam( From 7a7d9d51d52f3e130c2ca50016267dd50b9df8eb Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:14:24 +0100 Subject: [PATCH 14/29] feat: apply johannes suggestion Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docs/tutorials/mutlimodal_training_and_serving.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index f8a8c18f72f..08c7e5f1131 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -248,7 +248,7 @@ def clip_loss(image: DocumentArray[Image], text: DocumentArray[Text]) -> TorchTe num_epoch = 1 # here you should do more epochs to really learn something ``` -One things to notice here is that our dataloader does not return a torch.Tensor but a DocumentArray[PairTextImage] ! +One things to notice here is that our dataloader does not return a `torch.Tensor` but a `DocumentArray[PairTextImage]` ! ```python tags=[] with torch.autocast(device_type="cuda", dtype=torch.float16): From 1cb0093ff73f9b73525545b060c63fd3d40fc9d8 Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:14:38 +0100 Subject: [PATCH 15/29] feat: apply johannes suggestion Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docs/tutorials/mutlimodal_training_and_serving.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index 08c7e5f1131..99a9f67a4a8 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -264,8 +264,6 @@ with torch.autocast(device_type="cuda", dtype=torch.float16): print(f"{i+epoch} steps , loss : {loss}") loss.backward() optim.step() -``` -## From prototype to production in, well, almost no line of code Now we have a ML clip model trained ! Let's see how we can serve this model with a RestAPI by reusing most of the code above. From 7fa3da732e8e0e5d1e5a00d19d606132275f0999 Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:14:50 +0100 Subject: [PATCH 16/29] feat: apply johannes suggestion Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docs/tutorials/mutlimodal_training_and_serving.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index 99a9f67a4a8..9c92ab2084b 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -268,7 +268,7 @@ with torch.autocast(device_type="cuda", dtype=torch.float16): Now we have a ML clip model trained ! Let's see how we can serve this model with a RestAPI by reusing most of the code above. -lets use our beloved [FastAPI](https://fastapi.tiangolo.com/) +Let's use our beloved [FastAPI](https://fastapi.tiangolo.com/) for that! FastAPI is powerfull because it allows you to define your RestAPI data schema only with python ! And DocArray is fully compatible with FastAPI that means that as long as you have a function that takes as input a Document FastAPI will be able to translate it into a fully fledge RestAPI with documentation, openAPI specification and more ! From cae462878f23b6ff458311b69cd01a810dac9e6b Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:15:03 +0100 Subject: [PATCH 17/29] feat: apply johannes suggestion Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docs/tutorials/mutlimodal_training_and_serving.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index 9c92ab2084b..1a8ebf48c99 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -287,7 +287,7 @@ vision_encoder = vision_encoder.eval() text_encoder = text_encoder.eval() ``` -now we can test the API +Now all we need to do is to tell FastAPI what methods it should use to serve the model: ```python @app.post("/embed_text/", response_model=Text, response_class=DocumentResponse) @@ -323,6 +323,5 @@ doc_resp = Text.parse_raw(response.content.decode()) ```python doc_resp.embedding.shape -``` And we're done! You have trained and served a mulit-modal ML model, with zero headache and a lot of DocArray! From ef3b320d978b5234a2246d4ceea5e16d6208b85f Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 12 Jan 2023 15:15:19 +0100 Subject: [PATCH 18/29] feat: apply johannes suggestion Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- docs/tutorials/mutlimodal_training_and_serving.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index 1a8ebf48c99..a13a4f011e5 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -266,12 +266,13 @@ with torch.autocast(device_type="cuda", dtype=torch.float16): optim.step() -Now we have a ML clip model trained ! Let's see how we can serve this model with a RestAPI by reusing most of the code above. +Now we have a trained CLIP mode, let's see how we can serve this model with a Rest API by reusing most of the code above. Let's use our beloved [FastAPI](https://fastapi.tiangolo.com/) for that! -FastAPI is powerfull because it allows you to define your RestAPI data schema only with python ! And DocArray is fully compatible with FastAPI that means that as long as you have a function that takes as input a Document FastAPI will be able to translate it into a fully fledge RestAPI with documentation, openAPI specification and more ! +FastAPI is powerful because it allows you to define your Rest API data schema in pure Python. +And DocArray is fully compatible with FastAPI, which means that as long as you have a function that takes a Document as input, FastAPI will be able to automatically translate it into a fully fledged Rest API with documentation, openAPI specification and more: ```python from fastapi import FastAPI From e7d7ed9b1ac0d7945111c584ef262060f20b4afa Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 12 Jan 2023 15:50:28 +0100 Subject: [PATCH 19/29] chore: add from pytorch to the readme Signed-off-by: Sami Jaghouar --- README.md | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/README.md b/README.md index 35d1357f8eb..5e592735790 100644 --- a/README.md +++ b/README.md @@ -186,6 +186,94 @@ If you come from Pydantic, you can see Documents as juiced up models, and DocArr - Cloud ready: Serialization to **Protobuf** for use with microservices and **gRPC** - Support for **vector search functionalities**, such as `find()` and `embed()` +## Coming from Pytorch + +DocArray is meant to be used directly inside machine learning model to handle an represent multi modal nested data. Not only It promises to reduce the friction between ML model training and ML model serving, using DocArray with pytorch allow to reason with the nested and multi modal abstraction deep inside the nn.Module part of pytorch. + + + +```python +import torch +from torch import nn + + +class MyMultiModalModel(nn.Module): + def __init__(self): + super().__init__() + self.audio_encoder = AudioEncoder() + self.image_encoder = ImageEncoder() + self.text_encoder = TextEncoder() + + def forward(self, text_1, text_2, image_1, image_2, audio_1, audio_2): + emnedding_text_1 = self.text_encoder(text_1) + emnedding_text_2 = self.text_encoder(text_2) + + emnedding_image_1 = self.image_encoder(image_1) + emnedding_image_2 = self.image_encoder(image_2) + + emnedding_audio_1 = self.image_encoder(audio_1) + emnedding_audio_2 = self.image_encoder(audio_2) + + return ( + emnedding_text_1, + emnedding_text_2, + emnedding_image_1, + emnedding_image_2, + emnedding_audio_1, + emnedding_audio_2, + ) +``` + +It is not easy on the eye ..., even worse if you need to add one more modality you have to handle all of these tuples etcc + +Let see how it will loooks with DocArray + +```python +from docarray import DocumentArray, BaseDocument +from docarray.documents import Image, Text, Audio +from docarray.typing import TorchTensor + +import torch + + +class Podcast(BaseDocument): + text: Text + image: Image + audio: Audio + + +class PairPodcast(BaseDocument): + left: Podcast + right: Podcast + + +class MyPodcastModel(nn.Module): + def __init__(self): + super().__init__() + self.audio_encoder = AudioEncoder() + self.image_encoder = ImageEncoder() + self.text_encoder = TextEncoder() + + def forward_podcast(da: DocumentArray[Podcast]) -> DocumentArray[Podcast]: + da.audio.embedding = self.audio_encoder(da.audio.tensor) + da.text.embedding = self.text_encoder(da.text.tensor) + da.image.embedding = self.image_encoder(da.image.tensor) + + return da + + def forward(da: DocumentArray[PairPodcast]) -> TorchTensor: + da.left = self.forward_podcast(da.left) + da.right = self.forward_podcast(da.right) + + return da +``` + +You win in code readibility and maintainability. And for the same price you can turn your pytorch model into a FastAPI app and reuse the same +schema definition that you used during training as schema for your RestAPI. Everything handle in pythonic manner relying the the type hint for your +function + + + ## Coming from FastAPI Documents are Pydantic Models (with a twist), and as such they are fully compatible with FastAPI: From e67c022501d76c3eee6cf4f94a229404c67f8031 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 12 Jan 2023 15:53:49 +0100 Subject: [PATCH 20/29] chore: fix type gint readme Signed-off-by: Sami Jaghouar --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5e592735790..366907f554c 100644 --- a/README.md +++ b/README.md @@ -261,7 +261,7 @@ class MyPodcastModel(nn.Module): return da - def forward(da: DocumentArray[PairPodcast]) -> TorchTensor: + def forward(da: DocumentArray[PairPodcast]) -> DocumentArray[PairPodcast]: da.left = self.forward_podcast(da.left) da.right = self.forward_podcast(da.right) From 1f450844f8eb5e7fa97052ab042eef3205e18b61 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Mon, 16 Jan 2023 10:39:36 +0100 Subject: [PATCH 21/29] feat: update notebook Signed-off-by: Sami Jaghouar --- docs/tutorials/mutlimodal_training_and_serving.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index a13a4f011e5..92215099c93 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -65,7 +65,7 @@ Let's start by defining a few Documents to handle the different modalities that from docarray import BaseDocument, DocumentArray from docarray.typing import TorchTensor, ImageUrl Let's first create a Document for our Text modality. It will contain a number of `Tokens`, which we also define: -```python tags=[] +``` from docarray.documents import Text as BaseText class Tokens(BaseDocument): @@ -274,7 +274,7 @@ Let's use our beloved [FastAPI](https://fastapi.tiangolo.com/) for that! FastAPI is powerful because it allows you to define your Rest API data schema in pure Python. And DocArray is fully compatible with FastAPI, which means that as long as you have a function that takes a Document as input, FastAPI will be able to automatically translate it into a fully fledged Rest API with documentation, openAPI specification and more: -```python +``` from fastapi import FastAPI from docarray.base_document import DocumentResponse ``` @@ -326,3 +326,4 @@ doc_resp = Text.parse_raw(response.content.decode()) doc_resp.embedding.shape And we're done! You have trained and served a mulit-modal ML model, with zero headache and a lot of DocArray! +``` From 095cb2609a77d99d17897833ef2b93206a70fb98 Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Mon, 16 Jan 2023 12:14:33 +0100 Subject: [PATCH 22/29] feat: apply johannes suggestion Co-authored-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 366907f554c..96b657790e2 100644 --- a/README.md +++ b/README.md @@ -99,14 +99,14 @@ da = DocumentArray[Image]( ``` -access field at the DocumentArray level +Access fields at the DocumentArray level: ```python print(len(da.tensor)) print(da.tensor[0].shape) ``` -You can stack tensor if you want to perform in batch processing +You can stack tensors if you want to perform in batch processing: ```python da = da.stack() @@ -186,9 +186,9 @@ If you come from Pydantic, you can see Documents as juiced up models, and DocArr - Cloud ready: Serialization to **Protobuf** for use with microservices and **gRPC** - Support for **vector search functionalities**, such as `find()` and `embed()` -## Coming from Pytorch +## Coming from PyTorch -DocArray is meant to be used directly inside machine learning model to handle an represent multi modal nested data. Not only It promises to reduce the friction between ML model training and ML model serving, using DocArray with pytorch allow to reason with the nested and multi modal abstraction deep inside the nn.Module part of pytorch. +DocArray can be used directly inside ML models to handle and represent multi-modal data. This allows you to reason about your data using DocArray's abstractions deep inside of `nn.Module`, and provides a (FastAPI compatible) schema that eases the transition between model training and model serving. @@ -224,9 +224,9 @@ class MyMultiModalModel(nn.Module): ) ``` -It is not easy on the eye ..., even worse if you need to add one more modality you have to handle all of these tuples etcc +Not very easy on the eyes if you ask us. And even worse, if you need to add one more modality you have to touch every part of your code base, changing the `forward()` return type and make a whole lot of changes downstream from that. -Let see how it will loooks with DocArray +So now let's see what the same code looks like with DocArray: ```python from docarray import DocumentArray, BaseDocument @@ -268,9 +268,9 @@ class MyPodcastModel(nn.Module): return da ``` -You win in code readibility and maintainability. And for the same price you can turn your pytorch model into a FastAPI app and reuse the same -schema definition that you used during training as schema for your RestAPI. Everything handle in pythonic manner relying the the type hint for your -function +Looks much better, doesn't it? +You instantly win in code readability and maintainability. And for the same price you can turn your PyTorch model into a FastAPI app and reuse your Document +schema definition (see below). Everything handles in a pythonic manner by relying on type hints. From 0903d5ca2a53c1cd7879281f05df102832abb1b9 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Mon, 16 Jan 2023 12:15:53 +0100 Subject: [PATCH 23/29] feat: apply johanes suggestion Signed-off-by: Sami Jaghouar --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 96b657790e2..01020d940c2 100644 --- a/README.md +++ b/README.md @@ -190,7 +190,7 @@ If you come from Pydantic, you can see Documents as juiced up models, and DocArr DocArray can be used directly inside ML models to handle and represent multi-modal data. This allows you to reason about your data using DocArray's abstractions deep inside of `nn.Module`, and provides a (FastAPI compatible) schema that eases the transition between model training and model serving. - +To see the effect of this, let's first observe a vanilla PyTorch implementation of a tri-modal ML model: ```python import torch From bbf3a4fec652f47921556981caac065f5d446d0a Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 16 Jan 2023 15:26:38 +0100 Subject: [PATCH 24/29] docs: add more text to the notebook Signed-off-by: Johannes Messner --- .../mutlimodal_training_and_serving.md | 130 ++++++++++++++---- 1 file changed, 107 insertions(+), 23 deletions(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index 92215099c93..cfb308d1d7f 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -12,17 +12,35 @@ jupyter: name: python3 --- -# MultiModal Deep learning with DocArray +# Multi-Modal Deep learning with DocArray +DocArray is a library for representing, sending, and storing multi-modal data that can be used for a variety of different +use cases. - The goal of this notebook is to showcase the usage of **DocArray** with **PyTorch** to do multi-modal machine learning. +Here we will focus on a workflow familiar to many ML Engineers: Building and training a model, and then serving it to +users. -We will train a [CLIP](https://arxiv.org/abs/2103.00020)-like model on a dataset compose of text and image. The goal is to train the model -that is able to understand both text and image and project them into a common embedding space. +This notebook contains two parts: -We train the CLIP-like model on the [flick8k](https://www.kaggle.com/datasets/adityajn105/flickr8k) dataset. To run this notebook you need to download and unzip the data into the same folder as the notebook. +1. **Representing**: We will use DocArray to represent multi-modal data while **building and training a PyTorch model**. +We will see how DocArray can help to organize and group your modalities and tensors and make clear what methods expect as inputs and return as outputs. +2. **Sending**: We will take the model that we built and trained in part 1, and **serve it using FastAPI**. +We will see how DocArray narrows the gap between model development and model deployment, and how the same data models can be +reused in both contexts. That part will be very short, but that's the point! -In this notebook we don't aim at reproduce any CLIP results (our dataset is way to small anyway) but rather to show how DocArray datastructures help researchers and practitioners to write beautiful and pythonic multi-modal PyTorch code. +So without further ado, let's dive into it! + +# 1. Representing: Build and train a PyTorch model + +We will train a [CLIP](https://arxiv.org/abs/2103.00020)-like model on a dataset composes of text-image-pairs. +The goal is to obtain a model that is able to understand both text and images and project them into a common embedding space. + +We train the CLIP-like model on the [flickr8k](https://www.kaggle.com/datasets/adityajn105/flickr8k) dataset. +To run this notebook you need to download and unzip the data into the same folder as the notebook. + +Not that in this notebook by no means we aim at reproduce any CLIP results (our dataset is way too small anyways), +but rather we want to show how DocArray datastructures help researchers and practitioners to write beautiful and +pythonic multi-modal PyTorch code. ```python tags=[] #!pip install "git+https://github.com/docarray/docarray@feat-rewrite-v2#egg=docarray[torch,image]" @@ -48,26 +66,32 @@ from transformers import AutoTokenizer, DistilBertModel ``` ```python -DEVICE = "cuda:2" +DEVICE = "cuda:2" # change to your favourite device ``` -## Create the Documents for handling the MutiModal data +## Create the Documents for handling the Muti-Modal data -At the heart of DocArray live the concept of a `Document`, a collection of mulit-modal data. The `BaseDocument` class allows users to define their own (nested, multi-modal) Document schema to represent any kind of complex data. +The first thing we are trying to achieve when using DocArray is to clearly model our data so that we never get confused +about which tensors are supposed to represent what. -`BaseDocument` is a pythonic way to define a data schema, and is inspired by and built on top of [Pydantic BaseModel](https://docs.pydantic.dev/usage/models/). +To do that we are using a concept that is at the core of DocArray. The `Document`, a collection of multi-modal data. +The `BaseDocument` class allows users to define their own (nested, multi-modal) Document schema to represent any kind of complex data. Let's start by defining a few Documents to handle the different modalities that we will use during our training: ```python from docarray import BaseDocument, DocumentArray from docarray.typing import TorchTensor, ImageUrl -Let's first create a Document for our Text modality. It will contain a number of `Tokens`, which we also define: ``` + +Let's first create a Document for our Text modality. It will contain a number of `Tokens`, which we also define: + +```python from docarray.documents import Text as BaseText + class Tokens(BaseDocument): input_ids: TorchTensor[512] attention_mask: TorchTensor @@ -77,8 +101,30 @@ class Tokens(BaseDocument): class Text(BaseText): tokens: Optional[Tokens] ``` -Notice the `TorchTensor` type. It is a thin wrapper around `torch.Tensor` that enables additional features like shape parametrization (`TorchTensor[512]`), but can be use like any other torch tensor. If you want, you can always get a raw `torch.Tensor` from a `TorchTensor`. -The final Document used for training here is the `PairTextImage`, which combines the Text and Image modalities: +Notice the `TorchTensor` type. It is a thin wrapper around `torch.Tensor` that can be use like any other torch tensor, +but also enables additional features. One such feature is shape parametrization (`TorchTensor[512]`), which lets you +hint and even enforce the desired shape of any tensor! + +To represent our image data, we use the `Image` Document that is included in DocArray: + +```python +from docarray.documents import Image +``` + +Under the hood, an `Image` looks something like this (with the only main difference that it can take tensors from any +supported ML framework): + +```python +# class Image(BaseDocument): +# url: Optional[ImageUrl] +# tensor: Optional[TorchTesor] +# embedding: Optional[TorchTensor] +``` + +Actually, the `BaseText` above also alredy includes `tensor`, `url` and `embedding` fields, so we can use those on our +`Text` Document as well. + +The final Document used for training here is the `PairTextImage`, which simply combines the Text and Image modalities: ```python class PairTextImage(BaseDocument): @@ -89,7 +135,7 @@ class PairTextImage(BaseDocument): ## Create the Dataset -In this section we will create a multi modal pytorch dataset around the Flick8k dataset using DocArray. +In this section we will create a multi-modal pytorch dataset around the Flick8k dataset using DocArray. We will use DocArray data loading functionality to load the data and use Torchvision and Transformers to preprocess the data before feeding it to our deep learning model: @@ -124,6 +170,8 @@ class TextPreprocess: return Tokens(**self.tokenizer(text, padding="max_length", truncation=True)) ``` +`VisionPreprocess` and `TextPreprocess` implement standard preprocessing steps for images and text, nothing special here. + ```python class PairDataset(Dataset): def __init__( @@ -165,6 +213,11 @@ class PairDataset(Dataset): return batch ``` +In the `PairDataset` class we can already see some of the beauty of DocArray. +The dataset will return Documents that contain the text and image data, accessible via `doc.text` and `doc.image`. + +Now let's instantiate this dataset: + ```python vision_preprocess = VisionPreprocess() text_preprocess = TextPreprocess() @@ -180,7 +233,8 @@ loader = DataLoader( ## Create the Pytorch model that works on DocumentArray -In this section we create two encoders one for each modalities (Text and Image). These encoders are normal PyTorch `nn.Module`s. The only difference is that they operate on DocumentArray rather that on torch.Tensor: +In this section we create two encoders, one per modality (Text and Image). These encoders are normal PyTorch `nn.Module`s. +The only difference is that they operate on DocumentArray rather that on torch.Tensor: ```python class TextEncoder(nn.Module): @@ -202,6 +256,10 @@ class TextEncoder(nn.Module): return masked_output.sum(dim=1) / attention_mask.sum(-1, keepdim=True) ``` +The `TextEncoder` takes a `DocumentArray` of `Text`s as input, and returns an embedding `TorchTensor` as output. +`DocumentArray` can be seen as a list of `Text` documents, and the encoder will treat it as one batch. + + ```python class VisionEncoder(nn.Module): def __init__(self): @@ -214,15 +272,23 @@ class VisionEncoder(nn.Module): return self.linear(x) ``` +Similarly, the `VisionEncoder` also takes a `DocumentArray` of `Image`s as input, and returns an embedding `TorchTensor` as output. +However, it operates on the `image` attribute of each Document. + +Now we can instantiate our encoders: + ```python vision_encoder = VisionEncoder().to(DEVICE) text_encoder = TextEncoder().to(DEVICE) ``` + As you can see, DocArray helps us to clearly convey what data is expected as input and output for each method, all through Python type hints. -## Train the model in a constrative way between Text and Image (CLIP) +## Train the model in a contrastive way between Text and Image (CLIP) -Now that we have defined our dataloader and our models we can train the two encoders is a contrastive way. The goal is to match the representation of the text and the image for each pair in the dataset. + +Now that we have defined our dataloader and our models, we can train the two encoders is a contrastive way. +The goal is to match the representation of the text and the image for each pair in the dataset. ```python optim = torch.optim.Adam( @@ -244,19 +310,26 @@ def clip_loss(image: DocumentArray[Image], text: DocumentArray[Text]) -> TorchTe return torch.norm(sims - torch.eye(sims.shape[0], device=DEVICE)) ``` +In the type hints of `cosine_sim` and `clip_loss` you can again notice that we can treat a `TorchTensor` like any other +`torch.Tensor`, and how we can make explicit what kind of data and data modalities the different functions expect. + ```python num_epoch = 1 # here you should do more epochs to really learn something ``` -One things to notice here is that our dataloader does not return a `torch.Tensor` but a `DocumentArray[PairTextImage]` ! +One things to notice here is that our dataloader does not return a `torch.Tensor` but a `DocumentArray[PairTextImage]`, +which is exactly what our model can operate on. + +So let's write a training loop and train our encoders: ```python tags=[] with torch.autocast(device_type="cuda", dtype=torch.float16): for epoch in range(num_epoch): for i, batch in enumerate(loader): - batch.to(DEVICE) + batch.to(DEVICE) # DocumentArray can be moved to device optim.zero_grad() + # FORWARD PASS: batch.image.embedding = vision_encoder(batch.image) batch.text.embedding = text_encoder(batch.text) loss = clip_loss(batch.image, batch.text) @@ -264,17 +337,24 @@ with torch.autocast(device_type="cuda", dtype=torch.float16): print(f"{i+epoch} steps , loss : {loss}") loss.backward() optim.step() +``` + +Here we can see how we can immediately group the output of each encoder with the Document (and modality) it belong to. +And with all that, we've successfully trained a CLIP-like model without ever being confused the meaning of any tensors! -Now we have a trained CLIP mode, let's see how we can serve this model with a Rest API by reusing most of the code above. +# 1. Sending: Serve the model using FastAPI + +Now that we have a trained CLIP model, let's see how we can serve this model with a REST API by reusing most of the code above. Let's use our beloved [FastAPI](https://fastapi.tiangolo.com/) for that! FastAPI is powerful because it allows you to define your Rest API data schema in pure Python. -And DocArray is fully compatible with FastAPI, which means that as long as you have a function that takes a Document as input, FastAPI will be able to automatically translate it into a fully fledged Rest API with documentation, openAPI specification and more: +And DocArray is fully compatible with FastAPI and Pydantic, which means that as long as you have a function that takes a Document as input, +FastAPI will be able to automatically translate it into a fully fledged API with documentation, openAPI specification and more: -``` +```python from fastapi import FastAPI from docarray.base_document import DocumentResponse ``` @@ -302,6 +382,10 @@ async def embed_text(doc: Text) -> Text: return doc ``` +You can see that our earlier definition of the `Text` Document now doubles as the API schema for the `/embed_text` endpoint. + +With this running, we can query our model over the network: + ```python from httpx import AsyncClient ``` @@ -324,6 +408,6 @@ doc_resp = Text.parse_raw(response.content.decode()) ```python doc_resp.embedding.shape +``` And we're done! You have trained and served a mulit-modal ML model, with zero headache and a lot of DocArray! -``` From 3b78c976535f0ace5111c1f6264962277b97ecdc Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Mon, 16 Jan 2023 15:52:26 +0100 Subject: [PATCH 25/29] docs: update notebook Signed-off-by: Sami Jaghouar --- docs/tutorials/mutlimodal_training_and_serving.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index cfb308d1d7f..a9575268db3 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -38,8 +38,8 @@ The goal is to obtain a model that is able to understand both text and images an We train the CLIP-like model on the [flickr8k](https://www.kaggle.com/datasets/adityajn105/flickr8k) dataset. To run this notebook you need to download and unzip the data into the same folder as the notebook. -Not that in this notebook by no means we aim at reproduce any CLIP results (our dataset is way too small anyways), -but rather we want to show how DocArray datastructures help researchers and practitioners to write beautiful and +Note that in this notebook by no means we aim at reproduce any CLIP results (our dataset is way too small anyways), +but we rather want to show how DocArray datastructures help researchers and practitioners to write beautiful and pythonic multi-modal PyTorch code. ```python tags=[] @@ -343,7 +343,7 @@ Here we can see how we can immediately group the output of each encoder with the And with all that, we've successfully trained a CLIP-like model without ever being confused the meaning of any tensors! -# 1. Sending: Serve the model using FastAPI +# 2. Sending: Serve the model using FastAPI Now that we have a trained CLIP model, let's see how we can serve this model with a REST API by reusing most of the code above. From 732eeec5081a89f4ded405122cbf3ea78139fbc2 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Tue, 17 Jan 2023 15:22:33 +0100 Subject: [PATCH 26/29] chore: update readme with shape Signed-off-by: Sami Jaghouar --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 01020d940c2..be4055b0698 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,9 @@ class MyDocument(BaseDocument): description: str image_url: ImageUrl image_tensor: Optional[TorchTensor] - embedding: Optional[TorchTensor[768]] + embedding: Optional[ + TorchTensor[1704, 2272, 3] + ] # This field only work with tensor of shape (1704, 2272, 3) doc = MyDocument( From 58938d82ea255bfbcd0fbb6bb9ff355fb9675949 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Tue, 17 Jan 2023 15:27:11 +0100 Subject: [PATCH 27/29] docs: fix title tuto Signed-off-by: Sami Jaghouar --- docs/tutorials/mutlimodal_training_and_serving.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tutorials/mutlimodal_training_and_serving.md b/docs/tutorials/mutlimodal_training_and_serving.md index a9575268db3..eb2a8dfba86 100644 --- a/docs/tutorials/mutlimodal_training_and_serving.md +++ b/docs/tutorials/mutlimodal_training_and_serving.md @@ -30,7 +30,7 @@ reused in both contexts. That part will be very short, but that's the point! So without further ado, let's dive into it! -# 1. Representing: Build and train a PyTorch model +## 1. Representing: Build and train a PyTorch model We will train a [CLIP](https://arxiv.org/abs/2103.00020)-like model on a dataset composes of text-image-pairs. The goal is to obtain a model that is able to understand both text and images and project them into a common embedding space. @@ -343,7 +343,7 @@ Here we can see how we can immediately group the output of each encoder with the And with all that, we've successfully trained a CLIP-like model without ever being confused the meaning of any tensors! -# 2. Sending: Serve the model using FastAPI +## 2. Sending: Serve the model using FastAPI Now that we have a trained CLIP model, let's see how we can serve this model with a REST API by reusing most of the code above. From 7daceaf61753eeafbefc2601d1f7126b00b6cbae Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Tue, 17 Jan 2023 15:29:45 +0100 Subject: [PATCH 28/29] chore: reamde fix shape stuff Signed-off-by: Sami Jaghouar --- README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index be4055b0698..ad1ff7be484 100644 --- a/README.md +++ b/README.md @@ -22,10 +22,9 @@ from typing import Optional class MyDocument(BaseDocument): description: str image_url: ImageUrl - image_tensor: Optional[TorchTensor] - embedding: Optional[ - TorchTensor[1704, 2272, 3] - ] # This field only work with tensor of shape (1704, 2272, 3) + image_tensor: Optional[TorchTensor[1704, 2272, 3]] + # The field above only work with tensor of shape (1704, 2272, 3) + embedding: Optional[TorchTensor] doc = MyDocument( @@ -33,6 +32,9 @@ doc = MyDocument( image_url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", ) doc.image_tensor = doc.image_url.load() # load image tensor from URL +``` + +```python doc.embedding = clip_image_encoder( doc.image_tensor ) # create and store embedding using model of your choice From 7d932fc0029e713162708f28d8fb2b8ab7ba152c Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Tue, 17 Jan 2023 15:31:28 +0100 Subject: [PATCH 29/29] chore: fix python block Signed-off-by: Sami Jaghouar --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ad1ff7be484..568b96649de 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ doc.embedding = clip_image_encoder( ``` ### Compose nested Documents: - +```python from docarray import BaseDocument from docarray.documents import Image, Text import numpy as np @@ -74,7 +74,7 @@ class MultiModalDocument(BaseDocument): doc = MultiModalDocument( image_doc=Image(tensor=np.zeros((3, 224, 224))), text_doc=Text(text='hi!') ) - +``` ### Collect multiple `Documents` into a `DocumentArray`: ```python