From 0b7fa701972217ae1e66435ef6baadd727208aa7 Mon Sep 17 00:00:00 2001 From: Alex C-G Date: Mon, 17 Apr 2023 12:27:34 +0200 Subject: [PATCH 1/4] docs(readme): remove alpha notice Signed-off-by: Alex C-G --- docs/README.md | 461 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 461 insertions(+) create mode 100644 docs/README.md diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000000..699a62955f4 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,461 @@ +# DocArray - Version 2 + +DocArray is a library for **representing, sending and storing multi-modal data**, with a focus on applications in **ML** and +**Neural Search**. + +This means that DocArray lets you do the following things: + +## Represent + +```python +from docarray import BaseDocument +from docarray.typing import TorchTensor, ImageUrl +from typing import Optional + + +class MyDocument(BaseDocument): + description: str + image_url: ImageUrl + image_tensor: Optional[TorchTensor[1704, 2272, 3]] + # The field above only work with tensor of shape (1704, 2272, 3) + embedding: Optional[TorchTensor] + + +doc = MyDocument( + description="This is a photo of a mountain", + image_url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", +) +doc.image_tensor = doc.image_url.load() # load image tensor from URL +``` + +```python +doc.embedding = clip_image_encoder( + doc.image_tensor +) # create and store embedding using model of your choice + +print(doc.embedding.shape) +``` + +- **Model** data of any type (audio, video, text, images, 3D meshes, raw tensors, etc) as a single, unified data structure, the `Document` + - A `Document` is a juiced-up [Pydantic Model](https://pydantic-docs.helpmanual.io/usage/models/), inheriting all the benefits, while extending it with ML focussed features + +### Use pre-defined `Document`s for common use cases: + +```python +from docarray.documents import ImageDoc + +doc = ImageDoc( + url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", +) +doc.tensor = doc.url.load() # load image tensor from URL +doc.embedding = clip_image_encoder( + doc.tensor +) # create and store embedding using model of your choice +``` +### Compose nested Documents: + +```python +from docarray import BaseDocument +from docarray.documents import ImageDoc, TextDoc +import numpy as np + + +class MultiModalDocument(BaseDocument): + image_doc: ImageDoc + text_doc: TextDoc + + +doc = MultiModalDocument( + image_doc=ImageDoc(tensor=np.zeros((3, 224, 224))), text_doc=TextDoc(text='hi!') +) +``` + +### Collect multiple `Documents` into a `DocumentArray`: +```python +from docarray import DocumentArray, BaseDocument +from docarray.typing import AnyTensor, ImageUrl +import numpy as np + + +class Image(BaseDocument): + url: ImageUrl + tensor: AnyTensor +``` + +```python +from docarray import DocumentArray + +da = DocumentArray[Image]( + [ + Image( + url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", + tensor=np.zeros((3, 224, 224)), + ) + for _ in range(100) + ] +) +``` + + +Access fields at the DocumentArray level: + +```python +print(len(da.tensor)) +print(da.tensor[0].shape) +``` + +You can stack tensors if you want to perform in batch processing: + +```python +da = da.stack() +``` + +```python +print(type(da.tensor)) +print(da.tensor.shape) +``` + +## Send +- **Serialize** any `Document` or `DocumentArray` into _protobuf_, _json_, _jsonschema_, _bytes_ or _base64_ +- Use in **microservice** architecture: Send over **HTTP** or **gRPC** +- Integrate seamlessly with **FastAPI** and **Jina** + +```python +from docarray.documents import ImageDoc +from httpx import AsyncClient +import numpy as np + +doc = ImageDoc(tensor=np.zeros((3, 224, 224))) + +# JSON over HTTP +async with AsyncClient(app=app, base_url="http://test") as ac: + response = await ac.post("/doc/", data=input_doc.json()) +``` + +```python +# (de)serialize from/to protobuf +Image.from_protobuf(doc.to_protobuf()) +``` + +## Store +- Persist and `DocumentArray` using a **`DocumentStore`** +- Store your Documents in any supported (vector) database: **Elasticsearch**, **Qdrant**, **Weaviate**, **Redis**, **Milvus**, **ANNLite** or **SQLite** +- Leverage DocumentStores to **perform vector search on your multi-modal data** + +```python +# NOTE: DocumentStores are not yet implemented in version 2 +from docarray import DocumentArray +from docarray.documents import ImageDoc +from docarray.stores import DocumentStore +import numpy as np + +da = DocumentArray([ImageDoc(embedding=np.zeros((128,))) for _ in range(1000)]) +store = DocumentStore[ImageDoc]( + storage='qdrant' +) # create a DocumentStore with Qdrant as backend +store.insert(da) # insert the DocumentArray into the DocumentStore +# find the 10 most similar images based on the 'embedding' field +match = store.find(ImageDoc(embedding=np.zeros((128,))), field='embedding', top_k=10) +``` + +If you want to get a deeper understanding of DocArray v2, it is best to do so on the basis of your +use case and background: + +## Coming from DocArray + +If you are already using DocArray, you will be familiar with its [dataclass API](https://docarray.jina.ai/fundamentals/dataclass/). + +_DocArray v2 is that idea, taken seriously._ Every `Document` is created through dataclass-like interface, +courtesy of [Pydantic](https://pydantic-docs.helpmanual.io/usage/models/). + +This gives the following advantages: +- **Flexibility:** No need to conform to a fixed set of fields, your data defines the schema +- **Multi-modality:** Easily store multiple modalities and multiple embeddings in the same Document +- **Language agnostic:** At its core, Documents are just dictionaries. This makes it easy to create and send them from any language, not just Python. + +## Coming from Pydantic + +If you come from Pydantic, you can see Documents as juiced up models, and DocArray as a collection of goodies around them. + +- **ML focussed types**: Tensor, TorchTensor, TFTensor, Embedding, ... +- **Types that are alive**: ImageUrl can `.load()` a URL to image tensor, TextUrl can load and tokenize text documents, etc. +- **Pre-built Documents** for different data modalities: Image, Text, 3DMesh, Video, Audio, ... Note that all of these will be valid Pydantic models! +- The concepts of **DocumentArray and DocumentStore** +- Cloud ready: Serialization to **Protobuf** for use with microservices and **gRPC** +- Support for **vector search functionalities**, such as `find()` and `embed()` + +## Coming from PyTorch + +DocArray can be used directly inside ML models to handle and represent multi-modal data. This allows you to reason about your data using DocArray's abstractions deep inside of `nn.Module`, and provides a (FastAPI compatible) schema that eases the transition between model training and model serving. + +To see the effect of this, let's first observe a vanilla PyTorch implementation of a tri-modal ML model: + +```python +import torch +from torch import nn + + +class MyMultiModalModel(nn.Module): + def __init__(self): + super().__init__() + self.audio_encoder = AudioEncoder() + self.image_encoder = ImageEncoder() + self.text_encoder = TextEncoder() + + def forward(self, text_1, text_2, image_1, image_2, audio_1, audio_2): + embedding_text_1 = self.text_encoder(text_1) + embedding_text_2 = self.text_encoder(text_2) + + embedding_image_1 = self.image_encoder(image_1) + embedding_image_2 = self.image_encoder(image_2) + + embedding_audio_1 = self.image_encoder(audio_1) + embedding_audio_2 = self.image_encoder(audio_2) + + return ( + embedding_text_1, + embedding_text_2, + embedding_image_1, + embedding_image_2, + embedding_audio_1, + embedding_audio_2, + ) +``` + +Not very easy on the eyes if you ask us. And even worse, if you need to add one more modality you have to touch every part of your code base, changing the `forward()` return type and make a whole lot of changes downstream from that. + +So now let's see what the same code looks like with DocArray: + +```python +from docarray import DocumentArray, BaseDocument +from docarray.documents import ImageDoc, TextDoc, AudioDoc +from docarray.typing import TorchTensor + +import torch + + +class Podcast(BaseDocument): + text: TextDoc + image: ImageDoc + audio: AudioDoc + + +class PairPodcast(BaseDocument): + left: Podcast + right: Podcast + + +class MyPodcastModel(nn.Module): + def __init__(self): + super().__init__() + self.audio_encoder = AudioEncoder() + self.image_encoder = ImageEncoder() + self.text_encoder = TextEncoder() + + def forward_podcast(self, da: DocumentArray[Podcast]) -> DocumentArray[Podcast]: + da.audio.embedding = self.audio_encoder(da.audio.tensor) + da.text.embedding = self.text_encoder(da.text.tensor) + da.image.embedding = self.image_encoder(da.image.tensor) + + return da + + def forward(self, da: DocumentArray[PairPodcast]) -> DocumentArray[PairPodcast]: + da.left = self.forward_podcast(da.left) + da.right = self.forward_podcast(da.right) + + return da +``` + +Looks much better, doesn't it? +You instantly win in code readability and maintainability. And for the same price you can turn your PyTorch model into a FastAPI app and reuse your Document +schema definition (see below). Everything handles in a pythonic manner by relying on type hints. + + +## Coming from TensorFlow + +Similar to the PyTorch approach, you can also use DocArray with TensorFlow to handle and represent multi-modal data inside your ML model. + +First off, to use DocArray with TensorFlow we first need to install it as follows: +``` +pip install tensorflow==2.11.0 +pip install protobuf==3.19.0 +``` + +Compared to using DocArray with PyTorch, there is one main difference when using it with TensorFlow:\ +While DocArray's `TorchTensor` is a subclass of `torch.Tensor`, this is not the case for the `TensorFlowTensor`: Due to technical limitations on `tf.Tensor`, docarray's `TensorFlowTensor` is not a subclass of `tf.Tensor` but instead stores a `tf.Tensor` in its `.tensor` attribute. + +How does this effect you? Whenever you want to access the tensor data to e.g. do operations with it or hand it to your ML model, instead of handing over your `TensorFlowTensor` instance, you need to access its `.tensor` attribute. + +This would look like the following: + +```python +from typing import Optional + +from docarray import DocumentArray, BaseDocument + +import tensorflow as tf + + +class Podcast(BaseDocument): + audio_tensor: Optional[AudioTensorFlowTensor] + embedding: Optional[AudioTensorFlowTensor] + + +class MyPodcastModel(tf.keras.Model): + def __init__(self): + super().__init__() + self.audio_encoder = AudioEncoder() + + def call(self, inputs: DocumentArray[Podcast]) -> DocumentArray[Podcast]: + inputs.audio_tensor.embedding = self.audio_encoder( + inputs.audio_tensor.tensor + ) # access audio_tensor's .tensor attribute + return inputs +``` + + + +## Coming from FastAPI + +Documents are Pydantic Models (with a twist), and as such they are fully compatible with FastAPI: + +```python +import numpy as np +from fastapi import FastAPI +from httpx import AsyncClient + +from docarray import BaseDocument +from docarray.documents import ImageDoc +from docarray.typing import NdArray +from docarray.base_document import DocumentResponse + + +class InputDoc(BaseDocument): + img: ImageDoc + + +class OutputDoc(BaseDocument): + embedding_clip: NdArray + embedding_bert: NdArray + + +input_doc = InputDoc(img=ImageDoc(tensor=np.zeros((3, 224, 224)))) + +app = FastAPI() + + +@app.post("/doc/", response_model=OutputDoc, response_class=DocumentResponse) +async def create_item(doc: InputDoc) -> OutputDoc: + ## call my fancy model to generate the embeddings + doc = OutputDoc( + embedding_clip=np.zeros((100, 1)), embedding_bert=np.zeros((100, 1)) + ) + return doc + + +async with AsyncClient(app=app, base_url="http://test") as ac: + response = await ac.post("/doc/", data=input_doc.json()) + resp_doc = await ac.get("/docs") + resp_redoc = await ac.get("/redoc") +``` + +The big advantage here is **first-class support for ML centric data**, such as {Torch, TF, ...}Tensor, Embedding, etc. + +This includes handy features such as validating the shape of a tensor: +```python +from docarray import BaseDocument +from docarray.typing import TorchTensor +import torch + + +class MyDoc(BaseDocument): + tensor: TorchTensor[3, 224, 224] + + +doc = MyDoc(tensor=torch.zeros(3, 224, 224)) # works +doc = MyDoc(tensor=torch.zeros(224, 224, 3)) # works by reshaping +doc = MyDoc(tensor=torch.zeros(224)) # fails validation + + +class Image(BaseDocument): + tensor: TorchTensor[3, 'x', 'x'] + + +Image(tensor=torch.zeros(3, 224, 224)) # works +Image( + tensor=torch.zeros(3, 64, 128) +) # fails validation because second dimension does not match third +Image( + tensor=torch.zeros(4, 224, 224) +) # fails validation because of the first dimension +Image( + tensor=torch.zeros(3, 64) +) # fails validation because it does not have enough dimensions +``` + +## Coming from a vector database + +If you came across docarray as a universal vector DB client, you can best think of it as **a new kind of ORM for vector databases**. + +DocArray's job is to take multi-modal, nested and domain-specific data and to map it to a vector database, +store it there, and thus make it searchable: + +```python +# NOTE: DocumentStores are not yet implemented in version 2 +from docarray import DocumentArray, BaseDocument +from docarray.stores import DocumentStore +from docarray.documents import ImageDoc, TextDoc +import numpy as np + + +class MyDoc(BaseDocument): + image: ImageDoc + text: TextDoc + description: str + + +def _random_my_doc(): + return MyDoc( + image=ImageDoc(embedding=np.random.random((256,))), + text=TextDoc(embedding=np.random.random((128,))), + description='this is a random document', + ) + + +da = DocumentArray([_random_my_doc() for _ in range(1000)]) # create some data +store = DocumentStore[MyDoc]( + storage='qdrant' +) # create a DocumentStore with Qdrant as backend +store.insert(da) # insert the DocumentArray into the DocumentStore + +# find the 10 most similar images based on the image embedding field +match = store.find( + ImageDoc(embedding=np.zeros((256,))), field='image__embedding', top_k=10 +) +# find the 10 most similar images based on the image embedding field +match = store.find( + ImageDoc(embedding=np.zeros((128,))), field='text__embedding', top_k=10 +) +``` + +## Install the alpha + +to try out the alpha you can install it via git: +```shell +pip install "git+https://github.com/docarray/docarray@2023.01.18.alpha#egg=docarray[common,torch,image]" +``` +or from the latest development branch +```shell +pip install "git+https://github.com/docarray/docarray@feat-rewrite-v2#egg=docarray[common,torch,image]" +``` + +## Further reading + +- [Join our Discord server](https://discord.gg/WaMp6PVPgR) +- [V2 announcement blog post](https://github.com/docarray/notes/blob/main/blog/01-announcement.md) +- [Donation to Linux Foundation AI&Data blog post](https://jina.ai/news/donate-docarray-lf-for-inclusive-standard-multimodal-data-model/) +- [Submit ideas, feature requests, and discussions](https://github.com/docarray/docarray/discussions) +- [v2 Documentation](https://docarray-v2--jina-docs.netlify.app/) +- ["Legacy" DocArray github page](https://github.com/docarray/docarray) +- ["Legacy" DocArray documentation](https://docarray.jina.ai/) + From dcda662406fef6ebcaefaa2a37c96fb16909c178 Mon Sep 17 00:00:00 2001 From: Alex C-G Date: Mon, 17 Apr 2023 12:28:13 +0200 Subject: [PATCH 2/4] docs(readme): clearer headings Signed-off-by: Alex C-G --- docs/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/README.md b/docs/README.md index 699a62955f4..3f9f8669ca6 100644 --- a/docs/README.md +++ b/docs/README.md @@ -5,7 +5,7 @@ DocArray is a library for **representing, sending and storing multi-modal data** This means that DocArray lets you do the following things: -## Represent +## Represent data ```python from docarray import BaseDocument @@ -115,7 +115,7 @@ print(type(da.tensor)) print(da.tensor.shape) ``` -## Send +## Send data - **Serialize** any `Document` or `DocumentArray` into _protobuf_, _json_, _jsonschema_, _bytes_ or _base64_ - Use in **microservice** architecture: Send over **HTTP** or **gRPC** - Integrate seamlessly with **FastAPI** and **Jina** @@ -137,7 +137,7 @@ async with AsyncClient(app=app, base_url="http://test") as ac: Image.from_protobuf(doc.to_protobuf()) ``` -## Store +## Store data - Persist and `DocumentArray` using a **`DocumentStore`** - Store your Documents in any supported (vector) database: **Elasticsearch**, **Qdrant**, **Weaviate**, **Redis**, **Milvus**, **ANNLite** or **SQLite** - Leverage DocumentStores to **perform vector search on your multi-modal data** From b88ca7d4384f1c452840aede01161fe26d655f5b Mon Sep 17 00:00:00 2001 From: Alex C-G Date: Mon, 17 Apr 2023 12:48:00 +0200 Subject: [PATCH 3/4] docs: first round of fixes for represent-first step and intro Signed-off-by: Alex C-G --- docs/user_guide/intro.md | 13 +++--- docs/user_guide/representing/first_step.md | 51 ++++++++++------------ 2 files changed, 28 insertions(+), 36 deletions(-) diff --git a/docs/user_guide/intro.md b/docs/user_guide/intro.md index 94bb730fdb0..c1dee5acdd4 100644 --- a/docs/user_guide/intro.md +++ b/docs/user_guide/intro.md @@ -4,11 +4,11 @@ This user guide shows you how to use `DocArray` with most of its features. There are three main sections: -- [Representing Data](representing/first_step.md): This section will show you how to use `DocArray` to represent your data. This is a great starting point if you want to better organize the data in your ML models, or if you are looking for a "pydantic for ML". -- [Sending Data](sending/first_step.md): This section will show you how to use `DocArray` to send your data. This is a great starting point if you want to serve your ML model, for example through FastAPI. -- [Storing Data](storing/first_step.md): This section will show you how to use `DocArray` to store your data. This is a great starting point if you are looking for an "ORM for vector databases". +- [Representing data](representing/first_step.md): This section will show you how to represent your data. This is a great starting point if you want to better organize the data in your ML models, or if you are looking for a "Pydantic for ML". +- [Sending data](sending/first_step.md): This section will show you how to send your data. This is a great starting point if you want to serve your ML model, for example through FastAPI. +- [Storing data](storing/first_step.md): This section will show you how to store your data. This is a great starting point if you are looking for an "ORM for vector databases". -You should start by reading the [Representing Data](representing/first_step.md) section, and then the [Sending Data](sending/first_step.md) and [Storing Data](storing/first_step.md) sections can be read in any order. +You should start by reading the [Representing data](representing/first_step.md) section, and then the [Sending data](sending/first_step.md) and [Storing data](storing/first_step.md) sections can be read in any order. You will first need to install `DocArray` in your Python environment. @@ -35,7 +35,7 @@ This will install the main dependencies of `DocArray` and will work with all the ``` Depending on your usage you might want to use `DocArray` with only a couple of specific modalities and their dependencies. -For instance, let's say you only want to work with images, you can install `DocArray` using the following command: +For instance, if you only want to work with images, you can install `DocArray` using the following command: ``` pip install "docarray[image]" @@ -46,6 +46,3 @@ pip install "docarray[image]" ``` pip install "docarray[image, audio]" ``` - -!!! warning - This way of installing `DocArray` is only valid starting with version `0.30` diff --git a/docs/user_guide/representing/first_step.md b/docs/user_guide/representing/first_step.md index 08ae1d52f00..3ec5db9e866 100644 --- a/docs/user_guide/representing/first_step.md +++ b/docs/user_guide/representing/first_step.md @@ -3,12 +3,11 @@ At the heart of `DocArray` lies the concept of [`BaseDoc`][docarray.base_doc.doc.BaseDoc]. A [BaseDoc][docarray.base_doc.doc.BaseDoc] is very similar to a [Pydantic](https://docs.pydantic.dev/) -[`BaseModel`](https://docs.Pydantic.dev/usage/models) - in fact it _is_ a specialized Pydantic `BaseModel`. It allows you to define custom `Document` schemas (or `Model` in +[`BaseModel`](https://docs.Pydantic.dev/usage/models) -- in fact it _is_ a specialized Pydantic `BaseModel`. It allows you to define custom `Document` schemas (or `Model`s in the Pydantic world) to represent your data. - !!! note - Naming convention: When we refer to a `BaseDoc` we refer to a class that inherits from [BaseDoc][docarray.base_doc.doc.BaseDoc]. + Naming convention: When we refer to a `BaseDoc`, we refer to a class that inherits from [BaseDoc][docarray.base_doc.doc.BaseDoc]. When we refer to a `Document` we refer to an instance of a `BaseDoc` class. ## Basic `Doc` usage. @@ -16,7 +15,7 @@ the Pydantic world) to represent your data. Before going into detail about what we can do with [BaseDoc][docarray.base_doc.doc.BaseDoc] and how to use it, let's see what it looks like in practice. -The following Python code defines a `BannerDoc` class that can be used to represent the data of a website banner. +The following Python code defines a `BannerDoc` class that can be used to represent the data of a website banner: ```python from docarray import BaseDoc @@ -29,7 +28,7 @@ class BannerDoc(BaseDoc): description: str ``` -You can then instantiate a `BannerDoc` object and access its attributes. +You can then instantiate a `BannerDoc` object and access its attributes: ```python banner = BannerDoc( @@ -45,39 +44,36 @@ assert banner.description == 'This is a banner' ## `BaseDoc` is a Pydantic `BaseModel` -The class [BaseDoc][docarray.base_doc.doc.BaseDoc] inherits from Pydantic [BaseModel](https://docs.pydantic.dev/usage/models). So you can use -all the features of `BaseModel` in your `Doc` class. - -This means that `BaseDoc`: +The [BaseDoc][docarray.base_doc.doc.BaseDoc] class inherits from Pydantic [BaseModel](https://docs.pydantic.dev/usage/models). This means you can use +all the features of `BaseModel` in your `Doc` class. `BaseDoc`: * Will perform data validation: `BaseDoc` will check that the data you pass to it is valid. If not, it will raise an error. Data being "valid" is actually defined by the type used in the type hint itself, but we will come back to this concept later. (TODO add typing section) -* Can be configured using a nested `Config` class, see Pydantic [documentation](https://docs.pydantic.dev/usage/model_config/) for more detail on what kind of config pydantic offers. -* Can be used as a drop-in replacement for `BaseModel` in your code and is compatible with tools that use Pydantic like [FastAPI]('https://fastapi.tiangolo.com/'). +* Can be configured using a nested `Config` class, see Pydantic [documentation](https://docs.pydantic.dev/usage/model_config/) for more detail on what kind of config Pydantic offers. +* Can be used as a drop-in replacement for `BaseModel` in your code and is compatible with tools that use Pydantic, like [FastAPI]('https://fastapi.tiangolo.com/'). -### What is the difference with Pydantic `BaseModel`? (INCOMPLETE) +### How is `BaseDoc` different to Pydantic's `BaseModel`? (INCOMPLETE) LINK TO THE VERSUS (not ready) -[BaseDoc][docarray.base_doc.doc.BaseDoc] is not only a [BaseModel](https://docs.pydantic.dev/usage/models), +[BaseDoc][docarray.base_doc.doc.BaseDoc] is not just a [BaseModel](https://docs.pydantic.dev/usage/models): -* You can use it with DocArray [Typed](docarray.typing) that are oriented toward MultiModal (image, audio, ...) data and for +* You can use it with DocArray [Typed](docarray.typing) that are oriented toward Multimodal (image, audio, etc) data and for Machine Learning use case TODO link the type section. +* [BaseDoc][docarray.base_doc.doc.BaseDoc] has an `id` field (generated by default) to uniquely identify a Document. -Another difference is that [BaseDoc][docarray.base_doc.doc.BaseDoc] has an `id` field that is generated by default that is used to uniquely identify a Document. - -## `BaseDoc` allows representing multimodal and nested data +## Representing multimodal and nested data Let's say you want to represent a YouTube video in your application, perhaps to build a search system for YouTube videos. A YouTube video is not only composed of a video, but also has a title, description, thumbnail (and more, but let's keep it simple). -All of these elements are from different [`modalities`](../../data_types/first_steps.md): the title and description are text, the thumbnail is an image, and the video in itself is, well, a video. +All of these elements are from different [`modalities`](../../data_types/first_steps.md): the title and description are text, the thumbnail is an image, and the video itself is, well, a video. -DocArray allows to represent all of this multimodal data in a single object. +DocArray lets you represent all of this multimodal data in a single object. -Let's first create an `BaseDoc` for each of the elements that compose the YouTube video. +Let's first create a `BaseDoc` for each of the elements that compose the YouTube video. -First for the thumbnail which is an image: +First for the thumbnail image: ```python from docarray import BaseDoc @@ -105,7 +101,7 @@ class VideoDoc(BaseDoc): ) ``` -Then for the title and description (which are text) we will just use a `str` type. +Then for the title and description (which are text) we'll just use a `str` type. All the elements that compose a YouTube video are ready: @@ -120,21 +116,20 @@ class YouTubeVideoDoc(BaseDoc): video: VideoDoc ``` -You now have `YouTubeVideoDoc` which is a pythonic representation of a YouTube video. +We now have `YouTubeVideoDoc` which is a pythonic representation of a YouTube video. -This representation can now be used to [send](../sending/first_step.md) or to [store](../storing/first_step.md) data. You can even use it directly to [train a machine learning](../../how_to/multimodal_training_and_serving.md) [Pytorch](https://pytorch.org/docs/stable/index.html) model on this representation. +This representation can be used to [send](../sending/first_step.md) or [store](../storing/first_step.md) data. You can even use it directly to [train a machine learning](../../how_to/multimodal_training_and_serving.md) [Pytorch](https://pytorch.org/docs/stable/index.html) model on this representation. !!! note - You see here that `ImageDoc` and `VideoDoc` are also [BaseDoc][docarray.base_doc.doc.BaseDoc], and they later used inside another [BaseDoc][docarray.base_doc.doc.BaseDoc]`. + You see here that `ImageDoc` and `VideoDoc` are also [BaseDoc][docarray.base_doc.doc.BaseDoc], and they are later used inside another [BaseDoc][docarray.base_doc.doc.BaseDoc]`. This is what we call nested data representation. [BaseDoc][docarray.base_doc.doc.BaseDoc] can be nested to represent any kind of data hierarchy. See also: -* The [next section](./array.md) of the representing section -* API Reference for the [BaseDoc][docarray.base_doc.doc.BaseDoc] class +* The [next part](./array.md) of the representing section +* API reference for the [BaseDoc][docarray.base_doc.doc.BaseDoc] class * The [Storing](../storing/first_step.md) section on how to store your data * The [Sending](../sending/first_step.md) section on how to send your data - From a1d859167757033aa4c79e9f8bfb4f934da3c210 Mon Sep 17 00:00:00 2001 From: Alex C-G Date: Mon, 17 Apr 2023 12:52:40 +0200 Subject: [PATCH 4/4] docs: rm generated readme Signed-off-by: Alex C-G --- docs/README.md | 461 ------------------------------------------------- 1 file changed, 461 deletions(-) delete mode 100644 docs/README.md diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 3f9f8669ca6..00000000000 --- a/docs/README.md +++ /dev/null @@ -1,461 +0,0 @@ -# DocArray - Version 2 - -DocArray is a library for **representing, sending and storing multi-modal data**, with a focus on applications in **ML** and -**Neural Search**. - -This means that DocArray lets you do the following things: - -## Represent data - -```python -from docarray import BaseDocument -from docarray.typing import TorchTensor, ImageUrl -from typing import Optional - - -class MyDocument(BaseDocument): - description: str - image_url: ImageUrl - image_tensor: Optional[TorchTensor[1704, 2272, 3]] - # The field above only work with tensor of shape (1704, 2272, 3) - embedding: Optional[TorchTensor] - - -doc = MyDocument( - description="This is a photo of a mountain", - image_url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", -) -doc.image_tensor = doc.image_url.load() # load image tensor from URL -``` - -```python -doc.embedding = clip_image_encoder( - doc.image_tensor -) # create and store embedding using model of your choice - -print(doc.embedding.shape) -``` - -- **Model** data of any type (audio, video, text, images, 3D meshes, raw tensors, etc) as a single, unified data structure, the `Document` - - A `Document` is a juiced-up [Pydantic Model](https://pydantic-docs.helpmanual.io/usage/models/), inheriting all the benefits, while extending it with ML focussed features - -### Use pre-defined `Document`s for common use cases: - -```python -from docarray.documents import ImageDoc - -doc = ImageDoc( - url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", -) -doc.tensor = doc.url.load() # load image tensor from URL -doc.embedding = clip_image_encoder( - doc.tensor -) # create and store embedding using model of your choice -``` -### Compose nested Documents: - -```python -from docarray import BaseDocument -from docarray.documents import ImageDoc, TextDoc -import numpy as np - - -class MultiModalDocument(BaseDocument): - image_doc: ImageDoc - text_doc: TextDoc - - -doc = MultiModalDocument( - image_doc=ImageDoc(tensor=np.zeros((3, 224, 224))), text_doc=TextDoc(text='hi!') -) -``` - -### Collect multiple `Documents` into a `DocumentArray`: -```python -from docarray import DocumentArray, BaseDocument -from docarray.typing import AnyTensor, ImageUrl -import numpy as np - - -class Image(BaseDocument): - url: ImageUrl - tensor: AnyTensor -``` - -```python -from docarray import DocumentArray - -da = DocumentArray[Image]( - [ - Image( - url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", - tensor=np.zeros((3, 224, 224)), - ) - for _ in range(100) - ] -) -``` - - -Access fields at the DocumentArray level: - -```python -print(len(da.tensor)) -print(da.tensor[0].shape) -``` - -You can stack tensors if you want to perform in batch processing: - -```python -da = da.stack() -``` - -```python -print(type(da.tensor)) -print(da.tensor.shape) -``` - -## Send data -- **Serialize** any `Document` or `DocumentArray` into _protobuf_, _json_, _jsonschema_, _bytes_ or _base64_ -- Use in **microservice** architecture: Send over **HTTP** or **gRPC** -- Integrate seamlessly with **FastAPI** and **Jina** - -```python -from docarray.documents import ImageDoc -from httpx import AsyncClient -import numpy as np - -doc = ImageDoc(tensor=np.zeros((3, 224, 224))) - -# JSON over HTTP -async with AsyncClient(app=app, base_url="http://test") as ac: - response = await ac.post("/doc/", data=input_doc.json()) -``` - -```python -# (de)serialize from/to protobuf -Image.from_protobuf(doc.to_protobuf()) -``` - -## Store data -- Persist and `DocumentArray` using a **`DocumentStore`** -- Store your Documents in any supported (vector) database: **Elasticsearch**, **Qdrant**, **Weaviate**, **Redis**, **Milvus**, **ANNLite** or **SQLite** -- Leverage DocumentStores to **perform vector search on your multi-modal data** - -```python -# NOTE: DocumentStores are not yet implemented in version 2 -from docarray import DocumentArray -from docarray.documents import ImageDoc -from docarray.stores import DocumentStore -import numpy as np - -da = DocumentArray([ImageDoc(embedding=np.zeros((128,))) for _ in range(1000)]) -store = DocumentStore[ImageDoc]( - storage='qdrant' -) # create a DocumentStore with Qdrant as backend -store.insert(da) # insert the DocumentArray into the DocumentStore -# find the 10 most similar images based on the 'embedding' field -match = store.find(ImageDoc(embedding=np.zeros((128,))), field='embedding', top_k=10) -``` - -If you want to get a deeper understanding of DocArray v2, it is best to do so on the basis of your -use case and background: - -## Coming from DocArray - -If you are already using DocArray, you will be familiar with its [dataclass API](https://docarray.jina.ai/fundamentals/dataclass/). - -_DocArray v2 is that idea, taken seriously._ Every `Document` is created through dataclass-like interface, -courtesy of [Pydantic](https://pydantic-docs.helpmanual.io/usage/models/). - -This gives the following advantages: -- **Flexibility:** No need to conform to a fixed set of fields, your data defines the schema -- **Multi-modality:** Easily store multiple modalities and multiple embeddings in the same Document -- **Language agnostic:** At its core, Documents are just dictionaries. This makes it easy to create and send them from any language, not just Python. - -## Coming from Pydantic - -If you come from Pydantic, you can see Documents as juiced up models, and DocArray as a collection of goodies around them. - -- **ML focussed types**: Tensor, TorchTensor, TFTensor, Embedding, ... -- **Types that are alive**: ImageUrl can `.load()` a URL to image tensor, TextUrl can load and tokenize text documents, etc. -- **Pre-built Documents** for different data modalities: Image, Text, 3DMesh, Video, Audio, ... Note that all of these will be valid Pydantic models! -- The concepts of **DocumentArray and DocumentStore** -- Cloud ready: Serialization to **Protobuf** for use with microservices and **gRPC** -- Support for **vector search functionalities**, such as `find()` and `embed()` - -## Coming from PyTorch - -DocArray can be used directly inside ML models to handle and represent multi-modal data. This allows you to reason about your data using DocArray's abstractions deep inside of `nn.Module`, and provides a (FastAPI compatible) schema that eases the transition between model training and model serving. - -To see the effect of this, let's first observe a vanilla PyTorch implementation of a tri-modal ML model: - -```python -import torch -from torch import nn - - -class MyMultiModalModel(nn.Module): - def __init__(self): - super().__init__() - self.audio_encoder = AudioEncoder() - self.image_encoder = ImageEncoder() - self.text_encoder = TextEncoder() - - def forward(self, text_1, text_2, image_1, image_2, audio_1, audio_2): - embedding_text_1 = self.text_encoder(text_1) - embedding_text_2 = self.text_encoder(text_2) - - embedding_image_1 = self.image_encoder(image_1) - embedding_image_2 = self.image_encoder(image_2) - - embedding_audio_1 = self.image_encoder(audio_1) - embedding_audio_2 = self.image_encoder(audio_2) - - return ( - embedding_text_1, - embedding_text_2, - embedding_image_1, - embedding_image_2, - embedding_audio_1, - embedding_audio_2, - ) -``` - -Not very easy on the eyes if you ask us. And even worse, if you need to add one more modality you have to touch every part of your code base, changing the `forward()` return type and make a whole lot of changes downstream from that. - -So now let's see what the same code looks like with DocArray: - -```python -from docarray import DocumentArray, BaseDocument -from docarray.documents import ImageDoc, TextDoc, AudioDoc -from docarray.typing import TorchTensor - -import torch - - -class Podcast(BaseDocument): - text: TextDoc - image: ImageDoc - audio: AudioDoc - - -class PairPodcast(BaseDocument): - left: Podcast - right: Podcast - - -class MyPodcastModel(nn.Module): - def __init__(self): - super().__init__() - self.audio_encoder = AudioEncoder() - self.image_encoder = ImageEncoder() - self.text_encoder = TextEncoder() - - def forward_podcast(self, da: DocumentArray[Podcast]) -> DocumentArray[Podcast]: - da.audio.embedding = self.audio_encoder(da.audio.tensor) - da.text.embedding = self.text_encoder(da.text.tensor) - da.image.embedding = self.image_encoder(da.image.tensor) - - return da - - def forward(self, da: DocumentArray[PairPodcast]) -> DocumentArray[PairPodcast]: - da.left = self.forward_podcast(da.left) - da.right = self.forward_podcast(da.right) - - return da -``` - -Looks much better, doesn't it? -You instantly win in code readability and maintainability. And for the same price you can turn your PyTorch model into a FastAPI app and reuse your Document -schema definition (see below). Everything handles in a pythonic manner by relying on type hints. - - -## Coming from TensorFlow - -Similar to the PyTorch approach, you can also use DocArray with TensorFlow to handle and represent multi-modal data inside your ML model. - -First off, to use DocArray with TensorFlow we first need to install it as follows: -``` -pip install tensorflow==2.11.0 -pip install protobuf==3.19.0 -``` - -Compared to using DocArray with PyTorch, there is one main difference when using it with TensorFlow:\ -While DocArray's `TorchTensor` is a subclass of `torch.Tensor`, this is not the case for the `TensorFlowTensor`: Due to technical limitations on `tf.Tensor`, docarray's `TensorFlowTensor` is not a subclass of `tf.Tensor` but instead stores a `tf.Tensor` in its `.tensor` attribute. - -How does this effect you? Whenever you want to access the tensor data to e.g. do operations with it or hand it to your ML model, instead of handing over your `TensorFlowTensor` instance, you need to access its `.tensor` attribute. - -This would look like the following: - -```python -from typing import Optional - -from docarray import DocumentArray, BaseDocument - -import tensorflow as tf - - -class Podcast(BaseDocument): - audio_tensor: Optional[AudioTensorFlowTensor] - embedding: Optional[AudioTensorFlowTensor] - - -class MyPodcastModel(tf.keras.Model): - def __init__(self): - super().__init__() - self.audio_encoder = AudioEncoder() - - def call(self, inputs: DocumentArray[Podcast]) -> DocumentArray[Podcast]: - inputs.audio_tensor.embedding = self.audio_encoder( - inputs.audio_tensor.tensor - ) # access audio_tensor's .tensor attribute - return inputs -``` - - - -## Coming from FastAPI - -Documents are Pydantic Models (with a twist), and as such they are fully compatible with FastAPI: - -```python -import numpy as np -from fastapi import FastAPI -from httpx import AsyncClient - -from docarray import BaseDocument -from docarray.documents import ImageDoc -from docarray.typing import NdArray -from docarray.base_document import DocumentResponse - - -class InputDoc(BaseDocument): - img: ImageDoc - - -class OutputDoc(BaseDocument): - embedding_clip: NdArray - embedding_bert: NdArray - - -input_doc = InputDoc(img=ImageDoc(tensor=np.zeros((3, 224, 224)))) - -app = FastAPI() - - -@app.post("/doc/", response_model=OutputDoc, response_class=DocumentResponse) -async def create_item(doc: InputDoc) -> OutputDoc: - ## call my fancy model to generate the embeddings - doc = OutputDoc( - embedding_clip=np.zeros((100, 1)), embedding_bert=np.zeros((100, 1)) - ) - return doc - - -async with AsyncClient(app=app, base_url="http://test") as ac: - response = await ac.post("/doc/", data=input_doc.json()) - resp_doc = await ac.get("/docs") - resp_redoc = await ac.get("/redoc") -``` - -The big advantage here is **first-class support for ML centric data**, such as {Torch, TF, ...}Tensor, Embedding, etc. - -This includes handy features such as validating the shape of a tensor: -```python -from docarray import BaseDocument -from docarray.typing import TorchTensor -import torch - - -class MyDoc(BaseDocument): - tensor: TorchTensor[3, 224, 224] - - -doc = MyDoc(tensor=torch.zeros(3, 224, 224)) # works -doc = MyDoc(tensor=torch.zeros(224, 224, 3)) # works by reshaping -doc = MyDoc(tensor=torch.zeros(224)) # fails validation - - -class Image(BaseDocument): - tensor: TorchTensor[3, 'x', 'x'] - - -Image(tensor=torch.zeros(3, 224, 224)) # works -Image( - tensor=torch.zeros(3, 64, 128) -) # fails validation because second dimension does not match third -Image( - tensor=torch.zeros(4, 224, 224) -) # fails validation because of the first dimension -Image( - tensor=torch.zeros(3, 64) -) # fails validation because it does not have enough dimensions -``` - -## Coming from a vector database - -If you came across docarray as a universal vector DB client, you can best think of it as **a new kind of ORM for vector databases**. - -DocArray's job is to take multi-modal, nested and domain-specific data and to map it to a vector database, -store it there, and thus make it searchable: - -```python -# NOTE: DocumentStores are not yet implemented in version 2 -from docarray import DocumentArray, BaseDocument -from docarray.stores import DocumentStore -from docarray.documents import ImageDoc, TextDoc -import numpy as np - - -class MyDoc(BaseDocument): - image: ImageDoc - text: TextDoc - description: str - - -def _random_my_doc(): - return MyDoc( - image=ImageDoc(embedding=np.random.random((256,))), - text=TextDoc(embedding=np.random.random((128,))), - description='this is a random document', - ) - - -da = DocumentArray([_random_my_doc() for _ in range(1000)]) # create some data -store = DocumentStore[MyDoc]( - storage='qdrant' -) # create a DocumentStore with Qdrant as backend -store.insert(da) # insert the DocumentArray into the DocumentStore - -# find the 10 most similar images based on the image embedding field -match = store.find( - ImageDoc(embedding=np.zeros((256,))), field='image__embedding', top_k=10 -) -# find the 10 most similar images based on the image embedding field -match = store.find( - ImageDoc(embedding=np.zeros((128,))), field='text__embedding', top_k=10 -) -``` - -## Install the alpha - -to try out the alpha you can install it via git: -```shell -pip install "git+https://github.com/docarray/docarray@2023.01.18.alpha#egg=docarray[common,torch,image]" -``` -or from the latest development branch -```shell -pip install "git+https://github.com/docarray/docarray@feat-rewrite-v2#egg=docarray[common,torch,image]" -``` - -## Further reading - -- [Join our Discord server](https://discord.gg/WaMp6PVPgR) -- [V2 announcement blog post](https://github.com/docarray/notes/blob/main/blog/01-announcement.md) -- [Donation to Linux Foundation AI&Data blog post](https://jina.ai/news/donate-docarray-lf-for-inclusive-standard-multimodal-data-model/) -- [Submit ideas, feature requests, and discussions](https://github.com/docarray/docarray/discussions) -- [v2 Documentation](https://docarray-v2--jina-docs.netlify.app/) -- ["Legacy" DocArray github page](https://github.com/docarray/docarray) -- ["Legacy" DocArray documentation](https://docarray.jina.ai/) -