From 749693f08966c2081c9185051b4f0f6bbb47d256 Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Wed, 5 Apr 2023 15:36:25 +0200
Subject: [PATCH 01/18] doc: rewrite represent section in readme

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 README.md | 172 +++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 131 insertions(+), 41 deletions(-)
diff --git a/README.md b/README.md
index af497cb5b3e..2bd9bf17fdc 100644
--- a/README.md
+++ b/README.md
@@ -1,65 +1,98 @@
-# DocArray - Version 2
+<p align="center">
+<img src="https://github.com/docarray/docarray/blob/main/docs/_static/logo-light.svg?raw=true" alt="DocArray logo: The data structure for unstructured data" width="150px">
+<br>
+<b>The data structure for multimodal data</b>
+</p>
+
+<p align=center>
+<a href="https://pypi.org/project/docarray/"><img src="https://img.shields.io/pypi/v/docarray?style=flat-square&amp;label=Release" alt="PyPI"></a>
+<a href="https://codecov.io/gh/docarray/docarray"><img alt="Codecov branch" src="https://img.shields.io/codecov/c/github/docarray/docarray/main?logo=Codecov&logoColor=white&style=flat-square"></a>
+<a href="https://bestpractices.coreinfrastructure.org/projects/6554"><img src="https://bestpractices.coreinfrastructure.org/projects/6554/badge"></a>
+<a href="https://pypistats.org/packages/docarray"><img alt="PyPI - Downloads from official pypistats" src="https://img.shields.io/pypi/dm/docarray?style=flat-square"></a>
+<a href="https://discord.gg/WaMp6PVPgR"><img src="https://dcbadge.vercel.app/api/server/WaMp6PVPgR?theme=default-inverted&style=flat-square"></a>
+</p>
 
 > **Note**
 > This introduction refers to version 2 of DocArray, a rewrite that is currently at the alpha stage.
 > Not all features that are mentioned here are implemented yet.
 > If you are looking for the version 2 implementation roadmap see [here](https://github.com/docarray/docarray/issues/780),
 > for the (already released) version 1 of DocArray
-> see [here](https://github.com/docarray/docarray)._
+> see [here](https://github.com/docarray/docarray).
 
-DocArray is a library for **representing, sending and storing multi-modal data**, with a focus on applications in **ML** and
-**Neural Search**.
+DocArray is a library for **representing, sending and storing multi-modal data**, perfect for **Machine Learning applications**.
 
-This means that DocArray lets you do the following things:
+DocArray handles your data while integrating seamlessly with the rest of your **Python and ML ecosystem**:
+
+- DocArray has native compatibility for **NumPy**, **PyTorch** and **TensorFlow**, including for **model training use cases**
+- DocArray is built on **Pydantic** and out-of-the-box compatible with **FastAPI**
+- DocArray can store data in vector databases such as **Weaviate, Qdrant, ElasticSearch** as well as **HNSWLib**
+- DocArray data can be sent as JSON over **HTTP** or as **Protobuf** over **gRPC**
+
+With that said, let's dig into the three pillars of DocArray:
+1. [Represent](#represent)
+2. [Send](#send)
+3. [Store](#store)
+
+> :bulb: **Where are you coming from?**: Depending on your use case and background, there are different was to "get" DocArray.
+> You can navigate to the following section for an explanation that should fit your mindest:
+> - [Coming from pure PyTorch or TensorFlow](#coming-from-torch-tf)
+> - [Coming from Pydantic](#coming-from-pydantic)
+> - [Coming from FastAPI](#coming-from-fastapi)
+> - [Coming from a vector database](#coming-from-vector-database)
 
 ## Represent
 
+DocArray allows you to **represent your data**, in a ML-native way.
+This is useful for different use cases:
+- You are **training a model**, there are myriads of tensors of different shapes and sizes flying around, representing different _things_, and you want to keep a straight head about them
+- You are **serving a model**, for example through FastAPI, and you want to specify your API endpoints
+- You are **parsing data** for later use in your ML or DS applications
+
+> :bulb: **Coming from Pydantic?**: If you're currently using Pydantic for the use cases above, you should be happy to hear
+> that DocArray is built on top of, and fully compatible with, Pydantic!
+> Also, we have [dedicated section](#coming-from-pydantic) just for you!
+
+So let's see how you can represent your data with DocArray:
+
 ```python
 from docarray import BaseDoc
 from docarray.typing import TorchTensor, ImageUrl
 from typing import Optional
 
 
+# Define your data model
 class MyDocument(BaseDoc):
     description: str
-    image_url: ImageUrl
-    image_tensor: Optional[TorchTensor[1704, 2272, 3]]
-    # The field above only work with tensor of shape (1704, 2272, 3)
+    image_url: ImageUrl  # could also be VideoUrl, AudioUrl, etc.
+    image_tensor: Optional[
+        TorchTensor[1704, 2272, 3]
+    ]  # could also be NdArray of TensorflowTensor
     embedding: Optional[TorchTensor]
+```
+
+So not only can you define the types of your data, you can even **specify the shape of your tensors!**
 
+Once you have your model in form of a `Document`, you can work with it!
 
+```python
+# Create a document
 doc = MyDocument(
     description="This is a photo of a mountain",
     image_url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg",
 )
-doc.image_tensor = doc.image_url.load()  # load image tensor from URL
-```
 
-```python
-doc.embedding = clip_image_encoder(
-    doc.image_tensor
-)  # create and store embedding using model of your choice
+# Load image tensor from URL
+doc.image_tensor = doc.image_url.load()
+
+# Compute embedding with any model of your choice
+doc.embedding = clip_image_encoder(doc.image_tensor)
 
 print(doc.embedding.shape)
 ```
 
-- **Model** data of any type (audio, video, text, images, 3D meshes, raw tensors, etc) as a Document, a single, unified data structure.
-  - A `Document` is a juiced-up [Pydantic Model](https://pydantic-docs.helpmanual.io/usage/models/), inheriting all the benefits, while extending it with ML focused features.
+### Compose nested Documents
 
-### Use pre-defined `Document`s for common use cases:
-
-```python
-from docarray.documents import ImageDoc
-
-doc = ImageDoc(
-    url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg",
-)
-doc.tensor = doc.url.load()  # load image tensor from URL
-doc.embedding = clip_image_encoder(
-    doc.tensor
-)  # create and store embedding using model of your choice
-```
-### Compose nested Documents:
+Of course you can compose Documents into a nested structure:
 
 ```python
 from docarray import BaseDoc
@@ -77,23 +110,59 @@ doc = MultiModalDocument(
 )
 ```
 
-### Collect multiple `Documents` into a `DocList`:
+Of course, you rarely work with a single data point at a time, especially in Machine Learning applications.
+
+That's why you can easily collect multiple `Documents`:
+
+### Collect multiple `Documents`
+
+When building or interacting with an ML system, usually you want to process multiple Documents (data points) at once.
+
+DocArray offers two data structures for this:
+- **`DocVec`**: A vector of `Documents`. All tensors in the `Documents` are stacked up into a single tensor. Perfect for batch processing and use inside of ML models.
+- **`DocList`**: A list of `Documents`. All tensors in the `Documents` are kept as-is. Perfect for streaming, re-ranking, and shuffling of data.
+
+Let's take a look at them, starting with `DocVec`:
 
 ```python
-from docarray import DocList, BaseDoc
+from docarray import DocVec, BaseDoc
 from docarray.typing import AnyTensor, ImageUrl
 import numpy as np
 
 
 class Image(BaseDoc):
     url: ImageUrl
-    tensor: AnyTensor
+    tensor: AnyTensor  # this allows torch, numpy, and tensorflow tensors
+
+
+vec = DocVec[Image](  # the DocVec is parametrized by your personal schema!
+    [
+        Image(
+            url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg",
+            tensor=np.zeros((3, 224, 224)),
+        )
+        for _ in range(100)
+    ]
+)
 ```
 
+As you can see in the code snippet above, `DocVec` is **parametrized by the type of Document** you want to use with it: `DocVec[Image]`.
+
+This may look slightly weird at first, but we're confident that you'll get used to it quickly!
+Besides, it allows us to do some cool things, like giving you **bulk access to the fields that you defined** in your `Document`:
+
+```python
+tensor = vec.tensor  # gets all the tensors in the DocVec
+print(tensor.shape)  # which are stacked up into a single tensor!
+print(vec.url)  # you can bulk access any other field, too
+```
+
+The second data structure, `DocList`, works in a similar way:
+
 ```python
 from docarray import DocList
 
-da = DocList[Image](
+dl = DocList[Image](  # the DocList is parametrized by your personal schema!
     [
         Image(
             url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg",
@@ -104,25 +173,46 @@ da = DocList[Image](
 )
 ```
 
-Access fields at the DocArray level:
+You can still bulk access the fields of your `Document`:
 
 ```python
-print(len(da.tensor))
-print(da.tensor[0].shape)
+tensors = dl.tensor  # gets all the tensors in the DocVec
+print(type(tensors))  # as a list of tensors
+print(dl.url)  # you can bulk access any other field, too
 ```
 
-You can stack tensors if you want to perform in batch processing:
+And you can insert, remove, and append `Documents` to your `DocList`:
 
 ```python
-da = da.stack()
+dl.append(
+    Image(
+        url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg",
+        tensor=np.zeros((3, 224, 224)),
+    )
+)
+del dl[0]
+dl.insert(
+    0,
+    Image(
+        url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg",
+        tensor=np.zeros((3, 224, 224)),
+    ),
+)
 ```
 
+And you can seamlessly switch between `DocVec` and `DocList`:
+
 ```python
-print(type(da.tensor))
-print(da.tensor.shape)
+vec_2 = dl.unstack()
+assert isinstance(vec_2, DocVec)
+
+dl_2 = vec_2.stack()
+assert isinstance(dl_2, DocList)
 ```
 
+
 ## Send
+
 - **Serialize** any `Document` or `DocArray` into _protobuf_, _json_, _jsonschema_, _bytes_ or _base64_
 - Use in **microservice** architecture: Send over **HTTP** or **gRPC**
 - Integrate seamlessly with **[FastAPI](https://github.com/tiangolo/fastapi/)** and **[Jina](https://github.com/jina-ai/jina/)**

From 435075b0386c7e0935ed64c38a2c08be2fc96bf9 Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Wed, 5 Apr 2023 15:48:30 +0200
Subject: [PATCH 02/18] docs: emojify

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 README.md | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 2bd9bf17fdc..deed27c99c4 100644
--- a/README.md
+++ b/README.md
@@ -12,21 +12,14 @@
 <a href="https://discord.gg/WaMp6PVPgR"><img src="https://dcbadge.vercel.app/api/server/WaMp6PVPgR?theme=default-inverted&style=flat-square"></a>
 </p>
 
-> **Note**
-> This introduction refers to version 2 of DocArray, a rewrite that is currently at the alpha stage.
-> Not all features that are mentioned here are implemented yet.
-> If you are looking for the version 2 implementation roadmap see [here](https://github.com/docarray/docarray/issues/780),
-> for the (already released) version 1 of DocArray
-> see [here](https://github.com/docarray/docarray).
-
 DocArray is a library for **representing, sending and storing multi-modal data**, perfect for **Machine Learning applications**.
 
 DocArray handles your data while integrating seamlessly with the rest of your **Python and ML ecosystem**:
 
-- DocArray has native compatibility for **NumPy**, **PyTorch** and **TensorFlow**, including for **model training use cases**
-- DocArray is built on **Pydantic** and out-of-the-box compatible with **FastAPI**
-- DocArray can store data in vector databases such as **Weaviate, Qdrant, ElasticSearch** as well as **HNSWLib**
-- DocArray data can be sent as JSON over **HTTP** or as **Protobuf** over **gRPC**
+- :fire: DocArray has native compatibility for **NumPy**, **PyTorch** and **TensorFlow**, including for **model training use cases**
+- :zap: DocArray is built on **Pydantic** and out-of-the-box compatible with **FastAPI**
+- :package: DocArray can store data in vector databases such as **Weaviate, Qdrant, ElasticSearch** as well as **HNSWLib**
+- :chains: DocArray data can be sent as JSON over **HTTP** or as **Protobuf** over **gRPC**
 
 With that said, let's dig into the three pillars of DocArray:
 1. [Represent](#represent)
@@ -44,9 +37,9 @@ With that said, let's dig into the three pillars of DocArray:
 
 DocArray allows you to **represent your data**, in a ML-native way.
 This is useful for different use cases:
-- You are **training a model**, there are myriads of tensors of different shapes and sizes flying around, representing different _things_, and you want to keep a straight head about them
-- You are **serving a model**, for example through FastAPI, and you want to specify your API endpoints
-- You are **parsing data** for later use in your ML or DS applications
+- :running_woman: You are **training a model**, there are myriads of tensors of different shapes and sizes flying around, representing different _things_, and you want to keep a straight head about them
+- :cloud: You are **serving a model**, for example through FastAPI, and you want to specify your API endpoints
+- :card_index_dividers: You are **parsing data** for later use in your ML or DS applications
 
 > :bulb: **Coming from Pydantic?**: If you're currently using Pydantic for the use cases above, you should be happy to hear
 > that DocArray is built on top of, and fully compatible with, Pydantic!
@@ -66,7 +59,7 @@ class MyDocument(BaseDoc):
     image_url: ImageUrl  # could also be VideoUrl, AudioUrl, etc.
     image_tensor: Optional[
         TorchTensor[1704, 2272, 3]
-    ]  # could also be NdArray of TensorflowTensor
+    ]  # could also be NdArray or TensorflowTensor
     embedding: Optional[TorchTensor]
 ```
 
@@ -119,8 +112,8 @@ That's why you can easily collect multiple `Documents`:
 When building or interacting with an ML system, usually you want to process multiple Documents (data points) at once.
 
 DocArray offers two data structures for this:
-- **`DocVec`**: A vector of `Documents`. All tensors in the `Documents` are stacked up into a single tensor. Perfect for batch processing and use inside of ML models.
-- **`DocList`**: A list of `Documents`. All tensors in the `Documents` are kept as-is. Perfect for streaming, re-ranking, and shuffling of data.
+- **`DocVec`**: A vector of `Documents`. All tensors in the `Documents` are stacked up into a single tensor. **Perfect for batch processing and use inside of ML models**.
+- **`DocList`**: A list of `Documents`. All tensors in the `Documents` are kept as-is. **Perfect for streaming, re-ranking, and shuffling of data**.
 
 Let's take a look at them, starting with `DocVec`:
 
@@ -184,13 +177,16 @@ print(dl.url)  # you can bulk access any other field, too
 And you can insert, remove, and append `Documents` to your `DocList`:
 
 ```python
+# append
 dl.append(
     Image(
         url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg",
         tensor=np.zeros((3, 224, 224)),
     )
 )
+# delete
 del dl[0]
+# insert
 dl.insert(
     0,
     Image(

From 812bb541bdb07b790b391285df2399874e32d8fe Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Wed, 5 Apr 2023 15:50:25 +0200
Subject: [PATCH 03/18] docs: make collapsible

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index deed27c99c4..d407ce785bd 100644
--- a/README.md
+++ b/README.md
@@ -33,6 +33,10 @@ With that said, let's dig into the three pillars of DocArray:
 > - [Coming from FastAPI](#coming-from-fastapi)
 > - [Coming from a vector database](#coming-from-vector-database)
 
+
+<details>
+  <summary>Click me</summary>
+
 ## Represent
 
 DocArray allows you to **represent your data**, in a ML-native way.
@@ -206,6 +210,7 @@ dl_2 = vec_2.stack()
 assert isinstance(dl_2, DocList)
 ```
 
+</details>
 
 ## Send
 

From c496f3db6633898aecdf8237d5a821f0cd751842 Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Wed, 5 Apr 2023 15:51:53 +0200
Subject: [PATCH 04/18] docs: fix collapsible

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index d407ce785bd..f9e8a7262e9 100644
--- a/README.md
+++ b/README.md
@@ -34,11 +34,11 @@ With that said, let's dig into the three pillars of DocArray:
 > - [Coming from a vector database](#coming-from-vector-database)
 
 
-<details>
-  <summary>Click me</summary>
-
 ## Represent
 
+<details>
+  <summary>Click to expand</summary>
+
 DocArray allows you to **represent your data**, in a ML-native way.
 This is useful for different use cases:
 - :running_woman: You are **training a model**, there are myriads of tensors of different shapes and sizes flying around, representing different _things_, and you want to keep a straight head about them

From 832c32dc331c549d01c3f5e213648df950423173 Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Wed, 5 Apr 2023 16:00:10 +0200
Subject: [PATCH 05/18] docs: moving stuff around

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 README.md | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 56 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index f9e8a7262e9..5a3bd30d4aa 100644
--- a/README.md
+++ b/README.md
@@ -36,9 +36,6 @@ With that said, let's dig into the three pillars of DocArray:
 
 ## Represent
 
-<details>
-  <summary>Click to expand</summary>
-
 DocArray allows you to **represent your data**, in a ML-native way.
 This is useful for different use cases:
 - :running_woman: You are **training a model**, there are myriads of tensors of different shapes and sizes flying around, representing different _things_, and you want to keep a straight head about them
@@ -49,7 +46,62 @@ This is useful for different use cases:
 > that DocArray is built on top of, and fully compatible with, Pydantic!
 > Also, we have [dedicated section](#coming-from-pydantic) just for you!
 
-So let's see how you can represent your data with DocArray:
+Put simply, DocArray lets you represent your data in a dataclass-like way, with ML as a first class citizen:
+
+```python
+from docarray import BaseDoc
+from docarray.typing import TorchTensor, ImageUrl
+
+# Define your data model
+class MyDocument(BaseDoc):
+    description: str
+    image_url: ImageUrl  # could also be VideoUrl, AudioUrl, etc.
+    image_tensor: TorchTensor[1704, 2272, 3]  # you can express tensor shapes!
+
+
+# Stack multiple documents, column-wise
+from docarray import DocVec
+
+vec = DocVec[MyDocument](
+    [
+        MyDocument(
+            description="A cat",
+            image_url="https://example.com/cat.jpg",
+            image_tensor=torch.rand(1704, 2272, 3),
+        ),
+        MyDocument(
+            description="A dog",
+            image_url="https://example.com/dog.jpg",
+            image_tensor=torch.rand(1704, 2272, 3),
+        ),
+    ]
+)
+print(vec.image_tensor)
+
+# Or treat them like a list, row wise
+from docarray import DocList
+
+dl = DocList[MyDocument](
+    [
+        MyDocument(
+            description="A cat",
+            image_url="https://example.com/cat.jpg",
+            image_tensor=torch.rand(1704, 2272, 3),
+        ),
+        MyDocument(
+            description="A dog",
+            image_url="https://example.com/dog.jpg",
+            image_tensor=torch.rand(1704, 2272, 3),
+        ),
+    ]
+)
+print(dl.image_tensor)
+```
+
+<details>
+  <summary>Click for more details</summary>
+
+So let's take a closer look at how you can represent your data with DocArray:
 
 ```python
 from docarray import BaseDoc

From e5c7ab6796ddfaf033db4d9066c46f2810393f54 Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Wed, 5 Apr 2023 16:00:17 +0200
Subject: [PATCH 06/18] docs: moving stuff around

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 README.md | 27 ++-------------------------
 1 file changed, 2 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 5a3bd30d4aa..36f9115fa9c 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ class MyDocument(BaseDoc):
     image_tensor: TorchTensor[1704, 2272, 3]  # you can express tensor shapes!
 
 
-# Stack multiple documents, column-wise
+# Stack multiple documents
 from docarray import DocVec
 
 vec = DocVec[MyDocument](
@@ -69,33 +69,10 @@ vec = DocVec[MyDocument](
             image_url="https://example.com/cat.jpg",
             image_tensor=torch.rand(1704, 2272, 3),
         ),
-        MyDocument(
-            description="A dog",
-            image_url="https://example.com/dog.jpg",
-            image_tensor=torch.rand(1704, 2272, 3),
-        ),
     ]
+    * 1000
 )
 print(vec.image_tensor)
-
-# Or treat them like a list, row wise
-from docarray import DocList
-
-dl = DocList[MyDocument](
-    [
-        MyDocument(
-            description="A cat",
-            image_url="https://example.com/cat.jpg",
-            image_tensor=torch.rand(1704, 2272, 3),
-        ),
-        MyDocument(
-            description="A dog",
-            image_url="https://example.com/dog.jpg",
-            image_tensor=torch.rand(1704, 2272, 3),
-        ),
-    ]
-)
-print(dl.image_tensor)
 ```
 
 <details>

From 426d6ab85bc6dcf95548c6b4d5c41e7359093407 Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Wed, 5 Apr 2023 16:02:12 +0200
Subject: [PATCH 07/18] docs: small tweaks

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 36f9115fa9c..b81d815f777 100644
--- a/README.md
+++ b/README.md
@@ -72,11 +72,11 @@ vec = DocVec[MyDocument](
     ]
     * 1000
 )
-print(vec.image_tensor)
+print(vec.image_tensor.shape)  # (1000, 1704, 2272, 3)
 ```
 
 <details>
-  <summary>Click for more details</summary>
+  <summary>**Click for more details**</summary>
 
 So let's take a closer look at how you can represent your data with DocArray:
 

From e26af0f5ff5d15ae9c660fde794fd2c105bc40e4 Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Thu, 6 Apr 2023 13:34:06 +0200
Subject: [PATCH 08/18] docs: readme section for send and store

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 README.md | 150 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 120 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index b81d815f777..c8ea1d2a2b8 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,8 @@ With that said, let's dig into the three pillars of DocArray:
 
 ## Represent
 
-DocArray allows you to **represent your data**, in a ML-native way.
+DocArray allows you to **represent your data**, in an ML-native way.
+
 This is useful for different use cases:
 - :running_woman: You are **training a model**, there are myriads of tensors of different shapes and sizes flying around, representing different _things_, and you want to keep a straight head about them
 - :cloud: You are **serving a model**, for example through FastAPI, and you want to specify your API endpoints
@@ -76,7 +77,7 @@ print(vec.image_tensor.shape)  # (1000, 1704, 2272, 3)
 ```
 
 <details>
-  <summary>**Click for more details**</summary>
+  <summary>Click for more details</summary>
 
 So let's take a closer look at how you can represent your data with DocArray:
 
@@ -243,50 +244,139 @@ assert isinstance(dl_2, DocList)
 
 ## Send
 
-- **Serialize** any `Document` or `DocArray` into _protobuf_, _json_, _jsonschema_, _bytes_ or _base64_
-- Use in **microservice** architecture: Send over **HTTP** or **gRPC**
-- Integrate seamlessly with **[FastAPI](https://github.com/tiangolo/fastapi/)** and **[Jina](https://github.com/jina-ai/jina/)**
+DocArray allows you to **send your data**, in an ML-native way.
+
+This means there is native support for **Protobuf and gRPC**, in top of **HTTP** and serialization to JSON, JSONSchema, Base64, and Bytes.
+
+This is useful for different use cases:
+- :cloud: You are **serving a model**, for example through **[Jina](https://github.com/jina-ai/jina/)** or **[FastAPI](https://github.com/tiangolo/fastapi/)**
+- :spider_web: You **distribute your model** across machines and need to send your data between nodes
+- :gear: You are building a **microservice** architecture and need to send your data between microservices
+
+> :bulb: **Coming from FatAPI?**: If you're currently using FatAPI for the use cases above, you should be happy to hear
+> that DocArray is fully compatible with FatAPI!
+> Also, we have [dedicated section](#coming-from-fastapi) just for you!
+
+Whenever you want to send your data you need to serialize it, so let's take a look at how that works with DocArray:
 
 ```python
-from docarray.documents import ImageDoc
-from httpx import AsyncClient
-import numpy as np
+from docarray import BaseDoc
+from docarray.typing import ImageTorchTensor
 
-doc = ImageDoc(tensor=np.zeros((3, 224, 224)))
+# model your data
+class MyDocument(BaseDoc):
+    description: str
+    image: ImageTorchTensor[3, 224, 224]
 
-# JSON over HTTP
-async with AsyncClient(app=app, base_url="http://test") as ac:
-    response = await ac.post("/doc/", data=input_doc.json())
+
+# create a Document
+doc = MyDocument(
+    description="This is a description",
+    image=torch.zeros((3, 224, 224)),
+)
+
+# serialize it!
+proto = doc.to_protobuf()
+base64 = doc.to_base64()
+bytes_ = doc.to_bytes()
+json = doc.json()
+jsonschema = doc.jsonschema()
+
+# deserialize it!
+doc_2 = MyDocument.from_protobuf(proto)
+doc_3 = MyDocument.from_base64(base64)
+doc_4 = MyDocument.from_bytes(bytes_)
+doc_5 = MyDocument.parse_raw(json)
 ```
 
+Of course, serialization is not all you need.
+So check out how DocArray integrates with FatAPI and Jina. TODO link to doc sections
+
+
+## Store
+
+Once you've modelled your data, and maybe sent it around, usually you want to **store it** somewhere.
+But fret not! DocArray has you covered!
+
+**Document Stores** let you, well, store your Documents, locally or remotely, all with the same user interface:
+- :cd: **On disk** as a file in your local file system
+- :bucket: On **[AWS S3](https://aws.amazon.com/de/s3/)**
+- :cloud: On **[Jina AI Cloud](https://cloud.jina.ai/)**
+
+<details>
+  <summary>See Document Store usage</summary>
+
+The Document Store interface lets you push and pull Documents to and from multiple data sources, all with the same user interface.
+
+As an example, let's take a look at how that would work with AWS S3 storage:
+
 ```python
-# (de)serialize from/to protobuf
-Image.from_protobuf(doc.to_protobuf())
+from docarray import DocList
+from docarray.documents import ImageDoc
+
+dl = DocList[ImageDoc](
+    [
+        ImageDoc(
+            url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg",
+            tensor=np.zeros((3, 224, 224)),
+        )
+        for _ in range(100)
+    ]
+)
+
+# push the DocList to S3
+dl.push('s3://my-bucket/my-documents', show_progress=True)
+
+# pull the DocList from S3
+dl_2 = DocList[ImageDoc].pull('s3://my-bucket/my-documents', show_progress=True)
 ```
+</details>
 
-## Store
-- Persist a `DocArray` using a **`DocumentStore`**
-- Store your Documents in any supported (vector) database: **Elasticsearch**, **Qdrant**, **Weaviate**, **Redis**, **Milvus**, **ANNLite** or **SQLite**
-- Leverage DocumentStores to **perform vector search on your multi-modal data**
+**Document Indexes** let you index your Documents into a **vector database**, for efficient similarity-based retrieval.
+
+This is useful for:
+- :left_speech_bubble: Augmenting **LLMs and Chatbots** with domain knowledge ([Retrieval Augmented Generation](https://arxiv.org/abs/2005.11401))
+- :mag: **Neural search** applications
+- :bulb: **Recommender systems**
+
+Currently, DocArray Document Indexes support **[Weaviate](https://weaviate.io/)**, **[Qdrant](https://qdrant.tech/)**, **[ElasticSearch](https://www.elastic.co/)**, and **[HNSWLib](https://github.com/nmslib/hnswlib)**, with more to come!.
+
+<details>
+  <summary>See Document Index usage</summary>
+
+The Document Index interface lets you index and retrieve Documents from multiple vector databases, all with the same user interface.
+
+It supports ANN vector search, text search, filtering, and hybrid search.
 
 ```python
-# NOTE: DocumentStores are not yet implemented in version 2
 from docarray import DocList
 from docarray.documents import ImageDoc
-from docarray.stores import DocumentStore
-import numpy as np
+from docarray.index import HnswDocumentIndex
 
-da = DocList([ImageDoc(embedding=np.zeros((128,))) for _ in range(1000)])
-store = DocumentStore[ImageDoc](
-    storage='qdrant'
-)  # create a DocumentStore with Qdrant as backend
-store.insert(da)  # insert the DocList into the DocumentStore
-# find the 10 most similar images based on the 'embedding' field
-match = store.find(ImageDoc(embedding=np.zeros((128,))), field='embedding', top_k=10)
+# create some data
+dl = DocList[ImageDoc](
+    [
+        ImageDoc(
+            url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg",
+            tensor=np.zeros((3, 224, 224)),
+            embedding=np.random.random((128,)),
+        )
+        for _ in range(100)
+    ]
+)
+
+# create a Document Index
+index = HnswDocumentIndex(work_dir='.')
+
+# index your data
+index.index(dl)
+
+# find similar Document
+query = dl[0]
+results, scores = index.find(query, top_k=10, search_field='embedding')
 ```
 
-If you want to get a deeper understanding of DocArray v2, it is best to do so on the basis of your
-use case and background:
+</details>
 
 ## Coming from DocArray
 

From 6ce22a4bd60ce7ce9822ebc59380ffe71a797a4b Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Thu, 6 Apr 2023 14:04:31 +0200
Subject: [PATCH 09/18] docs: re-arrange some stuff

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 README.md | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index c8ea1d2a2b8..a62152b8f93 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,12 @@
 
 DocArray is a library for **representing, sending and storing multi-modal data**, perfect for **Machine Learning applications**.
 
+Those are the three pillars of DocArray, and you can check them out individually:
+
+1. [**Represent**](#represent)
+2. [**Send**](#send)
+3. [**Store**](#store)
+
 DocArray handles your data while integrating seamlessly with the rest of your **Python and ML ecosystem**:
 
 - :fire: DocArray has native compatibility for **NumPy**, **PyTorch** and **TensorFlow**, including for **model training use cases**
@@ -21,10 +27,6 @@ DocArray handles your data while integrating seamlessly with the rest of your **
 - :package: DocArray can store data in vector databases such as **Weaviate, Qdrant, ElasticSearch** as well as **HNSWLib**
 - :chains: DocArray data can be sent as JSON over **HTTP** or as **Protobuf** over **gRPC**
 
-With that said, let's dig into the three pillars of DocArray:
-1. [Represent](#represent)
-2. [Send](#send)
-3. [Store](#store)
 
 > :bulb: **Where are you coming from?**: Depending on your use case and background, there are different was to "get" DocArray.
 > You can navigate to the following section for an explanation that should fit your mindest:
@@ -60,7 +62,7 @@ class MyDocument(BaseDoc):
     image_tensor: TorchTensor[1704, 2272, 3]  # you can express tensor shapes!
 
 
-# Stack multiple documents
+# Stack multiple documents in a Document Vector
 from docarray import DocVec
 
 vec = DocVec[MyDocument](
@@ -246,7 +248,7 @@ assert isinstance(dl_2, DocList)
 
 DocArray allows you to **send your data**, in an ML-native way.
 
-This means there is native support for **Protobuf and gRPC**, in top of **HTTP** and serialization to JSON, JSONSchema, Base64, and Bytes.
+This means there is native support for **Protobuf and gRPC**, on top of **HTTP** and serialization to JSON, JSONSchema, Base64, and Bytes.
 
 This is useful for different use cases:
 - :cloud: You are **serving a model**, for example through **[Jina](https://github.com/jina-ai/jina/)** or **[FastAPI](https://github.com/tiangolo/fastapi/)**
@@ -339,7 +341,7 @@ This is useful for:
 - :mag: **Neural search** applications
 - :bulb: **Recommender systems**
 
-Currently, DocArray Document Indexes support **[Weaviate](https://weaviate.io/)**, **[Qdrant](https://qdrant.tech/)**, **[ElasticSearch](https://www.elastic.co/)**, and **[HNSWLib](https://github.com/nmslib/hnswlib)**, with more to come!.
+Currently, DocArray Document Indexes support **[Weaviate](https://weaviate.io/)**, **[Qdrant](https://qdrant.tech/)**, **[ElasticSearch](https://www.elastic.co/)**, and **[HNSWLib](https://github.com/nmslib/hnswlib)**, with more to come!
 
 <details>
   <summary>See Document Index usage</summary>
@@ -371,7 +373,7 @@ index = HnswDocumentIndex(work_dir='.')
 # index your data
 index.index(dl)
 
-# find similar Document
+# find similar Documents
 query = dl[0]
 results, scores = index.find(query, top_k=10, search_field='embedding')
 ```

From 88a1de58353b7984eff580b074f19fb5f8830642 Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Thu, 6 Apr 2023 14:09:59 +0200
Subject: [PATCH 10/18] docs: typos

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index a62152b8f93..d1d37ad01db 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ DocArray handles your data while integrating seamlessly with the rest of your **
 - :chains: DocArray data can be sent as JSON over **HTTP** or as **Protobuf** over **gRPC**
 
 
-> :bulb: **Where are you coming from?**: Depending on your use case and background, there are different was to "get" DocArray.
+> :bulb: **Where are you coming from?** Depending on your use case and background, there are different was to "get" DocArray.
 > You can navigate to the following section for an explanation that should fit your mindest:
 > - [Coming from pure PyTorch or TensorFlow](#coming-from-torch-tf)
 > - [Coming from Pydantic](#coming-from-pydantic)
@@ -45,7 +45,7 @@ This is useful for different use cases:
 - :cloud: You are **serving a model**, for example through FastAPI, and you want to specify your API endpoints
 - :card_index_dividers: You are **parsing data** for later use in your ML or DS applications
 
-> :bulb: **Coming from Pydantic?**: If you're currently using Pydantic for the use cases above, you should be happy to hear
+> :bulb: **Coming from Pydantic?** If you're currently using Pydantic for the use cases above, you should be happy to hear
 > that DocArray is built on top of, and fully compatible with, Pydantic!
 > Also, we have [dedicated section](#coming-from-pydantic) just for you!
 
@@ -255,7 +255,7 @@ This is useful for different use cases:
 - :spider_web: You **distribute your model** across machines and need to send your data between nodes
 - :gear: You are building a **microservice** architecture and need to send your data between microservices
 
-> :bulb: **Coming from FatAPI?**: If you're currently using FatAPI for the use cases above, you should be happy to hear
+> :bulb: **Coming from FastAPI?** If you're currently using FatAPI for the use cases above, you should be happy to hear
 > that DocArray is fully compatible with FatAPI!
 > Also, we have [dedicated section](#coming-from-fastapi) just for you!
 

From 39f96f7409a57ec506220e3b9412e9825fde56dc Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Thu, 6 Apr 2023 14:14:05 +0200
Subject: [PATCH 11/18] docs: more typos

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d1d37ad01db..0b188ae4dbb 100644
--- a/README.md
+++ b/README.md
@@ -255,8 +255,8 @@ This is useful for different use cases:
 - :spider_web: You **distribute your model** across machines and need to send your data between nodes
 - :gear: You are building a **microservice** architecture and need to send your data between microservices
 
-> :bulb: **Coming from FastAPI?** If you're currently using FatAPI for the use cases above, you should be happy to hear
-> that DocArray is fully compatible with FatAPI!
+> :bulb: **Coming from FastAPI?** If you're currently using FastAPI for the use cases above, you should be happy to hear
+> that DocArray is fully compatible with FastAPI!
 > Also, we have [dedicated section](#coming-from-fastapi) just for you!
 
 Whenever you want to send your data you need to serialize it, so let's take a look at how that works with DocArray:

From f157dba0195f1b2f3a2d0bb197749971d4d4bd75 Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Thu, 6 Apr 2023 16:28:46 +0200
Subject: [PATCH 12/18] docs: make section collapsible

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 README.md | 247 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 141 insertions(+), 106 deletions(-)

diff --git a/README.md b/README.md
index 0b188ae4dbb..592d5e4b74c 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ DocArray handles your data while integrating seamlessly with the rest of your **
 
 > :bulb: **Where are you coming from?** Depending on your use case and background, there are different was to "get" DocArray.
 > You can navigate to the following section for an explanation that should fit your mindest:
-> - [Coming from pure PyTorch or TensorFlow](#coming-from-torch-tf)
+> - [Coming from pure PyTorch or TensorFlow](#coming-from-pytorch)
 > - [Coming from Pydantic](#coming-from-pydantic)
 > - [Coming from FastAPI](#coming-from-fastapi)
 > - [Coming from a vector database](#coming-from-vector-database)
@@ -380,9 +380,15 @@ results, scores = index.find(query, top_k=10, search_field='embedding')
 
 </details>
 
-## Coming from DocArray
+Depending on your background and use case, there are different ways for you to _get_ DocArray.
+Choose your own adventure!
 
-If you are already using DocArray, you will be familiar with its [dataclass API](https://docarray.jina.ai/fundamentals/dataclass/).
+## Coming from old DocArray
+
+<details>
+  <summary>Click to expand</summary>
+
+If you are using DocArray v<0.30.0, you will be familiar with its [dataclass API](https://docarray.jina.ai/fundamentals/dataclass/).
 
 _DocArray v2 is that idea, taken seriously._ Every `Document` is created through dataclass-like interface,
 courtesy of [Pydantic](https://pydantic-docs.helpmanual.io/usage/models/).
@@ -392,20 +398,88 @@ This gives the following advantages:
 - **Multi-modality:** Easily store multiple modalities and multiple embeddings in the same Document
 - **Language agnostic:** At its core, Documents are just dictionaries. This makes it easy to create and send them from any language, not just Python.
 
+You may also be familiar with our old Document Stores for vector DB integration.
+They are now called **Document Indexes** and offer the following improvements (see [here](#store) for the new API):
+- **Hybrid search:** You can now combine vector search with text search, and even filter by arbitrary fields
+- **Production-ready:** The new Document Indexes are a much thinner wrapper around the various vector DB libraries, making them more robust and easier to maintain
+- **Increased flexibility:** We strive to support any configuration or setting that you could perform through the DB's first-party client
+
+For now, Document Indexes support **[Weaviate](https://weaviate.io/)**, **[Qdrant](https://qdrant.tech/)**, **[ElasticSearch](https://www.elastic.co/)**, and **[HNSWLib](https://github.com/nmslib/hnswlib)**, with more to come.
+
+</details>
+
 ## Coming from Pydantic
 
-If you come from Pydantic, you can see Documents as juiced up models, and DocArray as a collection of goodies around them.
+<details>
+  <summary>Click to expand</summary>
+
+If you come from Pydantic, you can see DocArray Documents as juiced up Pydantic models, and DocArray as a collection of goodies around them.
 
-- **ML focused types**: Tensor, TorchTensor, TFTensor, Embedding, ...
+More specifically, we set out to **make Pydantic fit for the ML world** - not by replacing it, but by building on top of it!
+
+This means that you get the following benefits:
+- **ML focused types**: Tensor, TorchTensor, Embedding, ..., including **tensor shape validation**
+- Full compatibility with **FastAPI**
+- **DocList** and **DocVec** generalize the idea of a model to a _sequence_ or _batch_ of models. Perfect for **use in ML models** and other batch processing tasks.
 - **Types that are alive**: ImageUrl can `.load()` a URL to image tensor, TextUrl can load and tokenize text documents, etc.
-- **Pre-built Documents** for different data modalities: Image, Text, 3DMesh, Video, Audio and more. Note that all of these will be valid Pydantic models!
-- The concepts of **DocArray and DocumentStore**
 - Cloud-ready: Serialization to **Protobuf** for use with microservices and **gRPC**
-- Support for **vector search functionalities**, such as `find()` and `embed()`
+- **Pre-built multi-modal Documents** for different data modalities: Image, Text, 3DMesh, Video, Audio and more. Note that all of these are valid Pydantic models!
+- **Document Stores** and **Document Indexes** let you store your data and retrieve it using **vector search**
+
+The most obvious advantage here is **first-class support for ML centric data**, such as {Torch, TF, ...}Tensor, Embedding, etc.
+
+This includes handy features such as validating the shape of a tensor:
+
+```python
+from docarray import BaseDoc
+from docarray.typing import TorchTensor
+import torch
+
+
+class MyDoc(BaseDoc):
+    tensor: TorchTensor[3, 224, 224]
+
+
+doc = MyDoc(tensor=torch.zeros(3, 224, 224))  # works
+doc = MyDoc(tensor=torch.zeros(224, 224, 3))  # works by reshaping
+doc = MyDoc(tensor=torch.zeros(224))  # fails validation
+
+
+class Image(BaseDoc):
+    tensor: TorchTensor[3, 'x', 'x']
+
+
+Image(tensor=torch.zeros(3, 224, 224))  # works
+Image(
+    tensor=torch.zeros(3, 64, 128)
+)  # fails validation because second dimension does not match third
+Image(
+    tensor=torch.zeros(4, 224, 224)
+)  # fails validation because of the first dimension
+Image(
+    tensor=torch.zeros(3, 64)
+)  # fails validation because it does not have enough dimensions
+```
+
+</details>
+
 
 ## Coming from PyTorch
 
-DocArray can be used directly inside ML models to handle and represent multi-modal data. This allows you to reason about your data using DocArray's abstractions deep inside of `nn.Module`, and provides a (FastAPI-compatible) schema that eases the transition between model training and model serving.
+<details>
+  <summary>Click to expand</summary>
+
+If you come from PyTorch, you can see DocArray mainly as a way of _organizing your data as it flows through your model_.
+
+It offers you several advantages:
+- Express **tensors shapes in type hints**
+- **Group tensors that belong to the same object**, e.g. an audio track and an image
+- **Go directly to deployment**, by re-using your data model as a [FastAPI](https://fastapi.tiangolo.com/) or [Jina](https://github.com/jina-ai/jina) API schema
+- Connect model components between **microservices**, using Protobuf and gRPC
+
+DocArray can be used directly inside ML models to handle and represent multi-modal data.
+This allows you to reason about your data using DocArray's abstractions deep inside of `nn.Module`,
+and provides a (FastAPI-compatible) schema that eases the transition between model training and model serving.
 
 To see the effect of this, let's first observe a vanilla PyTorch implementation of a tri-modal ML model:
 
@@ -487,11 +561,17 @@ class MyPodcastModel(nn.Module):
 
 Looks much better, doesn't it?
 You instantly win in code readability and maintainability. And for the same price you can turn your PyTorch model into a FastAPI app and reuse your Document
-schema definition (see below). Everything is handled in a pythonic manner by relying on type hints.
+schema definition (see [below](#coming-from-fastapi)). Everything is handled in a pythonic manner by relying on type hints.
+
+</details>
+
 
 ## Coming from TensorFlow
 
-Similar to the PyTorch approach, you can also use DocArray with TensorFlow to handle and represent multi-modal data inside your ML model.
+<details>
+  <summary>Click to expand</summary>
+
+Similar to the [PyTorch approach](#coming-from-pytorch), you can also use DocArray with TensorFlow to handle and represent multi-modal data inside your ML model.
 
 First off, to use DocArray with TensorFlow we first need to install it as follows:
 
@@ -532,9 +612,22 @@ class MyPodcastModel(tf.keras.Model):
         return inputs
 ```
 
+</details>
+
+
 ## Coming from FastAPI
 
-Documents are Pydantic Models (with a twist), and as such they are fully compatible with FastAPI:
+<details>
+  <summary>Click to expand</summary>
+
+Documents are Pydantic Models (with a twist), and as such they are fully compatible with FastAPI!
+
+But why should you use them, and not the Pydantic models you already know and love?
+Good question!
+- Because of the ML-first features, types and validations, [here](#coming-from-pydantic)
+- Because DocArray can act as an [ORM for vector databases](#coming-from-a-vector-database), similar to what SQLModel does for SQL databases
+
+And to seal the deal, let us show you how easily Documents slot into your FastAPI app:
 
 ```python
 import numpy as np
@@ -576,119 +669,63 @@ async with AsyncClient(app=app, base_url="http://test") as ac:
     resp_redoc = await ac.get("/redoc")
 ```
 
-The big advantage here is **first-class support for ML centric data**, such as {Torch, TF, ...}Tensor, Embedding, etc.
-
-This includes handy features such as validating the shape of a tensor:
-
-```python
-from docarray import BaseDoc
-from docarray.typing import TorchTensor
-import torch
-
+Just like a vanilla Pydantic model!
 
-class MyDoc(BaseDoc):
-    tensor: TorchTensor[3, 224, 224]
-
-
-doc = MyDoc(tensor=torch.zeros(3, 224, 224))  # works
-doc = MyDoc(tensor=torch.zeros(224, 224, 3))  # works by reshaping
-doc = MyDoc(tensor=torch.zeros(224))  # fails validation
-
-
-class Image(BaseDoc):
-    tensor: TorchTensor[3, 'x', 'x']
+</details>
 
 
-Image(tensor=torch.zeros(3, 224, 224))  # works
-Image(
-    tensor=torch.zeros(3, 64, 128)
-)  # fails validation because second dimension does not match third
-Image(
-    tensor=torch.zeros(4, 224, 224)
-)  # fails validation because of the first dimension
-Image(
-    tensor=torch.zeros(3, 64)
-)  # fails validation because it does not have enough dimensions
-```
-
 ## Coming from a vector database
 
+<details>
+  <summary>Click to expand</summary>
+
 If you came across DocArray as a universal vector database client, you can best think of it as **a new kind of ORM for vector databases**.
 
 DocArray's job is to take multi-modal, nested and domain-specific data and to map it to a vector database,
 store it there, and thus make it searchable:
 
 ```python
-# NOTE: DocumentStores are not yet implemented in version 2
-from docarray import DocList, BaseDoc
-from docarray.stores import DocumentStore
-from docarray.documents import ImageDoc, TextDoc
-import numpy as np
-
-
-class MyDoc(BaseDoc):
-    image: ImageDoc
-    text: TextDoc
-    description: str
-
-
-def _random_my_doc():
-    return MyDoc(
-        image=ImageDoc(embedding=np.random.random((256,))),
-        text=TextDoc(embedding=np.random.random((128,))),
-        description='this is a random document',
-    )
-
-
-da = DocList([_random_my_doc() for _ in range(1000)])  # create some data
-store = DocumentStore[MyDoc](
-    storage='qdrant'
-)  # create a DocumentStore with Qdrant as backend
-store.insert(da)  # insert the DocArray into the DocumentStore
+from docarray import DocList
+from docarray.documents import ImageDoc
+from docarray.index import HnswDocumentIndex
 
-# find the 10 most similar images based on the image embedding field
-match = store.find(
-    ImageDoc(embedding=np.zeros((256,))), field='image__embedding', top_k=10
-)
-# find the 10 most similar images based on the image embedding field
-match = store.find(
-    ImageDoc(embedding=np.zeros((128,))), field='text__embedding', top_k=10
+# create some data
+dl = DocList[ImageDoc](
+    [
+        ImageDoc(
+            url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg",
+            tensor=np.zeros((3, 224, 224)),
+            embedding=np.random.random((128,)),
+        )
+        for _ in range(100)
+    ]
 )
-```
 
-## Enable logging
+# create a Document Index
+index = HnswDocumentIndex(work_dir='.')
 
-You can see more logs by setting the log level to `DEBUG` or `INFO`:
+# index your data
+index.index(dl)
 
-```python
-from pydantic import Field
-from docarray import BaseDoc
-from docarray.index import HnswDocumentIndex
-from docarray.typing import NdArray
-import logging
+# find similar Documents
+query = dl[0]
+results, scores = index.find(query, top_k=10, search_field='embedding')
+```
 
-# get the logger and set the log level to DEBUG
-logging.getLogger('docarray').setLevel(logging.DEBUG)
+Currently, DocArray supports the following vector databases:
+- [Weaviate](https://www.weaviate.io/)
+- [Qdrant](https://qdrant.tech/)
+- [Elasticsearch](https://www.elastic.co/elasticsearch/) v8 and v7
+- [HNSWlib](https://github.com/nmslib/hnswlib) as a local-first alternative
 
+An integration of [OpenSearch](https://opensearch.org/) is currently in progress.
 
-# define a simple document and create a document index
-class SimpleDoc(BaseDoc):
-    vector: NdArray = Field(dim=10)
+Legacy versions of DocArray also support [Redis](https://redis.io/) and [Milvus](https://milvus.io/), but these are not yet supported in the current version.
 
+Of course this is only one thing that DocArray can do, so we encourage you to check out the rest of this readme!
 
-doc_store = HnswDocumentIndex[SimpleDoc](work_dir='temp_path/')
-```
+</details>
 
-```console
-INFO - docarray - DB config created
-INFO - docarray - Runtime config created
-DEBUG - docarray - Working directory set to temp_path/
-WARNING - docarray - No index was created for `id` as it does not have a config
-INFO - docarray - Created a new index for column `vector`
-DEBUG - docarray - DB path set to temp_path/docs_sqlite.db
-INFO - docarray - Connection to DB has been established
-INFO - docarray - HnswDocumentIndex[SimpleDoc] has been initialized
-```
 
 ## Install the alpha
 
@@ -706,10 +743,8 @@ pip install "git+https://github.com/docarray/docarray@feat-rewrite-v2#egg=docarr
 
 ## See also
 
+- [Documentation](https://docarray-v2--jina-docs.netlify.app/)
 - [Join our Discord server](https://discord.gg/WaMp6PVPgR)
-- [V2 announcement blog post](https://github.com/docarray/notes/blob/main/blog/01-announcement.md)
 - [Donation to Linux Foundation AI&Data blog post](https://jina.ai/news/donate-docarray-lf-for-inclusive-standard-multimodal-data-model/)
-- [Submit ideas, feature requests, and discussions](https://github.com/docarray/docarray/discussions)
-- [v2 Documentation](https://docarray-v2--jina-docs.netlify.app/)
 - ["Legacy" DocArray github page](https://github.com/docarray/docarray)
 - ["Legacy" DocArray documentation](https://docarray.jina.ai/)

From a420c536477de0568307618b63da8b584be3d724 Mon Sep 17 00:00:00 2001
From: Johannes Messner <messnerjo@gmail.com>
Date: Thu, 6 Apr 2023 16:45:47 +0200
Subject: [PATCH 13/18] docs: typo

Signed-off-by: Johannes Messner <messnerjo@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 592d5e4b74c..7c68ffe06bc 100644
--- a/README.md
+++ b/README.md
@@ -205,7 +205,7 @@ dl = DocList[Image](  # the DocList is parametrized by your personal schema!
 You can still bulk access the fields of your `Document`:
 
 ```python
-tensors = dl.tensor  # gets all the tensors in the DocVec
+tensors = dl.tensor  # gets all the tensors in the DocList
 print(type(tensors))  # as a list of tensors
 print(dl.url)  # you can bulk access any other field, too
 ```

From 13a54cfcaea8abf82f919c8bfca9ba742438187b Mon Sep 17 00:00:00 2001
From: samsja <sami.jaghouar@hotmail.fr>
Date: Thu, 6 Apr 2023 17:14:46 +0200
Subject: [PATCH 14/18] feat: add readme testing

Signed-off-by: samsja <sami.jaghouar@hotmail.fr>
---
 tests/documentation/test_docs.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py
index 4eceb252f89..8d3ba31cb41 100644
--- a/tests/documentation/test_docs.py
+++ b/tests/documentation/test_docs.py
@@ -11,3 +11,7 @@
 )
 def test_files_good(fpath):
     check_md_file(fpath=fpath, memory=True)
+
+
+def test_readme():
+    check_md_file(fpath='README.md', memory=True)

From 4e9941e145a943d3e035caad0049eab70d33170e Mon Sep 17 00:00:00 2001
From: samsja <sami.jaghouar@hotmail.fr>
Date: Tue, 11 Apr 2023 12:04:58 +0200
Subject: [PATCH 15/18] fix: fix most of readme pb

Signed-off-by: samsja <sami.jaghouar@hotmail.fr>
---
 README.md                        | 101 ++++++++++++++++++++++---------
 tests/documentation/test_docs.py |   2 -
 2 files changed, 74 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index 7c68ffe06bc..847131fefb7 100644
--- a/README.md
+++ b/README.md
@@ -54,6 +54,7 @@ Put simply, DocArray lets you represent your data in a dataclass-like way, with
 ```python
 from docarray import BaseDoc
 from docarray.typing import TorchTensor, ImageUrl
+import torch
 
 # Define your data model
 class MyDocument(BaseDoc):
@@ -73,9 +74,9 @@ vec = DocVec[MyDocument](
             image_tensor=torch.rand(1704, 2272, 3),
         ),
     ]
-    * 1000
+    * 10
 )
-print(vec.image_tensor.shape)  # (1000, 1704, 2272, 3)
+print(vec.image_tensor.shape)  # (10, 1704, 2272, 3)
 ```
 
 <details>
@@ -87,7 +88,7 @@ So let's take a closer look at how you can represent your data with DocArray:
 from docarray import BaseDoc
 from docarray.typing import TorchTensor, ImageUrl
 from typing import Optional
-
+import torch
 
 # Define your data model
 class MyDocument(BaseDoc):
@@ -114,9 +115,15 @@ doc = MyDocument(
 doc.image_tensor = doc.image_url.load()
 
 # Compute embedding with any model of your choice
+
+
+def clip_image_encoder(image_tensor: TorchTensor) -> TorchTensor:  # dummy function
+    return torch.rand(512)
+
+
 doc.embedding = clip_image_encoder(doc.image_tensor)
 
-print(doc.embedding.shape)
+print(doc.embedding.shape)  # torch.Size([512])
 ```
 
 ### Compose nested Documents
@@ -235,10 +242,10 @@ dl.insert(
 And you can seamlessly switch between `DocVec` and `DocList`:
 
 ```python
-vec_2 = dl.unstack()
+vec_2 = dl.stack()
 assert isinstance(vec_2, DocVec)
 
-dl_2 = vec_2.stack()
+dl_2 = vec_2.unstack()
 assert isinstance(dl_2, DocList)
 ```
 
@@ -264,6 +271,7 @@ Whenever you want to send your data you need to serialize it, so let's take a lo
 ```python
 from docarray import BaseDoc
 from docarray.typing import ImageTorchTensor
+import torch
 
 # model your data
 class MyDocument(BaseDoc):
@@ -279,14 +287,11 @@ doc = MyDocument(
 
 # serialize it!
 proto = doc.to_protobuf()
-base64 = doc.to_base64()
 bytes_ = doc.to_bytes()
 json = doc.json()
-jsonschema = doc.jsonschema()
 
 # deserialize it!
 doc_2 = MyDocument.from_protobuf(proto)
-doc_3 = MyDocument.from_base64(base64)
 doc_4 = MyDocument.from_bytes(bytes_)
 doc_5 = MyDocument.parse_raw(json)
 ```
@@ -315,6 +320,7 @@ As an example, let's take a look at how that would work with AWS S3 storage:
 ```python
 from docarray import DocList
 from docarray.documents import ImageDoc
+import numpy as np
 
 dl = DocList[ImageDoc](
     [
@@ -351,9 +357,18 @@ The Document Index interface lets you index and retrieve Documents from multiple
 It supports ANN vector search, text search, filtering, and hybrid search.
 
 ```python
-from docarray import DocList
-from docarray.documents import ImageDoc
+from docarray import DocList, BaseDoc
 from docarray.index import HnswDocumentIndex
+import numpy as np
+
+from docarray.typing import ImageUrl, ImageTensor, NdArray
+
+
+class ImageDoc(BaseDoc):
+    url: ImageUrl
+    tensor: ImageTensor
+    embedding: NdArray[128]
+
 
 # create some data
 dl = DocList[ImageDoc](
@@ -368,14 +383,15 @@ dl = DocList[ImageDoc](
 )
 
 # create a Document Index
-index = HnswDocumentIndex(work_dir='.')
+index = HnswDocumentIndex[ImageDoc](work_dir='/tmp/test_index')
+
 
 # index your data
 index.index(dl)
 
 # find similar Documents
 query = dl[0]
-results, scores = index.find(query, top_k=10, search_field='embedding')
+results, scores = index.find(query, limit=10, search_field='embedding')
 ```
 
 </details>
@@ -442,7 +458,13 @@ class MyDoc(BaseDoc):
 
 doc = MyDoc(tensor=torch.zeros(3, 224, 224))  # works
 doc = MyDoc(tensor=torch.zeros(224, 224, 3))  # works by reshaping
-doc = MyDoc(tensor=torch.zeros(224))  # fails validation
+
+try:
+    doc = MyDoc(tensor=torch.zeros(224))  # fails validation
+except Exception as e:
+    print(e)
+    # tensor
+    # Cannot reshape tensor of shape (224,) to shape (3, 224, 224) (type=value_error)
 
 
 class Image(BaseDoc):
@@ -450,15 +472,30 @@ class Image(BaseDoc):
 
 
 Image(tensor=torch.zeros(3, 224, 224))  # works
-Image(
-    tensor=torch.zeros(3, 64, 128)
-)  # fails validation because second dimension does not match third
-Image(
-    tensor=torch.zeros(4, 224, 224)
-)  # fails validation because of the first dimension
-Image(
-    tensor=torch.zeros(3, 64)
-)  # fails validation because it does not have enough dimensions
+
+try:
+    Image(
+        tensor=torch.zeros(3, 64, 128)
+    )  # fails validation because second dimension does not match third
+except Exception as e:
+    print()
+
+
+try:
+    Image(
+        tensor=torch.zeros(4, 224, 224)
+    )  # fails validation because of the first dimension
+except Exception as e:
+    print(e)
+    # Tensor shape mismatch. Expected(3, 'x', 'x'), got(4, 224, 224)(type=value_error)
+
+try:
+    Image(
+        tensor=torch.zeros(3, 64)
+    )  # fails validation because it does not have enough dimensions
+except Exception as e:
+    print(e)
+    # Tensor shape mismatch. Expected (3, 'x', 'x'), got (3, 64) (type=value_error)
 ```
 
 </details>
@@ -685,9 +722,18 @@ DocArray's job is to take multi-modal, nested and domain-specific data and to ma
 store it there, and thus make it searchable:
 
 ```python
-from docarray import DocList
-from docarray.documents import ImageDoc
+from docarray import DocList, BaseDoc
 from docarray.index import HnswDocumentIndex
+import numpy as np
+
+from docarray.typing import ImageUrl, ImageTensor, NdArray
+
+
+class ImageDoc(BaseDoc):
+    url: ImageUrl
+    tensor: ImageTensor
+    embedding: NdArray[128]
+
 
 # create some data
 dl = DocList[ImageDoc](
@@ -702,14 +748,15 @@ dl = DocList[ImageDoc](
 )
 
 # create a Document Index
-index = HnswDocumentIndex(work_dir='.')
+index = HnswDocumentIndex[ImageDoc](work_dir='/tmp/test_index')
+
 
 # index your data
 index.index(dl)
 
 # find similar Documents
 query = dl[0]
-results, scores = index.find(query, top_k=10, search_field='embedding')
+results, scores = index.find(query, limit=10, search_field='embedding')
 ```
 
 Currently, DocArray supports the following vector databases:
diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py
index 8d3ba31cb41..6e2112db0fc 100644
--- a/tests/documentation/test_docs.py
+++ b/tests/documentation/test_docs.py
@@ -4,8 +4,6 @@
 from mktestdocs import check_md_file
 
 
-# @pytest.mark.parametrize('fpath', pathlib.Path("docs").glob("**/*.md"), ids=str)
-# to use later
 @pytest.mark.parametrize(
     'fpath', pathlib.Path('docs/user_guide').glob('**/*.md'), ids=str
 )

From c0f243fff94b2a07bc2666f478c18de6a0f9958c Mon Sep 17 00:00:00 2001
From: samsja <sami.jaghouar@hotmail.fr>
Date: Tue, 11 Apr 2023 12:36:30 +0200
Subject: [PATCH 16/18] fix: fix most of readme pb

Signed-off-by: samsja <sami.jaghouar@hotmail.fr>
---
 README.md | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 847131fefb7..24b21e53541 100644
--- a/README.md
+++ b/README.md
@@ -523,14 +523,19 @@ To see the effect of this, let's first observe a vanilla PyTorch implementation
 ```python
 import torch
 from torch import nn
+import torch
+
+
+def encoder(x):
+    return torch.rand(512)
 
 
 class MyMultiModalModel(nn.Module):
     def __init__(self):
         super().__init__()
-        self.audio_encoder = AudioEncoder()
-        self.image_encoder = ImageEncoder()
-        self.text_encoder = TextEncoder()
+        self.audio_encoder = encoder()
+        self.image_encoder = encoder()
+        self.text_encoder = encoder()
 
     def forward(self, text_1, text_2, image_1, image_2, audio_1, audio_2):
         embedding_text_1 = self.text_encoder(text_1)
@@ -560,10 +565,14 @@ So, now let's see what the same code looks like with DocArray:
 from docarray import DocList, BaseDoc
 from docarray.documents import ImageDoc, TextDoc, AudioDoc
 from docarray.typing import TorchTensor
-
+from torch import nn
 import torch
 
 
+def encoder(x):
+    return torch.rand(512)
+
+
 class Podcast(BaseDoc):
     text: TextDoc
     image: ImageDoc
@@ -578,9 +587,9 @@ class PairPodcast(BaseDoc):
 class MyPodcastModel(nn.Module):
     def __init__(self):
         super().__init__()
-        self.audio_encoder = AudioEncoder()
-        self.image_encoder = ImageEncoder()
-        self.text_encoder = TextEncoder()
+        self.audio_encoder = encoder()
+        self.image_encoder = encoder()
+        self.text_encoder = encoder()
 
     def forward_podcast(self, docs: DocList[Podcast]) -> DocList[Podcast]:
         docs.audio.embedding = self.audio_encoder(docs.audio.tensor)
@@ -674,7 +683,7 @@ from httpx import AsyncClient
 from docarray import BaseDoc
 from docarray.documents import ImageDoc
 from docarray.typing import NdArray
-from docarray.base_doc import DocumentResponse
+from docarray.base_doc import DocArrayResponse
 
 
 class InputDoc(BaseDoc):
@@ -691,7 +700,7 @@ input_doc = InputDoc(img=ImageDoc(tensor=np.zeros((3, 224, 224))))
 app = FastAPI()
 
 
-@app.post("/doc/", response_model=OutputDoc, response_class=DocumentResponse)
+@app.post("/doc/", response_model=OutputDoc, response_class=DocArrayResponse)
 async def create_item(doc: InputDoc) -> OutputDoc:
     ## call my fancy model to generate the embeddings
     doc = OutputDoc(

From c05bfa5a206ec4f476e57257ab56d3c56574bfea Mon Sep 17 00:00:00 2001
From: samsja <sami.jaghouar@hotmail.fr>
Date: Tue, 11 Apr 2023 12:57:46 +0200
Subject: [PATCH 17/18] fix: fix readme pb

Signed-off-by: samsja <sami.jaghouar@hotmail.fr>
---
 README.md                        |  4 +--
 tests/documentation/test_docs.py | 45 ++++++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 24b21e53541..9988b49f625 100644
--- a/README.md
+++ b/README.md
@@ -168,7 +168,7 @@ import numpy as np
 
 class Image(BaseDoc):
     url: ImageUrl
-    tensor: AnyTensor  # this allows torch, numpy, and tensorflow tensors
+    tensor: AnyTensor  # this allows torch, numpy, and tensor flow tensors
 
 
 vec = DocVec[Image](  # the DocVec is parametrized by your personal schema!
@@ -757,7 +757,7 @@ dl = DocList[ImageDoc](
 )
 
 # create a Document Index
-index = HnswDocumentIndex[ImageDoc](work_dir='/tmp/test_index')
+index = HnswDocumentIndex[ImageDoc](work_dir='/tmp/test_index2')
 
 
 # index your data
diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py
index 6e2112db0fc..6ca32d7700f 100644
--- a/tests/documentation/test_docs.py
+++ b/tests/documentation/test_docs.py
@@ -1,7 +1,46 @@
 import pathlib
 
 import pytest
-from mktestdocs import check_md_file
+from mktestdocs import grab_code_blocks
+from mktestdocs.__main__ import _executors, check_raw_string
+
+
+def check_raw_file_full(raw, lang="python", keyword_ignore=[]):
+    if lang not in _executors:
+        raise LookupError(
+            f"{lang} is not a supported language to check\n"
+            "\tHint: you can add support for any language by using register_executor"
+        )
+    executor = _executors[lang]
+    all_code = ""
+    add_code_block = True
+
+    for b in grab_code_blocks(raw, lang=lang):
+        add_code_block = True
+        for keyword in keyword_ignore:
+            if keyword in b:
+                add_code_block = False
+                break
+        if add_code_block:
+            all_code = f"{all_code}\n{b}"
+    executor(all_code)
+
+
+def check_md_file(fpath, memory=False, lang="python", keyword_ignore=[]):
+    """
+    NOTE: copy paste from mktestdocs.__main__ and add the keyword ignore
+    Given a markdown file, parse the contents for python code blocks
+    and check that each independent block does not cause an error.
+
+    Arguments:
+        fpath: path to markdown file
+        memory: whether or not previous code-blocks should be remembered
+    """
+    text = pathlib.Path(fpath).read_text()
+    if not memory:
+        check_raw_string(text, lang=lang)
+    else:
+        check_raw_file_full(text, lang=lang, keyword_ignore=keyword_ignore)
 
 
 @pytest.mark.parametrize(
@@ -12,4 +51,6 @@ def test_files_good(fpath):
 
 
 def test_readme():
-    check_md_file(fpath='README.md', memory=True)
+    check_md_file(
+        fpath='README.md', memory=True, keyword_ignore=['tensorflow', 'fastapi', 'push']
+    )

From bdd33569e8254587a9057199112634dc0499d43e Mon Sep 17 00:00:00 2001
From: samsja <sami.jaghouar@hotmail.fr>
Date: Tue, 11 Apr 2023 13:03:17 +0200
Subject: [PATCH 18/18] fix: remove todo

Signed-off-by: samsja <sami.jaghouar@hotmail.fr>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9988b49f625..c0d2cd5edee 100644
--- a/README.md
+++ b/README.md
@@ -297,7 +297,7 @@ doc_5 = MyDocument.parse_raw(json)
 ```
 
 Of course, serialization is not all you need.
-So check out how DocArray integrates with FatAPI and Jina. TODO link to doc sections
+So check out how DocArray integrates with FatAPI and Jina.
 
 
 ## Store