From 749693f08966c2081c9185051b4f0f6bbb47d256 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 5 Apr 2023 15:36:25 +0200 Subject: [PATCH 01/18] doc: rewrite represent section in readme Signed-off-by: Johannes Messner --- README.md | 172 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 131 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index af497cb5b3e..2bd9bf17fdc 100644 --- a/README.md +++ b/README.md @@ -1,65 +1,98 @@ -# DocArray - Version 2 +

+DocArray logo: The data structure for unstructured data +
+The data structure for multimodal data +

+ +

+PyPI +Codecov branch + +PyPI - Downloads from official pypistats + +

> **Note** > This introduction refers to version 2 of DocArray, a rewrite that is currently at the alpha stage. > Not all features that are mentioned here are implemented yet. > If you are looking for the version 2 implementation roadmap see [here](https://github.com/docarray/docarray/issues/780), > for the (already released) version 1 of DocArray -> see [here](https://github.com/docarray/docarray)._ +> see [here](https://github.com/docarray/docarray). -DocArray is a library for **representing, sending and storing multi-modal data**, with a focus on applications in **ML** and -**Neural Search**. +DocArray is a library for **representing, sending and storing multi-modal data**, perfect for **Machine Learning applications**. -This means that DocArray lets you do the following things: +DocArray handles your data while integrating seamlessly with the rest of your **Python and ML ecosystem**: + +- DocArray has native compatibility for **NumPy**, **PyTorch** and **TensorFlow**, including for **model training use cases** +- DocArray is built on **Pydantic** and out-of-the-box compatible with **FastAPI** +- DocArray can store data in vector databases such as **Weaviate, Qdrant, ElasticSearch** as well as **HNSWLib** +- DocArray data can be sent as JSON over **HTTP** or as **Protobuf** over **gRPC** + +With that said, let's dig into the three pillars of DocArray: +1. [Represent](#represent) +2. [Send](#send) +3. [Store](#store) + +> :bulb: **Where are you coming from?**: Depending on your use case and background, there are different was to "get" DocArray. +> You can navigate to the following section for an explanation that should fit your mindest: +> - [Coming from pure PyTorch or TensorFlow](#coming-from-torch-tf) +> - [Coming from Pydantic](#coming-from-pydantic) +> - [Coming from FastAPI](#coming-from-fastapi) +> - [Coming from a vector database](#coming-from-vector-database) ## Represent +DocArray allows you to **represent your data**, in a ML-native way. +This is useful for different use cases: +- You are **training a model**, there are myriads of tensors of different shapes and sizes flying around, representing different _things_, and you want to keep a straight head about them +- You are **serving a model**, for example through FastAPI, and you want to specify your API endpoints +- You are **parsing data** for later use in your ML or DS applications + +> :bulb: **Coming from Pydantic?**: If you're currently using Pydantic for the use cases above, you should be happy to hear +> that DocArray is built on top of, and fully compatible with, Pydantic! +> Also, we have [dedicated section](#coming-from-pydantic) just for you! + +So let's see how you can represent your data with DocArray: + ```python from docarray import BaseDoc from docarray.typing import TorchTensor, ImageUrl from typing import Optional +# Define your data model class MyDocument(BaseDoc): description: str - image_url: ImageUrl - image_tensor: Optional[TorchTensor[1704, 2272, 3]] - # The field above only work with tensor of shape (1704, 2272, 3) + image_url: ImageUrl # could also be VideoUrl, AudioUrl, etc. + image_tensor: Optional[ + TorchTensor[1704, 2272, 3] + ] # could also be NdArray of TensorflowTensor embedding: Optional[TorchTensor] +``` + +So not only can you define the types of your data, you can even **specify the shape of your tensors!** +Once you have your model in form of a `Document`, you can work with it! +```python +# Create a document doc = MyDocument( description="This is a photo of a mountain", image_url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", ) -doc.image_tensor = doc.image_url.load() # load image tensor from URL -``` -```python -doc.embedding = clip_image_encoder( - doc.image_tensor -) # create and store embedding using model of your choice +# Load image tensor from URL +doc.image_tensor = doc.image_url.load() + +# Compute embedding with any model of your choice +doc.embedding = clip_image_encoder(doc.image_tensor) print(doc.embedding.shape) ``` -- **Model** data of any type (audio, video, text, images, 3D meshes, raw tensors, etc) as a Document, a single, unified data structure. - - A `Document` is a juiced-up [Pydantic Model](https://pydantic-docs.helpmanual.io/usage/models/), inheriting all the benefits, while extending it with ML focused features. +### Compose nested Documents -### Use pre-defined `Document`s for common use cases: - -```python -from docarray.documents import ImageDoc - -doc = ImageDoc( - url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", -) -doc.tensor = doc.url.load() # load image tensor from URL -doc.embedding = clip_image_encoder( - doc.tensor -) # create and store embedding using model of your choice -``` -### Compose nested Documents: +Of course you can compose Documents into a nested structure: ```python from docarray import BaseDoc @@ -77,23 +110,59 @@ doc = MultiModalDocument( ) ``` -### Collect multiple `Documents` into a `DocList`: +Of course, you rarely work with a single data point at a time, especially in Machine Learning applications. + +That's why you can easily collect multiple `Documents`: + +### Collect multiple `Documents` + +When building or interacting with an ML system, usually you want to process multiple Documents (data points) at once. + +DocArray offers two data structures for this: +- **`DocVec`**: A vector of `Documents`. All tensors in the `Documents` are stacked up into a single tensor. Perfect for batch processing and use inside of ML models. +- **`DocList`**: A list of `Documents`. All tensors in the `Documents` are kept as-is. Perfect for streaming, re-ranking, and shuffling of data. + +Let's take a look at them, starting with `DocVec`: ```python -from docarray import DocList, BaseDoc +from docarray import DocVec, BaseDoc from docarray.typing import AnyTensor, ImageUrl import numpy as np class Image(BaseDoc): url: ImageUrl - tensor: AnyTensor + tensor: AnyTensor # this allows torch, numpy, and tensorflow tensors + + +vec = DocVec[Image]( # the DocVec is parametrized by your personal schema! + [ + Image( + url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", + tensor=np.zeros((3, 224, 224)), + ) + for _ in range(100) + ] +) ``` +As you can see in the code snippet above, `DocVec` is **parametrized by the type of Document** you want to use with it: `DocVec[Image]`. + +This may look slightly weird at first, but we're confident that you'll get used to it quickly! +Besides, it allows us to do some cool things, like giving you **bulk access to the fields that you defined** in your `Document`: + +```python +tensor = vec.tensor # gets all the tensors in the DocVec +print(tensor.shape) # which are stacked up into a single tensor! +print(vec.url) # you can bulk access any other field, too +``` + +The second data structure, `DocList`, works in a similar way: + ```python from docarray import DocList -da = DocList[Image]( +dl = DocList[Image]( # the DocList is parametrized by your personal schema! [ Image( url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", @@ -104,25 +173,46 @@ da = DocList[Image]( ) ``` -Access fields at the DocArray level: +You can still bulk access the fields of your `Document`: ```python -print(len(da.tensor)) -print(da.tensor[0].shape) +tensors = dl.tensor # gets all the tensors in the DocVec +print(type(tensors)) # as a list of tensors +print(dl.url) # you can bulk access any other field, too ``` -You can stack tensors if you want to perform in batch processing: +And you can insert, remove, and append `Documents` to your `DocList`: ```python -da = da.stack() +dl.append( + Image( + url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", + tensor=np.zeros((3, 224, 224)), + ) +) +del dl[0] +dl.insert( + 0, + Image( + url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", + tensor=np.zeros((3, 224, 224)), + ), +) ``` +And you can seamlessly switch between `DocVec` and `DocList`: + ```python -print(type(da.tensor)) -print(da.tensor.shape) +vec_2 = dl.unstack() +assert isinstance(vec_2, DocVec) + +dl_2 = vec_2.stack() +assert isinstance(dl_2, DocList) ``` + ## Send + - **Serialize** any `Document` or `DocArray` into _protobuf_, _json_, _jsonschema_, _bytes_ or _base64_ - Use in **microservice** architecture: Send over **HTTP** or **gRPC** - Integrate seamlessly with **[FastAPI](https://github.com/tiangolo/fastapi/)** and **[Jina](https://github.com/jina-ai/jina/)** From 435075b0386c7e0935ed64c38a2c08be2fc96bf9 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 5 Apr 2023 15:48:30 +0200 Subject: [PATCH 02/18] docs: emojify Signed-off-by: Johannes Messner --- README.md | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 2bd9bf17fdc..deed27c99c4 100644 --- a/README.md +++ b/README.md @@ -12,21 +12,14 @@

-> **Note** -> This introduction refers to version 2 of DocArray, a rewrite that is currently at the alpha stage. -> Not all features that are mentioned here are implemented yet. -> If you are looking for the version 2 implementation roadmap see [here](https://github.com/docarray/docarray/issues/780), -> for the (already released) version 1 of DocArray -> see [here](https://github.com/docarray/docarray). - DocArray is a library for **representing, sending and storing multi-modal data**, perfect for **Machine Learning applications**. DocArray handles your data while integrating seamlessly with the rest of your **Python and ML ecosystem**: -- DocArray has native compatibility for **NumPy**, **PyTorch** and **TensorFlow**, including for **model training use cases** -- DocArray is built on **Pydantic** and out-of-the-box compatible with **FastAPI** -- DocArray can store data in vector databases such as **Weaviate, Qdrant, ElasticSearch** as well as **HNSWLib** -- DocArray data can be sent as JSON over **HTTP** or as **Protobuf** over **gRPC** +- :fire: DocArray has native compatibility for **NumPy**, **PyTorch** and **TensorFlow**, including for **model training use cases** +- :zap: DocArray is built on **Pydantic** and out-of-the-box compatible with **FastAPI** +- :package: DocArray can store data in vector databases such as **Weaviate, Qdrant, ElasticSearch** as well as **HNSWLib** +- :chains: DocArray data can be sent as JSON over **HTTP** or as **Protobuf** over **gRPC** With that said, let's dig into the three pillars of DocArray: 1. [Represent](#represent) @@ -44,9 +37,9 @@ With that said, let's dig into the three pillars of DocArray: DocArray allows you to **represent your data**, in a ML-native way. This is useful for different use cases: -- You are **training a model**, there are myriads of tensors of different shapes and sizes flying around, representing different _things_, and you want to keep a straight head about them -- You are **serving a model**, for example through FastAPI, and you want to specify your API endpoints -- You are **parsing data** for later use in your ML or DS applications +- :running_woman: You are **training a model**, there are myriads of tensors of different shapes and sizes flying around, representing different _things_, and you want to keep a straight head about them +- :cloud: You are **serving a model**, for example through FastAPI, and you want to specify your API endpoints +- :card_index_dividers: You are **parsing data** for later use in your ML or DS applications > :bulb: **Coming from Pydantic?**: If you're currently using Pydantic for the use cases above, you should be happy to hear > that DocArray is built on top of, and fully compatible with, Pydantic! @@ -66,7 +59,7 @@ class MyDocument(BaseDoc): image_url: ImageUrl # could also be VideoUrl, AudioUrl, etc. image_tensor: Optional[ TorchTensor[1704, 2272, 3] - ] # could also be NdArray of TensorflowTensor + ] # could also be NdArray or TensorflowTensor embedding: Optional[TorchTensor] ``` @@ -119,8 +112,8 @@ That's why you can easily collect multiple `Documents`: When building or interacting with an ML system, usually you want to process multiple Documents (data points) at once. DocArray offers two data structures for this: -- **`DocVec`**: A vector of `Documents`. All tensors in the `Documents` are stacked up into a single tensor. Perfect for batch processing and use inside of ML models. -- **`DocList`**: A list of `Documents`. All tensors in the `Documents` are kept as-is. Perfect for streaming, re-ranking, and shuffling of data. +- **`DocVec`**: A vector of `Documents`. All tensors in the `Documents` are stacked up into a single tensor. **Perfect for batch processing and use inside of ML models**. +- **`DocList`**: A list of `Documents`. All tensors in the `Documents` are kept as-is. **Perfect for streaming, re-ranking, and shuffling of data**. Let's take a look at them, starting with `DocVec`: @@ -184,13 +177,16 @@ print(dl.url) # you can bulk access any other field, too And you can insert, remove, and append `Documents` to your `DocList`: ```python +# append dl.append( Image( url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", tensor=np.zeros((3, 224, 224)), ) ) +# delete del dl[0] +# insert dl.insert( 0, Image( From 812bb541bdb07b790b391285df2399874e32d8fe Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 5 Apr 2023 15:50:25 +0200 Subject: [PATCH 03/18] docs: make collapsible Signed-off-by: Johannes Messner --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index deed27c99c4..d407ce785bd 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,10 @@ With that said, let's dig into the three pillars of DocArray: > - [Coming from FastAPI](#coming-from-fastapi) > - [Coming from a vector database](#coming-from-vector-database) + +
+ Click me + ## Represent DocArray allows you to **represent your data**, in a ML-native way. @@ -206,6 +210,7 @@ dl_2 = vec_2.stack() assert isinstance(dl_2, DocList) ``` +
## Send From c496f3db6633898aecdf8237d5a821f0cd751842 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 5 Apr 2023 15:51:53 +0200 Subject: [PATCH 04/18] docs: fix collapsible Signed-off-by: Johannes Messner --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d407ce785bd..f9e8a7262e9 100644 --- a/README.md +++ b/README.md @@ -34,11 +34,11 @@ With that said, let's dig into the three pillars of DocArray: > - [Coming from a vector database](#coming-from-vector-database) -
- Click me - ## Represent +
+ Click to expand + DocArray allows you to **represent your data**, in a ML-native way. This is useful for different use cases: - :running_woman: You are **training a model**, there are myriads of tensors of different shapes and sizes flying around, representing different _things_, and you want to keep a straight head about them From 832c32dc331c549d01c3f5e213648df950423173 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 5 Apr 2023 16:00:10 +0200 Subject: [PATCH 05/18] docs: moving stuff around Signed-off-by: Johannes Messner --- README.md | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f9e8a7262e9..5a3bd30d4aa 100644 --- a/README.md +++ b/README.md @@ -36,9 +36,6 @@ With that said, let's dig into the three pillars of DocArray: ## Represent -
- Click to expand - DocArray allows you to **represent your data**, in a ML-native way. This is useful for different use cases: - :running_woman: You are **training a model**, there are myriads of tensors of different shapes and sizes flying around, representing different _things_, and you want to keep a straight head about them @@ -49,7 +46,62 @@ This is useful for different use cases: > that DocArray is built on top of, and fully compatible with, Pydantic! > Also, we have [dedicated section](#coming-from-pydantic) just for you! -So let's see how you can represent your data with DocArray: +Put simply, DocArray lets you represent your data in a dataclass-like way, with ML as a first class citizen: + +```python +from docarray import BaseDoc +from docarray.typing import TorchTensor, ImageUrl + +# Define your data model +class MyDocument(BaseDoc): + description: str + image_url: ImageUrl # could also be VideoUrl, AudioUrl, etc. + image_tensor: TorchTensor[1704, 2272, 3] # you can express tensor shapes! + + +# Stack multiple documents, column-wise +from docarray import DocVec + +vec = DocVec[MyDocument]( + [ + MyDocument( + description="A cat", + image_url="https://example.com/cat.jpg", + image_tensor=torch.rand(1704, 2272, 3), + ), + MyDocument( + description="A dog", + image_url="https://example.com/dog.jpg", + image_tensor=torch.rand(1704, 2272, 3), + ), + ] +) +print(vec.image_tensor) + +# Or treat them like a list, row wise +from docarray import DocList + +dl = DocList[MyDocument]( + [ + MyDocument( + description="A cat", + image_url="https://example.com/cat.jpg", + image_tensor=torch.rand(1704, 2272, 3), + ), + MyDocument( + description="A dog", + image_url="https://example.com/dog.jpg", + image_tensor=torch.rand(1704, 2272, 3), + ), + ] +) +print(dl.image_tensor) +``` + +
+ Click for more details + +So let's take a closer look at how you can represent your data with DocArray: ```python from docarray import BaseDoc From e5c7ab6796ddfaf033db4d9066c46f2810393f54 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 5 Apr 2023 16:00:17 +0200 Subject: [PATCH 06/18] docs: moving stuff around Signed-off-by: Johannes Messner --- README.md | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 5a3bd30d4aa..36f9115fa9c 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ class MyDocument(BaseDoc): image_tensor: TorchTensor[1704, 2272, 3] # you can express tensor shapes! -# Stack multiple documents, column-wise +# Stack multiple documents from docarray import DocVec vec = DocVec[MyDocument]( @@ -69,33 +69,10 @@ vec = DocVec[MyDocument]( image_url="https://example.com/cat.jpg", image_tensor=torch.rand(1704, 2272, 3), ), - MyDocument( - description="A dog", - image_url="https://example.com/dog.jpg", - image_tensor=torch.rand(1704, 2272, 3), - ), ] + * 1000 ) print(vec.image_tensor) - -# Or treat them like a list, row wise -from docarray import DocList - -dl = DocList[MyDocument]( - [ - MyDocument( - description="A cat", - image_url="https://example.com/cat.jpg", - image_tensor=torch.rand(1704, 2272, 3), - ), - MyDocument( - description="A dog", - image_url="https://example.com/dog.jpg", - image_tensor=torch.rand(1704, 2272, 3), - ), - ] -) -print(dl.image_tensor) ```
From 426d6ab85bc6dcf95548c6b4d5c41e7359093407 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 5 Apr 2023 16:02:12 +0200 Subject: [PATCH 07/18] docs: small tweaks Signed-off-by: Johannes Messner --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 36f9115fa9c..b81d815f777 100644 --- a/README.md +++ b/README.md @@ -72,11 +72,11 @@ vec = DocVec[MyDocument]( ] * 1000 ) -print(vec.image_tensor) +print(vec.image_tensor.shape) # (1000, 1704, 2272, 3) ```
- Click for more details + **Click for more details** So let's take a closer look at how you can represent your data with DocArray: From e26af0f5ff5d15ae9c660fde794fd2c105bc40e4 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 6 Apr 2023 13:34:06 +0200 Subject: [PATCH 08/18] docs: readme section for send and store Signed-off-by: Johannes Messner --- README.md | 150 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 120 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index b81d815f777..c8ea1d2a2b8 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,8 @@ With that said, let's dig into the three pillars of DocArray: ## Represent -DocArray allows you to **represent your data**, in a ML-native way. +DocArray allows you to **represent your data**, in an ML-native way. + This is useful for different use cases: - :running_woman: You are **training a model**, there are myriads of tensors of different shapes and sizes flying around, representing different _things_, and you want to keep a straight head about them - :cloud: You are **serving a model**, for example through FastAPI, and you want to specify your API endpoints @@ -76,7 +77,7 @@ print(vec.image_tensor.shape) # (1000, 1704, 2272, 3) ```
- **Click for more details** + Click for more details So let's take a closer look at how you can represent your data with DocArray: @@ -243,50 +244,139 @@ assert isinstance(dl_2, DocList) ## Send -- **Serialize** any `Document` or `DocArray` into _protobuf_, _json_, _jsonschema_, _bytes_ or _base64_ -- Use in **microservice** architecture: Send over **HTTP** or **gRPC** -- Integrate seamlessly with **[FastAPI](https://github.com/tiangolo/fastapi/)** and **[Jina](https://github.com/jina-ai/jina/)** +DocArray allows you to **send your data**, in an ML-native way. + +This means there is native support for **Protobuf and gRPC**, in top of **HTTP** and serialization to JSON, JSONSchema, Base64, and Bytes. + +This is useful for different use cases: +- :cloud: You are **serving a model**, for example through **[Jina](https://github.com/jina-ai/jina/)** or **[FastAPI](https://github.com/tiangolo/fastapi/)** +- :spider_web: You **distribute your model** across machines and need to send your data between nodes +- :gear: You are building a **microservice** architecture and need to send your data between microservices + +> :bulb: **Coming from FatAPI?**: If you're currently using FatAPI for the use cases above, you should be happy to hear +> that DocArray is fully compatible with FatAPI! +> Also, we have [dedicated section](#coming-from-fastapi) just for you! + +Whenever you want to send your data you need to serialize it, so let's take a look at how that works with DocArray: ```python -from docarray.documents import ImageDoc -from httpx import AsyncClient -import numpy as np +from docarray import BaseDoc +from docarray.typing import ImageTorchTensor -doc = ImageDoc(tensor=np.zeros((3, 224, 224))) +# model your data +class MyDocument(BaseDoc): + description: str + image: ImageTorchTensor[3, 224, 224] -# JSON over HTTP -async with AsyncClient(app=app, base_url="http://test") as ac: - response = await ac.post("/doc/", data=input_doc.json()) + +# create a Document +doc = MyDocument( + description="This is a description", + image=torch.zeros((3, 224, 224)), +) + +# serialize it! +proto = doc.to_protobuf() +base64 = doc.to_base64() +bytes_ = doc.to_bytes() +json = doc.json() +jsonschema = doc.jsonschema() + +# deserialize it! +doc_2 = MyDocument.from_protobuf(proto) +doc_3 = MyDocument.from_base64(base64) +doc_4 = MyDocument.from_bytes(bytes_) +doc_5 = MyDocument.parse_raw(json) ``` +Of course, serialization is not all you need. +So check out how DocArray integrates with FatAPI and Jina. TODO link to doc sections + + +## Store + +Once you've modelled your data, and maybe sent it around, usually you want to **store it** somewhere. +But fret not! DocArray has you covered! + +**Document Stores** let you, well, store your Documents, locally or remotely, all with the same user interface: +- :cd: **On disk** as a file in your local file system +- :bucket: On **[AWS S3](https://aws.amazon.com/de/s3/)** +- :cloud: On **[Jina AI Cloud](https://cloud.jina.ai/)** + +
+ See Document Store usage + +The Document Store interface lets you push and pull Documents to and from multiple data sources, all with the same user interface. + +As an example, let's take a look at how that would work with AWS S3 storage: + ```python -# (de)serialize from/to protobuf -Image.from_protobuf(doc.to_protobuf()) +from docarray import DocList +from docarray.documents import ImageDoc + +dl = DocList[ImageDoc]( + [ + ImageDoc( + url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", + tensor=np.zeros((3, 224, 224)), + ) + for _ in range(100) + ] +) + +# push the DocList to S3 +dl.push('s3://my-bucket/my-documents', show_progress=True) + +# pull the DocList from S3 +dl_2 = DocList[ImageDoc].pull('s3://my-bucket/my-documents', show_progress=True) ``` +
-## Store -- Persist a `DocArray` using a **`DocumentStore`** -- Store your Documents in any supported (vector) database: **Elasticsearch**, **Qdrant**, **Weaviate**, **Redis**, **Milvus**, **ANNLite** or **SQLite** -- Leverage DocumentStores to **perform vector search on your multi-modal data** +**Document Indexes** let you index your Documents into a **vector database**, for efficient similarity-based retrieval. + +This is useful for: +- :left_speech_bubble: Augmenting **LLMs and Chatbots** with domain knowledge ([Retrieval Augmented Generation](https://arxiv.org/abs/2005.11401)) +- :mag: **Neural search** applications +- :bulb: **Recommender systems** + +Currently, DocArray Document Indexes support **[Weaviate](https://weaviate.io/)**, **[Qdrant](https://qdrant.tech/)**, **[ElasticSearch](https://www.elastic.co/)**, and **[HNSWLib](https://github.com/nmslib/hnswlib)**, with more to come!. + +
+ See Document Index usage + +The Document Index interface lets you index and retrieve Documents from multiple vector databases, all with the same user interface. + +It supports ANN vector search, text search, filtering, and hybrid search. ```python -# NOTE: DocumentStores are not yet implemented in version 2 from docarray import DocList from docarray.documents import ImageDoc -from docarray.stores import DocumentStore -import numpy as np +from docarray.index import HnswDocumentIndex -da = DocList([ImageDoc(embedding=np.zeros((128,))) for _ in range(1000)]) -store = DocumentStore[ImageDoc]( - storage='qdrant' -) # create a DocumentStore with Qdrant as backend -store.insert(da) # insert the DocList into the DocumentStore -# find the 10 most similar images based on the 'embedding' field -match = store.find(ImageDoc(embedding=np.zeros((128,))), field='embedding', top_k=10) +# create some data +dl = DocList[ImageDoc]( + [ + ImageDoc( + url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", + tensor=np.zeros((3, 224, 224)), + embedding=np.random.random((128,)), + ) + for _ in range(100) + ] +) + +# create a Document Index +index = HnswDocumentIndex(work_dir='.') + +# index your data +index.index(dl) + +# find similar Document +query = dl[0] +results, scores = index.find(query, top_k=10, search_field='embedding') ``` -If you want to get a deeper understanding of DocArray v2, it is best to do so on the basis of your -use case and background: +
## Coming from DocArray From 6ce22a4bd60ce7ce9822ebc59380ffe71a797a4b Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 6 Apr 2023 14:04:31 +0200 Subject: [PATCH 09/18] docs: re-arrange some stuff Signed-off-by: Johannes Messner --- README.md | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c8ea1d2a2b8..a62152b8f93 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,12 @@ DocArray is a library for **representing, sending and storing multi-modal data**, perfect for **Machine Learning applications**. +Those are the three pillars of DocArray, and you can check them out individually: + +1. [**Represent**](#represent) +2. [**Send**](#send) +3. [**Store**](#store) + DocArray handles your data while integrating seamlessly with the rest of your **Python and ML ecosystem**: - :fire: DocArray has native compatibility for **NumPy**, **PyTorch** and **TensorFlow**, including for **model training use cases** @@ -21,10 +27,6 @@ DocArray handles your data while integrating seamlessly with the rest of your ** - :package: DocArray can store data in vector databases such as **Weaviate, Qdrant, ElasticSearch** as well as **HNSWLib** - :chains: DocArray data can be sent as JSON over **HTTP** or as **Protobuf** over **gRPC** -With that said, let's dig into the three pillars of DocArray: -1. [Represent](#represent) -2. [Send](#send) -3. [Store](#store) > :bulb: **Where are you coming from?**: Depending on your use case and background, there are different was to "get" DocArray. > You can navigate to the following section for an explanation that should fit your mindest: @@ -60,7 +62,7 @@ class MyDocument(BaseDoc): image_tensor: TorchTensor[1704, 2272, 3] # you can express tensor shapes! -# Stack multiple documents +# Stack multiple documents in a Document Vector from docarray import DocVec vec = DocVec[MyDocument]( @@ -246,7 +248,7 @@ assert isinstance(dl_2, DocList) DocArray allows you to **send your data**, in an ML-native way. -This means there is native support for **Protobuf and gRPC**, in top of **HTTP** and serialization to JSON, JSONSchema, Base64, and Bytes. +This means there is native support for **Protobuf and gRPC**, on top of **HTTP** and serialization to JSON, JSONSchema, Base64, and Bytes. This is useful for different use cases: - :cloud: You are **serving a model**, for example through **[Jina](https://github.com/jina-ai/jina/)** or **[FastAPI](https://github.com/tiangolo/fastapi/)** @@ -339,7 +341,7 @@ This is useful for: - :mag: **Neural search** applications - :bulb: **Recommender systems** -Currently, DocArray Document Indexes support **[Weaviate](https://weaviate.io/)**, **[Qdrant](https://qdrant.tech/)**, **[ElasticSearch](https://www.elastic.co/)**, and **[HNSWLib](https://github.com/nmslib/hnswlib)**, with more to come!. +Currently, DocArray Document Indexes support **[Weaviate](https://weaviate.io/)**, **[Qdrant](https://qdrant.tech/)**, **[ElasticSearch](https://www.elastic.co/)**, and **[HNSWLib](https://github.com/nmslib/hnswlib)**, with more to come!
See Document Index usage @@ -371,7 +373,7 @@ index = HnswDocumentIndex(work_dir='.') # index your data index.index(dl) -# find similar Document +# find similar Documents query = dl[0] results, scores = index.find(query, top_k=10, search_field='embedding') ``` From 88a1de58353b7984eff580b074f19fb5f8830642 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 6 Apr 2023 14:09:59 +0200 Subject: [PATCH 10/18] docs: typos Signed-off-by: Johannes Messner --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a62152b8f93..d1d37ad01db 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ DocArray handles your data while integrating seamlessly with the rest of your ** - :chains: DocArray data can be sent as JSON over **HTTP** or as **Protobuf** over **gRPC** -> :bulb: **Where are you coming from?**: Depending on your use case and background, there are different was to "get" DocArray. +> :bulb: **Where are you coming from?** Depending on your use case and background, there are different was to "get" DocArray. > You can navigate to the following section for an explanation that should fit your mindest: > - [Coming from pure PyTorch or TensorFlow](#coming-from-torch-tf) > - [Coming from Pydantic](#coming-from-pydantic) @@ -45,7 +45,7 @@ This is useful for different use cases: - :cloud: You are **serving a model**, for example through FastAPI, and you want to specify your API endpoints - :card_index_dividers: You are **parsing data** for later use in your ML or DS applications -> :bulb: **Coming from Pydantic?**: If you're currently using Pydantic for the use cases above, you should be happy to hear +> :bulb: **Coming from Pydantic?** If you're currently using Pydantic for the use cases above, you should be happy to hear > that DocArray is built on top of, and fully compatible with, Pydantic! > Also, we have [dedicated section](#coming-from-pydantic) just for you! @@ -255,7 +255,7 @@ This is useful for different use cases: - :spider_web: You **distribute your model** across machines and need to send your data between nodes - :gear: You are building a **microservice** architecture and need to send your data between microservices -> :bulb: **Coming from FatAPI?**: If you're currently using FatAPI for the use cases above, you should be happy to hear +> :bulb: **Coming from FastAPI?** If you're currently using FatAPI for the use cases above, you should be happy to hear > that DocArray is fully compatible with FatAPI! > Also, we have [dedicated section](#coming-from-fastapi) just for you! From 39f96f7409a57ec506220e3b9412e9825fde56dc Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 6 Apr 2023 14:14:05 +0200 Subject: [PATCH 11/18] docs: more typos Signed-off-by: Johannes Messner --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d1d37ad01db..0b188ae4dbb 100644 --- a/README.md +++ b/README.md @@ -255,8 +255,8 @@ This is useful for different use cases: - :spider_web: You **distribute your model** across machines and need to send your data between nodes - :gear: You are building a **microservice** architecture and need to send your data between microservices -> :bulb: **Coming from FastAPI?** If you're currently using FatAPI for the use cases above, you should be happy to hear -> that DocArray is fully compatible with FatAPI! +> :bulb: **Coming from FastAPI?** If you're currently using FastAPI for the use cases above, you should be happy to hear +> that DocArray is fully compatible with FastAPI! > Also, we have [dedicated section](#coming-from-fastapi) just for you! Whenever you want to send your data you need to serialize it, so let's take a look at how that works with DocArray: From f157dba0195f1b2f3a2d0bb197749971d4d4bd75 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 6 Apr 2023 16:28:46 +0200 Subject: [PATCH 12/18] docs: make section collapsible Signed-off-by: Johannes Messner --- README.md | 247 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 141 insertions(+), 106 deletions(-) diff --git a/README.md b/README.md index 0b188ae4dbb..592d5e4b74c 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ DocArray handles your data while integrating seamlessly with the rest of your ** > :bulb: **Where are you coming from?** Depending on your use case and background, there are different was to "get" DocArray. > You can navigate to the following section for an explanation that should fit your mindest: -> - [Coming from pure PyTorch or TensorFlow](#coming-from-torch-tf) +> - [Coming from pure PyTorch or TensorFlow](#coming-from-pytorch) > - [Coming from Pydantic](#coming-from-pydantic) > - [Coming from FastAPI](#coming-from-fastapi) > - [Coming from a vector database](#coming-from-vector-database) @@ -380,9 +380,15 @@ results, scores = index.find(query, top_k=10, search_field='embedding')
-## Coming from DocArray +Depending on your background and use case, there are different ways for you to _get_ DocArray. +Choose your own adventure! -If you are already using DocArray, you will be familiar with its [dataclass API](https://docarray.jina.ai/fundamentals/dataclass/). +## Coming from old DocArray + +
+ Click to expand + +If you are using DocArray v<0.30.0, you will be familiar with its [dataclass API](https://docarray.jina.ai/fundamentals/dataclass/). _DocArray v2 is that idea, taken seriously._ Every `Document` is created through dataclass-like interface, courtesy of [Pydantic](https://pydantic-docs.helpmanual.io/usage/models/). @@ -392,20 +398,88 @@ This gives the following advantages: - **Multi-modality:** Easily store multiple modalities and multiple embeddings in the same Document - **Language agnostic:** At its core, Documents are just dictionaries. This makes it easy to create and send them from any language, not just Python. +You may also be familiar with our old Document Stores for vector DB integration. +They are now called **Document Indexes** and offer the following improvements (see [here](#store) for the new API): +- **Hybrid search:** You can now combine vector search with text search, and even filter by arbitrary fields +- **Production-ready:** The new Document Indexes are a much thinner wrapper around the various vector DB libraries, making them more robust and easier to maintain +- **Increased flexibility:** We strive to support any configuration or setting that you could perform through the DB's first-party client + +For now, Document Indexes support **[Weaviate](https://weaviate.io/)**, **[Qdrant](https://qdrant.tech/)**, **[ElasticSearch](https://www.elastic.co/)**, and **[HNSWLib](https://github.com/nmslib/hnswlib)**, with more to come. + +
+ ## Coming from Pydantic -If you come from Pydantic, you can see Documents as juiced up models, and DocArray as a collection of goodies around them. +
+ Click to expand + +If you come from Pydantic, you can see DocArray Documents as juiced up Pydantic models, and DocArray as a collection of goodies around them. -- **ML focused types**: Tensor, TorchTensor, TFTensor, Embedding, ... +More specifically, we set out to **make Pydantic fit for the ML world** - not by replacing it, but by building on top of it! + +This means that you get the following benefits: +- **ML focused types**: Tensor, TorchTensor, Embedding, ..., including **tensor shape validation** +- Full compatibility with **FastAPI** +- **DocList** and **DocVec** generalize the idea of a model to a _sequence_ or _batch_ of models. Perfect for **use in ML models** and other batch processing tasks. - **Types that are alive**: ImageUrl can `.load()` a URL to image tensor, TextUrl can load and tokenize text documents, etc. -- **Pre-built Documents** for different data modalities: Image, Text, 3DMesh, Video, Audio and more. Note that all of these will be valid Pydantic models! -- The concepts of **DocArray and DocumentStore** - Cloud-ready: Serialization to **Protobuf** for use with microservices and **gRPC** -- Support for **vector search functionalities**, such as `find()` and `embed()` +- **Pre-built multi-modal Documents** for different data modalities: Image, Text, 3DMesh, Video, Audio and more. Note that all of these are valid Pydantic models! +- **Document Stores** and **Document Indexes** let you store your data and retrieve it using **vector search** + +The most obvious advantage here is **first-class support for ML centric data**, such as {Torch, TF, ...}Tensor, Embedding, etc. + +This includes handy features such as validating the shape of a tensor: + +```python +from docarray import BaseDoc +from docarray.typing import TorchTensor +import torch + + +class MyDoc(BaseDoc): + tensor: TorchTensor[3, 224, 224] + + +doc = MyDoc(tensor=torch.zeros(3, 224, 224)) # works +doc = MyDoc(tensor=torch.zeros(224, 224, 3)) # works by reshaping +doc = MyDoc(tensor=torch.zeros(224)) # fails validation + + +class Image(BaseDoc): + tensor: TorchTensor[3, 'x', 'x'] + + +Image(tensor=torch.zeros(3, 224, 224)) # works +Image( + tensor=torch.zeros(3, 64, 128) +) # fails validation because second dimension does not match third +Image( + tensor=torch.zeros(4, 224, 224) +) # fails validation because of the first dimension +Image( + tensor=torch.zeros(3, 64) +) # fails validation because it does not have enough dimensions +``` + +
+ ## Coming from PyTorch -DocArray can be used directly inside ML models to handle and represent multi-modal data. This allows you to reason about your data using DocArray's abstractions deep inside of `nn.Module`, and provides a (FastAPI-compatible) schema that eases the transition between model training and model serving. +
+ Click to expand + +If you come from PyTorch, you can see DocArray mainly as a way of _organizing your data as it flows through your model_. + +It offers you several advantages: +- Express **tensors shapes in type hints** +- **Group tensors that belong to the same object**, e.g. an audio track and an image +- **Go directly to deployment**, by re-using your data model as a [FastAPI](https://fastapi.tiangolo.com/) or [Jina](https://github.com/jina-ai/jina) API schema +- Connect model components between **microservices**, using Protobuf and gRPC + +DocArray can be used directly inside ML models to handle and represent multi-modal data. +This allows you to reason about your data using DocArray's abstractions deep inside of `nn.Module`, +and provides a (FastAPI-compatible) schema that eases the transition between model training and model serving. To see the effect of this, let's first observe a vanilla PyTorch implementation of a tri-modal ML model: @@ -487,11 +561,17 @@ class MyPodcastModel(nn.Module): Looks much better, doesn't it? You instantly win in code readability and maintainability. And for the same price you can turn your PyTorch model into a FastAPI app and reuse your Document -schema definition (see below). Everything is handled in a pythonic manner by relying on type hints. +schema definition (see [below](#coming-from-fastapi)). Everything is handled in a pythonic manner by relying on type hints. + +
+ ## Coming from TensorFlow -Similar to the PyTorch approach, you can also use DocArray with TensorFlow to handle and represent multi-modal data inside your ML model. +
+ Click to expand + +Similar to the [PyTorch approach](#coming-from-pytorch), you can also use DocArray with TensorFlow to handle and represent multi-modal data inside your ML model. First off, to use DocArray with TensorFlow we first need to install it as follows: @@ -532,9 +612,22 @@ class MyPodcastModel(tf.keras.Model): return inputs ``` +
+ + ## Coming from FastAPI -Documents are Pydantic Models (with a twist), and as such they are fully compatible with FastAPI: +
+ Click to expand + +Documents are Pydantic Models (with a twist), and as such they are fully compatible with FastAPI! + +But why should you use them, and not the Pydantic models you already know and love? +Good question! +- Because of the ML-first features, types and validations, [here](#coming-from-pydantic) +- Because DocArray can act as an [ORM for vector databases](#coming-from-a-vector-database), similar to what SQLModel does for SQL databases + +And to seal the deal, let us show you how easily Documents slot into your FastAPI app: ```python import numpy as np @@ -576,119 +669,63 @@ async with AsyncClient(app=app, base_url="http://test") as ac: resp_redoc = await ac.get("/redoc") ``` -The big advantage here is **first-class support for ML centric data**, such as {Torch, TF, ...}Tensor, Embedding, etc. - -This includes handy features such as validating the shape of a tensor: - -```python -from docarray import BaseDoc -from docarray.typing import TorchTensor -import torch - +Just like a vanilla Pydantic model! -class MyDoc(BaseDoc): - tensor: TorchTensor[3, 224, 224] - - -doc = MyDoc(tensor=torch.zeros(3, 224, 224)) # works -doc = MyDoc(tensor=torch.zeros(224, 224, 3)) # works by reshaping -doc = MyDoc(tensor=torch.zeros(224)) # fails validation - - -class Image(BaseDoc): - tensor: TorchTensor[3, 'x', 'x'] +
-Image(tensor=torch.zeros(3, 224, 224)) # works -Image( - tensor=torch.zeros(3, 64, 128) -) # fails validation because second dimension does not match third -Image( - tensor=torch.zeros(4, 224, 224) -) # fails validation because of the first dimension -Image( - tensor=torch.zeros(3, 64) -) # fails validation because it does not have enough dimensions -``` - ## Coming from a vector database +
+ Click to expand + If you came across DocArray as a universal vector database client, you can best think of it as **a new kind of ORM for vector databases**. DocArray's job is to take multi-modal, nested and domain-specific data and to map it to a vector database, store it there, and thus make it searchable: ```python -# NOTE: DocumentStores are not yet implemented in version 2 -from docarray import DocList, BaseDoc -from docarray.stores import DocumentStore -from docarray.documents import ImageDoc, TextDoc -import numpy as np - - -class MyDoc(BaseDoc): - image: ImageDoc - text: TextDoc - description: str - - -def _random_my_doc(): - return MyDoc( - image=ImageDoc(embedding=np.random.random((256,))), - text=TextDoc(embedding=np.random.random((128,))), - description='this is a random document', - ) - - -da = DocList([_random_my_doc() for _ in range(1000)]) # create some data -store = DocumentStore[MyDoc]( - storage='qdrant' -) # create a DocumentStore with Qdrant as backend -store.insert(da) # insert the DocArray into the DocumentStore +from docarray import DocList +from docarray.documents import ImageDoc +from docarray.index import HnswDocumentIndex -# find the 10 most similar images based on the image embedding field -match = store.find( - ImageDoc(embedding=np.zeros((256,))), field='image__embedding', top_k=10 -) -# find the 10 most similar images based on the image embedding field -match = store.find( - ImageDoc(embedding=np.zeros((128,))), field='text__embedding', top_k=10 +# create some data +dl = DocList[ImageDoc]( + [ + ImageDoc( + url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg", + tensor=np.zeros((3, 224, 224)), + embedding=np.random.random((128,)), + ) + for _ in range(100) + ] ) -``` -## Enable logging +# create a Document Index +index = HnswDocumentIndex(work_dir='.') -You can see more logs by setting the log level to `DEBUG` or `INFO`: +# index your data +index.index(dl) -```python -from pydantic import Field -from docarray import BaseDoc -from docarray.index import HnswDocumentIndex -from docarray.typing import NdArray -import logging +# find similar Documents +query = dl[0] +results, scores = index.find(query, top_k=10, search_field='embedding') +``` -# get the logger and set the log level to DEBUG -logging.getLogger('docarray').setLevel(logging.DEBUG) +Currently, DocArray supports the following vector databases: +- [Weaviate](https://www.weaviate.io/) +- [Qdrant](https://qdrant.tech/) +- [Elasticsearch](https://www.elastic.co/elasticsearch/) v8 and v7 +- [HNSWlib](https://github.com/nmslib/hnswlib) as a local-first alternative +An integration of [OpenSearch](https://opensearch.org/) is currently in progress. -# define a simple document and create a document index -class SimpleDoc(BaseDoc): - vector: NdArray = Field(dim=10) +Legacy versions of DocArray also support [Redis](https://redis.io/) and [Milvus](https://milvus.io/), but these are not yet supported in the current version. +Of course this is only one thing that DocArray can do, so we encourage you to check out the rest of this readme! -doc_store = HnswDocumentIndex[SimpleDoc](work_dir='temp_path/') -``` +
-```console -INFO - docarray - DB config created -INFO - docarray - Runtime config created -DEBUG - docarray - Working directory set to temp_path/ -WARNING - docarray - No index was created for `id` as it does not have a config -INFO - docarray - Created a new index for column `vector` -DEBUG - docarray - DB path set to temp_path/docs_sqlite.db -INFO - docarray - Connection to DB has been established -INFO - docarray - HnswDocumentIndex[SimpleDoc] has been initialized -``` ## Install the alpha @@ -706,10 +743,8 @@ pip install "git+https://github.com/docarray/docarray@feat-rewrite-v2#egg=docarr ## See also +- [Documentation](https://docarray-v2--jina-docs.netlify.app/) - [Join our Discord server](https://discord.gg/WaMp6PVPgR) -- [V2 announcement blog post](https://github.com/docarray/notes/blob/main/blog/01-announcement.md) - [Donation to Linux Foundation AI&Data blog post](https://jina.ai/news/donate-docarray-lf-for-inclusive-standard-multimodal-data-model/) -- [Submit ideas, feature requests, and discussions](https://github.com/docarray/docarray/discussions) -- [v2 Documentation](https://docarray-v2--jina-docs.netlify.app/) - ["Legacy" DocArray github page](https://github.com/docarray/docarray) - ["Legacy" DocArray documentation](https://docarray.jina.ai/) From a420c536477de0568307618b63da8b584be3d724 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 6 Apr 2023 16:45:47 +0200 Subject: [PATCH 13/18] docs: typo Signed-off-by: Johannes Messner --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 592d5e4b74c..7c68ffe06bc 100644 --- a/README.md +++ b/README.md @@ -205,7 +205,7 @@ dl = DocList[Image]( # the DocList is parametrized by your personal schema! You can still bulk access the fields of your `Document`: ```python -tensors = dl.tensor # gets all the tensors in the DocVec +tensors = dl.tensor # gets all the tensors in the DocList print(type(tensors)) # as a list of tensors print(dl.url) # you can bulk access any other field, too ``` From 13a54cfcaea8abf82f919c8bfca9ba742438187b Mon Sep 17 00:00:00 2001 From: samsja Date: Thu, 6 Apr 2023 17:14:46 +0200 Subject: [PATCH 14/18] feat: add readme testing Signed-off-by: samsja --- tests/documentation/test_docs.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py index 4eceb252f89..8d3ba31cb41 100644 --- a/tests/documentation/test_docs.py +++ b/tests/documentation/test_docs.py @@ -11,3 +11,7 @@ ) def test_files_good(fpath): check_md_file(fpath=fpath, memory=True) + + +def test_readme(): + check_md_file(fpath='README.md', memory=True) From 4e9941e145a943d3e035caad0049eab70d33170e Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 11 Apr 2023 12:04:58 +0200 Subject: [PATCH 15/18] fix: fix most of readme pb Signed-off-by: samsja --- README.md | 101 ++++++++++++++++++++++--------- tests/documentation/test_docs.py | 2 - 2 files changed, 74 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 7c68ffe06bc..847131fefb7 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,7 @@ Put simply, DocArray lets you represent your data in a dataclass-like way, with ```python from docarray import BaseDoc from docarray.typing import TorchTensor, ImageUrl +import torch # Define your data model class MyDocument(BaseDoc): @@ -73,9 +74,9 @@ vec = DocVec[MyDocument]( image_tensor=torch.rand(1704, 2272, 3), ), ] - * 1000 + * 10 ) -print(vec.image_tensor.shape) # (1000, 1704, 2272, 3) +print(vec.image_tensor.shape) # (10, 1704, 2272, 3) ```
@@ -87,7 +88,7 @@ So let's take a closer look at how you can represent your data with DocArray: from docarray import BaseDoc from docarray.typing import TorchTensor, ImageUrl from typing import Optional - +import torch # Define your data model class MyDocument(BaseDoc): @@ -114,9 +115,15 @@ doc = MyDocument( doc.image_tensor = doc.image_url.load() # Compute embedding with any model of your choice + + +def clip_image_encoder(image_tensor: TorchTensor) -> TorchTensor: # dummy function + return torch.rand(512) + + doc.embedding = clip_image_encoder(doc.image_tensor) -print(doc.embedding.shape) +print(doc.embedding.shape) # torch.Size([512]) ``` ### Compose nested Documents @@ -235,10 +242,10 @@ dl.insert( And you can seamlessly switch between `DocVec` and `DocList`: ```python -vec_2 = dl.unstack() +vec_2 = dl.stack() assert isinstance(vec_2, DocVec) -dl_2 = vec_2.stack() +dl_2 = vec_2.unstack() assert isinstance(dl_2, DocList) ``` @@ -264,6 +271,7 @@ Whenever you want to send your data you need to serialize it, so let's take a lo ```python from docarray import BaseDoc from docarray.typing import ImageTorchTensor +import torch # model your data class MyDocument(BaseDoc): @@ -279,14 +287,11 @@ doc = MyDocument( # serialize it! proto = doc.to_protobuf() -base64 = doc.to_base64() bytes_ = doc.to_bytes() json = doc.json() -jsonschema = doc.jsonschema() # deserialize it! doc_2 = MyDocument.from_protobuf(proto) -doc_3 = MyDocument.from_base64(base64) doc_4 = MyDocument.from_bytes(bytes_) doc_5 = MyDocument.parse_raw(json) ``` @@ -315,6 +320,7 @@ As an example, let's take a look at how that would work with AWS S3 storage: ```python from docarray import DocList from docarray.documents import ImageDoc +import numpy as np dl = DocList[ImageDoc]( [ @@ -351,9 +357,18 @@ The Document Index interface lets you index and retrieve Documents from multiple It supports ANN vector search, text search, filtering, and hybrid search. ```python -from docarray import DocList -from docarray.documents import ImageDoc +from docarray import DocList, BaseDoc from docarray.index import HnswDocumentIndex +import numpy as np + +from docarray.typing import ImageUrl, ImageTensor, NdArray + + +class ImageDoc(BaseDoc): + url: ImageUrl + tensor: ImageTensor + embedding: NdArray[128] + # create some data dl = DocList[ImageDoc]( @@ -368,14 +383,15 @@ dl = DocList[ImageDoc]( ) # create a Document Index -index = HnswDocumentIndex(work_dir='.') +index = HnswDocumentIndex[ImageDoc](work_dir='/tmp/test_index') + # index your data index.index(dl) # find similar Documents query = dl[0] -results, scores = index.find(query, top_k=10, search_field='embedding') +results, scores = index.find(query, limit=10, search_field='embedding') ```
@@ -442,7 +458,13 @@ class MyDoc(BaseDoc): doc = MyDoc(tensor=torch.zeros(3, 224, 224)) # works doc = MyDoc(tensor=torch.zeros(224, 224, 3)) # works by reshaping -doc = MyDoc(tensor=torch.zeros(224)) # fails validation + +try: + doc = MyDoc(tensor=torch.zeros(224)) # fails validation +except Exception as e: + print(e) + # tensor + # Cannot reshape tensor of shape (224,) to shape (3, 224, 224) (type=value_error) class Image(BaseDoc): @@ -450,15 +472,30 @@ class Image(BaseDoc): Image(tensor=torch.zeros(3, 224, 224)) # works -Image( - tensor=torch.zeros(3, 64, 128) -) # fails validation because second dimension does not match third -Image( - tensor=torch.zeros(4, 224, 224) -) # fails validation because of the first dimension -Image( - tensor=torch.zeros(3, 64) -) # fails validation because it does not have enough dimensions + +try: + Image( + tensor=torch.zeros(3, 64, 128) + ) # fails validation because second dimension does not match third +except Exception as e: + print() + + +try: + Image( + tensor=torch.zeros(4, 224, 224) + ) # fails validation because of the first dimension +except Exception as e: + print(e) + # Tensor shape mismatch. Expected(3, 'x', 'x'), got(4, 224, 224)(type=value_error) + +try: + Image( + tensor=torch.zeros(3, 64) + ) # fails validation because it does not have enough dimensions +except Exception as e: + print(e) + # Tensor shape mismatch. Expected (3, 'x', 'x'), got (3, 64) (type=value_error) ```
@@ -685,9 +722,18 @@ DocArray's job is to take multi-modal, nested and domain-specific data and to ma store it there, and thus make it searchable: ```python -from docarray import DocList -from docarray.documents import ImageDoc +from docarray import DocList, BaseDoc from docarray.index import HnswDocumentIndex +import numpy as np + +from docarray.typing import ImageUrl, ImageTensor, NdArray + + +class ImageDoc(BaseDoc): + url: ImageUrl + tensor: ImageTensor + embedding: NdArray[128] + # create some data dl = DocList[ImageDoc]( @@ -702,14 +748,15 @@ dl = DocList[ImageDoc]( ) # create a Document Index -index = HnswDocumentIndex(work_dir='.') +index = HnswDocumentIndex[ImageDoc](work_dir='/tmp/test_index') + # index your data index.index(dl) # find similar Documents query = dl[0] -results, scores = index.find(query, top_k=10, search_field='embedding') +results, scores = index.find(query, limit=10, search_field='embedding') ``` Currently, DocArray supports the following vector databases: diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py index 8d3ba31cb41..6e2112db0fc 100644 --- a/tests/documentation/test_docs.py +++ b/tests/documentation/test_docs.py @@ -4,8 +4,6 @@ from mktestdocs import check_md_file -# @pytest.mark.parametrize('fpath', pathlib.Path("docs").glob("**/*.md"), ids=str) -# to use later @pytest.mark.parametrize( 'fpath', pathlib.Path('docs/user_guide').glob('**/*.md'), ids=str ) From c0f243fff94b2a07bc2666f478c18de6a0f9958c Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 11 Apr 2023 12:36:30 +0200 Subject: [PATCH 16/18] fix: fix most of readme pb Signed-off-by: samsja --- README.md | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 847131fefb7..24b21e53541 100644 --- a/README.md +++ b/README.md @@ -523,14 +523,19 @@ To see the effect of this, let's first observe a vanilla PyTorch implementation ```python import torch from torch import nn +import torch + + +def encoder(x): + return torch.rand(512) class MyMultiModalModel(nn.Module): def __init__(self): super().__init__() - self.audio_encoder = AudioEncoder() - self.image_encoder = ImageEncoder() - self.text_encoder = TextEncoder() + self.audio_encoder = encoder() + self.image_encoder = encoder() + self.text_encoder = encoder() def forward(self, text_1, text_2, image_1, image_2, audio_1, audio_2): embedding_text_1 = self.text_encoder(text_1) @@ -560,10 +565,14 @@ So, now let's see what the same code looks like with DocArray: from docarray import DocList, BaseDoc from docarray.documents import ImageDoc, TextDoc, AudioDoc from docarray.typing import TorchTensor - +from torch import nn import torch +def encoder(x): + return torch.rand(512) + + class Podcast(BaseDoc): text: TextDoc image: ImageDoc @@ -578,9 +587,9 @@ class PairPodcast(BaseDoc): class MyPodcastModel(nn.Module): def __init__(self): super().__init__() - self.audio_encoder = AudioEncoder() - self.image_encoder = ImageEncoder() - self.text_encoder = TextEncoder() + self.audio_encoder = encoder() + self.image_encoder = encoder() + self.text_encoder = encoder() def forward_podcast(self, docs: DocList[Podcast]) -> DocList[Podcast]: docs.audio.embedding = self.audio_encoder(docs.audio.tensor) @@ -674,7 +683,7 @@ from httpx import AsyncClient from docarray import BaseDoc from docarray.documents import ImageDoc from docarray.typing import NdArray -from docarray.base_doc import DocumentResponse +from docarray.base_doc import DocArrayResponse class InputDoc(BaseDoc): @@ -691,7 +700,7 @@ input_doc = InputDoc(img=ImageDoc(tensor=np.zeros((3, 224, 224)))) app = FastAPI() -@app.post("/doc/", response_model=OutputDoc, response_class=DocumentResponse) +@app.post("/doc/", response_model=OutputDoc, response_class=DocArrayResponse) async def create_item(doc: InputDoc) -> OutputDoc: ## call my fancy model to generate the embeddings doc = OutputDoc( From c05bfa5a206ec4f476e57257ab56d3c56574bfea Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 11 Apr 2023 12:57:46 +0200 Subject: [PATCH 17/18] fix: fix readme pb Signed-off-by: samsja --- README.md | 4 +-- tests/documentation/test_docs.py | 45 ++++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 24b21e53541..9988b49f625 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,7 @@ import numpy as np class Image(BaseDoc): url: ImageUrl - tensor: AnyTensor # this allows torch, numpy, and tensorflow tensors + tensor: AnyTensor # this allows torch, numpy, and tensor flow tensors vec = DocVec[Image]( # the DocVec is parametrized by your personal schema! @@ -757,7 +757,7 @@ dl = DocList[ImageDoc]( ) # create a Document Index -index = HnswDocumentIndex[ImageDoc](work_dir='/tmp/test_index') +index = HnswDocumentIndex[ImageDoc](work_dir='/tmp/test_index2') # index your data diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py index 6e2112db0fc..6ca32d7700f 100644 --- a/tests/documentation/test_docs.py +++ b/tests/documentation/test_docs.py @@ -1,7 +1,46 @@ import pathlib import pytest -from mktestdocs import check_md_file +from mktestdocs import grab_code_blocks +from mktestdocs.__main__ import _executors, check_raw_string + + +def check_raw_file_full(raw, lang="python", keyword_ignore=[]): + if lang not in _executors: + raise LookupError( + f"{lang} is not a supported language to check\n" + "\tHint: you can add support for any language by using register_executor" + ) + executor = _executors[lang] + all_code = "" + add_code_block = True + + for b in grab_code_blocks(raw, lang=lang): + add_code_block = True + for keyword in keyword_ignore: + if keyword in b: + add_code_block = False + break + if add_code_block: + all_code = f"{all_code}\n{b}" + executor(all_code) + + +def check_md_file(fpath, memory=False, lang="python", keyword_ignore=[]): + """ + NOTE: copy paste from mktestdocs.__main__ and add the keyword ignore + Given a markdown file, parse the contents for python code blocks + and check that each independent block does not cause an error. + + Arguments: + fpath: path to markdown file + memory: whether or not previous code-blocks should be remembered + """ + text = pathlib.Path(fpath).read_text() + if not memory: + check_raw_string(text, lang=lang) + else: + check_raw_file_full(text, lang=lang, keyword_ignore=keyword_ignore) @pytest.mark.parametrize( @@ -12,4 +51,6 @@ def test_files_good(fpath): def test_readme(): - check_md_file(fpath='README.md', memory=True) + check_md_file( + fpath='README.md', memory=True, keyword_ignore=['tensorflow', 'fastapi', 'push'] + ) From bdd33569e8254587a9057199112634dc0499d43e Mon Sep 17 00:00:00 2001 From: samsja Date: Tue, 11 Apr 2023 13:03:17 +0200 Subject: [PATCH 18/18] fix: remove todo Signed-off-by: samsja --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9988b49f625..c0d2cd5edee 100644 --- a/README.md +++ b/README.md @@ -297,7 +297,7 @@ doc_5 = MyDocument.parse_raw(json) ``` Of course, serialization is not all you need. -So check out how DocArray integrates with FatAPI and Jina. TODO link to doc sections +So check out how DocArray integrates with FatAPI and Jina. ## Store