-The data structure for unstructured multimodal data
+The data structure for multimodal data
+print(doc.embedding.shape) # torch.Size([512])
+```
-
-
- |
-+### Compose nested Documents + +Of course you can compose Documents into a nested structure: ```python -from docarray import dataclass, Document -from docarray.typing import Image, Text, JSON +from docarray import BaseDoc +from docarray.documents import ImageDoc, TextDoc +import numpy as np -@dataclass -class WPArticle: - banner: Image - headline: Text - meta: JSON +class MultiModalDocument(BaseDoc): + image_doc: ImageDoc + text_doc: TextDoc -a = WPArticle( - banner='https://.../cat-dog-flight.png', - headline='Everything to know about flying with pets, ...', - meta={ - 'author': 'Nathan Diller', - 'Column': 'By the Way - A Post Travel Destination', - }, +doc = MultiModalDocument( + image_doc=ImageDoc(tensor=np.zeros((3, 224, 224))), text_doc=TextDoc(text='hi!') ) - -d = Document(a) ``` - | -
| left/00018.jpg | -right/00018.jpg | -left/00131.jpg | -right/00131.jpg | -
|---|---|---|---|
![]() |
- ![]() |
- ![]() |
- ![]() |
-
| Pull from Cloud | -Download, unzip, load from local | -
|---|---|
| +You may also be familiar with our old Document Stores for vector DB integration. +They are now called **Document Indexes** and offer the following improvements (see [here](#store) for the new API): +- **Hybrid search:** You can now combine vector search with text search, and even filter by arbitrary fields +- **Production-ready:** The new Document Indexes are a much thinner wrapper around the various vector DB libraries, making them more robust and easier to maintain +- **Increased flexibility:** We strive to support any configuration or setting that you could perform through the DB's first-party client -```python -right_da = ( - DocumentArray.pull('demo-rightda', show_progress=True) - .apply(preproc) - .embed(model, device='cuda')[:1000] -) -``` - - | -
+For now, Document Indexes support **[Weaviate](https://weaviate.io/)**, **[Qdrant](https://qdrant.tech/)**, **[ElasticSearch](https://www.elastic.co/)**, and **[HNSWLib](https://github.com/nmslib/hnswlib)**, with more to come.
+
+
+
+## Coming from Pydantic
+
+
+ Click to expand+ +If you come from Pydantic, you can see DocArray Documents as juiced up Pydantic models, and DocArray as a collection of goodies around them. + +More specifically, we set out to **make Pydantic fit for the ML world** - not by replacing it, but by building on top of it! + +This means that you get the following benefits: +- **ML focused types**: Tensor, TorchTensor, Embedding, ..., including **tensor shape validation** +- Full compatibility with **FastAPI** +- **DocList** and **DocVec** generalize the idea of a model to a _sequence_ or _batch_ of models. Perfect for **use in ML models** and other batch processing tasks. +- **Types that are alive**: ImageUrl can `.load()` a URL to image tensor, TextUrl can load and tokenize text documents, etc. +- Cloud-ready: Serialization to **Protobuf** for use with microservices and **gRPC** +- **Pre-built multi-modal Documents** for different data modalities: Image, Text, 3DMesh, Video, Audio and more. Note that all of these are valid Pydantic models! +- **Document Stores** and **Document Indexes** let you store your data and retrieve it using **vector search** + +The most obvious advantage here is **first-class support for ML centric data**, such as {Torch, TF, ...}Tensor, Embedding, etc. + +This includes handy features such as validating the shape of a tensor: ```python -right_da = ( - DocumentArray.from_files('right/*.jpg')[:1000] - .apply(preproc) - .embed(model, device='cuda') -) +from docarray import BaseDoc +from docarray.typing import TorchTensor +import torch + + +class MyDoc(BaseDoc): + tensor: TorchTensor[3, 224, 224] + + +doc = MyDoc(tensor=torch.zeros(3, 224, 224)) # works +doc = MyDoc(tensor=torch.zeros(224, 224, 3)) # works by reshaping + +try: + doc = MyDoc(tensor=torch.zeros(224)) # fails validation +except Exception as e: + print(e) + # tensor + # Cannot reshape tensor of shape (224,) to shape (3, 224, 224) (type=value_error) + + +class Image(BaseDoc): + tensor: TorchTensor[3, 'x', 'x'] + + +Image(tensor=torch.zeros(3, 224, 224)) # works + +try: + Image( + tensor=torch.zeros(3, 64, 128) + ) # fails validation because second dimension does not match third +except Exception as e: + print() + + +try: + Image( + tensor=torch.zeros(4, 224, 224) + ) # fails validation because of the first dimension +except Exception as e: + print(e) + # Tensor shape mismatch. Expected(3, 'x', 'x'), got(4, 224, 224)(type=value_error) + +try: + Image( + tensor=torch.zeros(3, 64) + ) # fails validation because it does not have enough dimensions +except Exception as e: + print(e) + # Tensor shape mismatch. Expected (3, 'x', 'x'), got (3, 64) (type=value_error) ``` - |
-
-{%- endblock %}
\ No newline at end of file
diff --git a/docs/_templates/sidebar/brand.html b/docs/_templates/sidebar/brand.html
deleted file mode 100644
index 4e9d09f841a..00000000000
--- a/docs/_templates/sidebar/brand.html
+++ /dev/null
@@ -1,48 +0,0 @@
-
- {% block brand_content %}
- {%- if logo_url %}
-
- {%- endif %}
- {%- if theme_light_logo and theme_dark_logo %}
-
- {%- endif %}
- {% if not theme_sidebar_hide_name %}
-
- {%- endif %}
- {% endblock brand_content %}
-
-
-
-Oops, we couldn't find that page.
-You can try "asking our docs" on the right corner of the page to find answer.
-Otherwise, please create a Github issue and one of our team will respond.
- -''', -} -notfound_no_urls_prefix = True - -apidoc_module_dir = repo_dir -apidoc_output_dir = 'api' -apidoc_excluded_paths = ['tests', 'legacy', 'hub', 'toy*', 'setup.py'] -apidoc_separate_modules = True -apidoc_extra_args = ['-t', 'template/'] -autodoc_member_order = 'bysource' -autodoc_mock_imports = ['argparse', 'numpy', 'np', 'tensorflow', 'torch', 'scipy'] -autoclass_content = 'both' -set_type_checking_flag = False -html_last_updated_fmt = '' -nitpicky = True -nitpick_ignore = [('py:class', 'type')] -linkcheck_ignore = [ - # Avoid link check on local uri - 'http://0.0.0.0:*', - 'pods/encode.yml', - 'https://github.com/jina-ai/docarray/commit/*', - '.github/*', - 'extra-requirements.txt', - 'fastentrypoints.py' '../../101', - '../../102', - 'http://www.twinsun.com/tz/tz-link.htm', # Broken link from pytz library - 'https://urllib3.readthedocs.io/en/latest/contrib.html#google-app-engine', # Broken link from urllib3 library - 'https://linuxize.com/post/how-to-add-swap-space-on-ubuntu-20-04/', - # This link works but gets 403 error on linkcheck -] -linkcheck_timeout = 20 -linkcheck_retries = 2 -linkcheck_anchors = False - -ogp_site_url = 'https://docarray.jina.ai/' -ogp_image = 'https://docarray.jina.ai/_static/banner.png' -ogp_use_first_image = True -ogp_description_length = 300 -ogp_type = 'website' -ogp_site_name = f'DocArray {os.environ.get("SPHINX_MULTIVERSION_VERSION", version)} Documentation' - -ogp_custom_meta_tags = [ - '', - '', - '', - '', - '', - ''' - - - - - - ''', -] - - -def add_server_address(app): - # This makes variable `server_address` available to docbot.js - server_address = app.config['server_address'] - js_text = "var server_address = '%s';" % server_address - app.add_js_file(None, body=js_text) - -def configure_qa_bot_ui(app): - # This sets the server address to| hello.wav | -olleh.wav | -
|---|---|
| - | - |