From 7edbb6b544ce1bf617ee952630657dfcef207775 Mon Sep 17 00:00:00 2001 From: nan-wang Date: Sun, 9 Apr 2023 11:46:59 +0200 Subject: [PATCH 01/20] docs: add storing with file Signed-off-by: nan-wang Signed-off-by: anna-charlotte --- docs/user_guide/storing/store_file.md | 53 +++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 docs/user_guide/storing/store_file.md diff --git a/docs/user_guide/storing/store_file.md b/docs/user_guide/storing/store_file.md new file mode 100644 index 00000000000..03711342ce4 --- /dev/null +++ b/docs/user_guide/storing/store_file.md @@ -0,0 +1,53 @@ +# Store +[DocList][docarray.array.doc_list.doc_list.DocList] can be persisted using `push()` and `pull()` functions. Under the hood, +[DocStore][docarray.store.abstract_doc_store.AbstractDocStore] is used to persist a `DocList`. You can store your `Doc` on-disk. Alternatively, you can upload to [AWS S3](https://aws.amazon.com/s3/) or [Jina AI Cloud](https://cloud.jina.ai/user/storage). + +# Store on-disk +When you want to use your `DocList` in another place, you can use the `push()` function to push the `DocList` to one place and later use the `pull()` function to pull its content back. + +## Push & pull +To use the store locally, you need to pass a local file path to the function starting with `file://`. + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +store_docs = [SimpleDoc(text=f'doc {i}') for i in range(8)] + +dl = DocList[SimpleDoc]() +dl.extend([SimpleDoc(text=f'doc {i}') for i in range(8)]) +dl.push('file:///Users/docarray/tmp/simple_dl') + +dl_pull = DocList[SimpleDoc].pull('file:///Users/docarray/tmp/simple_dl') +``` + +Under `/Users/docarray/tmp/`, there is a file with the name of `simple_dl.docs` being created to store the `DocList`. +```output +tmp +└── simple_dl.docs +``` + +## Push & Pull with streaming +When you have a large amount of `Doc` to push and pull, you could use the streaming function. `push_stream()` and `pull_stream()` can help you to stream the `DocList` in order to save the memory usage. You set multiple `DocList` to pull from the same source as well. + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +store_docs = [SimpleDoc(text=f'doc {i}') for i in range(8)] + +DocList[SimpleDoc].push_stream(iter(store_docs), 'file:///Users/docarray/tmp/dl_stream') +dl_pull_stream_1 = DocList[SimpleDoc].pull_stream('file:///Users/docarray/tmp/dl_stream') +dl_pull_stream_2 = DocList[SimpleDoc].pull_stream('file:///Users/docarray/tmp/dl_stream') +for d1, d2 in zip(dl_pull_stream_1, dl_pull_stream_2): + print(f'get {d1}, get {d2}') +``` + From 80c3b2944d5717fd6c200a1946729276732a370e Mon Sep 17 00:00:00 2001 From: nan-wang Date: Mon, 10 Apr 2023 11:10:35 +0200 Subject: [PATCH 02/20] docs: add docs for the S3 store Signed-off-by: nan-wang Signed-off-by: anna-charlotte --- docs/user_guide/storing/store_file.md | 2 +- docs/user_guide/storing/store_s3.md | 87 +++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 docs/user_guide/storing/store_s3.md diff --git a/docs/user_guide/storing/store_file.md b/docs/user_guide/storing/store_file.md index 03711342ce4..b3337371558 100644 --- a/docs/user_guide/storing/store_file.md +++ b/docs/user_guide/storing/store_file.md @@ -1,6 +1,6 @@ # Store [DocList][docarray.array.doc_list.doc_list.DocList] can be persisted using `push()` and `pull()` functions. Under the hood, -[DocStore][docarray.store.abstract_doc_store.AbstractDocStore] is used to persist a `DocList`. You can store your `Doc` on-disk. Alternatively, you can upload to [AWS S3](https://aws.amazon.com/s3/) or [Jina AI Cloud](https://cloud.jina.ai/user/storage). +[DocStore][docarray.store.abstract_doc_store.AbstractDocStore] is used to persist a `DocList`. You can store your `Doc` on-disk. Alternatively, you can upload to [AWS S3](https://aws.amazon.com/s3/), [minio](https://min.io) or [Jina AI Cloud](https://cloud.jina.ai/user/storage). # Store on-disk When you want to use your `DocList` in another place, you can use the `push()` function to push the `DocList` to one place and later use the `pull()` function to pull its content back. diff --git a/docs/user_guide/storing/store_s3.md b/docs/user_guide/storing/store_s3.md new file mode 100644 index 00000000000..ce849807720 --- /dev/null +++ b/docs/user_guide/storing/store_s3.md @@ -0,0 +1,87 @@ +# Store on S3 +When you want to use your `DocList` in another place, you can use the `push()` function to push the `DocList` to S3 and later use the `pull()` function to pull its content back. + +!!! note + To store on S3, you need to install the extra dependency with the following line + ```bash + pip install "docarray[aws]" + ``` + +## Push & pull +To use the store `DocList` on S3, you need to pass an S3 path to the function starting with `s3://`. + +In the following demo, we use `MinIO` as a local S3 service. You could use the following docker-compose file to start the service in a Docker container. + +```yaml +version: "3" +services: + minio: + container_name: minio + image: "minio/minio:RELEASE.2023-03-13T19-46-17Z" + ports: + - "9005:9000" + command: server /data +``` +Save the above file as `dock-compose.yml` and run the following line in the same folder as the file, +```bash +docker-compose up +``` + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +if __name__ == '__main__': + import boto3 + from botocore.client import Config + + BUCKET = 'tmp_bucket' + my_session = boto3.session.Session() + s3 = my_session.resource( + service_name='s3', + region_name="us-east-1", + use_ssl=False, + endpoint_url="http://localhost:9005", + aws_access_key_id="minioadmin", + aws_secret_access_key="minioadmin", + config=Config(signature_version="s3v4"), + ) + # make a bucket + s3.create_bucket(Bucket=BUCKET) + + store_docs = [SimpleDoc(text=f'doc {i}') for i in range(8)] + dl = DocList[SimpleDoc]() + dl.extend([SimpleDoc(text=f'doc {i}') for i in range(8)]) + + # .push() and .pull() use the default boto3 client + boto3.Session.client.__defaults__ = ( + "us-east-1", + None, + False, + None, + "http://localhost:9005", + "minioadmin", + "minioadmin", + None, + Config(signature_version="s3v4"), + ) + dl.push(f's3://{BUCKET}/simple_dl') + dl_pull = DocList[SimpleDoc].pull(f's3://{BUCKET}/simple_dl') + + # delete the bucket + s3.Bucket(BUCKET).objects.all().delete() + s3.Bucket(BUCKET).delete() +``` + +Under the bucket `tmp_bucket`, there is a file with the name of `simple_dl.docs` being created to store the `DocList`. + +!!! note + When using `.push()` and `.pull()`, `DocList` calls the default boto3 client. Be sure your default session is correctly set up. + + +## Push & Pull with streaming +When you have a large amount of `Doc` to push and pull, you could use the streaming function. `push_stream()` and `pull_stream()` can help you to stream the `DocList` in order to save the memory usage. You set multiple `DocList` to pull from the same source as well. The usage is the same as using streaming with local files. Please refer to [Push & Pull with streaming with local files][TODO_add_internal_link] From 0c8481058316bf8e26273e80b3abbe3bd1115016 Mon Sep 17 00:00:00 2001 From: nan-wang Date: Mon, 10 Apr 2023 11:35:05 +0200 Subject: [PATCH 03/20] docs: add docs for jac store Signed-off-by: nan-wang Signed-off-by: anna-charlotte --- docs/user_guide/storing/store_file.md | 5 +-- docs/user_guide/storing/store_jac.md | 49 +++++++++++++++++++++++++++ docs/user_guide/storing/store_s3.md | 10 ++++++ 3 files changed, 60 insertions(+), 4 deletions(-) create mode 100644 docs/user_guide/storing/store_jac.md diff --git a/docs/user_guide/storing/store_file.md b/docs/user_guide/storing/store_file.md index b3337371558..b8c899aeaba 100644 --- a/docs/user_guide/storing/store_file.md +++ b/docs/user_guide/storing/store_file.md @@ -16,10 +16,7 @@ class SimpleDoc(BaseDoc): text: str -store_docs = [SimpleDoc(text=f'doc {i}') for i in range(8)] - -dl = DocList[SimpleDoc]() -dl.extend([SimpleDoc(text=f'doc {i}') for i in range(8)]) +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(8)]) dl.push('file:///Users/docarray/tmp/simple_dl') dl_pull = DocList[SimpleDoc].pull('file:///Users/docarray/tmp/simple_dl') diff --git a/docs/user_guide/storing/store_jac.md b/docs/user_guide/storing/store_jac.md new file mode 100644 index 00000000000..d44e8f22bdd --- /dev/null +++ b/docs/user_guide/storing/store_jac.md @@ -0,0 +1,49 @@ +# Store on Jina AI Cloud +When you want to use your `DocList` in another place, you can use the `push()` function to push the `DocList` to S3 and later use the `pull()` function to pull its content back. + +!!! note + To store on Jina AI Cloud, you need to install the extra dependency with the following line + ```bash + pip install "docarray[jac]" + ``` + +## Push & pull +To use the store `DocList` on Jina AI Cloud, you need to pass a Jina AI Cloud path to the function starting with `jac://`. + +Before getting started, you need to have an account at [Jina AI Cloud](http://cloud.jina.ai/) and created a [Personal Access Token (PAT)](https://cloud.jina.ai/settings/tokens). + +```python +from docarray import BaseDoc, DocList +import os + + +class SimpleDoc(BaseDoc): + text: str + + +os.environ['JINA_AUTH_TOKEN'] = 'YOUR_PAT' +DL_NAME = 'simple-dl' +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(8)]) +# push to Jina AI Cloud +dl.push(f'jac://{DL_NAME}') +# pull from Jina AI Cloud +dl_pull = DocList[SimpleDoc].pull(f'jac://{DL_NAME}') +``` + + +!!! note + When using `.push()` and `.pull()`, `DocList` calls the default boto3 client. Be sure your default session is correctly set up. + + +## Push & Pull with streaming +When you have a large amount of `Doc` to push and pull, you could use the streaming function. `push_stream()` and `pull_stream()` can help you to stream the `DocList` in order to save the memory usage. You set multiple `DocList` to pull from the same source as well. The usage is the same as using streaming with local files. Please refer to [Push & Pull with streaming with local files][TODO_add_internal_link] + + +## Delete +To delete the store, you need to use the static method `delete()` of `JACDocStore` class. + +```python +from docarray.store import JACDocStore + +JACDocStore.delete(f'jac://{DL_NAME}') +``` \ No newline at end of file diff --git a/docs/user_guide/storing/store_s3.md b/docs/user_guide/storing/store_s3.md index ce849807720..68e21444eaa 100644 --- a/docs/user_guide/storing/store_s3.md +++ b/docs/user_guide/storing/store_s3.md @@ -85,3 +85,13 @@ Under the bucket `tmp_bucket`, there is a file with the name of `simple_dl.docs` ## Push & Pull with streaming When you have a large amount of `Doc` to push and pull, you could use the streaming function. `push_stream()` and `pull_stream()` can help you to stream the `DocList` in order to save the memory usage. You set multiple `DocList` to pull from the same source as well. The usage is the same as using streaming with local files. Please refer to [Push & Pull with streaming with local files][TODO_add_internal_link] + + +## Delete +To delete the store, you need to use the static method `delete()` of `JACDocStore` class. + +```python +from docarray.store import S3DocStore + +success = S3DocStore.delete(f's3://{BUCKET}/simple_dl') +``` From ed35d20a43a01f1f1dc63e6e9bfa1838f9f0ff06 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 13 Apr 2023 12:46:06 +0200 Subject: [PATCH 04/20] docs: store section in user guide Signed-off-by: anna-charlotte --- docs/user_guide/storing/first_step.md | 11 +++++- docs/user_guide/storing/store_file.md | 54 +++++++++++++++++++++------ docs/user_guide/storing/store_jac.md | 22 ++++++++--- docs/user_guide/storing/store_s3.md | 19 ++++++---- mkdocs.yml | 6 ++- 5 files changed, 85 insertions(+), 27 deletions(-) diff --git a/docs/user_guide/storing/first_step.md b/docs/user_guide/storing/first_step.md index 5be8b39165b..f58c4ba4e36 100644 --- a/docs/user_guide/storing/first_step.md +++ b/docs/user_guide/storing/first_step.md @@ -1 +1,10 @@ -# Storing +# Intro + +In the previous sections we saw how to use [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] to represent multi-modal data and send it over the wire. +In this section we will see how to store and persist this data. + +This section is divided into three parts: + +- [Store](store_file.md) of [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] on-disk +- [Store on Jina AI Cloud](store_jac.md) +- [Store on S3](store_s3.md) \ No newline at end of file diff --git a/docs/user_guide/storing/store_file.md b/docs/user_guide/storing/store_file.md index b8c899aeaba..973e6999775 100644 --- a/docs/user_guide/storing/store_file.md +++ b/docs/user_guide/storing/store_file.md @@ -1,12 +1,18 @@ # Store -[DocList][docarray.array.doc_list.doc_list.DocList] can be persisted using `push()` and `pull()` functions. Under the hood, -[DocStore][docarray.store.abstract_doc_store.AbstractDocStore] is used to persist a `DocList`. You can store your `Doc` on-disk. Alternatively, you can upload to [AWS S3](https://aws.amazon.com/s3/), [minio](https://min.io) or [Jina AI Cloud](https://cloud.jina.ai/user/storage). - -# Store on-disk -When you want to use your `DocList` in another place, you can use the `push()` function to push the `DocList` to one place and later use the `pull()` function to pull its content back. +[DocList][docarray.array.doc_list.doc_list.DocList] can be persisted using the +[`.push()`][docarray.array.doc_list.pushpull.PushPullMixin.push] and +[`.pull()`][docarray.array.doc_list.pushpull.PushPullMixin.pull] methods. +Under the hood, [DocStore][docarray.store.abstract_doc_store.AbstractDocStore] is used to persist a `DocList`. +You can store your `Doc` on-disk. Alternatively, you can upload to [AWS S3](https://aws.amazon.com/s3/), +[minio](https://min.io) or [Jina AI Cloud](https://cloud.jina.ai/user/storage). + +## Store on-disk +When you want to use your [DocList][docarray.array.doc_list.doc_list.DocList] in another place, you can use the +[`.push()`][docarray.array.doc_list.pushpull.PushPullMixin.push] function to push the [DocList][docarray.array.doc_list.doc_list.DocList] +to one place and later use the [`.pull()`][docarray.array.doc_list.pushpull.PushPullMixin.pull] function to pull its content back. ## Push & pull -To use the store locally, you need to pass a local file path to the function starting with `file://`. +To use the store locally, you need to pass a local file path to the function starting with `'file://'`. ```python from docarray import BaseDoc, DocList @@ -23,13 +29,16 @@ dl_pull = DocList[SimpleDoc].pull('file:///Users/docarray/tmp/simple_dl') ``` Under `/Users/docarray/tmp/`, there is a file with the name of `simple_dl.docs` being created to store the `DocList`. -```output +``` { .output .no-copy } tmp └── simple_dl.docs ``` -## Push & Pull with streaming -When you have a large amount of `Doc` to push and pull, you could use the streaming function. `push_stream()` and `pull_stream()` can help you to stream the `DocList` in order to save the memory usage. You set multiple `DocList` to pull from the same source as well. +## Push & pull with streaming +When you have a large amount of documents to push and pull, you could use the streaming function. +[`.push_stream()`][docarray.array.doc_list.pushpull.PushPullMixin.push_stream] and +[`.pull_stream()`][docarray.array.doc_list.pushpull.PushPullMixin.pull_stream] can help you to stream the `DocList` in +order to save the memory usage. You set multiple `DocList` to pull from the same source as well. ```python from docarray import BaseDoc, DocList @@ -41,10 +50,31 @@ class SimpleDoc(BaseDoc): store_docs = [SimpleDoc(text=f'doc {i}') for i in range(8)] -DocList[SimpleDoc].push_stream(iter(store_docs), 'file:///Users/docarray/tmp/dl_stream') -dl_pull_stream_1 = DocList[SimpleDoc].pull_stream('file:///Users/docarray/tmp/dl_stream') -dl_pull_stream_2 = DocList[SimpleDoc].pull_stream('file:///Users/docarray/tmp/dl_stream') +DocList[SimpleDoc].push_stream( + iter(store_docs), + 'file:///Users/docarray/tmp/dl_stream', +) +dl_pull_stream_1 = DocList[SimpleDoc].pull_stream( + 'file:///Users/docarray/tmp/dl_stream' +) +dl_pull_stream_2 = DocList[SimpleDoc].pull_stream( + 'file:///Users/docarray/tmp/dl_stream' +) + for d1, d2 in zip(dl_pull_stream_1, dl_pull_stream_2): print(f'get {d1}, get {d2}') ``` +
+ Output + ```text + get SimpleDoc(id='5a4b92af27aadbb852d636892506998b', text='doc 0'), get SimpleDoc(id='5a4b92af27aadbb852d636892506998b', text='doc 0') + get SimpleDoc(id='705e4f6acbab0a6ff10d11a07c03b24c', text='doc 1'), get SimpleDoc(id='705e4f6acbab0a6ff10d11a07c03b24c', text='doc 1') + get SimpleDoc(id='4fb5c01bd5f935bbe91cf73e271ad590', text='doc 2'), get SimpleDoc(id='4fb5c01bd5f935bbe91cf73e271ad590', text='doc 2') + get SimpleDoc(id='381498cef78f1d4f1d80415d67918940', text='doc 3'), get SimpleDoc(id='381498cef78f1d4f1d80415d67918940', text='doc 3') + get SimpleDoc(id='d968bc6fa235b1cfc69eded92926157e', text='doc 4'), get SimpleDoc(id='d968bc6fa235b1cfc69eded92926157e', text='doc 4') + get SimpleDoc(id='30bf347427a4bd50ce8ada1841320fe3', text='doc 5'), get SimpleDoc(id='30bf347427a4bd50ce8ada1841320fe3', text='doc 5') + get SimpleDoc(id='1389877ac97b3e6d0e8eb17568934708', text='doc 6'), get SimpleDoc(id='1389877ac97b3e6d0e8eb17568934708', text='doc 6') + get SimpleDoc(id='264b0eff2cd138d296f15c685e15bf23', text='doc 7'), get SimpleDoc(id='264b0eff2cd138d296f15c685e15bf23', text='doc 7') + ``` +
\ No newline at end of file diff --git a/docs/user_guide/storing/store_jac.md b/docs/user_guide/storing/store_jac.md index d44e8f22bdd..8e2b47c9959 100644 --- a/docs/user_guide/storing/store_jac.md +++ b/docs/user_guide/storing/store_jac.md @@ -1,14 +1,16 @@ # Store on Jina AI Cloud -When you want to use your `DocList` in another place, you can use the `push()` function to push the `DocList` to S3 and later use the `pull()` function to pull its content back. +When you want to use your [`DocList`][docarray.DocList] in another place, you can use the +[`.push()`][docarray.array.doc_list.pushpull.PushPullMixin.push] method to push the `DocList` to S3 and later use the +[`.pull()`][docarray.array.doc_list.pushpull.PushPullMixin.pull] function to pull its content back. !!! note To store on Jina AI Cloud, you need to install the extra dependency with the following line - ```bash + ```cmd pip install "docarray[jac]" ``` ## Push & pull -To use the store `DocList` on Jina AI Cloud, you need to pass a Jina AI Cloud path to the function starting with `jac://`. +To use the store [`DocList`][docarray.DocList] on Jina AI Cloud, you need to pass a Jina AI Cloud path to the function starting with `'jac://'`. Before getting started, you need to have an account at [Jina AI Cloud](http://cloud.jina.ai/) and created a [Personal Access Token (PAT)](https://cloud.jina.ai/settings/tokens). @@ -24,8 +26,10 @@ class SimpleDoc(BaseDoc): os.environ['JINA_AUTH_TOKEN'] = 'YOUR_PAT' DL_NAME = 'simple-dl' dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(8)]) + # push to Jina AI Cloud dl.push(f'jac://{DL_NAME}') + # pull from Jina AI Cloud dl_pull = DocList[SimpleDoc].pull(f'jac://{DL_NAME}') ``` @@ -35,12 +39,18 @@ dl_pull = DocList[SimpleDoc].pull(f'jac://{DL_NAME}') When using `.push()` and `.pull()`, `DocList` calls the default boto3 client. Be sure your default session is correctly set up. -## Push & Pull with streaming -When you have a large amount of `Doc` to push and pull, you could use the streaming function. `push_stream()` and `pull_stream()` can help you to stream the `DocList` in order to save the memory usage. You set multiple `DocList` to pull from the same source as well. The usage is the same as using streaming with local files. Please refer to [Push & Pull with streaming with local files][TODO_add_internal_link] +## Push & pull with streaming +When you have a large amount of documents to push and pull, you could use the streaming function. +[`.push_stream()`][docarray.array.doc_list.pushpull.PushPullMixin.push_stream] and +[`.pull_stream()`][docarray.array.doc_list.pushpull.PushPullMixin.pull_stream] can help you to stream the +[`DocList`][docarray.DocList] in order to save the memory usage. +You set multiple `DocList` to pull from the same source as well. +The usage is the same as using streaming with local files. +Please refer to [Push & Pull with streaming with local files](store_file.md#push-pull-with-streaming). ## Delete -To delete the store, you need to use the static method `delete()` of `JACDocStore` class. +To delete the store, you need to use the static method [`.delete()`][docarray.store.jac.JACDocStore.delete] of [`JACDocStore`][docarray.store.jac.JACDocStore] class. ```python from docarray.store import JACDocStore diff --git a/docs/user_guide/storing/store_s3.md b/docs/user_guide/storing/store_s3.md index 68e21444eaa..9e63eb81e0a 100644 --- a/docs/user_guide/storing/store_s3.md +++ b/docs/user_guide/storing/store_s3.md @@ -1,14 +1,16 @@ # Store on S3 -When you want to use your `DocList` in another place, you can use the `push()` function to push the `DocList` to S3 and later use the `pull()` function to pull its content back. +When you want to use your [`DocList`][docarray.DocList] in another place, you can use the +[`.push`][docarray.array.doc_list.pushpull.PushPullMixin.push] method to push the `DocList` to S3 and later use the +[`.pull`][docarray.array.doc_list.pushpull.PushPullMixin.pull] function to pull its content back. !!! note To store on S3, you need to install the extra dependency with the following line - ```bash + ```cmd pip install "docarray[aws]" ``` ## Push & pull -To use the store `DocList` on S3, you need to pass an S3 path to the function starting with `s3://`. +To use the store [`DocList`][docarray.DocList] on S3, you need to pass an S3 path to the function starting with `'s3://'`. In the following demo, we use `MinIO` as a local S3 service. You could use the following docker-compose file to start the service in a Docker container. @@ -22,7 +24,7 @@ services: - "9005:9000" command: server /data ``` -Save the above file as `dock-compose.yml` and run the following line in the same folder as the file, +Save the above file as `docker-compose.yml` and run the following line in the same folder as the file, ```bash docker-compose up ``` @@ -83,12 +85,15 @@ Under the bucket `tmp_bucket`, there is a file with the name of `simple_dl.docs` When using `.push()` and `.pull()`, `DocList` calls the default boto3 client. Be sure your default session is correctly set up. -## Push & Pull with streaming -When you have a large amount of `Doc` to push and pull, you could use the streaming function. `push_stream()` and `pull_stream()` can help you to stream the `DocList` in order to save the memory usage. You set multiple `DocList` to pull from the same source as well. The usage is the same as using streaming with local files. Please refer to [Push & Pull with streaming with local files][TODO_add_internal_link] +## Push & pull with streaming +When you have a large amount of documents to push and pull, you could use the streaming function. +[`.push_stream()`][docarray.array.doc_list.pushpull.PushPullMixin.push_stream] and +[`.pull_stream()`][docarray.array.doc_list.pushpull.PushPullMixin.pull_stream] can help you to stream the +[`DocList`][docarray.DocList] in order to save the memory usage. You set multiple [`DocList`][docarray.DocList] to pull from the same source as well. The usage is the same as using streaming with local files. Please refer to [Push & Pull with streaming with local files](store_file.md#push-pull-with-streaming). ## Delete -To delete the store, you need to use the static method `delete()` of `JACDocStore` class. +To delete the store, you need to use the static method [`.delete()`][docarray.store.s3.S3DocStore.delete] of [`S3DocStore`][docarray.store.s3.S3DocStore] class. ```python from docarray.store import S3DocStore diff --git a/mkdocs.yml b/mkdocs.yml index 605b986393e..c3ace80d956 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -82,7 +82,11 @@ nav: - user_guide/representing/first_step.md - user_guide/representing/array.md - user_guide/sending/first_step.md - - user_guide/storing/first_step.md + - Storing: + - user_guide/storing/first_step.md + - user_guide/storing/store_file.md + - user_guide/storing/store_jac.md + - user_guide/storing/store_s3.md - How-to: - how_to/add_doc_index.md From 3d59e7aa9cfda4acca4268d43a69d0a355aaaf8c Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 13 Apr 2023 12:47:30 +0200 Subject: [PATCH 05/20] docs: add doc stores to api reference section Signed-off-by: anna-charlotte --- docs/api_references/array/da.md | 2 +- docs/api_references/doc_store/doc_store.md | 3 +++ docs/api_references/doc_store/file_doc_store.md | 3 +++ docs/api_references/doc_store/jac_doc_store.md | 3 +++ docs/api_references/doc_store/s3_doc_store.md | 3 +++ 5 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 docs/api_references/doc_store/doc_store.md create mode 100644 docs/api_references/doc_store/file_doc_store.md create mode 100644 docs/api_references/doc_store/jac_doc_store.md create mode 100644 docs/api_references/doc_store/s3_doc_store.md diff --git a/docs/api_references/array/da.md b/docs/api_references/array/da.md index eedcec827cd..28e1aa94efa 100644 --- a/docs/api_references/array/da.md +++ b/docs/api_references/array/da.md @@ -1,4 +1,4 @@ # DocList ::: docarray.array.doc_list.doc_list.DocList -::: docarray.array.doc_list.io.IOMixinArray +::: docarray.array.doc_list.pushpull.PushPullMixin \ No newline at end of file diff --git a/docs/api_references/doc_store/doc_store.md b/docs/api_references/doc_store/doc_store.md new file mode 100644 index 00000000000..eb6e65b9f4a --- /dev/null +++ b/docs/api_references/doc_store/doc_store.md @@ -0,0 +1,3 @@ +# AbstractDocStore + +::: docarray.store.abstract_doc_store.AbstractDocStore diff --git a/docs/api_references/doc_store/file_doc_store.md b/docs/api_references/doc_store/file_doc_store.md new file mode 100644 index 00000000000..b81dc3ee298 --- /dev/null +++ b/docs/api_references/doc_store/file_doc_store.md @@ -0,0 +1,3 @@ +# FileDocStore + +::: docarray.store.file.FileDocStore diff --git a/docs/api_references/doc_store/jac_doc_store.md b/docs/api_references/doc_store/jac_doc_store.md new file mode 100644 index 00000000000..1d4c0a28303 --- /dev/null +++ b/docs/api_references/doc_store/jac_doc_store.md @@ -0,0 +1,3 @@ +# JACDocStore + +::: docarray.store.jac.JACDocStore diff --git a/docs/api_references/doc_store/s3_doc_store.md b/docs/api_references/doc_store/s3_doc_store.md new file mode 100644 index 00000000000..6856c42f2ff --- /dev/null +++ b/docs/api_references/doc_store/s3_doc_store.md @@ -0,0 +1,3 @@ +# S3DocStore + +::: docarray.store.s3.S3DocStore From 52dcab2972b5c370d39f44ad7dcfa1acfc3947a1 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 13 Apr 2023 12:47:59 +0200 Subject: [PATCH 06/20] docs: fix docstrings Signed-off-by: anna-charlotte --- docarray/array/any_array.py | 14 +++++++------- docarray/array/doc_list/pushpull.py | 26 ++++++++++++------------- docarray/store/file.py | 27 +++++++++++++++----------- docarray/store/jac.py | 30 +++++++++++++++-------------- docarray/store/s3.py | 18 ++++++++--------- 5 files changed, 61 insertions(+), 54 deletions(-) diff --git a/docarray/array/any_array.py b/docarray/array/any_array.py index da718519682..31d1dedb067 100644 --- a/docarray/array/any_array.py +++ b/docarray/array/any_array.py @@ -121,7 +121,7 @@ def _set_data_column( field: str, values: Union[List, T, 'AbstractTensor'], ): - """Set all Documents in this [`DocList`][docarray.typing.DocList] using the passed values + """Set all Documents in this [`DocList`][docarray.DocList] using the passed values :param field: name of the fields to extract :values: the values to set at the DocList level @@ -140,7 +140,7 @@ def to_protobuf(self) -> 'DocListProto': ... def _to_node_protobuf(self) -> 'NodeProto': - """Convert a [`DocList`][docarray.typing.DocList] into a NodeProto protobuf message. + """Convert a [`DocList`][docarray.DocList] into a NodeProto protobuf message. This function should be called when a DocList is nested into another Document that need to be converted into a protobuf @@ -157,7 +157,7 @@ def traverse_flat( ) -> Union[List[Any], 'AbstractTensor']: """ Return a List of the accessed objects when applying the `access_path`. If this - results in a nested list or list of [`DocList`s][docarray.typing.DocList], the list will be flattened + results in a nested list or list of [`DocList`s][docarray.DocList], the list will be flattened on the first level. The access path is a string that consists of attribute names, concatenated and `"__"`-separated. It describes the path from the first level to an arbitrary one, e.g. `'content__image__url'`. @@ -210,7 +210,7 @@ class Book(BaseDoc): chapters = docs.traverse_flat(access_path='chapters') # list of 30 strings ``` - If your [`DocList`][docarray.typing.DocList] is in doc_vec mode, and you want to access a field of + If your [`DocList`][docarray.DocList] is in doc_vec mode, and you want to access a field of type [`AnyTensor`][docarray.typing.AnyTensor], the doc_vec tensor will be returned instead of a list: ```python @@ -263,7 +263,7 @@ def _flatten_one_level(sequence: List[Any]) -> List[Any]: def summary(self): """ - Print a summary of this [`DocList`][docarray.typing.DocList] object and a summary of the schema of its + Print a summary of this [`DocList`][docarray.DocList] object and a summary of the schema of its Document type. """ DocArraySummary(self).summary() @@ -275,13 +275,13 @@ def _batch( show_progress: bool = False, ) -> Generator[T, None, None]: """ - Creates a `Generator` that yields [`DocList`][docarray.typing.DocList] of size `batch_size`. + Creates a `Generator` that yields [`DocList`][docarray.DocList] of size `batch_size`. Note, that the last batch might be smaller than `batch_size`. :param batch_size: Size of each generated batch. :param shuffle: If set, shuffle the Documents before dividing into minibatches. :param show_progress: if set, show a progress bar when batching documents. - :yield: a Generator of [`DocList`][docarray.typing.DocList], each in the length of `batch_size` + :yield: a Generator of [`DocList`][docarray.DocList], each in the length of `batch_size` """ from rich.progress import track diff --git a/docarray/array/doc_list/pushpull.py b/docarray/array/doc_list/pushpull.py index baa9c0439da..0d0f9384758 100644 --- a/docarray/array/doc_list/pushpull.py +++ b/docarray/array/doc_list/pushpull.py @@ -86,10 +86,10 @@ def push( show_progress: bool = False, branding: Optional[Dict] = None, ) -> Dict: - """Push this DocList object to the specified url. + """Push this `DocList` object to the specified url. - :param url: url specifying the protocol and save name of the DocList. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` - :param public: Only used by ``jac`` protocol. If true, anyone can pull a DocList if they know its name. + :param url: url specifying the protocol and save name of the `DocList`. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param public: Only used by ``jac`` protocol. If true, anyone can pull a `DocList` if they know its name. Setting this to false will restrict access to only the creator. :param show_progress: If true, a progress bar will be displayed. :param branding: Only used by ``jac`` protocol. A dictionary of branding information to be sent to Jina AI Cloud. {"icon": "emoji", "background": "#fff"} @@ -112,8 +112,8 @@ def push_stream( """Push a stream of documents to the specified url. :param docs: a stream of documents - :param url: url specifying the protocol and save name of the DocList. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` - :param public: Only used by ``jac`` protocol. If true, anyone can pull a DocList if they know its name. + :param url: url specifying the protocol and save name of the `DocList`. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param public: Only used by ``jac`` protocol. If true, anyone can pull a `DocList` if they know its name. :param show_progress: If true, a progress bar will be displayed. :param branding: Only used by ``jac`` protocol. A dictionary of branding information to be sent to Jina AI Cloud. {"icon": "emoji", "background": "#fff"} """ @@ -130,19 +130,19 @@ def pull( show_progress: bool = False, local_cache: bool = True, ) -> 'DocList': - """Pull a :class:`DocList` from the specified url. + """Pull a `DocList` from the specified url. - :param url: url specifying the protocol and save name of the DocList. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param url: url specifying the protocol and save name of the `DocList`. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocList to local folder - :return: a :class:`DocList` object + :param local_cache: store the downloaded `DocList` to local folder + :return: a `DocList` object """ from docarray.base_doc import AnyDoc if cls.doc_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' - 'Please specify the DocList\'s Document type using `DocList[MyDoc]`.' + 'Please specify the `DocList`\'s Document type using `DocList[MyDoc]`.' ) logging.info(f'Pulling {url}') @@ -160,9 +160,9 @@ def pull_stream( ) -> Iterator['BaseDoc']: """Pull a stream of Documents from the specified url. - :param url: url specifying the protocol and save name of the DocList. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param url: url specifying the protocol and save name of the `DocList`. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocList to local folder + :param local_cache: store the downloaded `DocList` to local folder :return: Iterator of Documents """ from docarray.base_doc import AnyDoc @@ -170,7 +170,7 @@ def pull_stream( if cls.doc_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' - 'Please specify the DocList\'s Document type using `DocList[MyDoc]`.' + 'Please specify the `DocList`\'s Document type using `DocList[MyDoc]`.' ) logging.info(f'Pulling Document stream from {url}') diff --git a/docarray/store/file.py b/docarray/store/file.py index b649864478a..6c46c3ab615 100644 --- a/docarray/store/file.py +++ b/docarray/store/file.py @@ -16,11 +16,15 @@ class FileDocStore(AbstractDocStore): + """Class to push and pull [`DocList`][docarray.DocList] on-disk.""" + @staticmethod def _abs_filepath(name: str) -> Path: """Resolve a name to an absolute path. - If it is not a path, the cache directoty is prepended. - If it is a path, it is resolved to an absolute path. + + :param name: If it is not a path, the cache directory is prepended. + If it is a path, it is resolved to an absolute path. + :return: Path """ if not (name.startswith('/') or name.startswith('~') or name.startswith('.')): name = str(_get_cache_path() / name) @@ -32,11 +36,11 @@ def _abs_filepath(name: str) -> Path: def list( cls: Type[SelfFileDocStore], namespace: str, show_table: bool ) -> List[str]: - """List all DocArrays in a directory. + """List all [`DocList`s][docarray.DocList] in a directory. :param namespace: The directory to list. :param show_table: If True, print a table of the files in the directory. - :return: A list of the names of the DocArrays in the directory. + :return: A list of the names of the `DocLists` in the directory. """ namespace_dir = cls._abs_filepath(namespace) if not namespace_dir.exists(): @@ -51,7 +55,7 @@ def list( from rich.table import Table table = Table( - title=f'You have {len(da_files)} DocArrays in file://{namespace_dir}', + title=f'You have {len(da_files)} DocLists in file://{namespace_dir}', box=box.SIMPLE, highlight=True, ) @@ -74,9 +78,9 @@ def list( def delete( cls: Type[SelfFileDocStore], name: str, missing_ok: bool = False ) -> bool: - """Delete a DocList from the local filesystem. + """Delete a [`DocList`][docarray.DocList] from the local filesystem. - :param name: The name of the DocList to delete. + :param name: The name of the `DocList` to delete. :param missing_ok: If True, do not raise an exception if the file does not exist. Defaults to False. :return: True if the file was deleted, False if it did not exist. """ @@ -98,8 +102,9 @@ def push( show_progress: bool, branding: Optional[Dict], ) -> Dict: - """Push this DocList object to the specified file path. + """Push this [`DocList`][docarray.DocList] object to the specified file path. + :param docs: The `DocList` to push. :param name: The file path to push to. :param public: Not used by the ``file`` protocol. :param show_progress: If true, a progress bar will be displayed. @@ -150,12 +155,12 @@ def pull( show_progress: bool, local_cache: bool, ) -> 'DocList': - """Pull a :class:`DocList` from the specified url. + """Pull a [`DocList`][docarray.DocList] from the specified url. :param name: The file path to pull from. :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocList to local folder - :return: a :class:`DocList` object + :param local_cache: store the downloaded `DocList` to local folder + :return: a `DocList` object """ return docs_cls( diff --git a/docarray/store/jac.py b/docarray/store/jac.py index 7838e3c26c8..6dafb49839a 100644 --- a/docarray/store/jac.py +++ b/docarray/store/jac.py @@ -82,7 +82,7 @@ def _get_raw_summary(self: 'DocList') -> List[Dict[str, Any]]: class JACDocStore(AbstractDocStore): - """Class to push and pull DocList to and from Jina AI Cloud.""" + """Class to push and pull [`DocList`][docarray.DocList] to and from Jina AI Cloud.""" @staticmethod @hubble.login_required @@ -135,7 +135,7 @@ def list(namespace: str = '', show_table: bool = False) -> List[str]: @hubble.login_required def delete(name: str, missing_ok: bool = True) -> bool: """ - Delete a DocList from the cloud. + Delete a [`DocList`][docarray.DocList] from the cloud. :param name: the name of the DocList to delete. :param missing_ok: if true, do not raise an error if the DocList does not exist. :return: True if the DocList was deleted, False if it did not exist. @@ -158,17 +158,18 @@ def push( show_progress: bool = False, branding: Optional[Dict] = None, ) -> Dict: - """Push this DocList object to Jina AI Cloud + """Push this [`DocList`][docarray.DocList] object to Jina AI Cloud - .. note:: + !!! note - Push with the same ``name`` will override the existing content. - Kinda like a public clipboard where everyone can override anyone's content. So to make your content survive longer, you may want to use longer & more complicated name. - The lifetime of the content is not promised atm, could be a day, could be a week. Do not use it for persistence. Only use this full temporary transmission/storage/clipboard. - :param name: A name that can later be used to retrieve this :class:`DocList`. - :param public: By default, anyone can pull a DocList if they know its name. + :param docs: The `DocList` to push. + :param name: A name that can later be used to retrieve this `DocList`. + :param public: By default, anyone can pull a `DocList` if they know its name. Setting this to false will restrict access to only the creator. :param show_progress: If true, a progress bar will be displayed. :param branding: A dictionary of branding information to be sent to Jina Cloud. e.g. {"icon": "emoji", "background": "#fff"} @@ -245,15 +246,16 @@ def push_stream( ) -> Dict: """Push a stream of documents to Jina AI Cloud - .. note:: + !!! note - Push with the same ``name`` will override the existing content. - Kinda like a public clipboard where everyone can override anyone's content. So to make your content survive longer, you may want to use longer & more complicated name. - The lifetime of the content is not promised atm, could be a day, could be a week. Do not use it for persistence. Only use this full temporary transmission/storage/clipboard. - :param name: A name that can later be used to retrieve this :class:`DocList`. - :param public: By default, anyone can pull a DocList if they know its name. + :param docs: a stream of documents + :param name: A name that can later be used to retrieve this `DocList`. + :param public: By default, anyone can pull a `DocList` if they know its name. Setting this to false will restrict access to only the creator. :param show_progress: If true, a progress bar will be displayed. :param branding: A dictionary of branding information to be sent to Jina Cloud. e.g. {"icon": "emoji", "background": "#fff"} @@ -278,12 +280,12 @@ def pull( show_progress: bool = False, local_cache: bool = True, ) -> 'DocList': - """Pull a :class:`DocList` from Jina AI Cloud to local. + """Pull a [`DocList`][docarray.DocList] from Jina AI Cloud to local. - :param name: the upload name set during :meth:`.push` + :param name: the upload name set during `.push` :param show_progress: if true, display a progress bar. :param local_cache: store the downloaded DocList to local folder - :return: a :class:`DocList` object + :return: a [`DocList`][docarray.DocList] object """ from docarray import DocList @@ -299,9 +301,9 @@ def pull_stream( show_progress: bool = False, local_cache: bool = False, ) -> Iterator['BaseDoc']: - """Pull a :class:`DocList` from Jina AI Cloud to local. + """Pull a [`DocList`][docarray.DocList] from Jina AI Cloud to local. - :param name: the upload name set during :meth:`.push` + :param name: the upload name set during `.push` :param show_progress: if true, display a progress bar. :param local_cache: store the downloaded DocList to local folder :return: An iterator of Documents diff --git a/docarray/store/s3.py b/docarray/store/s3.py index 936a261396f..2ebb864fc8d 100644 --- a/docarray/store/s3.py +++ b/docarray/store/s3.py @@ -48,15 +48,15 @@ def close(self): class S3DocStore(AbstractDocStore): - """Class to push and pull DocList to and from S3.""" + """Class to push and pull [`DocList`][docarray.DocList] to and from S3.""" @staticmethod def list(namespace: str, show_table: bool = False) -> List[str]: - """List all DocArrays in the specified bucket and namespace. + """List all [`DocList`s][docarray.DocList] in the specified bucket and namespace. :param namespace: The bucket and namespace to list. e.g. my_bucket/my_namespace :param show_table: If true, a rich table will be printed to the console. - :return: A list of DocList names. + :return: A list of `DocList` names. """ bucket, namespace = namespace.split('/', 1) s3 = boto3.resource('s3') @@ -74,7 +74,7 @@ def list(namespace: str, show_table: bool = False) -> List[str]: from rich.table import Table table = Table( - title=f'You have {len(da_files)} DocArrays in bucket s3://{bucket} under the namespace "{namespace}"', + title=f'You have {len(da_files)} DocLists in bucket s3://{bucket} under the namespace "{namespace}"', box=box.SIMPLE, highlight=True, ) @@ -94,7 +94,7 @@ def list(namespace: str, show_table: bool = False) -> List[str]: @staticmethod def delete(name: str, missing_ok: bool = True) -> bool: - """Delete the DocList object at the specified bucket and key. + """Delete the [`DocList`][docarray.DocList] object at the specified bucket and key. :param name: The bucket and key to delete. e.g. my_bucket/my_key :param missing_ok: If true, no error will be raised if the object does not exist. @@ -125,9 +125,9 @@ def push( show_progress: bool = False, branding: Optional[Dict] = None, ) -> Dict: - """Push this DocList object to the specified bucket and key. + """Push this [`DocList`][docarray.DocList] object to the specified bucket and key. - :param docs: The DocList to push. + :param docs: The `DocList` to push. :param name: The bucket and key to push to. e.g. my_bucket/my_key :param public: Not used by the ``s3`` protocol. :param show_progress: If true, a progress bar will be displayed. @@ -182,12 +182,12 @@ def pull( show_progress: bool = False, local_cache: bool = False, ) -> 'DocList': - """Pull a :class:`DocList` from the specified bucket and key. + """Pull a [`DocList`][docarray.DocList] from the specified bucket and key. :param name: The bucket and key to pull from. e.g. my_bucket/my_key :param show_progress: if true, display a progress bar. :param local_cache: store the downloaded DocList to local cache - :return: a :class:`DocList` object + :return: a `DocList` object """ docs = docs_cls( # type: ignore cls.pull_stream( From d592eb7bca18816837794ab4ba582a1808df4ffe Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 13 Apr 2023 12:54:36 +0200 Subject: [PATCH 07/20] fix: clean up Signed-off-by: anna-charlotte --- docs/user_guide/storing/first_step.md | 9 ++++++++- docs/user_guide/storing/store_file.md | 11 ++--------- docs/user_guide/storing/store_s3.md | 16 ++++++++-------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/docs/user_guide/storing/first_step.md b/docs/user_guide/storing/first_step.md index f58c4ba4e36..d821f5872fb 100644 --- a/docs/user_guide/storing/first_step.md +++ b/docs/user_guide/storing/first_step.md @@ -3,8 +3,15 @@ In the previous sections we saw how to use [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] to represent multi-modal data and send it over the wire. In this section we will see how to store and persist this data. +[DocList][docarray.array.doc_list.doc_list.DocList] can be persisted using the +[`.push()`][docarray.array.doc_list.pushpull.PushPullMixin.push] and +[`.pull()`][docarray.array.doc_list.pushpull.PushPullMixin.pull] methods. +Under the hood, [DocStore][docarray.store.abstract_doc_store.AbstractDocStore] is used to persist a `DocList`. +You can store your documents on-disk. Alternatively, you can upload them to [AWS S3](https://aws.amazon.com/s3/), +[minio](https://min.io) or [Jina AI Cloud](https://cloud.jina.ai/user/storage). + This section is divided into three parts: - [Store](store_file.md) of [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] on-disk - [Store on Jina AI Cloud](store_jac.md) -- [Store on S3](store_s3.md) \ No newline at end of file +- [Store on S3](store_s3.md) diff --git a/docs/user_guide/storing/store_file.md b/docs/user_guide/storing/store_file.md index 973e6999775..8e76fe6d676 100644 --- a/docs/user_guide/storing/store_file.md +++ b/docs/user_guide/storing/store_file.md @@ -1,12 +1,5 @@ -# Store -[DocList][docarray.array.doc_list.doc_list.DocList] can be persisted using the -[`.push()`][docarray.array.doc_list.pushpull.PushPullMixin.push] and -[`.pull()`][docarray.array.doc_list.pushpull.PushPullMixin.pull] methods. -Under the hood, [DocStore][docarray.store.abstract_doc_store.AbstractDocStore] is used to persist a `DocList`. -You can store your `Doc` on-disk. Alternatively, you can upload to [AWS S3](https://aws.amazon.com/s3/), -[minio](https://min.io) or [Jina AI Cloud](https://cloud.jina.ai/user/storage). - -## Store on-disk +# Store on-disk + When you want to use your [DocList][docarray.array.doc_list.doc_list.DocList] in another place, you can use the [`.push()`][docarray.array.doc_list.pushpull.PushPullMixin.push] function to push the [DocList][docarray.array.doc_list.doc_list.DocList] to one place and later use the [`.pull()`][docarray.array.doc_list.pushpull.PushPullMixin.pull] function to pull its content back. diff --git a/docs/user_guide/storing/store_s3.md b/docs/user_guide/storing/store_s3.md index 9e63eb81e0a..fe712857349 100644 --- a/docs/user_guide/storing/store_s3.md +++ b/docs/user_guide/storing/store_s3.md @@ -24,8 +24,8 @@ services: - "9005:9000" command: server /data ``` -Save the above file as `docker-compose.yml` and run the following line in the same folder as the file, -```bash +Save the above file as `docker-compose.yml` and run the following line in the same folder as the file. +```cmd docker-compose up ``` @@ -56,8 +56,8 @@ if __name__ == '__main__': s3.create_bucket(Bucket=BUCKET) store_docs = [SimpleDoc(text=f'doc {i}') for i in range(8)] - dl = DocList[SimpleDoc]() - dl.extend([SimpleDoc(text=f'doc {i}') for i in range(8)]) + docs = DocList[SimpleDoc]() + docs.extend([SimpleDoc(text=f'doc {i}') for i in range(8)]) # .push() and .pull() use the default boto3 client boto3.Session.client.__defaults__ = ( @@ -71,15 +71,15 @@ if __name__ == '__main__': None, Config(signature_version="s3v4"), ) - dl.push(f's3://{BUCKET}/simple_dl') - dl_pull = DocList[SimpleDoc].pull(f's3://{BUCKET}/simple_dl') + docs.push(f's3://{BUCKET}/simple_docs') + docs_pull = DocList[SimpleDoc].pull(f's3://{BUCKET}/simple_docs') # delete the bucket s3.Bucket(BUCKET).objects.all().delete() s3.Bucket(BUCKET).delete() ``` -Under the bucket `tmp_bucket`, there is a file with the name of `simple_dl.docs` being created to store the `DocList`. +Under the bucket `tmp_bucket`, there is a file with the name of `simple_docs.docs` being created to store the `DocList`. !!! note When using `.push()` and `.pull()`, `DocList` calls the default boto3 client. Be sure your default session is correctly set up. @@ -98,5 +98,5 @@ To delete the store, you need to use the static method [`.delete()`][docarray.st ```python from docarray.store import S3DocStore -success = S3DocStore.delete(f's3://{BUCKET}/simple_dl') +success = S3DocStore.delete(f's3://{BUCKET}/simple_docs') ``` From af80be36f804af6b7233c33052dc6ea362fc2771 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 13 Apr 2023 13:56:54 +0200 Subject: [PATCH 08/20] fix: path in file doc store Signed-off-by: anna-charlotte --- docs/user_guide/storing/store_file.md | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/docs/user_guide/storing/store_file.md b/docs/user_guide/storing/store_file.md index 8e76fe6d676..8602eb71adb 100644 --- a/docs/user_guide/storing/store_file.md +++ b/docs/user_guide/storing/store_file.md @@ -16,16 +16,13 @@ class SimpleDoc(BaseDoc): dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(8)]) -dl.push('file:///Users/docarray/tmp/simple_dl') +dl.push('file://simple_dl') -dl_pull = DocList[SimpleDoc].pull('file:///Users/docarray/tmp/simple_dl') +dl_pull = DocList[SimpleDoc].pull('file://simple_dl') ``` -Under `/Users/docarray/tmp/`, there is a file with the name of `simple_dl.docs` being created to store the `DocList`. -``` { .output .no-copy } -tmp -└── simple_dl.docs -``` +A file with the name of `simple_dl.docs` being created to store the `DocList`. + ## Push & pull with streaming When you have a large amount of documents to push and pull, you could use the streaming function. @@ -45,14 +42,10 @@ store_docs = [SimpleDoc(text=f'doc {i}') for i in range(8)] DocList[SimpleDoc].push_stream( iter(store_docs), - 'file:///Users/docarray/tmp/dl_stream', -) -dl_pull_stream_1 = DocList[SimpleDoc].pull_stream( - 'file:///Users/docarray/tmp/dl_stream' -) -dl_pull_stream_2 = DocList[SimpleDoc].pull_stream( - 'file:///Users/docarray/tmp/dl_stream' + 'file://dl_stream', ) +dl_pull_stream_1 = DocList[SimpleDoc].pull_stream('file://dl_stream') +dl_pull_stream_2 = DocList[SimpleDoc].pull_stream('file://dl_stream') for d1, d2 in zip(dl_pull_stream_1, dl_pull_stream_2): print(f'get {d1}, get {d2}') From 47debe43a31252729d2951cd7e9e2a6e9080fcb8 Mon Sep 17 00:00:00 2001 From: Alex Cureton-Griffiths Date: Thu, 13 Apr 2023 13:33:49 +0200 Subject: [PATCH 09/20] docs(menu): consistency, wording fixes (#1363) * docs(menu): consistency, wording fixes Signed-off-by: Alex C-G * docs(intro): remove redundancy in title Signed-off-by: Alex C-G --------- Signed-off-by: Alex C-G Signed-off-by: anna-charlotte --- docs/how_to/audio2text.md | 8 ++++---- docs/how_to/multimodal_training_and_serving.md | 2 +- docs/how_to/optimize_performance_with_id_generation.md | 2 +- docs/user_guide/intro.md | 2 +- docs/user_guide/sending/first_step.md | 2 +- mkdocs.yml | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/how_to/audio2text.md b/docs/how_to/audio2text.md index fcec869ce0f..d2f2507e08f 100644 --- a/docs/how_to/audio2text.md +++ b/docs/how_to/audio2text.md @@ -1,10 +1,10 @@ -# Creating an Audio to Text App with Jina and DocArray V2 +# Create an audio to text app with Jina and DocArray V2 -This is how you can build an Audio to Text app using Jina, Docarray and Whisper +This is how you can build an Audio to Text app using Jina, DocArray and Whisper. We will use: -* DocarrayV2: Helps us to load and preprocess multimodal data such as image, text and audio in our case +* DocArray V2: Helps us to load and preprocess multimodal data such as image, text and audio in our case * Jina: Helps us serve the model quickly and create a client First let's install requirements @@ -76,4 +76,4 @@ with Deployment( print(docs[0].text) ``` -And we get the transcribed result! \ No newline at end of file +And we get the transcribed result! diff --git a/docs/how_to/multimodal_training_and_serving.md b/docs/how_to/multimodal_training_and_serving.md index 9c30cbeffba..604545c7cd2 100644 --- a/docs/how_to/multimodal_training_and_serving.md +++ b/docs/how_to/multimodal_training_and_serving.md @@ -12,7 +12,7 @@ jupyter: name: python3 --- -# Multi-Modal Deep learning with DocList +# Multimodal deep learning with DocList DocList is a library for representing, sending, and storing multi-modal data that can be used for a variety of different use cases. diff --git a/docs/how_to/optimize_performance_with_id_generation.md b/docs/how_to/optimize_performance_with_id_generation.md index db46020faa2..5d0df78e776 100644 --- a/docs/how_to/optimize_performance_with_id_generation.md +++ b/docs/how_to/optimize_performance_with_id_generation.md @@ -1,4 +1,4 @@ -# How to optimize performance +# Optimize performance ### `BaseDoc`'s id diff --git a/docs/user_guide/intro.md b/docs/user_guide/intro.md index 5c9fbb14d1f..94bb730fdb0 100644 --- a/docs/user_guide/intro.md +++ b/docs/user_guide/intro.md @@ -1,4 +1,4 @@ -# User Guide - Introduction +# Introduction This user guide shows you how to use `DocArray` with most of its features. diff --git a/docs/user_guide/sending/first_step.md b/docs/user_guide/sending/first_step.md index a18433535b9..1079b9dd75b 100644 --- a/docs/user_guide/sending/first_step.md +++ b/docs/user_guide/sending/first_step.md @@ -1 +1 @@ -# Sending +# Sending data diff --git a/mkdocs.yml b/mkdocs.yml index c3ace80d956..bd1548a0a22 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -76,9 +76,9 @@ plugins: nav: - Home: README.md - - Tutorial - User Guide: + - Tutorial/User Guide: - user_guide/intro.md - - Representing: + - Representing data: - user_guide/representing/first_step.md - user_guide/representing/array.md - user_guide/sending/first_step.md From 65c5e88dfc55e534352056d9e8307880e1a6c146 Mon Sep 17 00:00:00 2001 From: Anne Yang Date: Thu, 13 Apr 2023 19:38:01 +0800 Subject: [PATCH 10/20] fix: default dims=-1 for elastic index (#1368) Signed-off-by: AnneY Signed-off-by: anna-charlotte --- docarray/index/backends/elastic.py | 11 ++++++++++- tests/index/elastic/fixture.py | 5 +++++ tests/index/elastic/v7/test_index_get_del.py | 7 ++++++- tests/index/elastic/v8/test_index_get_del.py | 9 +++++++-- 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/docarray/index/backends/elastic.py b/docarray/index/backends/elastic.py index c2c1c6646a2..52f60e1d098 100644 --- a/docarray/index/backends/elastic.py +++ b/docarray/index/backends/elastic.py @@ -88,8 +88,17 @@ def __init__(self, db_config=None, **kwargs): mappings.update(self._db_config.index_mappings) for col_name, col in self._column_infos.items(): + if col.db_type == 'dense_vector' and ( + not col.n_dim and col.config['dims'] < 0 + ): + self._logger.info( + f'Not indexing column {col_name}, the dimensionality is not specified' + ) + continue + mappings['properties'][col_name] = self._create_index_mapping(col) + # print(mappings['properties']) if self._client.indices.exists(index=self._index_name): self._client_put_mapping(mappings) else: @@ -231,8 +240,8 @@ def __post_init__(self): def dense_vector_config(self): config = { + 'dims': -1, 'index': True, - 'dims': 128, 'similarity': 'cosine', # 'l2_norm', 'dot_product', 'cosine' 'm': 16, 'ef_construction': 100, diff --git a/tests/index/elastic/fixture.py b/tests/index/elastic/fixture.py index 315078d6269..812f0f09d51 100644 --- a/tests/index/elastic/fixture.py +++ b/tests/index/elastic/fixture.py @@ -6,6 +6,7 @@ from pydantic import Field from docarray import BaseDoc +from docarray.documents import ImageDoc from docarray.typing import NdArray pytestmark = [pytest.mark.slow, pytest.mark.index] @@ -58,6 +59,10 @@ class DeepNestedDoc(BaseDoc): d: NestedDoc +class MyImageDoc(ImageDoc): + embedding: NdArray = Field(dims=128) + + @pytest.fixture(scope='function') def ten_simple_docs(): return [SimpleDoc(tens=np.random.randn(10)) for _ in range(10)] diff --git a/tests/index/elastic/v7/test_index_get_del.py b/tests/index/elastic/v7/test_index_get_del.py index 7124d5d61bd..d5ead493c03 100644 --- a/tests/index/elastic/v7/test_index_get_del.py +++ b/tests/index/elastic/v7/test_index_get_del.py @@ -10,6 +10,7 @@ from tests.index.elastic.fixture import ( # noqa: F401 DeepNestedDoc, FlatDoc, + MyImageDoc, NestedDoc, SimpleDoc, start_storage_v7, @@ -247,7 +248,7 @@ class MySchema(BaseDoc): def test_index_multi_modal_doc(): class MyMultiModalDoc(BaseDoc): - image: ImageDoc + image: MyImageDoc text: TextDoc store = ElasticV7DocIndex[MyMultiModalDoc]() @@ -263,3 +264,7 @@ class MyMultiModalDoc(BaseDoc): assert store[id_].id == id_ assert np.all(store[id_].image.embedding == doc[0].image.embedding) assert store[id_].text.text == doc[0].text.text + + query = doc[0] + docs, _ = store.find(query, limit=10, search_field='image__embedding') + assert len(docs) > 0 diff --git a/tests/index/elastic/v8/test_index_get_del.py b/tests/index/elastic/v8/test_index_get_del.py index db2df925ebb..03560caae7d 100644 --- a/tests/index/elastic/v8/test_index_get_del.py +++ b/tests/index/elastic/v8/test_index_get_del.py @@ -10,6 +10,7 @@ from tests.index.elastic.fixture import ( # noqa: F401 DeepNestedDoc, FlatDoc, + MyImageDoc, NestedDoc, SimpleDoc, start_storage_v8, @@ -234,7 +235,7 @@ class MyDoc(BaseDoc): tensor: Union[NdArray, str] class MySchema(BaseDoc): - tensor: NdArray + tensor: NdArray[128] store = ElasticDocIndex[MySchema]() doc = [MyDoc(tensor=np.random.randn(128))] @@ -247,7 +248,7 @@ class MySchema(BaseDoc): def test_index_multi_modal_doc(): class MyMultiModalDoc(BaseDoc): - image: ImageDoc + image: MyImageDoc text: TextDoc store = ElasticDocIndex[MyMultiModalDoc]() @@ -264,6 +265,10 @@ class MyMultiModalDoc(BaseDoc): assert np.all(store[id_].image.embedding == doc[0].image.embedding) assert store[id_].text.text == doc[0].text.text + query = doc[0] + docs, _ = store.find(query, limit=10, search_field='image__embedding') + assert len(docs) > 0 + def test_elasticv7_version_check(): with pytest.raises(ImportError): From 95c8c7029f0c39be0c7f82d8623213dddc764d80 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 13 Apr 2023 14:27:45 +0200 Subject: [PATCH 11/20] fix: s3 bucket var Signed-off-by: anna-charlotte --- docs/user_guide/storing/store_s3.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/user_guide/storing/store_s3.md b/docs/user_guide/storing/store_s3.md index fe712857349..bd896b107e6 100644 --- a/docs/user_guide/storing/store_s3.md +++ b/docs/user_guide/storing/store_s3.md @@ -98,5 +98,6 @@ To delete the store, you need to use the static method [`.delete()`][docarray.st ```python from docarray.store import S3DocStore +BUCKET = 'tmp_bucket' success = S3DocStore.delete(f's3://{BUCKET}/simple_docs') ``` From 96aa50f5bb0b0d07efd1116882b1d2c0c226329e Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 13 Apr 2023 14:58:19 +0200 Subject: [PATCH 12/20] fix: typo and s3 delete code snippet Signed-off-by: anna-charlotte --- docs/user_guide/storing/store_jac.md | 2 +- docs/user_guide/storing/store_s3.md | 55 ++++++++++++++++++++++++---- 2 files changed, 48 insertions(+), 9 deletions(-) diff --git a/docs/user_guide/storing/store_jac.md b/docs/user_guide/storing/store_jac.md index 8e2b47c9959..2975df7311f 100644 --- a/docs/user_guide/storing/store_jac.md +++ b/docs/user_guide/storing/store_jac.md @@ -1,6 +1,6 @@ # Store on Jina AI Cloud When you want to use your [`DocList`][docarray.DocList] in another place, you can use the -[`.push()`][docarray.array.doc_list.pushpull.PushPullMixin.push] method to push the `DocList` to S3 and later use the +[`.push()`][docarray.array.doc_list.pushpull.PushPullMixin.push] method to push the `DocList` to Jina AI Cloud and later use the [`.pull()`][docarray.array.doc_list.pushpull.PushPullMixin.pull] function to pull its content back. !!! note diff --git a/docs/user_guide/storing/store_s3.md b/docs/user_guide/storing/store_s3.md index bd896b107e6..c4e0878133b 100644 --- a/docs/user_guide/storing/store_s3.md +++ b/docs/user_guide/storing/store_s3.md @@ -73,10 +73,6 @@ if __name__ == '__main__': ) docs.push(f's3://{BUCKET}/simple_docs') docs_pull = DocList[SimpleDoc].pull(f's3://{BUCKET}/simple_docs') - - # delete the bucket - s3.Bucket(BUCKET).objects.all().delete() - s3.Bucket(BUCKET).delete() ``` Under the bucket `tmp_bucket`, there is a file with the name of `simple_docs.docs` being created to store the `DocList`. @@ -95,9 +91,52 @@ When you have a large amount of documents to push and pull, you could use the st ## Delete To delete the store, you need to use the static method [`.delete()`][docarray.store.s3.S3DocStore.delete] of [`S3DocStore`][docarray.store.s3.S3DocStore] class. -```python -from docarray.store import S3DocStore +```python hl_lines="44-47" +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +if __name__ == '__main__': + import boto3 + from botocore.client import Config + + BUCKET = 'tmp_bucket' + my_session = boto3.session.Session() + s3 = my_session.resource( + service_name='s3', + region_name="us-east-1", + use_ssl=False, + endpoint_url="http://localhost:9005", + aws_access_key_id="minioadmin", + aws_secret_access_key="minioadmin", + config=Config(signature_version="s3v4"), + ) + # make a bucket + s3.create_bucket(Bucket=BUCKET) + + store_docs = [SimpleDoc(text=f'doc {i}') for i in range(8)] + docs = DocList[SimpleDoc]() + docs.extend([SimpleDoc(text=f'doc {i}') for i in range(8)]) + + # .push() and .pull() use the default boto3 client + boto3.Session.client.__defaults__ = ( + "us-east-1", + None, + False, + None, + "http://localhost:9005", + "minioadmin", + "minioadmin", + None, + Config(signature_version="s3v4"), + ) + docs.push(f's3://{BUCKET}/simple_docs') + + # delete bucket + from docarray.store import S3DocStore -BUCKET = 'tmp_bucket' -success = S3DocStore.delete(f's3://{BUCKET}/simple_docs') + success = S3DocStore.delete('{BUCKET}/simple_docs') ``` From 77c7ea23eeafea217e6f1282d2e46296e5c7d234 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 13 Apr 2023 15:35:35 +0200 Subject: [PATCH 13/20] docs: exclude jacdocstore docs from test for now Signed-off-by: anna-charlotte --- tests/documentation/test_docs.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py index 085022b5a00..ce8c19b115e 100644 --- a/tests/documentation/test_docs.py +++ b/tests/documentation/test_docs.py @@ -43,15 +43,20 @@ def check_md_file(fpath, memory=False, lang="python", keyword_ignore=[]): check_raw_file_full(text, lang=lang, keyword_ignore=keyword_ignore) -@pytest.mark.parametrize( - 'fpath', - [ - *list(pathlib.Path('docs/user_guide').glob('**/*.md')), - *list(pathlib.Path('docs/data_types').glob('**/*.md')), - ], - ids=str, -) +paths = [ + *list(pathlib.Path('docs/user_guide').glob('**/*.md')), + *list(pathlib.Path('docs/data_types').glob('**/*.md')), +] +exclude = [pathlib.Path('docs/user_guide/storing/store_jac.md')] + +for path in exclude: + if path in paths: + paths.remove(path) + + +@pytest.mark.parametrize('fpath', paths, ids=str) def test_files_good(fpath): + print(f"fpath = {fpath}") check_md_file(fpath=fpath, memory=True) From 8b118d3b723ab855e5df1535fcbc7b62431140c2 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 13 Apr 2023 15:39:57 +0200 Subject: [PATCH 14/20] Revert "docs: exclude jacdocstore docs from test for now" This reverts commit a52fed5430369e0306930ac27139cc64eb6456e9. Signed-off-by: anna-charlotte --- tests/documentation/test_docs.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py index ce8c19b115e..085022b5a00 100644 --- a/tests/documentation/test_docs.py +++ b/tests/documentation/test_docs.py @@ -43,20 +43,15 @@ def check_md_file(fpath, memory=False, lang="python", keyword_ignore=[]): check_raw_file_full(text, lang=lang, keyword_ignore=keyword_ignore) -paths = [ - *list(pathlib.Path('docs/user_guide').glob('**/*.md')), - *list(pathlib.Path('docs/data_types').glob('**/*.md')), -] -exclude = [pathlib.Path('docs/user_guide/storing/store_jac.md')] - -for path in exclude: - if path in paths: - paths.remove(path) - - -@pytest.mark.parametrize('fpath', paths, ids=str) +@pytest.mark.parametrize( + 'fpath', + [ + *list(pathlib.Path('docs/user_guide').glob('**/*.md')), + *list(pathlib.Path('docs/data_types').glob('**/*.md')), + ], + ids=str, +) def test_files_good(fpath): - print(f"fpath = {fpath}") check_md_file(fpath=fpath, memory=True) From ac7a567533ef0054f2634f1efdc8dd59f130ddf6 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 13 Apr 2023 15:47:58 +0200 Subject: [PATCH 15/20] docs: exclude jac tests Signed-off-by: anna-charlotte --- tests/documentation/test_docs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py index 085022b5a00..1d8fe1679b3 100644 --- a/tests/documentation/test_docs.py +++ b/tests/documentation/test_docs.py @@ -52,7 +52,10 @@ def check_md_file(fpath, memory=False, lang="python", keyword_ignore=[]): ids=str, ) def test_files_good(fpath): - check_md_file(fpath=fpath, memory=True) + keyword_ignore = [] + if 'store_jac.md' in str(fpath): + keyword_ignore = ['jac'] + check_md_file(fpath=fpath, memory=True, keyword_ignore=keyword_ignore) def test_readme(): From f15955def47efabb8cd959f3c4f44ca0535bc165 Mon Sep 17 00:00:00 2001 From: Shukri Date: Thu, 13 Apr 2023 16:35:15 +0200 Subject: [PATCH 16/20] feat: weaviate document index V2! (#1367) Signed-off-by: anna-charlotte --- .pre-commit-config.yaml | 2 +- docarray/index/abstract.py | 8 +- docarray/index/backends/weaviate.py | 833 ++++++++++++++++++ poetry.lock | 145 ++- pyproject.toml | 2 + .../doc_index/weaviate/docker-compose.yml | 27 + .../doc_index/weaviate/fixture_weaviate.py | 41 + .../weaviate/test_column_config_weaviate.py | 33 + .../doc_index/weaviate/test_find_weaviate.py | 66 ++ .../weaviate/test_index_get_del_weaviate.py | 452 ++++++++++ 10 files changed, 1589 insertions(+), 20 deletions(-) create mode 100644 docarray/index/backends/weaviate.py create mode 100644 tests/integrations/doc_index/weaviate/docker-compose.yml create mode 100644 tests/integrations/doc_index/weaviate/fixture_weaviate.py create mode 100644 tests/integrations/doc_index/weaviate/test_column_config_weaviate.py create mode 100644 tests/integrations/doc_index/weaviate/test_find_weaviate.py create mode 100644 tests/integrations/doc_index/weaviate/test_index_get_del_weaviate.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bccbe2f206d..9df8e8a06d2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,4 +32,4 @@ repos: args: - -S additional_dependencies: - - black==22.3.0 \ No newline at end of file + - black==22.3.0 diff --git a/docarray/index/abstract.py b/docarray/index/abstract.py index 4feb576ed76..3c423137259 100644 --- a/docarray/index/abstract.py +++ b/docarray/index/abstract.py @@ -49,12 +49,12 @@ class FindResultBatched(NamedTuple): documents: List[DocList] - scores: np.ndarray + scores: List[np.ndarray] class _FindResultBatched(NamedTuple): documents: Union[List[DocList], List[List[Dict[str, Any]]]] - scores: np.ndarray + scores: List[np.ndarray] def _raise_not_composable(name): @@ -571,7 +571,9 @@ def text_search_batched( if len(da_list) > 0 and isinstance(da_list[0], List): docs = [self._dict_list_to_docarray(docs) for docs in da_list] - return FindResultBatched(documents=docs, scores=scores) + return FindResultBatched(documents=docs, scores=scores) + + return FindResultBatched(documents=da_list, scores=scores) ########################################################## # Helper methods # diff --git a/docarray/index/backends/weaviate.py b/docarray/index/backends/weaviate.py new file mode 100644 index 00000000000..c54d3e76f47 --- /dev/null +++ b/docarray/index/backends/weaviate.py @@ -0,0 +1,833 @@ +import base64 +import copy +import logging +import os +from dataclasses import dataclass, field +from pathlib import Path +from typing import ( + Any, + Dict, + Generator, + Generic, + List, + Optional, + Sequence, + Tuple, + Type, + TypeVar, + Union, + cast, +) + +import numpy as np +import weaviate +from pydantic import parse_obj_as +from typing_extensions import Literal + +import docarray +from docarray import BaseDoc, DocList +from docarray.index.abstract import BaseDocIndex, FindResultBatched, _FindResultBatched +from docarray.typing import AnyTensor +from docarray.typing.tensor.abstract_tensor import AbstractTensor +from docarray.typing.tensor.ndarray import NdArray +from docarray.utils.find import FindResult, _FindResult + +TSchema = TypeVar('TSchema', bound=BaseDoc) +T = TypeVar('T', bound='WeaviateDocumentIndex') + + +DEFAULT_BATCH_CONFIG = { + "batch_size": 20, + "dynamic": False, + "timeout_retries": 3, + "num_workers": 1, +} + +DEFAULT_BINARY_PATH = str(Path.home() / ".cache/weaviate-embedded/") +DEFAULT_PERSISTENCE_DATA_PATH = str(Path.home() / ".local/share/weaviate") + + +@dataclass +class EmbeddedOptions: + persistence_data_path: str = os.environ.get( + "XDG_DATA_HOME", DEFAULT_PERSISTENCE_DATA_PATH + ) + binary_path: str = os.environ.get("XDG_CACHE_HOME", DEFAULT_BINARY_PATH) + version: str = "latest" + port: int = 6666 + hostname: str = "127.0.0.1" + additional_env_vars: Optional[Dict[str, str]] = None + + +# TODO: add more types and figure out how to handle text vs string type +# see https://weaviate.io/developers/weaviate/configuration/datatypes +WEAVIATE_PY_VEC_TYPES = [list, np.ndarray, AbstractTensor] +WEAVIATE_PY_TYPES = [bool, int, float, str, docarray.typing.ID] + +# "id" and "_id" are reserved names in weaviate so we need to use a different +# name for the id column in a BaseDocument +DOCUMENTID = "docarrayid" + + +class WeaviateDocumentIndex(BaseDocIndex, Generic[TSchema]): + def __init__(self, db_config=None, **kwargs) -> None: + self.embedding_column: Optional[str] = None + self.properties: Optional[List[str]] = None + # keep track of the column name that contains the bytes + # type because we will store them as a base64 encoded string + # in weaviate + self.bytes_columns: List[str] = [] + # keep track of the array columns that are not embeddings because we will + # convert them to python lists before uploading to weaviate + self.nonembedding_array_columns: List[str] = [] + super().__init__(db_config=db_config, **kwargs) + self._db_config: WeaviateDocumentIndex.DBConfig = cast( + WeaviateDocumentIndex.DBConfig, self._db_config + ) + self._runtime_config: WeaviateDocumentIndex.RuntimeConfig = cast( + WeaviateDocumentIndex.RuntimeConfig, self._runtime_config + ) + + if self._db_config.embedded_options: + self._client = weaviate.Client( + embedded_options=self._db_config.embedded_options + ) + else: + self._client = weaviate.Client( + self._db_config.host, auth_client_secret=self._build_auth_credentials() + ) + + self._configure_client() + self._validate_columns() + self._set_embedding_column() + self._set_properties() + self._create_schema() + + def _set_properties(self) -> None: + field_overwrites = {"id": DOCUMENTID} + + self.properties = [ + field_overwrites.get(k, k) + for k, v in self._column_infos.items() + if v.config.get('is_embedding', False) is False + ] + + def _validate_columns(self) -> None: + # must have at most one column with property is_embedding=True + # and that column must be of type WEAVIATE_PY_VEC_TYPES + # TODO: update when https://github.com/weaviate/weaviate/issues/2424 + # is implemented and discuss best interface to signal which column(s) + # should be used for embeddings + num_embedding_columns = 0 + + for column_name, column_info in self._column_infos.items(): + if column_info.config.get('is_embedding', False): + num_embedding_columns += 1 + # if db_type is not 'number[]', then that means the type of the column in + # the given schema is not one of WEAVIATE_PY_VEC_TYPES + # note: the mapping between a column's type in the schema to a weaviate type + # is handled by the python_type_to_db_type method + if column_info.db_type != 'number[]': + raise ValueError( + f'Column {column_name} is marked as embedding but is not of type {WEAVIATE_PY_VEC_TYPES}' + ) + + if num_embedding_columns > 1: + raise ValueError( + f'Only one column can be marked as embedding but found {num_embedding_columns} columns marked as embedding' + ) + + def _set_embedding_column(self) -> None: + for column_name, column_info in self._column_infos.items(): + if column_info.config.get('is_embedding', False): + self.embedding_column = column_name + break + + def _configure_client(self) -> None: + self._client.batch.configure(**self._runtime_config.batch_config) + + def _build_auth_credentials(self): + dbconfig = self._db_config + + if dbconfig.auth_api_key: + return weaviate.auth.AuthApiKey(api_key=dbconfig.auth_api_key) + elif dbconfig.username and dbconfig.password: + return weaviate.auth.AuthClientPassword( + dbconfig.username, dbconfig.password, dbconfig.scopes + ) + else: + return None + + def configure(self, runtime_config=None, **kwargs) -> None: + super().configure(runtime_config, **kwargs) + self._configure_client() + + def _create_schema(self) -> None: + schema: Dict[str, Any] = {} + + properties = [] + column_infos = self._column_infos + + for column_name, column_info in column_infos.items(): + # in weaviate, we do not create a property for the doc's embeddings + if column_name == self.embedding_column: + continue + if column_info.db_type == 'blob': + self.bytes_columns.append(column_name) + if column_info.db_type == 'number[]': + self.nonembedding_array_columns.append(column_name) + prop = { + "name": column_name + if column_name != 'id' + else DOCUMENTID, # in weaviate, id and _id is a reserved keyword + "dataType": [column_info.db_type], + } + properties.append(prop) + + # TODO: What is the best way to specify other config that is part of schema? + # e.g. invertedIndexConfig, shardingConfig, moduleConfig, vectorIndexConfig + # and configure replication + # we will update base on user feedback + schema["properties"] = properties + schema["class"] = self._db_config.index_name + + # TODO: Use exists() instead of contains() when available + # see https://github.com/weaviate/weaviate-python-client/issues/232 + if self._client.schema.contains(schema): + logging.warning( + f"Found index {self._db_config.index_name} with schema {schema}. Will reuse existing schema." + ) + else: + self._client.schema.create_class(schema) + + @dataclass + class DBConfig(BaseDocIndex.DBConfig): + host: str = 'http://localhost:8080' + index_name: str = 'Document' + username: Optional[str] = None + password: Optional[str] = None + scopes: List[str] = field(default_factory=lambda: ["offline_access"]) + auth_api_key: Optional[str] = None + embedded_options: Optional[EmbeddedOptions] = None + + @dataclass + class RuntimeConfig(BaseDocIndex.RuntimeConfig): + default_column_config: Dict[Any, Dict[str, Any]] = field( + default_factory=lambda: { + np.ndarray: {}, + docarray.typing.ID: {}, + 'string': {}, + 'text': {}, + 'int': {}, + 'number': {}, + 'boolean': {}, + 'number[]': {}, + 'blob': {}, + } + ) + + batch_config: Dict[str, Any] = field( + default_factory=lambda: DEFAULT_BATCH_CONFIG + ) + + def _del_items(self, doc_ids: Sequence[str]): + has_matches = True + + operands = [ + {"path": [DOCUMENTID], "operator": "Equal", "valueString": doc_id} + for doc_id in doc_ids + ] + where_filter = { + "operator": "Or", + "operands": operands, + } + + # do a loop because there is a limit to how many objects can be deleted at + # in a single query + # see: https://weaviate.io/developers/weaviate/api/rest/batch#maximum-number-of-deletes-per-query + while has_matches: + results = self._client.batch.delete_objects( + class_name=self._db_config.index_name, + where=where_filter, + ) + + has_matches = results["results"]["matches"] + + def _filter(self, filter_query: Any, limit: int) -> Union[DocList, List[Dict]]: + self._overwrite_id(filter_query) + + results = ( + self._client.query.get(self._db_config.index_name, self.properties) + .with_additional("vector") + .with_where(filter_query) + .with_limit(limit) + .do() + ) + + docs = results["data"]["Get"][self._db_config.index_name] + + return [self._parse_weaviate_result(doc) for doc in docs] + + def _filter_batched( + self, filter_queries: Any, limit: int + ) -> Union[List[DocList], List[List[Dict]]]: + for filter_query in filter_queries: + self._overwrite_id(filter_query) + + qs = [ + self._client.query.get(self._db_config.index_name, self.properties) + .with_additional("vector") + .with_where(filter_query) + .with_limit(limit) + .with_alias(f'query_{i}') + for i, filter_query in enumerate(filter_queries) + ] + + batched_results = self._client.query.multi_get(qs).do() + + return [ + [self._parse_weaviate_result(doc) for doc in batched_result] + for batched_result in batched_results["data"]["Get"].values() + ] + + def find( + self, + query: Union[AnyTensor, BaseDoc], + search_field: str = '', + limit: int = 10, + **kwargs, + ): + self._logger.debug('Executing `find`') + if search_field != '': + raise ValueError( + 'Argument search_field is not supported for WeaviateDocumentIndex.\nSet search_field to an empty string to proceed.' + ) + embedding_field = self._get_embedding_field() + if isinstance(query, BaseDoc): + query_vec = self._get_values_by_column([query], embedding_field)[0] + else: + query_vec = query + query_vec_np = self._to_numpy(query_vec) + docs, scores = self._find( + query_vec_np, search_field=search_field, limit=limit, **kwargs + ) + + if isinstance(docs, List): + docs = self._dict_list_to_docarray(docs) + + return FindResult(documents=docs, scores=scores) + + def _overwrite_id(self, where_filter): + """ + Overwrite the id field in the where filter to DOCUMENTID + if the "id" field is present in the path + """ + for key, value in where_filter.items(): + if key == "path" and value == ["id"]: + where_filter[key] = [DOCUMENTID] + elif isinstance(value, dict): + self._overwrite_id(value) + elif isinstance(value, list): + for item in value: + if isinstance(item, dict): + self._overwrite_id(item) + + def _find( + self, + query: np.ndarray, + limit: int, + search_field: str = '', + score_name: Literal["certainty", "distance"] = "certainty", + score_threshold: Optional[float] = None, + ) -> _FindResult: + index_name = self._db_config.index_name + if search_field: + logging.warning( + 'Argument search_field is not supported for WeaviateDocumentIndex. Ignoring.' + ) + near_vector: Dict[str, Any] = { + "vector": query, + } + if score_threshold: + near_vector[score_name] = score_threshold + + results = ( + self._client.query.get(index_name, self.properties) + .with_near_vector( + near_vector, + ) + .with_limit(limit) + .with_additional([score_name, "vector"]) + .do() + ) + + docs, scores = self._format_response( + results["data"]["Get"][index_name], score_name + ) + return _FindResult(docs, parse_obj_as(NdArray, scores)) + + def _format_response( + self, results, score_name + ) -> Tuple[List[Dict[Any, Any]], List[Any]]: + """ + Format the response from Weaviate into a Tuple of DocList and scores + """ + + documents = [] + scores = [] + + for result in results: + score = result["_additional"][score_name] + scores.append(score) + + document = self._parse_weaviate_result(result) + documents.append(document) + + return documents, scores + + def find_batched( + self, + queries: Union[AnyTensor, DocList], + search_field: str = '', + limit: int = 10, + **kwargs, + ) -> FindResultBatched: + self._logger.debug('Executing `find_batched`') + if search_field != '': + raise ValueError( + 'Argument search_field is not supported for WeaviateDocumentIndex.\nSet search_field to an empty string to proceed.' + ) + embedding_field = self._get_embedding_field() + + if isinstance(queries, Sequence): + query_vec_list = self._get_values_by_column(queries, embedding_field) + query_vec_np = np.stack( + tuple(self._to_numpy(query_vec) for query_vec in query_vec_list) + ) + else: + query_vec_np = self._to_numpy(queries) + + da_list, scores = self._find_batched( + query_vec_np, search_field=search_field, limit=limit, **kwargs + ) + + if len(da_list) > 0 and isinstance(da_list[0], List): + da_list = [self._dict_list_to_docarray(docs) for docs in da_list] + + return FindResultBatched(documents=da_list, scores=scores) # type: ignore + + def _find_batched( + self, + queries: np.ndarray, + limit: int, + search_field: str = '', + score_name: Literal["certainty", "distance"] = "certainty", + score_threshold: Optional[float] = None, + ) -> _FindResultBatched: + qs = [] + for i, query in enumerate(queries): + near_vector: Dict[str, Any] = {"vector": query} + + if score_threshold: + near_vector[score_name] = score_threshold + + q = ( + self._client.query.get(self._db_config.index_name, self.properties) + .with_near_vector(near_vector) + .with_limit(limit) + .with_additional([score_name, "vector"]) + .with_alias(f'query_{i}') + ) + + qs.append(q) + + results = self._client.query.multi_get(qs).do() + + docs_and_scores = [ + self._format_response(result, score_name) + for result in results["data"]["Get"].values() + ] + + docs, scores = zip(*docs_and_scores) + return _FindResultBatched(list(docs), list(scores)) + + def _get_items(self, doc_ids: Sequence[str]) -> List[Dict]: + # TODO: warn when doc_ids > QUERY_MAXIMUM_RESULTS after + # https://github.com/weaviate/weaviate/issues/2792 + # is implemented + operands = [ + {"path": [DOCUMENTID], "operator": "Equal", "valueString": doc_id} + for doc_id in doc_ids + ] + where_filter = { + "operator": "Or", + "operands": operands, + } + + results = ( + self._client.query.get(self._db_config.index_name, self.properties) + .with_where(where_filter) + .with_additional("vector") + .do() + ) + + docs = [ + self._parse_weaviate_result(doc) + for doc in results["data"]["Get"][self._db_config.index_name] + ] + + return docs + + def _rewrite_documentid(self, document: Dict): + doc = document.copy() + + # rewrite the id to DOCUMENTID + document_id = doc.pop('id') + doc[DOCUMENTID] = document_id + + return doc + + def _parse_weaviate_result(self, result: Dict) -> Dict: + """ + Parse the result from weaviate to a format that is compatible with the schema + that was used to initialize weaviate with. + """ + + result = result.copy() + + # rewrite the DOCUMENTID to id + if DOCUMENTID in result: + result['id'] = result.pop(DOCUMENTID) + + # take the vector from the _additional field + if '_additional' in result and self.embedding_column: + additional_fields = result.pop('_additional') + if 'vector' in additional_fields: + result[self.embedding_column] = additional_fields['vector'] + + # convert any base64 encoded bytes column to bytes + self._decode_base64_properties_to_bytes(result) + + return result + + def _index(self, column_to_data: Dict[str, Generator[Any, None, None]]): + docs = self._transpose_col_value_dict(column_to_data) + index_name = self._db_config.index_name + + with self._client.batch as batch: + for doc in docs: + parsed_doc = self._rewrite_documentid(doc) + self._encode_bytes_columns_to_base64(parsed_doc) + self._convert_nonembedding_array_to_list(parsed_doc) + vector = ( + parsed_doc.pop(self.embedding_column) + if self.embedding_column + else None + ) + + batch.add_data_object( + uuid=weaviate.util.generate_uuid5(parsed_doc, index_name), + data_object=parsed_doc, + class_name=index_name, + vector=vector, + ) + + def _text_search( + self, query: str, limit: int, search_field: str = '' + ) -> _FindResult: + index_name = self._db_config.index_name + bm25 = {"query": query, "properties": [search_field]} + + results = ( + self._client.query.get(index_name, self.properties) + .with_bm25(bm25) + .with_limit(limit) + .with_additional(["score", "vector"]) + .do() + ) + + docs, scores = self._format_response( + results["data"]["Get"][index_name], "score" + ) + + return _FindResult(documents=docs, scores=parse_obj_as(NdArray, scores)) + + def _text_search_batched( + self, queries: Sequence[str], limit: int, search_field: str = '' + ) -> _FindResultBatched: + qs = [] + for i, query in enumerate(queries): + bm25 = {"query": query, "properties": [search_field]} + + q = ( + self._client.query.get(self._db_config.index_name, self.properties) + .with_bm25(bm25) + .with_limit(limit) + .with_additional(["score", "vector"]) + .with_alias(f'query_{i}') + ) + + qs.append(q) + + results = self._client.query.multi_get(qs).do() + + docs_and_scores = [ + self._format_response(result, "score") + for result in results["data"]["Get"].values() + ] + + docs, scores = zip(*docs_and_scores) + return _FindResultBatched(list(docs), list(scores)) + + def execute_query(self, query: Any, *args, **kwargs) -> Any: + da_class = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema)) + + if isinstance(query, self.QueryBuilder): + batched_results = self._client.query.multi_get(query._queries).do() + batched_docs = batched_results["data"]["Get"].values() + + def f(doc): + # TODO: use + # return self._schema(**self._parse_weaviate_result(doc)) + # when https://github.com/weaviate/weaviate/issues/2858 + # is fixed + return self._schema.from_view(self._parse_weaviate_result(doc)) # type: ignore + + results = [ + da_class([f(doc) for doc in batched_doc]) + for batched_doc in batched_docs + ] + return results if len(results) > 1 else results[0] + + # TODO: validate graphql query string before sending it to weaviate + if isinstance(query, str): + return self._client.query.raw(query) + + def num_docs(self) -> int: + index_name = self._db_config.index_name + result = self._client.query.aggregate(index_name).with_meta_count().do() + # TODO: decorator to check for errors + total_docs = result["data"]["Aggregate"][index_name][0]["meta"]["count"] + + return total_docs + + def python_type_to_db_type(self, python_type: Type) -> Any: + """Map python type to database type.""" + for allowed_type in WEAVIATE_PY_VEC_TYPES: + if issubclass(python_type, allowed_type): + return 'number[]' + + py_weaviate_type_map = { + docarray.typing.ID: 'string', + str: 'text', + int: 'int', + float: 'number', + bool: 'boolean', + np.ndarray: 'number[]', + bytes: 'blob', + } + + for py_type, weaviate_type in py_weaviate_type_map.items(): + if issubclass(python_type, py_type): + return weaviate_type + + raise ValueError(f'Unsupported column type for {type(self)}: {python_type}') + + def build_query(self) -> BaseDocIndex.QueryBuilder: + return self.QueryBuilder(self) + + def _get_embedding_field(self): + for colname, colinfo in self._column_infos.items(): + # no need to check for missing is_embedding attribute because this check + # is done when the index is created + if colinfo.config.get('is_embedding', None): + return colname + + # just to pass mypy + return "" + + def _encode_bytes_columns_to_base64(self, doc): + for column in self.bytes_columns: + if doc[column] is not None: + doc[column] = base64.b64encode(doc[column]).decode("utf-8") + + def _decode_base64_properties_to_bytes(self, doc): + for column in self.bytes_columns: + if doc[column] is not None: + doc[column] = base64.b64decode(doc[column]) + + def _convert_nonembedding_array_to_list(self, doc): + for column in self.nonembedding_array_columns: + if doc[column] is not None: + doc[column] = doc[column].tolist() + + class QueryBuilder(BaseDocIndex.QueryBuilder): + def __init__(self, document_index): + self._queries = [ + document_index._client.query.get( + document_index._db_config.index_name, document_index.properties + ) + ] + + def build(self) -> Any: + num_queries = len(self._queries) + + for i in range(num_queries): + q = self._queries[i] + if self._is_hybrid_query(q): + self._make_proper_hybrid_query(q) + q.with_additional(["vector"]).with_alias(f'query_{i}') + + return self + + def _is_hybrid_query(self, query: weaviate.gql.get.GetBuilder) -> bool: + """ + Checks if a query has been composed with both a with_bm25 and a with_near_vector verb + """ + if not query._near_ask: + return False + else: + return query._bm25 and query._near_ask._content.get("vector", None) + + def _make_proper_hybrid_query( + self, query: weaviate.gql.get.GetBuilder + ) -> weaviate.gql.get.GetBuilder: + """ + Modifies a query to be a proper hybrid query. + + In weaviate, a query with with_bm25 and with_near_vector verb is not a hybrid query. + We need to use the with_hybrid verb to make it a hybrid query. + """ + + text_query = query._bm25.query + vector_query = query._near_ask._content["vector"] + hybrid_query = weaviate.gql.get.Hybrid( + query=text_query, vector=vector_query, alpha=0.5 + ) + + query._bm25 = None + query._near_ask = None + query._hybrid = hybrid_query + + def _overwrite_id(self, where_filter): + """ + Overwrite the id field in the where filter to DOCUMENTID + if the "id" field is present in the path + """ + for key, value in where_filter.items(): + if key == "path" and value == ["id"]: + where_filter[key] = [DOCUMENTID] + elif isinstance(value, dict): + self._overwrite_id(value) + elif isinstance(value, list): + for item in value: + if isinstance(item, dict): + self._overwrite_id(item) + + def find( + self, + query, + score_name: Literal["certainty", "distance"] = "certainty", + score_threshold: Optional[float] = None, + ) -> Any: + near_vector = { + "vector": query, + } + if score_threshold: + near_vector[score_name] = score_threshold + + self._queries[0] = self._queries[0].with_near_vector(near_vector) + return self + + def find_batched( + self, + queries, + score_name: Literal["certainty", "distance"] = "certainty", + score_threshold: Optional[float] = None, + ) -> Any: + adj_queries, adj_clauses = self._resize_queries_and_clauses( + self._queries, queries + ) + new_queries = [] + + for query, clause in zip(adj_queries, adj_clauses): + near_vector = { + "vector": clause, + } + if score_threshold: + near_vector[score_name] = score_threshold + + new_queries.append(query.with_near_vector(near_vector)) + + self._queries = new_queries + + return self + + def filter(self, where_filter) -> Any: + where_filter = where_filter.copy() + self._overwrite_id(where_filter) + self._queries[0] = self._queries[0].with_where(where_filter) + return self + + def filter_batched(self, filters) -> Any: + adj_queries, adj_clauses = self._resize_queries_and_clauses( + self._queries, filters + ) + new_queries = [] + + for query, clause in zip(adj_queries, adj_clauses): + clause = clause.copy() + self._overwrite_id(clause) + new_queries.append(query.with_where(clause)) + + self._queries = new_queries + + return self + + def text_search(self, query, search_field) -> Any: + bm25 = {"query": query, "properties": [search_field]} + self._queries[0] = self._queries[0].with_bm25(**bm25) + return self + + def text_search_batched(self, queries, search_field) -> Any: + adj_queries, adj_clauses = self._resize_queries_and_clauses( + self._queries, queries + ) + new_queries = [] + + for query, clause in zip(adj_queries, adj_clauses): + bm25 = {"query": clause, "properties": [search_field]} + new_queries.append(query.with_bm25(**bm25)) + + self._queries = new_queries + + return self + + def limit(self, limit: int) -> Any: + self._queries = [query.with_limit(limit) for query in self._queries] + return self + + def _resize_queries_and_clauses(self, queries, clauses): + """ + Adjust the length and content of queries and clauses so that we can compose + them element-wise + """ + num_clauses = len(clauses) + num_queries = len(queries) + + # if there's only one clause, then we assume that it should be applied + # to every query + if num_clauses == 1: + return queries, clauses * num_queries + # if there's only one query, then we can lengthen it to match the number + # of clauses + elif num_queries == 1: + return [copy.deepcopy(queries[0]) for _ in range(num_clauses)], clauses + # if the number of queries and clauses is the same, then we can just + # return them as-is + elif num_clauses == num_queries: + return queries, clauses + else: + raise ValueError( + f"Can't compose {num_clauses} clauses with {num_queries} queries" + ) diff --git a/poetry.lock b/poetry.lock index cd46e05c897..398a9ec992d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. [[package]] name = "aiohttp" @@ -264,6 +264,21 @@ docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "zope.interface"] tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins"] +[[package]] +name = "authlib" +version = "1.2.0" +description = "The ultimate Python library in building OAuth and OpenID Connect servers and clients." +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "Authlib-1.2.0-py2.py3-none-any.whl", hash = "sha256:4ddf4fd6cfa75c9a460b361d4bd9dac71ffda0be879dbe4292a02e92349ad55a"}, + {file = "Authlib-1.2.0.tar.gz", hash = "sha256:4fa3e80883a5915ef9f5bc28630564bc4ed5b5af39812a3ff130ec76bd631e9d"}, +] + +[package.dependencies] +cryptography = ">=3.2" + [[package]] name = "av" version = "10.0.0" @@ -525,7 +540,7 @@ files = [ name = "cffi" version = "1.15.1" description = "Foreign Function Interface for Python calling C code." -category = "dev" +category = "main" optional = false python-versions = "*" files = [ @@ -698,6 +713,48 @@ files = [ [package.extras] test = ["flake8 (==3.7.8)", "hypothesis (==3.55.3)"] +[[package]] +name = "cryptography" +version = "40.0.1" +description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." +category = "main" +optional = false +python-versions = ">=3.6" +files = [ + {file = "cryptography-40.0.1-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:918cb89086c7d98b1b86b9fdb70c712e5a9325ba6f7d7cfb509e784e0cfc6917"}, + {file = "cryptography-40.0.1-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9618a87212cb5200500e304e43691111570e1f10ec3f35569fdfcd17e28fd797"}, + {file = "cryptography-40.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a4805a4ca729d65570a1b7cac84eac1e431085d40387b7d3bbaa47e39890b88"}, + {file = "cryptography-40.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63dac2d25c47f12a7b8aa60e528bfb3c51c5a6c5a9f7c86987909c6c79765554"}, + {file = "cryptography-40.0.1-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:0a4e3406cfed6b1f6d6e87ed243363652b2586b2d917b0609ca4f97072994405"}, + {file = "cryptography-40.0.1-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1e0af458515d5e4028aad75f3bb3fe7a31e46ad920648cd59b64d3da842e4356"}, + {file = "cryptography-40.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d8aa3609d337ad85e4eb9bb0f8bcf6e4409bfb86e706efa9a027912169e89122"}, + {file = "cryptography-40.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:cf91e428c51ef692b82ce786583e214f58392399cf65c341bc7301d096fa3ba2"}, + {file = "cryptography-40.0.1-cp36-abi3-win32.whl", hash = "sha256:650883cc064297ef3676b1db1b7b1df6081794c4ada96fa457253c4cc40f97db"}, + {file = "cryptography-40.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:a805a7bce4a77d51696410005b3e85ae2839bad9aa38894afc0aa99d8e0c3160"}, + {file = "cryptography-40.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cd033d74067d8928ef00a6b1327c8ea0452523967ca4463666eeba65ca350d4c"}, + {file = "cryptography-40.0.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d36bbeb99704aabefdca5aee4eba04455d7a27ceabd16f3b3ba9bdcc31da86c4"}, + {file = "cryptography-40.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:32057d3d0ab7d4453778367ca43e99ddb711770477c4f072a51b3ca69602780a"}, + {file = "cryptography-40.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:f5d7b79fa56bc29580faafc2ff736ce05ba31feaa9d4735048b0de7d9ceb2b94"}, + {file = "cryptography-40.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7c872413353c70e0263a9368c4993710070e70ab3e5318d85510cc91cce77e7c"}, + {file = "cryptography-40.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:28d63d75bf7ae4045b10de5413fb1d6338616e79015999ad9cf6fc538f772d41"}, + {file = "cryptography-40.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6f2bbd72f717ce33100e6467572abaedc61f1acb87b8d546001328d7f466b778"}, + {file = "cryptography-40.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cc3a621076d824d75ab1e1e530e66e7e8564e357dd723f2533225d40fe35c60c"}, + {file = "cryptography-40.0.1.tar.gz", hash = "sha256:2803f2f8b1e95f614419926c7e6f55d828afc614ca5ed61543877ae668cc3472"}, +] + +[package.dependencies] +cffi = ">=1.12" + +[package.extras] +docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] +docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] +pep8test = ["black", "check-manifest", "mypy", "ruff"] +sdist = ["setuptools-rust (>=0.11.4)"] +ssh = ["bcrypt (>=3.1.5)"] +test = ["iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist"] +test-randomorder = ["pytest-randomly"] +tox = ["tox"] + [[package]] name = "debugpy" version = "1.6.3" @@ -730,7 +787,7 @@ files = [ name = "decorator" version = "5.1.1" description = "Decorators for Humans" -category = "dev" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -3004,7 +3061,7 @@ validation = ["lxml"] name = "pycparser" version = "2.21" description = "C parser in Python" -category = "dev" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -3541,25 +3598,25 @@ files = [ [[package]] name = "requests" -version = "2.27.1" +version = "2.28.2" description = "Python HTTP for Humans." category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +python-versions = ">=3.7, <4" files = [ - {file = "requests-2.27.1-py2.py3-none-any.whl", hash = "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"}, - {file = "requests-2.27.1.tar.gz", hash = "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61"}, + {file = "requests-2.28.2-py3-none-any.whl", hash = "sha256:64299f4909223da747622c030b781c0d7811e359c37124b4bd368fb8c6518baa"}, + {file = "requests-2.28.2.tar.gz", hash = "sha256:98b1b2782e3c6c4904938b84c0eb932721069dfdb9134313beff7c83c2df24bf"}, ] [package.dependencies] certifi = ">=2017.4.17" -charset-normalizer = {version = ">=2.0.0,<2.1.0", markers = "python_version >= \"3\""} -idna = {version = ">=2.5,<4", markers = "python_version >= \"3\""} +charset-normalizer = ">=2,<4" +idna = ">=2.5,<4" urllib3 = ">=1.21.1,<1.27" [package.extras] -socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] -use-chardet-on-py3 = ["chardet (>=3.0.2,<5)"] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "rfc3986" @@ -4073,6 +4130,27 @@ files = [ {file = "tornado-6.2.tar.gz", hash = "sha256:9b630419bde84ec666bfd7ea0a4cb2a8a651c2d5cccdbdd1972a0c859dfc3c13"}, ] +[[package]] +name = "tqdm" +version = "4.65.0" +description = "Fast, Extensible Progress Meter" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tqdm-4.65.0-py3-none-any.whl", hash = "sha256:c4f53a17fe37e132815abceec022631be8ffe1b9381c2e6e30aa70edc99e9671"}, + {file = "tqdm-4.65.0.tar.gz", hash = "sha256:1871fb68a86b8fb3b59ca4cdd3dcccbc7e6d613eeed31f4c332531977b89beb5"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["py-make (>=0.1.0)", "twine", "wheel"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + [[package]] name = "traitlets" version = "5.5.0" @@ -4275,6 +4353,23 @@ typing-extensions = {version = "*", markers = "python_version < \"3.8\""} [package.extras] standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.0)"] +[[package]] +name = "validators" +version = "0.20.0" +description = "Python Data Validation for Humans™." +category = "main" +optional = false +python-versions = ">=3.4" +files = [ + {file = "validators-0.20.0.tar.gz", hash = "sha256:24148ce4e64100a2d5e267233e23e7afeb55316b47d30faae7eb6e7292bc226a"}, +] + +[package.dependencies] +decorator = ">=3.4.0" + +[package.extras] +test = ["flake8 (>=2.4.0)", "isort (>=4.2.2)", "pytest (>=2.2.3)"] + [[package]] name = "virtualenv" version = "20.16.7" @@ -4365,6 +4460,24 @@ files = [ {file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"}, ] +[[package]] +name = "weaviate-client" +version = "3.15.5" +description = "A python native weaviate client" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "weaviate-client-3.15.5.tar.gz", hash = "sha256:6da7e5d08dc9bb8b7879661d1a457c50af7d73e621a5305efe131160e83da69e"}, + {file = "weaviate_client-3.15.5-py3-none-any.whl", hash = "sha256:24d0be614e5494534e758cc67a45e7e15f3929a89bf512afd642de53d08723c7"}, +] + +[package.dependencies] +authlib = ">=1.1.0" +requests = ">=2.28.0,<2.29.0" +tqdm = ">=4.59.0,<5.0.0" +validators = ">=0.18.2,<=0.21.0" + [[package]] name = "webencodings" version = "0.5.1" @@ -4625,14 +4738,14 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools" [extras] audio = ["pydub"] aws = ["smart-open"] -elasticsearch = ["elasticsearch", "elastic-transport"] -full = ["protobuf", "lz4", "pandas", "pillow", "types-pillow", "av", "pydub", "trimesh"] +elasticsearch = ["elastic-transport", "elasticsearch"] +full = ["av", "lz4", "pandas", "pillow", "protobuf", "pydub", "trimesh", "types-pillow"] hnswlib = ["hnswlib"] image = ["pillow", "types-pillow"] jac = ["jina-hubble-sdk"] mesh = ["trimesh"] pandas = ["pandas"] -proto = ["protobuf", "lz4"] +proto = ["lz4", "protobuf"] torch = ["torch"] video = ["av"] web = ["fastapi"] @@ -4640,4 +4753,4 @@ web = ["fastapi"] [metadata] lock-version = "2.0" python-versions = ">=3.7,<4.0" -content-hash = "a5bae8ca8239347d066e7566dfea56f08d42950f7037e50870cee226809f4b01" +content-hash = "5a07acb92ae45bc42e49e68af897444874d6facd4ed81af4bd9e8d37d7737037" diff --git a/pyproject.toml b/pyproject.toml index ecc72c74719..2b5bc301296 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ hnswlib = {version = ">=0.6.2", optional = true } lz4 = {version= ">=1.0.0", optional = true} pydub = {version = "^0.25.1", optional = true } pandas = {version = ">=1.1.0", optional = true } +weaviate-client = {version = ">=3.15", extras = ["weaviate"]} elasticsearch = {version = ">=7.10.1", optional = true } smart-open = {version = ">=6.3.0", extras = ["s3"], optional = true} jina-hubble-sdk = {version = ">=0.34.0", optional = true} @@ -92,6 +93,7 @@ module = [ "trimesh", "pandas", "av", + "weaviate" ] ignore_missing_imports = true diff --git a/tests/integrations/doc_index/weaviate/docker-compose.yml b/tests/integrations/doc_index/weaviate/docker-compose.yml new file mode 100644 index 00000000000..5cca1e722eb --- /dev/null +++ b/tests/integrations/doc_index/weaviate/docker-compose.yml @@ -0,0 +1,27 @@ +version: '3.8' + +services: + + weaviate: + command: + - --host + - 0.0.0.0 + - --port + - '8080' + - --scheme + - http + image: semitechnologies/weaviate:1.18.3 + ports: + - "8080:8080" + restart: on-failure:0 + environment: + QUERY_DEFAULTS_LIMIT: 25 + AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' + PERSISTENCE_DATA_PATH: '/var/lib/weaviate' + DEFAULT_VECTORIZER_MODULE: 'none' + ENABLE_MODULES: '' + CLUSTER_HOSTNAME: 'node1' + LOG_LEVEL: debug # verbose + LOG_FORMAT: text + # LOG_LEVEL: trace # very verbose + GODEBUG: gctrace=1 # make go garbage collector verbose \ No newline at end of file diff --git a/tests/integrations/doc_index/weaviate/fixture_weaviate.py b/tests/integrations/doc_index/weaviate/fixture_weaviate.py new file mode 100644 index 00000000000..786a92b2a00 --- /dev/null +++ b/tests/integrations/doc_index/weaviate/fixture_weaviate.py @@ -0,0 +1,41 @@ +import os +import time + +import pytest +import requests +import weaviate + +HOST = "http://localhost:8080" + + +cur_dir = os.path.dirname(os.path.abspath(__file__)) +weaviate_yml = os.path.abspath(os.path.join(cur_dir, 'docker-compose.yml')) + + +@pytest.fixture(scope='session', autouse=True) +def start_storage(): + os.system(f"docker-compose -f {weaviate_yml} up -d --remove-orphans") + _wait_for_weaviate() + + yield + os.system(f"docker-compose -f {weaviate_yml} down --remove-orphans") + + +def _wait_for_weaviate(): + while True: + try: + response = requests.get(f"{HOST}/v1/.well-known/ready") + if response.status_code == 200: + return + else: + time.sleep(0.5) + except requests.exceptions.ConnectionError: + time.sleep(1) + + +@pytest.fixture +def weaviate_client(start_storage): + client = weaviate.Client(HOST) + client.schema.delete_all() + yield client + client.schema.delete_all() diff --git a/tests/integrations/doc_index/weaviate/test_column_config_weaviate.py b/tests/integrations/doc_index/weaviate/test_column_config_weaviate.py new file mode 100644 index 00000000000..a3050e9b9ba --- /dev/null +++ b/tests/integrations/doc_index/weaviate/test_column_config_weaviate.py @@ -0,0 +1,33 @@ +# TODO: enable ruff qa on this file when we figure out why it thinks weaviate_client is +# redefined at each test that fixture +# ruff: noqa +from pydantic import Field + +from docarray import BaseDoc +from docarray.index.backends.weaviate import WeaviateDocumentIndex +from tests.integrations.doc_index.weaviate.fixture_weaviate import ( # noqa: F401 + start_storage, + weaviate_client, +) + + +def test_column_config(weaviate_client): + def get_text_field_data_type(store, index_name): + props = store._client.schema.get(index_name)["properties"] + text_field = [p for p in props if p["name"] == "text"][0] + + return text_field["dataType"][0] + + class TextDoc(BaseDoc): + text: str = Field() + + class StringDoc(BaseDoc): + text: str = Field(col_type="string") + + dbconfig = WeaviateDocumentIndex.DBConfig(index_name="TextDoc") + store = WeaviateDocumentIndex[TextDoc](db_config=dbconfig) + assert get_text_field_data_type(store, "TextDoc") == "text" + + dbconfig = WeaviateDocumentIndex.DBConfig(index_name="StringDoc") + store = WeaviateDocumentIndex[StringDoc](db_config=dbconfig) + assert get_text_field_data_type(store, "StringDoc") == "string" diff --git a/tests/integrations/doc_index/weaviate/test_find_weaviate.py b/tests/integrations/doc_index/weaviate/test_find_weaviate.py new file mode 100644 index 00000000000..c54d167c634 --- /dev/null +++ b/tests/integrations/doc_index/weaviate/test_find_weaviate.py @@ -0,0 +1,66 @@ +# TODO: enable ruff qa on this file when we figure out why it thinks weaviate_client is +# redefined at each test that fixture +# ruff: noqa +import numpy as np +import pytest +import torch +from pydantic import Field + +from docarray import BaseDoc +from docarray.index.backends.weaviate import WeaviateDocumentIndex +from docarray.typing import TorchTensor +from tests.integrations.doc_index.weaviate.fixture_weaviate import ( # noqa: F401 + start_storage, + weaviate_client, +) + + +def test_find_torch(weaviate_client): + class TorchDoc(BaseDoc): + tens: TorchTensor[10] = Field(dims=10, is_embedding=True) + + store = WeaviateDocumentIndex[TorchDoc]() + + index_docs = [ + TorchDoc(tens=np.random.rand(10).astype(dtype=np.float32)) for _ in range(10) + ] + store.index(index_docs) + + query = index_docs[-1] + docs, scores = store.find(query, limit=5) + + assert len(docs) == 5 + assert len(scores) == 5 + for doc in docs: + assert isinstance(doc.tens, TorchTensor) + + assert docs[0].id == index_docs[-1].id + assert torch.allclose(docs[0].tens, index_docs[-1].tens) + + +@pytest.mark.tensorflow +def test_find_tensorflow(): + from docarray.typing import TensorFlowTensor + + class TfDoc(BaseDoc): + tens: TensorFlowTensor[10] = Field(dims=10, is_embedding=True) + + store = WeaviateDocumentIndex[TfDoc]() + + index_docs = [ + TfDoc(tens=np.random.rand(10).astype(dtype=np.float32)) for _ in range(10) + ] + store.index(index_docs) + + query = index_docs[-1] + docs, scores = store.find(query, limit=5) + + assert len(docs) == 5 + assert len(scores) == 5 + for doc in docs: + assert isinstance(doc.tens, TensorFlowTensor) + + assert docs[0].id == index_docs[-1].id + assert np.allclose( + docs[0].tens.unwrap().numpy(), index_docs[-1].tens.unwrap().numpy() + ) diff --git a/tests/integrations/doc_index/weaviate/test_index_get_del_weaviate.py b/tests/integrations/doc_index/weaviate/test_index_get_del_weaviate.py new file mode 100644 index 00000000000..e9c218d45a4 --- /dev/null +++ b/tests/integrations/doc_index/weaviate/test_index_get_del_weaviate.py @@ -0,0 +1,452 @@ +# TODO: enable ruff qa on this file when we figure out why it thinks weaviate_client is +# redefined at each test that fixture +# ruff: noqa +import logging + +import numpy as np +import pytest +from pydantic import Field + +from docarray import BaseDoc +from docarray.documents import ImageDoc, TextDoc +from docarray.index.backends.weaviate import ( + DOCUMENTID, + EmbeddedOptions, + WeaviateDocumentIndex, +) +from docarray.typing import NdArray +from tests.integrations.doc_index.weaviate.fixture_weaviate import ( # noqa: F401 + HOST, + start_storage, + weaviate_client, +) + + +class SimpleDoc(BaseDoc): + tens: NdArray[10] = Field(dim=1000, is_embedding=True) + + +class Document(BaseDoc): + embedding: NdArray[2] = Field(dim=2, is_embedding=True) + text: str = Field() + + +class NestedDocument(BaseDoc): + text: str = Field() + child: Document + + +@pytest.fixture +def ten_simple_docs(): + return [SimpleDoc(tens=np.random.randn(10)) for _ in range(10)] + + +@pytest.fixture +def documents(): + texts = ["lorem ipsum", "dolor sit amet", "consectetur adipiscing elit"] + embeddings = [[10, 10], [10.5, 10.5], [-100, -100]] + + # create the docs by enumerating from 1 and use that as the id + docs = [ + Document(id=str(i), embedding=embedding, text=text) + for i, (embedding, text) in enumerate(zip(embeddings, texts)) + ] + + yield docs + + +@pytest.fixture +def test_store(weaviate_client, documents): + store = WeaviateDocumentIndex[Document]() + store.index(documents) + yield store + + +def test_index_simple_schema(weaviate_client, ten_simple_docs): + store = WeaviateDocumentIndex[SimpleDoc]() + store.index(ten_simple_docs) + assert store.num_docs() == 10 + + for doc in ten_simple_docs: + doc_id = doc.id + doc_embedding = doc.tens + + result = ( + weaviate_client.query.get("Document", DOCUMENTID) + .with_additional("vector") + .with_where( + {"path": [DOCUMENTID], "operator": "Equal", "valueString": doc_id} + ) + .do() + ) + + result = result["data"]["Get"]["Document"][0] + assert result[DOCUMENTID] == doc_id + assert np.allclose(result["_additional"]["vector"], doc_embedding) + + +def test_validate_columns(weaviate_client): + dbconfig = WeaviateDocumentIndex.DBConfig(host=HOST) + + class InvalidDoc1(BaseDoc): + tens: NdArray[10] = Field(dim=1000, is_embedding=True) + tens2: NdArray[10] = Field(dim=1000, is_embedding=True) + + class InvalidDoc2(BaseDoc): + tens: int = Field(dim=1000, is_embedding=True) + + with pytest.raises(ValueError, match=r"Only one column can be marked as embedding"): + WeaviateDocumentIndex[InvalidDoc1](db_config=dbconfig) + + with pytest.raises(ValueError, match=r"marked as embedding but is not of type"): + WeaviateDocumentIndex[InvalidDoc2](db_config=dbconfig) + + +def test_find(weaviate_client, caplog): + class Document(BaseDoc): + embedding: NdArray[2] = Field(dim=2, is_embedding=True) + + vectors = [[10, 10], [10.5, 10.5], [-100, -100]] + docs = [Document(embedding=vector) for vector in vectors] + + store = WeaviateDocumentIndex[Document]() + store.index(docs) + + query = [10.1, 10.1] + + results = store.find( + query, search_field='', limit=3, score_name="distance", score_threshold=1e-2 + ) + assert len(results) == 2 + + results = store.find(query, search_field='', limit=3, score_threshold=0.99) + assert len(results) == 2 + + with pytest.raises( + ValueError, + match=r"Argument search_field is not supported for WeaviateDocumentIndex", + ): + store.find(query, search_field="foo", limit=10) + + +def test_find_batched(weaviate_client, caplog): + class Document(BaseDoc): + embedding: NdArray[2] = Field(dim=2, is_embedding=True) + + vectors = [[10, 10], [10.5, 10.5], [-100, -100]] + docs = [Document(embedding=vector) for vector in vectors] + + store = WeaviateDocumentIndex[Document]() + store.index(docs) + + queries = np.array([[10.1, 10.1], [-100, -100]]) + + results = store.find_batched( + queries, search_field='', limit=3, score_name="distance", score_threshold=1e-2 + ) + assert len(results) == 2 + assert len(results.documents[0]) == 2 + assert len(results.documents[1]) == 1 + + results = store.find_batched( + queries, search_field='', limit=3, score_name="certainty" + ) + assert len(results) == 2 + assert len(results.documents[0]) == 3 + assert len(results.documents[1]) == 3 + + with pytest.raises( + ValueError, + match=r"Argument search_field is not supported for WeaviateDocumentIndex", + ): + store.find_batched(queries, search_field="foo", limit=10) + + +@pytest.mark.parametrize( + "filter_query, expected_num_docs", + [ + ({"path": ["text"], "operator": "Equal", "valueText": "lorem ipsum"}, 1), + ({"path": ["text"], "operator": "Equal", "valueText": "foo"}, 0), + ({"path": ["id"], "operator": "Equal", "valueString": "1"}, 1), + ], +) +def test_filter(test_store, filter_query, expected_num_docs): + docs = test_store.filter(filter_query, limit=3) + actual_num_docs = len(docs) + + assert actual_num_docs == expected_num_docs + + +@pytest.mark.parametrize( + "filter_queries, expected_num_docs", + [ + ( + [ + {"path": ["text"], "operator": "Equal", "valueText": "lorem ipsum"}, + {"path": ["text"], "operator": "Equal", "valueText": "foo"}, + ], + [1, 0], + ), + ( + [ + {"path": ["id"], "operator": "Equal", "valueString": "1"}, + {"path": ["id"], "operator": "Equal", "valueString": "2"}, + ], + [1, 0], + ), + ], +) +def test_filter_batched(test_store, filter_queries, expected_num_docs): + filter_queries = [ + {"path": ["text"], "operator": "Equal", "valueText": "lorem ipsum"}, + {"path": ["text"], "operator": "Equal", "valueText": "foo"}, + ] + + results = test_store.filter_batched(filter_queries, limit=3) + actual_num_docs = [len(docs) for docs in results] + assert actual_num_docs == expected_num_docs + + +def test_text_search(test_store): + results = test_store.text_search(query="lorem", search_field="text", limit=3) + assert len(results.documents) == 1 + + +def test_text_search_batched(test_store): + text_queries = ["lorem", "foo"] + + results = test_store.text_search_batched( + queries=text_queries, search_field="text", limit=3 + ) + assert len(results.documents[0]) == 1 + assert len(results.documents[1]) == 0 + + +def test_del_items(test_store): + del test_store[["1", "2"]] + assert test_store.num_docs() == 1 + + +def test_get_items(test_store): + docs = test_store[["1", "2"]] + assert len(docs) == 2 + assert set(doc.id for doc in docs) == {'1', '2'} + + +def test_index_nested_documents(weaviate_client): + store = WeaviateDocumentIndex[NestedDocument]() + document = NestedDocument( + text="lorem ipsum", child=Document(embedding=[10, 10], text="dolor sit amet") + ) + store.index([document]) + assert store.num_docs() == 1 + + +@pytest.mark.parametrize( + "search_field, query, expected_num_docs", + [ + ("text", "lorem", 1), + ("child__text", "dolor", 1), + ("text", "foo", 0), + ("child__text", "bar", 0), + ], +) +def test_text_search_nested_documents( + weaviate_client, search_field, query, expected_num_docs +): + store = WeaviateDocumentIndex[NestedDocument]() + document = NestedDocument( + text="lorem ipsum", child=Document(embedding=[10, 10], text="dolor sit amet") + ) + store.index([document]) + + results = store.text_search(query=query, search_field=search_field, limit=3) + + assert len(results.documents) == expected_num_docs + + +def test_reuse_existing_schema(weaviate_client, caplog): + WeaviateDocumentIndex[SimpleDoc]() + + with caplog.at_level(logging.DEBUG): + WeaviateDocumentIndex[SimpleDoc]() + assert "Will reuse existing schema" in caplog.text + + +def test_query_builder(test_store): + query_embedding = [10.25, 10.25] + query_text = "ipsum" + where_filter = {"path": ["id"], "operator": "Equal", "valueString": "1"} + q = ( + test_store.build_query() + .find(query=query_embedding) + .filter(where_filter) + .build() + ) + + docs = test_store.execute_query(q) + assert len(docs) == 1 + + q = ( + test_store.build_query() + .text_search(query=query_text, search_field="text") + .build() + ) + + docs = test_store.execute_query(q) + assert len(docs) == 1 + + +def test_batched_query_builder(test_store): + query_embeddings = [[10.25, 10.25], [-100, -100]] + query_texts = ["ipsum", "foo"] + where_filters = [{"path": ["id"], "operator": "Equal", "valueString": "1"}] + + q = ( + test_store.build_query() + .find_batched( + queries=query_embeddings, score_name="certainty", score_threshold=0.99 + ) + .filter_batched(filters=where_filters) + .build() + ) + + docs = test_store.execute_query(q) + assert len(docs[0]) == 1 + assert len(docs[1]) == 0 + + q = ( + test_store.build_query() + .text_search_batched(queries=query_texts, search_field="text") + .build() + ) + + docs = test_store.execute_query(q) + assert len(docs[0]) == 1 + assert len(docs[1]) == 0 + + +def test_raw_graphql(test_store): + graphql_query = """ + { + Aggregate { + Document { + meta { + count + } + } + } + } + """ + + results = test_store.execute_query(graphql_query) + num_docs = results["data"]["Aggregate"]["Document"][0]["meta"]["count"] + + assert num_docs == 3 + + +def test_hybrid_query(test_store): + query_embedding = [10.25, 10.25] + query_text = "ipsum" + where_filter = {"path": ["id"], "operator": "Equal", "valueString": "1"} + + q = ( + test_store.build_query() + .find(query=query_embedding) + .text_search(query=query_text, search_field="text") + .filter(where_filter) + .build() + ) + + docs = test_store.execute_query(q) + assert len(docs) == 1 + + +def test_hybrid_query_batched(test_store): + query_embeddings = [[10.25, 10.25], [-100, -100]] + query_texts = ["dolor", "elit"] + + q = ( + test_store.build_query() + .find_batched( + queries=query_embeddings, score_name="certainty", score_threshold=0.99 + ) + .text_search_batched(queries=query_texts, search_field="text") + .build() + ) + + docs = test_store.execute_query(q) + assert docs[0][0].id == '1' + assert docs[1][0].id == '2' + + +def test_index_multi_modal_doc(): + class MyMultiModalDoc(BaseDoc): + image: ImageDoc + text: TextDoc + + store = WeaviateDocumentIndex[MyMultiModalDoc]() + + doc = [ + MyMultiModalDoc( + image=ImageDoc(embedding=np.random.randn(128)), text=TextDoc(text='hello') + ) + ] + store.index(doc) + + id_ = doc[0].id + assert store[id_].id == id_ + assert np.all(store[id_].image.embedding == doc[0].image.embedding) + assert store[id_].text.text == doc[0].text.text + + +def test_index_document_with_bytes(weaviate_client): + doc = ImageDoc(id="1", url="www.foo.com", bytes_=b"foo") + + store = WeaviateDocumentIndex[ImageDoc]() + store.index([doc]) + + results = store.filter( + filter_query={"path": ["id"], "operator": "Equal", "valueString": "1"} + ) + + assert doc == results[0] + + +def test_index_document_with_no_embeddings(weaviate_client): + # define a document that does not have any field where is_embedding=True + class Document(BaseDoc): + not_embedding: NdArray[2] = Field(dim=2) + text: str + + doc = Document(not_embedding=[2, 5], text="dolor sit amet", id="1") + + store = WeaviateDocumentIndex[Document]() + + store.index([doc]) + + results = store.filter( + filter_query={"path": ["id"], "operator": "Equal", "valueString": "1"} + ) + + assert doc == results[0] + + +def test_limit_query_builder(test_store): + query_vector = [10.25, 10.25] + q = test_store.build_query().find(query=query_vector).limit(2) + + docs = test_store.execute_query(q) + assert len(docs) == 2 + + +@pytest.mark.linux +def test_embedded_weaviate(): + class Document(BaseDoc): + text: str + + embedded_options = EmbeddedOptions() + db_config = WeaviateDocumentIndex.DBConfig(embedded_options=embedded_options) + store = WeaviateDocumentIndex[Document](db_config=db_config) + + assert store._client._connection.embedded_db From a077cc69ab806a091ddd345c116ddbd9fda9521c Mon Sep 17 00:00:00 2001 From: Nan Wang Date: Thu, 13 Apr 2023 16:59:34 +0200 Subject: [PATCH 17/20] docs: add sending section (#1350) * docs: add serialization for json Signed-off-by: nan-wang * docs: add serialization for binary and protobuf Signed-off-by: nan-wang * docs: add serialization for base64 and bytes Signed-off-by: nan-wang * docs: add serialization for csv Signed-off-by: nan-wang * docs: add serialization for dataframe Signed-off-by: nan-wang * fix: add doctring to documentaion basedoc Signed-off-by: samsja * fix: fix mypy Signed-off-by: samsja * fix: add docstring doc list Signed-off-by: samsja * fix: dic doc array docstring Signed-off-by: samsja * fix: fix page for doc list serilizaiton Signed-off-by: samsja * fix: fix docstring Signed-off-by: samsja * feat: add docvec Signed-off-by: samsja * docs: add send doc section Signed-off-by: samsja * docs: fix docstring Signed-off-by: samsja * refactor: better tree structure for sending Signed-off-by: samsja * fix: fix tests Signed-off-by: samsja * fix: fix python code snippet ods Signed-off-by: samsja * fix: fix remove breakpoint Signed-off-by: samsja * feat: add intro Signed-off-by: samsja * feat: add ref Signed-off-by: samsja * feat: move fastapi part Signed-off-by: samsja * fix: fix fastAPI Signed-off-by: samsja * fix: remove uselss mixin Signed-off-by: samsja * faet: add jina section Signed-off-by: samsja * fix: compress -> compression Signed-off-by: samsja * feat: apply suggestion Co-authored-by: Alex Cureton-Griffiths Co-authored-by: Charlotte Gerhaher Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> * fix: apply alex suggestion Signed-off-by: samsja * wip Signed-off-by: samsja * fix: fix all docstring Signed-off-by: samsja * fix: fix update docstring Signed-off-by: samsja * fix: fix ruff Signed-off-by: samsja * fix: fix smth Signed-off-by: samsja * feat: apply charllote suggestion Co-authored-by: Charlotte Gerhaher Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> --------- Signed-off-by: nan-wang Signed-off-by: samsja Signed-off-by: samsja <55492238+samsja@users.noreply.github.com> Co-authored-by: samsja Co-authored-by: samsja <55492238+samsja@users.noreply.github.com> Co-authored-by: Alex Cureton-Griffiths Co-authored-by: Charlotte Gerhaher --- docarray/array/any_array.py | 21 +-- docarray/array/doc_list/doc_list.py | 1 + docarray/array/doc_list/io.py | 29 +-- docarray/base_doc/doc.py | 81 ++++++++- docarray/base_doc/mixins/io.py | 2 +- docarray/base_doc/mixins/update.py | 38 ++-- docs/api_references/array/da.md | 3 +- docs/api_references/base_doc/base_doc.md | 3 + .../sending/api/fastAPI.md} | 12 +- .../sending/api/jina.md} | 2 + docs/user_guide/sending/first_step.md | 13 +- docs/user_guide/sending/ser/send_doc.md | 55 ++++++ docs/user_guide/sending/ser/send_doclist.md | 165 ++++++++++++++++++ docs/user_guide/sending/ser/send_docvec.md | 30 ++++ mkdocs.yml | 13 +- tests/documentation/test_docs.py | 32 ++-- 16 files changed, 437 insertions(+), 63 deletions(-) rename docs/{integrations/fastapi.md => user_guide/sending/api/fastAPI.md} (90%) rename docs/{how_to/audio2text.md => user_guide/sending/api/jina.md} (99%) create mode 100644 docs/user_guide/sending/ser/send_doc.md create mode 100644 docs/user_guide/sending/ser/send_doclist.md create mode 100644 docs/user_guide/sending/ser/send_docvec.md diff --git a/docarray/array/any_array.py b/docarray/array/any_array.py index 31d1dedb067..e3b46132ee6 100644 --- a/docarray/array/any_array.py +++ b/docarray/array/any_array.py @@ -121,7 +121,7 @@ def _set_data_column( field: str, values: Union[List, T, 'AbstractTensor'], ): - """Set all Documents in this [`DocList`][docarray.DocList] using the passed values + """Set all Documents in this [`DocList`][docarray.array.doc_list.doc_list.DocList] using the passed values :param field: name of the fields to extract :values: the values to set at the DocList level @@ -140,8 +140,8 @@ def to_protobuf(self) -> 'DocListProto': ... def _to_node_protobuf(self) -> 'NodeProto': - """Convert a [`DocList`][docarray.DocList] into a NodeProto protobuf message. - This function should be called when a DocList + """Convert a [`DocList`][docarray.array.doc_list.doc_list.DocList] into a NodeProto protobuf message. + This function should be called when a DocList is nested into another Document that need to be converted into a protobuf :return: the nested item protobuf message @@ -157,13 +157,11 @@ def traverse_flat( ) -> Union[List[Any], 'AbstractTensor']: """ Return a List of the accessed objects when applying the `access_path`. If this - results in a nested list or list of [`DocList`s][docarray.DocList], the list will be flattened + results in a nested list or list of [`DocList`s][docarray.array.doc_list.doc_list.DocList], the list will be flattened on the first level. The access path is a string that consists of attribute names, concatenated and `"__"`-separated. It describes the path from the first level to an arbitrary one, e.g. `'content__image__url'`. - :param access_path: a string that represents the access path (`"__"`-separated). - :return: list of the accessed objects, flattened if nested. ```python from docarray import BaseDoc, DocList, Text @@ -210,7 +208,7 @@ class Book(BaseDoc): chapters = docs.traverse_flat(access_path='chapters') # list of 30 strings ``` - If your [`DocList`][docarray.DocList] is in doc_vec mode, and you want to access a field of + If your [`DocList`][docarray.array.doc_list.doc_list.DocList] is in doc_vec mode, and you want to access a field of type [`AnyTensor`][docarray.typing.AnyTensor], the doc_vec tensor will be returned instead of a list: ```python @@ -232,6 +230,9 @@ class Image(BaseDoc): access_path='tensor' ) # tensor of shape (2, 3, 224, 224) ``` + + :param access_path: a string that represents the access path ("__"-separated). + :return: list of the accessed objects, flattened if nested. """ ... @@ -263,7 +264,7 @@ def _flatten_one_level(sequence: List[Any]) -> List[Any]: def summary(self): """ - Print a summary of this [`DocList`][docarray.DocList] object and a summary of the schema of its + Print a summary of this [`DocList`][docarray.array.doc_list.doc_list.DocList] object and a summary of the schema of its Document type. """ DocArraySummary(self).summary() @@ -275,13 +276,13 @@ def _batch( show_progress: bool = False, ) -> Generator[T, None, None]: """ - Creates a `Generator` that yields [`DocList`][docarray.DocList] of size `batch_size`. + Creates a `Generator` that yields [`DocList`][docarray.array.doc_list.doc_list.DocList] of size `batch_size`. Note, that the last batch might be smaller than `batch_size`. :param batch_size: Size of each generated batch. :param shuffle: If set, shuffle the Documents before dividing into minibatches. :param show_progress: if set, show a progress bar when batching documents. - :yield: a Generator of [`DocList`][docarray.DocList], each in the length of `batch_size` + :yield: a Generator of [`DocList`][docarray.array.doc_list.doc_list.DocList], each in the length of `batch_size` """ from rich.progress import track diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py index d01d7a31e0d..8eb1a822d59 100644 --- a/docarray/array/doc_list/doc_list.py +++ b/docarray/array/doc_list/doc_list.py @@ -96,6 +96,7 @@ class Image(BaseDoc): # You can also set fields, with `docs.tensor = np.random.random([10, 100])`: + import numpy as np docs.tensor = np.random.random([10, 100]) diff --git a/docarray/array/doc_list/io.py b/docarray/array/doc_list/io.py index e0814e89fa8..9f153e2f1bd 100644 --- a/docarray/array/doc_list/io.py +++ b/docarray/array/doc_list/io.py @@ -141,7 +141,7 @@ def from_bytes( :param data: Bytes from which to deserialize :param protocol: protocol that was used to serialize - :param compress: compress algorithm that was used to serialize + :param compress: compression algorithm that was used to serialize between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the deserialized `DocList` """ @@ -247,7 +247,7 @@ def to_bytes( For more Pythonic code, please use ``bytes(...)``. :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between : `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param file_ctx: File or filename or serialized bytes where the data is stored. :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the binary serialization in bytes or None if file_ctx is passed where to store @@ -277,7 +277,7 @@ def from_base64( :param data: Base64 string to deserialize :param protocol: protocol that was used to serialize - :param compress: compress algorithm that was used to serialize + :param compress: compress algorithm that was used to serialize between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the deserialized `DocList` """ @@ -297,7 +297,7 @@ def to_base64( """Serialize itself into base64 encoded string. :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: the binary serialization in bytes or None if file_ctx is passed where to store """ @@ -566,7 +566,7 @@ def _load_binary_all( ): """Read a `DocList` object from a binary file :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: a `DocList` """ @@ -646,7 +646,7 @@ def _load_binary_stream( """Yield `Document` objects from a binary file :param protocol: protocol to use. It can be 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` :return: a generator of `Document` objects """ @@ -702,13 +702,7 @@ def load_binary( ) -> Union[T, Generator['T_doc', None, None]]: """Load doc_list elements from a compressed binary file. - :param file: File or filename or serialized bytes where the data is stored. - :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use - :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` - :param streaming: if `True` returns a generator over `Document` objects. In case protocol is pickle the `Documents` are streamed from disk to save memory usage - :return: a `DocList` object !!! note If `file` is `str` it can specify `protocol` and `compress` as file extensions. @@ -716,6 +710,15 @@ def load_binary( string interpolation of the respective `protocol` and `compress` methods. For example if `file=my_docarray.protobuf.lz4` then the binary data will be loaded assuming `protocol=protobuf` and `compress=lz4`. + + :param file: File or filename or serialized bytes where the data is stored. + :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` + :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` + :param streaming: if `True` returns a generator over `Document` objects. + + :return: a `DocList` object + """ load_protocol: Optional[str] = protocol load_compress: Optional[str] = compress @@ -765,7 +768,7 @@ def save_binary( :param file: File or filename to which the data is saved. :param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip` :param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf` !!! note diff --git a/docarray/base_doc/doc.py b/docarray/base_doc/doc.py index a5c42a82ee4..0ed39bd0d49 100644 --- a/docarray/base_doc/doc.py +++ b/docarray/base_doc/doc.py @@ -1,5 +1,15 @@ import os -from typing import TYPE_CHECKING, Any, Dict, Optional, Type, TypeVar +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Optional, + Type, + TypeVar, + Union, + no_type_check, +) import orjson from pydantic import BaseModel, Field @@ -12,11 +22,16 @@ from docarray.typing.tensor.abstract_tensor import AbstractTensor if TYPE_CHECKING: + from pydantic import Protocol + from pydantic.types import StrBytes + from pydantic.typing import AbstractSetIntStr, MappingIntStrAny + from docarray.array.doc_vec.column_storage import ColumnStorageView _console: Console = Console() T = TypeVar('T', bound='BaseDoc') +T_update = TypeVar('T_update', bound='UpdateMixin') class BaseDoc(BaseModel, IOMixin, UpdateMixin, BaseNode): @@ -141,3 +156,67 @@ def _docarray_to_json_compatible(self) -> Dict: :return: A dictionary of the BaseDoc object """ return self.dict() + + ######################################################################################################################################################## + ### this section is just for documentation purposes will be removed later once https://github.com/mkdocstrings/griffe/issues/138 is fixed ############## + ######################################################################################################################################################## + + def json( + self, + *, + include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None, + exclude: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None, + by_alias: bool = False, + skip_defaults: Optional[bool] = None, + exclude_unset: bool = False, + exclude_defaults: bool = False, + exclude_none: bool = False, + encoder: Optional[Callable[[Any], Any]] = None, + models_as_dict: bool = True, + **dumps_kwargs: Any, + ) -> str: + """ + Generate a JSON representation of the model, `include` and `exclude` arguments as per `dict()`. + + `encoder` is an optional function to supply as `default` to json.dumps(), other arguments as per `json.dumps()`. + """ + return super().json( + include=include, + exclude=exclude, + by_alias=by_alias, + skip_defaults=skip_defaults, + exclude_unset=exclude_unset, + exclude_defaults=exclude_defaults, + exclude_none=exclude_none, + encoder=encoder, + models_as_dict=models_as_dict, + **dumps_kwargs, + ) + + @no_type_check + @classmethod + def parse_raw( + cls: Type[T], + b: 'StrBytes', + *, + content_type: str = None, + encoding: str = 'utf8', + proto: 'Protocol' = None, + allow_pickle: bool = False, + ) -> T: + """ + Parse a raw string or bytes into a base doc + :param b: + :param content_type: + :param encoding: the encoding to use when parsing a string, defaults to 'utf8' + :param proto: protocol to use. + :param allow_pickle: allow pickle protocol + :return: a document + """ + return super(BaseDoc, cls).parse_raw( + b, + content_type=content_type, + encoding=encoding, + proto=proto, + allow_pickle=allow_pickle, + ) diff --git a/docarray/base_doc/mixins/io.py b/docarray/base_doc/mixins/io.py index b2a64e8082b..e50d9ac791d 100644 --- a/docarray/base_doc/mixins/io.py +++ b/docarray/base_doc/mixins/io.py @@ -138,7 +138,7 @@ def to_bytes( For more Pythonic code, please use ``bytes(...)``. :param protocol: protocol to use. It can be 'pickle' or 'protobuf' - :param compress: compress algorithm to use + :param compress: compression algorithm to use :return: the binary serialization in bytes """ import pickle diff --git a/docarray/base_doc/mixins/update.py b/docarray/base_doc/mixins/update.py index 99fdbc2bf8e..471e97483ba 100644 --- a/docarray/base_doc/mixins/update.py +++ b/docarray/base_doc/mixins/update.py @@ -25,7 +25,8 @@ def update(self, other: T): Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following: - setting data properties of the second Document to the first Document - if they are not None + if they are not None: + - Concatenating lists and updating sets - Updating recursively Documents and DocArrays - Updating Dictionaries of the left with the right @@ -38,30 +39,33 @@ def update(self, other: T): so they behave as regular types and the value of `self` is updated with the value of `other` - EXAMPLE USAGE - .. code-block:: python + --- + + ```python + from typing import List, Optional - from docarray import BaseDoc - from docarray.documents import Text + from docarray import BaseDoc - class MyDocument(BaseDoc): - content: str - title: Optional[str] = None - tags_: List + class MyDocument(BaseDoc): + content: str + title: Optional[str] = None + tags_: List - doc1 = MyDocument( - content='Core content of the document', title='Title', tags_=['python', 'AI'] - ) - doc2 = MyDocument(content='Core content updated', tags_=['docarray']) + doc1 = MyDocument( + content='Core content of the document', title='Title', tags_=['python', 'AI'] + ) + doc2 = MyDocument(content='Core content updated', tags_=['docarray']) - doc1.update(doc2) - assert doc1.content == 'Core content updated' - assert doc1.title == 'Title' - assert doc1.tags_ == ['python', 'AI', 'docarray'] + doc1.update(doc2) + assert doc1.content == 'Core content updated' + assert doc1.title == 'Title' + assert doc1.tags_ == ['python', 'AI', 'docarray'] + ``` + --- :param other: The Document with which to update the contents of this """ if type(self) != type(other): diff --git a/docs/api_references/array/da.md b/docs/api_references/array/da.md index 28e1aa94efa..e1f5b33f008 100644 --- a/docs/api_references/array/da.md +++ b/docs/api_references/array/da.md @@ -1,4 +1,5 @@ # DocList ::: docarray.array.doc_list.doc_list.DocList -::: docarray.array.doc_list.pushpull.PushPullMixin \ No newline at end of file +::: docarray.array.doc_list.io.IOMixinArray +::: docarray.array.doc_list.pushpull.PushPullMixin diff --git a/docs/api_references/base_doc/base_doc.md b/docs/api_references/base_doc/base_doc.md index 0fe2dc80891..abce654ee96 100644 --- a/docs/api_references/base_doc/base_doc.md +++ b/docs/api_references/base_doc/base_doc.md @@ -1,3 +1,6 @@ # BaseDoc ::: docarray.base_doc.doc.BaseDoc +::: docarray.base_doc.mixins.io.IOMixin +::: docarray.base_doc.mixins.update.UpdateMixin + diff --git a/docs/integrations/fastapi.md b/docs/user_guide/sending/api/fastAPI.md similarity index 90% rename from docs/integrations/fastapi.md rename to docs/user_guide/sending/api/fastAPI.md index e55b09fba9e..d35308fefce 100644 --- a/docs/integrations/fastapi.md +++ b/docs/user_guide/sending/api/fastAPI.md @@ -1,9 +1,15 @@ -# Use DocArray with FastAPI +# FastAPI -FastAPI is a high-performance web framework for building APIs with Python. It's designed to be easy to use and supports asynchronous programming. -Since [`DocArray` documents are Pydantic Models (with a twist)](../user_guide/representing/first_step.md) they can be easily integrated with FastAPI, +[FastAPI](https://fastapi.tiangolo.com/) is a high-performance web framework for building APIs with Python based on Python type hints. It's designed to be easy to use and supports asynchronous programming. +Since [`DocArray` documents are Pydantic Models (with a twist)](../../representing/first_step.md) they can be easily integrated with FastAPI, and provide a seamless and efficient way to work with multimodal data in FastAPI-powered APIs. +!!! note + you need to install FastAPI to follow this section + ``` + pip install fastapi + ``` + First, you should define schemas for your input and/or output Documents: ```python diff --git a/docs/how_to/audio2text.md b/docs/user_guide/sending/api/jina.md similarity index 99% rename from docs/how_to/audio2text.md rename to docs/user_guide/sending/api/jina.md index d2f2507e08f..cbdf50acd2a 100644 --- a/docs/how_to/audio2text.md +++ b/docs/user_guide/sending/api/jina.md @@ -1,3 +1,5 @@ +# Jina + # Create an audio to text app with Jina and DocArray V2 This is how you can build an Audio to Text app using Jina, DocArray and Whisper. diff --git a/docs/user_guide/sending/first_step.md b/docs/user_guide/sending/first_step.md index 1079b9dd75b..6e2d2608943 100644 --- a/docs/user_guide/sending/first_step.md +++ b/docs/user_guide/sending/first_step.md @@ -1 +1,12 @@ -# Sending data +# Intro + +In the representation section we saw how to use [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] +to represent multi-modal data. In this section we will see **how to send these data over the wire**. + + +This section is divided into two: + +- [Serialization](./ser/send_doc.md) of [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] +- [Using DocArray with a web framework to build a multimodal API](./api/jina.md) + + diff --git a/docs/user_guide/sending/ser/send_doc.md b/docs/user_guide/sending/ser/send_doc.md new file mode 100644 index 00000000000..dd77557dbba --- /dev/null +++ b/docs/user_guide/sending/ser/send_doc.md @@ -0,0 +1,55 @@ +# BaseDoc + +You need to serialize a [BaseDoc][docarray.base_doc.doc.BaseDoc] before you can store or send it. + +!!! note + [BaseDoc][docarray.base_doc.doc.BaseDoc] supports serialization to `protobuf` and `json` formats. + +## Serialization to protobuf + +You can use [`to_protobuf`][docarray.base_doc.mixins.io.IOMixin.to_protobuf] to serialize a [BaseDoc][docarray.base_doc.doc.BaseDoc] to a protobuf message object +and use [`from_protobuf`][docarray.base_doc.mixins.io.IOMixin.from_protobuf] to deserialize it. + +```python +from typing import List +from docarray import BaseDoc + + +class MyDoc(BaseDoc): + text: str + tags: List[str] + + +doc = MyDoc(text='hello world', tags=['hello', 'world']) +proto_message = doc.to_protobuf() +new_doc = MyDoc.from_protobuf(proto_message) +assert doc == new_doc # True +``` + +## Serialization to JSON + +You can use [`json`][docarray.base_doc.doc.BaseDoc.json] to serialize a [BaseDoc][docarray.base_doc.doc.BaseDoc] to a json string +and use [`parse_raw`][docarray.base_doc.doc.BaseDoc.parse_raw] to deserialize it. + +```python +from typing import List +from docarray import BaseDoc + + +class MyDoc(BaseDoc): + text: str + tags: List[str] + + +doc = MyDoc(text='hello world', tags=['hello', 'world']) +json_str = doc.json() +new_doc = MyDoc.parse_raw(json_str) +assert doc == new_doc # True +``` + +See also: + +* The serializing [DocList](./send_doclist.md) section +* The serializing [DocVec](./send_docvec.md) section + + diff --git a/docs/user_guide/sending/ser/send_doclist.md b/docs/user_guide/sending/ser/send_doclist.md new file mode 100644 index 00000000000..70b1789ca5f --- /dev/null +++ b/docs/user_guide/sending/ser/send_doclist.md @@ -0,0 +1,165 @@ +# DocList +When sending or storing [`DocList`][docarray.array.doc_list.doc_list.DocList], you need to use serialization. [DocList][docarray.array.doc_list.doc_list.DocList] supports multiple ways to serialize the data. + +## JSON +You can use [`to_json()`][docarray.array.doc_list.io.IOMixinArray.to_json] and [`from_json()`][docarray.array.doc_list.io.IOMixinArray.from_json] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]: + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +with open('simple-dl.json', 'wb') as f: + json_dl = dl.to_json() + print(json_dl) + f.write(json_dl) + +with open('simple-dl.json', 'r') as f: + dl_load_from_json = DocList[SimpleDoc].from_json(f.read()) + print(dl_load_from_json) +``` + +[to_json()][docarray.array.doc_list.io.IOMixinArray.to_json] returns the binary representation of the json object. [from_json()][docarray.array.doc_list.io.IOMixinArray.from_json] can load from either `str` or `binary` representation of the json object. + +```output +b'[{"id":"5540e72d407ae81abb2390e9249ed066","text":"doc 0"},{"id":"fbe9f80d2fa03571e899a2887af1ac1b","text":"doc 1"}]' +``` + +## Protobuf +To serialize a DocList with `protobuf`, you can use [`to_protobuf()`][docarray.array.doc_list.io.IOMixinArray.to_protobuf] and [`from_protobuf()`][docarray.array.doc_list.io.IOMixinArray.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]: + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +proto_message_dl = dl.to_protobuf() +dl_from_proto = DocList[SimpleDoc].from_protobuf(proto_message_dl) +print(type(proto_message_dl)) +print(dl_from_proto) +``` + +[to_protobuf()][docarray.array.doc_list.io.IOMixinArray.to_protobuf] returns a protobuf object of `docarray_pb2.DocListProto` class. [from_protobuf()][docarray.array.doc_list.io.IOMixinArray.from_protobuf] accepts a protobuf message object to construct a [DocList][docarray.array.doc_list.doc_list.DocList]. + +## Base64 +When transferring over the network, you can choose `Base64` format to serialize the [`DocList`][docarray.array.doc_list.doc_list.DocList]. +Serializing a [DocList][docarray.array.doc_list.doc_list.DocList] in Base64 supports both `pickle` and `protobuf` protocols. Besides, you can choose different compression methods. + +To serialize a [DocList][docarray.array.doc_list.doc_list.DocList] in Base64, you can use [`to_base64()`][docarray.array.doc_list.io.IOMixinArray.to_base64] and [`from_base64()`][docarray.array.doc_list.io.IOMixinArray.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]: + +We support multiple compression methods. (namely : `lz4`, `bz2`, `lzma`, `zlib`, `gzip`) + + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +base64_repr_dl = dl.to_base64(compress=None, protocol='pickle') + +dl_from_base64 = DocList[SimpleDoc].from_base64( + base64_repr_dl, compress=None, protocol='pickle' +) +``` + +## Binary +Similar to `Base64` serialization, `Binary` serialization also supports different protocols and compression methods. + +To save a [DocList][docarray.array.doc_list.doc_list.DocList] into a binary file, you can use [`save_binary()`][docarray.array.doc_list.io.IOMixinArray.to_base64] and [`load_binary()`][docarray.array.doc_list.io.IOMixinArray.from_protobuf] to serialize and deserialize a [DocList][docarray.array.doc_list.doc_list.DocList]: + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +dl.save_binary('simple-dl.pickle', compress=None, protocol='pickle') + +dl_from_binary = DocList[SimpleDoc].load_binary( + 'simple-dl.pickle', compress=None, protocol='pickle' +) +``` + +The [DocList][docarray.array.doc_list.doc_list.DocList] is stored at `simple-dl.pickle` file. + +### Bytes +Under the hood, [save_binary()][docarray.array.doc_list.io.IOMixinArray.to_base64] prepares the file object and calls [to_bytes()][docarray.array.doc_list.io.IOMixinArray.to_bytes] function to convert the [DocList][docarray.array.doc_list.doc_list.DocList] into a byte object. You can use [to_bytes()][docarray.array.doc_list.io.IOMixinArray.to_bytes] function directly and use [from_bytes()][docarray.array.doc_list.io.IOMixinArray.from_bytes] to load the [DocList][docarray.array.doc_list.doc_list.DocList] from a byte object. You can use `protocol` to choose between `pickle` and `protobuf`. Besides, [to_bytes()][docarray.array.doc_list.io.IOMixinArray.to_bytes] and [save_binary()][docarray.array.doc_list.io.IOMixinArray.save_binary] support multiple options for `compress` as well. + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +bytes_dl = dl.to_bytes(protocol='pickle', compress=None) + +dl_from_bytes = DocList[SimpleDoc].from_bytes( + bytes_dl, compress=None, protocol='pickle' +) +``` + + +## CSV +You can use [`from_csv()`][docarray.array.doc_list.io.IOMixinArray.from_csv] and [`to_csv()`][docarray.array.doc_list.io.IOMixinArray.to_csv] to de-/serializae and deserialize the [DocList][docarray.array.doc_list.doc_list.DocList] from/to a CSV file. Use the `dialect` parameter to choose the dialect of the CSV format: + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +dl.to_csv('simple-dl.csv') +dl_from_csv = DocList[SimpleDoc].from_csv('simple-dl.csv') +print(dl_from_csv) +``` + + +## Pandas.Dataframe +You can use [`from_dataframe()`][docarray.array.doc_list.io.IOMixinArray.from_dataframe] and [`to_dataframe()`][docarray.array.doc_list.io.IOMixinArray.to_dataframe] to load/save the [DocList][docarray.array.doc_list.doc_list.DocList] from/to a pandas DataFrame: + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(2)]) + +df = dl.to_dataframe() +dl_from_dataframe = DocList[SimpleDoc].from_dataframe(df) +print(dl_from_dataframe) +``` + +See also: + +* The serializing [BaseDoc](./send_doc.md) section +* The serializing [DocVec](./send_docvec.md) section diff --git a/docs/user_guide/sending/ser/send_docvec.md b/docs/user_guide/sending/ser/send_docvec.md new file mode 100644 index 00000000000..3fbaf759075 --- /dev/null +++ b/docs/user_guide/sending/ser/send_docvec.md @@ -0,0 +1,30 @@ +# DocVec + +When sending or storing [`DocVec`][docarray.array.doc_list.doc_list.DocVec], you need to use serialization. [DocVec][docarray.array.doc_list.doc_list.DocVec] only supports protobuf to serialize the data. +You can use [`to_protobuf`][docarray.array.doc_list.doc_list.DocVec.to_protobuf] and [`from_protobuf`][docarray.array.doc_list.doc_list.DocVec.from_protobuf] to serialize and deserialize a [DocVec][docarray.array.doc_list.doc_list.DocVec] + +```python +import numpy as np + +from docarray import BaseDoc, DocVec +from docarray.typing import AnyTensor + + +class SimpleVecDoc(BaseDoc): + tensor: AnyTensor + + +dv = DocVec[SimpleVecDoc]([SimpleVecDoc(tensor=np.ones(16)) for _ in range(8)]) + +proto_message_dv = dv.to_protobuf() + +dv_from_proto = DocVec[SimpleVecDoc].from_protobuf(proto_message_dv) +``` + +!!! note + We are planning to add more serialization formats in the future, notably JSON. + +[`to_protobuf`][docarray.array.doc_list.doc_list.DocVec.to_protobuf] returns a protobuf object of `docarray_pb2.DocVecProto` class. [`from_protobuf`][docarray.array.doc_list.doc_list.DocVec.from_protobuf] accepts a protobuf message object to construct a [DocVec][docarray.array.doc_list.doc_list.DocVec]. + +* The serializing [BaseDoc](./send_doc.md) section +* The serializing [DocList](./send_doclist.md) section diff --git a/mkdocs.yml b/mkdocs.yml index bd1548a0a22..f8a967fcec5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -81,7 +81,15 @@ nav: - Representing data: - user_guide/representing/first_step.md - user_guide/representing/array.md - - user_guide/sending/first_step.md + - Sending: + - user_guide/sending/first_step.md + - Serialization: + - user_guide/sending/ser/send_doc.md + - user_guide/sending/ser/send_doclist.md + - user_guide/sending/ser/send_docvec.md + - Building API: + - user_guide/sending/api/jina.md + - user_guide/sending/api/fastAPI.md - Storing: - user_guide/storing/first_step.md - user_guide/storing/store_file.md @@ -92,9 +100,6 @@ nav: - how_to/add_doc_index.md - how_to/multimodal_training_and_serving.md - how_to/optimize_performance_with_id_generation.md - - how_to/audio2text.md - - Integrations: - - integrations/fastapi.md - Data Types: - data_types/text/text.md - data_types/image/image.md diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py index 1d8fe1679b3..b071839c88c 100644 --- a/tests/documentation/test_docs.py +++ b/tests/documentation/test_docs.py @@ -4,6 +4,8 @@ from mktestdocs import grab_code_blocks from mktestdocs.__main__ import _executors, check_raw_string +file_to_skip = ['fastAPI', 'jina'] + def check_raw_file_full(raw, lang="python", keyword_ignore=[]): if lang not in _executors: @@ -43,19 +45,25 @@ def check_md_file(fpath, memory=False, lang="python", keyword_ignore=[]): check_raw_file_full(text, lang=lang, keyword_ignore=keyword_ignore) -@pytest.mark.parametrize( - 'fpath', - [ - *list(pathlib.Path('docs/user_guide').glob('**/*.md')), - *list(pathlib.Path('docs/data_types').glob('**/*.md')), - ], - ids=str, -) +files_to_check = [ + *list(pathlib.Path('docs/user_guide').glob('**/*.md')), + *list(pathlib.Path('docs/data_types').glob('**/*.md')), +] + +file_to_remove = [] + +for file in files_to_check: + for fn in file_to_skip: + if fn in str(file): + file_to_remove.append(file) + +for file in file_to_remove: + files_to_check.remove(file) + + +@pytest.mark.parametrize('fpath', files_to_check, ids=str) def test_files_good(fpath): - keyword_ignore = [] - if 'store_jac.md' in str(fpath): - keyword_ignore = ['jac'] - check_md_file(fpath=fpath, memory=True, keyword_ignore=keyword_ignore) + check_md_file(fpath=fpath, memory=True, keyword_ignore=['pickle', 'jac']) def test_readme(): From b6c3b66d1afd1bb0e782dc16fb35e0d13e1779a3 Mon Sep 17 00:00:00 2001 From: Charlotte Gerhaher Date: Fri, 14 Apr 2023 09:02:38 +0200 Subject: [PATCH 18/20] fix: apply sami suggestion Co-authored-by: samsja <55492238+samsja@users.noreply.github.com> Signed-off-by: Charlotte Gerhaher Signed-off-by: anna-charlotte --- docs/api_references/doc_store/doc_store.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api_references/doc_store/doc_store.md b/docs/api_references/doc_store/doc_store.md index eb6e65b9f4a..275f3e8e3b0 100644 --- a/docs/api_references/doc_store/doc_store.md +++ b/docs/api_references/doc_store/doc_store.md @@ -1,3 +1,3 @@ -# AbstractDocStore +# DocStore ::: docarray.store.abstract_doc_store.AbstractDocStore From d897d06f1481cdd48969c984c98f5b14de6ca644 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Fri, 14 Apr 2023 09:37:55 +0200 Subject: [PATCH 19/20] fix: apply suggestions from samis code review Signed-off-by: anna-charlotte --- .../storing/{ => doc_store}/store_file.md | 0 .../storing/{ => doc_store}/store_jac.md | 0 .../storing/{ => doc_store}/store_s3.md | 0 docs/user_guide/storing/first_step.md | 17 +++++++++++++---- mkdocs.yml | 8 +++++--- 5 files changed, 18 insertions(+), 7 deletions(-) rename docs/user_guide/storing/{ => doc_store}/store_file.md (100%) rename docs/user_guide/storing/{ => doc_store}/store_jac.md (100%) rename docs/user_guide/storing/{ => doc_store}/store_s3.md (100%) diff --git a/docs/user_guide/storing/store_file.md b/docs/user_guide/storing/doc_store/store_file.md similarity index 100% rename from docs/user_guide/storing/store_file.md rename to docs/user_guide/storing/doc_store/store_file.md diff --git a/docs/user_guide/storing/store_jac.md b/docs/user_guide/storing/doc_store/store_jac.md similarity index 100% rename from docs/user_guide/storing/store_jac.md rename to docs/user_guide/storing/doc_store/store_jac.md diff --git a/docs/user_guide/storing/store_s3.md b/docs/user_guide/storing/doc_store/store_s3.md similarity index 100% rename from docs/user_guide/storing/store_s3.md rename to docs/user_guide/storing/doc_store/store_s3.md diff --git a/docs/user_guide/storing/first_step.md b/docs/user_guide/storing/first_step.md index d821f5872fb..13ecfe138c0 100644 --- a/docs/user_guide/storing/first_step.md +++ b/docs/user_guide/storing/first_step.md @@ -3,6 +3,13 @@ In the previous sections we saw how to use [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] to represent multi-modal data and send it over the wire. In this section we will see how to store and persist this data. +DocArray offers to ways of storing your data: + +1. In a **[Document Store](#document-store)** for simple long-term storage +2. In a **[Document Index](#document-index)** for fast retrieval using vector similarity + +## Document Store + [DocList][docarray.array.doc_list.doc_list.DocList] can be persisted using the [`.push()`][docarray.array.doc_list.pushpull.PushPullMixin.push] and [`.pull()`][docarray.array.doc_list.pushpull.PushPullMixin.pull] methods. @@ -10,8 +17,10 @@ Under the hood, [DocStore][docarray.store.abstract_doc_store.AbstractDocStore] i You can store your documents on-disk. Alternatively, you can upload them to [AWS S3](https://aws.amazon.com/s3/), [minio](https://min.io) or [Jina AI Cloud](https://cloud.jina.ai/user/storage). -This section is divided into three parts: +This section covers the following three topics: -- [Store](store_file.md) of [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] on-disk -- [Store on Jina AI Cloud](store_jac.md) -- [Store on S3](store_s3.md) + - [Store](doc_store/store_file.md) of [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] on-disk + - [Store on Jina AI Cloud](doc_store/store_jac.md) + - [Store on S3](doc_store/store_s3.md) + +## Document Index diff --git a/mkdocs.yml b/mkdocs.yml index f8a967fcec5..255eeff4818 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -92,9 +92,11 @@ nav: - user_guide/sending/api/fastAPI.md - Storing: - user_guide/storing/first_step.md - - user_guide/storing/store_file.md - - user_guide/storing/store_jac.md - - user_guide/storing/store_s3.md + - DocStore: + - user_guide/storing/doc_store/store_file.md + - user_guide/storing/doc_store/store_jac.md + - user_guide/storing/doc_store/store_s3.md + - How-to: - how_to/add_doc_index.md From 597cd3a4b079706a3a168b0c0cc302355ce98a99 Mon Sep 17 00:00:00 2001 From: Charlotte Gerhaher Date: Fri, 14 Apr 2023 09:09:35 +0200 Subject: [PATCH 20/20] docs: add missing links and clean up (#1370) * docs: add links and clean up Signed-off-by: anna-charlotte * fix: Text to TextDoc Signed-off-by: anna-charlotte * fix: 3d urls Signed-off-by: anna-charlotte * fix: pc url Signed-off-by: anna-charlotte * fix: comment out display pc Signed-off-by: anna-charlotte --------- Signed-off-by: anna-charlotte --- docarray/array/any_array.py | 2 +- docarray/array/doc_list/io.py | 13 +++--- docarray/array/doc_vec/doc_vec.py | 46 ++++++++++--------- docarray/base_doc/mixins/update.py | 8 ++-- docarray/data/torch_dataset.py | 36 +++++++++------ .../tensor/audio/abstract_audio_tensor.py | 2 +- .../tensor/image/abstract_image_tensor.py | 2 +- .../tensor/image/image_tensorflow_tensor.py | 3 +- .../typing/tensor/image/image_torch_tensor.py | 3 +- .../typing/tensor/video/video_tensor_mixin.py | 2 +- docarray/typing/url/url_3d/mesh_url.py | 34 +++++++------- docarray/typing/url/url_3d/point_cloud_url.py | 26 ++++++----- docarray/typing/url/video_url.py | 3 +- 13 files changed, 97 insertions(+), 83 deletions(-) diff --git a/docarray/array/any_array.py b/docarray/array/any_array.py index e3b46132ee6..6457072cf88 100644 --- a/docarray/array/any_array.py +++ b/docarray/array/any_array.py @@ -209,7 +209,7 @@ class Book(BaseDoc): ``` If your [`DocList`][docarray.array.doc_list.doc_list.DocList] is in doc_vec mode, and you want to access a field of - type [`AnyTensor`][docarray.typing.AnyTensor], the doc_vec tensor will be returned instead of a list: + type `AnyTensor`, the doc_vec tensor will be returned instead of a list: ```python class Image(BaseDoc): diff --git a/docarray/array/doc_list/io.py b/docarray/array/doc_list/io.py index 9f153e2f1bd..16dca6a5bb0 100644 --- a/docarray/array/doc_list/io.py +++ b/docarray/array/doc_list/io.py @@ -358,10 +358,9 @@ def from_csv( :param dialect: defines separator and how to handle whitespaces etc. Can be a [`csv.Dialect`](https://docs.python.org/3/library/csv.html#csv.Dialect) instance or one string of: - - - 'excel' (for comma separated values), - - 'excel-tab' (for tab separated values), - - 'unix' (for csv file generated on UNIX systems). + `'excel'` (for comma separated values), + `'excel-tab'` (for tab separated values), + `'unix'` (for csv file generated on UNIX systems). :return: `DocList` object """ @@ -428,10 +427,10 @@ def to_csv( :param dialect: defines separator and how to handle whitespaces etc. Can be a [`csv.Dialect`](https://docs.python.org/3/library/csv.html#csv.Dialect) instance or one string of: + `'excel'` (for comma separated values), + `'excel-tab'` (for tab separated values), + `'unix'` (for csv file generated on UNIX systems). - - 'excel' (for comma seperated values), - - 'excel-tab' (for tab separated values), - - 'unix' (for csv file generated on UNIX systems). """ fields = self.doc_type._get_access_paths() diff --git a/docarray/array/doc_vec/doc_vec.py b/docarray/array/doc_vec/doc_vec.py index 7d692b31084..adb701d2a11 100644 --- a/docarray/array/doc_vec/doc_vec.py +++ b/docarray/array/doc_vec/doc_vec.py @@ -59,32 +59,34 @@ class DocVec(AnyDocArray[T_doc]): computation that require batches of data (ex: matrix multiplication, distance calculation, deep learning forward pass) - A DocVec has a similar interface as - {class}`~docarray.array.DocList` but with an underlying implementation that is - column based instead of row based. Each field - of the schema of the DocVec - (the :attr:`~docarray.array.doc_vec.DocVec.doc_type` which is a - `BaseDoc`) will be stored in a column. If the field is a tensor, the data from all Documents will be stored as a single, doc_vec (torch/np/tf) tensor. - If the tensor field - is `AnyTensor` or a Union of tensor types, the - :attr:`~docarray.array.doc_vec.DocVec.tensor_type` will be used to determine - the type of the doc_vec column. - - If the field is another `BasedDoc` the column will be another DocVec that follows the - schema of the nested Document. - If the field is a `DocList` or - `DocVec` then the column will be a list of `DocVec`. + A DocVec has a similar interface as [`DocList`][docarray.array.DocList] + but with an underlying implementation that is column based instead of row based. + Each field of the schema of the `DocVec` (the `.doc_type` which is a + [`BaseDoc`][docarray.BaseDoc]) will be stored in a column. + + If the field is a tensor, the data from all Documents will be stored as a single + doc_vec (torch/np/tf) tensor. + + If the tensor field is `AnyTensor` or a Union of tensor types, the + `.tensor_type` will be used to determine the type of the doc_vec column. + + If the field is another [`BaseDoc`][docarray.BaseDoc] the column will be another + `DocVec` that follows the schema of the nested Document. + + If the field is a [`DocList`][docarray.DocList] or `DocVec` then the column will + be a list of `DocVec`. + For any other type the column is a Python list. - Every `Document` inside a `DocVec` is a view into the data columns stored at the `DocVec` level. The `BaseDoc` does - not hold any data itself. The behavior of - this Document "view" is similar to the behavior of `view = tensor[i]` in - numpy/PyTorch. + Every `Document` inside a `DocVec` is a view into the data columns stored at the + `DocVec` level. The `BaseDoc` does not hold any data itself. The behavior of + this Document "view" is similar to the behavior of `view = tensor[i]` in + numpy/PyTorch. - :param docs: a homogeneous sequence of BaseDoc + :param docs: a homogeneous sequence of `BaseDoc` :param tensor_type: Tensor Class used to wrap the doc_vec tensors. This is useful - if the BaseDoc of this DocVec has some undefined tensor type like - AnyTensor or Union of NdArray and TorchTensor + if the BaseDoc of this DocVec has some undefined tensor type like + AnyTensor or Union of NdArray and TorchTensor """ doc_type: Type[T_doc] diff --git a/docarray/base_doc/mixins/update.py b/docarray/base_doc/mixins/update.py index 471e97483ba..754e6c9b789 100644 --- a/docarray/base_doc/mixins/update.py +++ b/docarray/base_doc/mixins/update.py @@ -24,9 +24,9 @@ def update(self, other: T): """ Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following: - - setting data properties of the second Document to the first Document - if they are not None: + - Setting data properties of the second Document to the first Document + if they are not None - Concatenating lists and updating sets - Updating recursively Documents and DocArrays - Updating Dictionaries of the left with the right @@ -35,9 +35,9 @@ def update(self, other: T): it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocArrays, lists and sets are concatenated. It is worth mentioning that Tuples - are not merged together since they are meant to be inmutable, + are not merged together since they are meant to be immutable, so they behave as regular types and the value of `self` is updated - with the value of `other` + with the value of `other`. --- diff --git a/docarray/data/torch_dataset.py b/docarray/data/torch_dataset.py index 25fbb9a9a6a..f174326c2a1 100644 --- a/docarray/data/torch_dataset.py +++ b/docarray/data/torch_dataset.py @@ -14,30 +14,31 @@ class MultiModalDataset(Dataset, Generic[T_doc]): A dataset that can be used inside a PyTorch DataLoader. In other words, it implements the PyTorch Dataset interface. - :param docs: the DocList to be used as the dataset - :param preprocessing: a dictionary of field names and preprocessing functions - The preprocessing dictionary passed to the constructor consists of keys that are field names and values that are functions that take a single argument and return a single argument. - EXAMPLE USAGE - .. code-block:: python + --- + + ```python from torch.utils.data import DataLoader from docarray import DocList from docarray.data import MultiModalDataset - from docarray.documents import Text + from docarray.documents import TextDoc def prepend_number(text: str): return f"Number {text}" - docs = DocList[Text](Text(text=str(i)) for i in range(16)) - ds = MultiModalDataset[Text](docs, preprocessing={'text': prepend_number}) - loader = DataLoader(ds, batch_size=4, collate_fn=MultiModalDataset[Text].collate_fn) + docs = DocList[TextDoc](TextDoc(text=str(i)) for i in range(16)) + ds = MultiModalDataset[TextDoc](docs, preprocessing={'text': prepend_number}) + loader = DataLoader(ds, batch_size=4, collate_fn=MultiModalDataset[TextDoc].collate_fn) for batch in loader: print(batch.text) + ``` + + --- Nested fields can be accessed by using dot notation. The document itself can be accessed using the empty string as the key. @@ -47,24 +48,25 @@ def prepend_number(text: str): The transformations will be applied according to their order in the dictionary. - EXAMPLE USAGE - .. code-block:: python + --- + + ```python import torch from torch.utils.data import DataLoader from docarray import DocList, BaseDoc from docarray.data import MultiModalDataset - from docarray.documents import Text + from docarray.documents import TextDoc class Thesis(BaseDoc): - title: Text + title: TextDoc class Student(BaseDoc): thesis: Thesis - def embed_title(title: Text): + def embed_title(title: TextDoc): title.embedding = torch.ones(4) @@ -90,6 +92,12 @@ def add_nonsense(student: Student): loader = DataLoader(ds, batch_size=4, collate_fn=ds.collate_fn) for batch in loader: print(batch.thesis.title.embedding) + ``` + + --- + + :param docs: the `DocList` to be used as the dataset + :param preprocessing: a dictionary of field names and preprocessing functions """ doc_type: Optional[Type[BaseDoc]] = None diff --git a/docarray/typing/tensor/audio/abstract_audio_tensor.py b/docarray/typing/tensor/audio/abstract_audio_tensor.py index 56fdae6c05e..b987b2addfd 100644 --- a/docarray/typing/tensor/audio/abstract_audio_tensor.py +++ b/docarray/typing/tensor/audio/abstract_audio_tensor.py @@ -16,7 +16,7 @@ class AbstractAudioTensor(AbstractTensor, ABC): def to_bytes(self) -> 'AudioBytes': """ - Convert audio tensor to AudioBytes. + Convert audio tensor to [`AudioBytes`][docarray.typrin.AudioBytes]. """ from docarray.typing.bytes.audio_bytes import AudioBytes diff --git a/docarray/typing/tensor/image/abstract_image_tensor.py b/docarray/typing/tensor/image/abstract_image_tensor.py index 0a880be9865..9566910781d 100644 --- a/docarray/typing/tensor/image/abstract_image_tensor.py +++ b/docarray/typing/tensor/image/abstract_image_tensor.py @@ -15,7 +15,7 @@ class AbstractImageTensor(AbstractTensor, ABC): def to_bytes(self, format: str = 'PNG') -> 'ImageBytes': """ - Convert image tensor to ImageBytes. + Convert image tensor to [`ImageBytes`][docarray.typing.ImageBytes]. :param format: the image format use to store the image, can be 'PNG' , 'JPG' ... :return: an ImageBytes object diff --git a/docarray/typing/tensor/image/image_tensorflow_tensor.py b/docarray/typing/tensor/image/image_tensorflow_tensor.py index c95b001e704..f373f45b30e 100644 --- a/docarray/typing/tensor/image/image_tensorflow_tensor.py +++ b/docarray/typing/tensor/image/image_tensorflow_tensor.py @@ -14,7 +14,8 @@ class ImageTensorFlowTensor( """ Subclass of [`TensorFlowTensor`][docarray.typing.TensorFlowTensor], to represent an image tensor. Adds image-specific features to the tensor. - For instance the ability convert the tensor back to image bytes which are + For instance the ability convert the tensor back to + [`ImageBytes`][docarray.typing.ImageBytes] which are optimized to send over the wire. diff --git a/docarray/typing/tensor/image/image_torch_tensor.py b/docarray/typing/tensor/image/image_torch_tensor.py index 249030c00f6..103a936d705 100644 --- a/docarray/typing/tensor/image/image_torch_tensor.py +++ b/docarray/typing/tensor/image/image_torch_tensor.py @@ -12,7 +12,8 @@ class ImageTorchTensor(AbstractImageTensor, TorchTensor, metaclass=metaTorchAndN """ Subclass of [`TorchTensor`][docarray.typing.TorchTensor], to represent an image tensor. Adds image-specific features to the tensor. - For instance the ability convert the tensor back to image bytes which are + For instance the ability convert the tensor back to + [`ImageBytes`][docarray.typing.ImageBytes] which are optimized to send over the wire. diff --git a/docarray/typing/tensor/video/video_tensor_mixin.py b/docarray/typing/tensor/video/video_tensor_mixin.py index d2ed61eacee..173daaacce8 100644 --- a/docarray/typing/tensor/video/video_tensor_mixin.py +++ b/docarray/typing/tensor/video/video_tensor_mixin.py @@ -135,7 +135,7 @@ def to_bytes( audio_format: str = 'fltp', ) -> 'VideoBytes': """ - Convert video tensor to VideoBytes. + Convert video tensor to [`VideoBytes`][docarray.typing.VideoBytes]. :param audio_tensor: AudioTensor containing the video's soundtrack. :param video_frame_rate: video frames per second. diff --git a/docarray/typing/url/url_3d/mesh_url.py b/docarray/typing/url/url_3d/mesh_url.py index 9ba5e330e6e..70f32eb5581 100644 --- a/docarray/typing/url/url_3d/mesh_url.py +++ b/docarray/typing/url/url_3d/mesh_url.py @@ -26,33 +26,33 @@ def load( trimesh_args: Optional[Dict[str, Any]] = None, ) -> 'VerticesAndFaces': """ - Load the data from the url into a VerticesAndFaces object containing - vertices and faces information. + Load the data from the url into a [`VerticesAndFaces`][docarray.documents.VerticesAndFaces] + object containing vertices and faces information. --- - ```python - from docarray import BaseDoc + ```python + from docarray import BaseDoc - from docarray.typing import Mesh3DUrl, NdArray + from docarray.typing import Mesh3DUrl, NdArray - class MyDoc(BaseDoc): - mesh_url: Mesh3DUrl + class MyDoc(BaseDoc): + mesh_url: Mesh3DUrl - doc = MyDoc(mesh_url="toydata/tetrahedron.obj") + doc = MyDoc(mesh_url="https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj") - tensors = doc.mesh_url.load() - assert isinstance(tensors.vertices, NdArray) - assert isinstance(tensors.faces, NdArray) - ``` + tensors = doc.mesh_url.load() + assert isinstance(tensors.vertices, NdArray) + assert isinstance(tensors.faces, NdArray) + ``` - --- - :param skip_materials: Skip materials if True, else skip. - :param trimesh_args: dictionary of additional arguments for `trimesh.load()` - or `trimesh.load_remote()`. - :return: VerticesAndFaces object containing vertices and faces information. + + :param skip_materials: Skip materials if True, else skip. + :param trimesh_args: dictionary of additional arguments for `trimesh.load()` + or `trimesh.load_remote()`. + :return: VerticesAndFaces object containing vertices and faces information. """ from docarray.documents.mesh.vertices_and_faces import VerticesAndFaces diff --git a/docarray/typing/url/url_3d/point_cloud_url.py b/docarray/typing/url/url_3d/point_cloud_url.py index dd3f17be0df..efe6ce6ae0e 100644 --- a/docarray/typing/url/url_3d/point_cloud_url.py +++ b/docarray/typing/url/url_3d/point_cloud_url.py @@ -29,7 +29,7 @@ def load( trimesh_args: Optional[Dict[str, Any]] = None, ) -> 'PointsAndColors': """ - Load the data from the url into an NdArray containing point cloud information. + Load the data from the url into an `NdArray` containing point cloud information. --- @@ -45,7 +45,7 @@ class MyDoc(BaseDoc): point_cloud_url: PointCloud3DUrl - doc = MyDoc(point_cloud_url="toydata/tetrahedron.obj") + doc = MyDoc(point_cloud_url="thttps://people.sc.fsu.edu/~jburkardt/data/obj/al.obj") # point_cloud = doc.point_cloud_url.load(samples=100) @@ -96,20 +96,24 @@ def display( First, it loads the point cloud into a `PointsAndColors` object, and then calls display on it. The following is therefore equivalent: - .. code-block:: python + --- - import numpy as np - from docarray import BaseDoc + ```python + import numpy as np + from docarray import BaseDoc - from docarray.documents import PointCloud3D + from docarray.documents import PointCloud3D - pc = PointCloud3D("toydata/tetrahedron.obj") + pc = PointCloud3D(url="https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj") - # option 1 - pc.url.display() + # option 1 + # pc.url.display() - # option 2 (equivalent) - pc.url.load(samples=10000).display() + # option 2 (equivalent) + # pc.url.load(samples=10000).display() + ``` + + --- :param samples: number of points to sample from the mesh. """ diff --git a/docarray/typing/url/video_url.py b/docarray/typing/url/video_url.py index db9dd4b5080..8c5f0e6d995 100644 --- a/docarray/typing/url/video_url.py +++ b/docarray/typing/url/video_url.py @@ -73,8 +73,7 @@ class MyDoc(BaseDoc): --- :param kwargs: supports all keyword arguments that are being supported by - av.open() as described in: - https://pyav.org/docs/stable/api/_globals.html?highlight=open#av.open + av.open() as described [here](https://pyav.org/docs/stable/api/_globals.html?highlight=open#av.open) :return: [`AudioNdArray`][docarray.typing.AudioNdArray] representing the audio content, [`VideoNdArray`][docarray.typing.VideoNdArray] representing the images of the video,