diff --git a/docarray/array/any_array.py b/docarray/array/any_array.py index cf1812194f1..4ba2d102cbb 100644 --- a/docarray/array/any_array.py +++ b/docarray/array/any_array.py @@ -140,9 +140,10 @@ def to_protobuf(self) -> 'DocListProto': ... def _to_node_protobuf(self) -> 'NodeProto': - """Convert a [`DocList`][docarray.array.doc_list.doc_list.DocList] into a NodeProto protobuf message. - This function should be called when a DocList - is nested into another Document that need to be converted into a protobuf + """Convert a [`DocList`][docarray.array.doc_list.doc_list.DocList] into a NodeProto + protobuf message. + This function should be called when a DocList is nested into + another Document that need to be converted into a protobuf. :return: the nested item protobuf message """ @@ -208,7 +209,6 @@ class Book(BaseDoc): chapters = docs.traverse_flat(access_path='chapters') # list of 30 strings ``` - If your [`DocList`][docarray.array.doc_list.doc_list.DocList] is in doc_vec mode, and you want to access a field of type `AnyTensor`, the doc_vec tensor will be returned instead of a list: diff --git a/docarray/array/doc_list/pushpull.py b/docarray/array/doc_list/pushpull.py index baa9c0439da..0d0f9384758 100644 --- a/docarray/array/doc_list/pushpull.py +++ b/docarray/array/doc_list/pushpull.py @@ -86,10 +86,10 @@ def push( show_progress: bool = False, branding: Optional[Dict] = None, ) -> Dict: - """Push this DocList object to the specified url. + """Push this `DocList` object to the specified url. - :param url: url specifying the protocol and save name of the DocList. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` - :param public: Only used by ``jac`` protocol. If true, anyone can pull a DocList if they know its name. + :param url: url specifying the protocol and save name of the `DocList`. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param public: Only used by ``jac`` protocol. If true, anyone can pull a `DocList` if they know its name. Setting this to false will restrict access to only the creator. :param show_progress: If true, a progress bar will be displayed. :param branding: Only used by ``jac`` protocol. A dictionary of branding information to be sent to Jina AI Cloud. {"icon": "emoji", "background": "#fff"} @@ -112,8 +112,8 @@ def push_stream( """Push a stream of documents to the specified url. :param docs: a stream of documents - :param url: url specifying the protocol and save name of the DocList. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` - :param public: Only used by ``jac`` protocol. If true, anyone can pull a DocList if they know its name. + :param url: url specifying the protocol and save name of the `DocList`. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param public: Only used by ``jac`` protocol. If true, anyone can pull a `DocList` if they know its name. :param show_progress: If true, a progress bar will be displayed. :param branding: Only used by ``jac`` protocol. A dictionary of branding information to be sent to Jina AI Cloud. {"icon": "emoji", "background": "#fff"} """ @@ -130,19 +130,19 @@ def pull( show_progress: bool = False, local_cache: bool = True, ) -> 'DocList': - """Pull a :class:`DocList` from the specified url. + """Pull a `DocList` from the specified url. - :param url: url specifying the protocol and save name of the DocList. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param url: url specifying the protocol and save name of the `DocList`. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocList to local folder - :return: a :class:`DocList` object + :param local_cache: store the downloaded `DocList` to local folder + :return: a `DocList` object """ from docarray.base_doc import AnyDoc if cls.doc_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' - 'Please specify the DocList\'s Document type using `DocList[MyDoc]`.' + 'Please specify the `DocList`\'s Document type using `DocList[MyDoc]`.' ) logging.info(f'Pulling {url}') @@ -160,9 +160,9 @@ def pull_stream( ) -> Iterator['BaseDoc']: """Pull a stream of Documents from the specified url. - :param url: url specifying the protocol and save name of the DocList. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` + :param url: url specifying the protocol and save name of the `DocList`. Should be of the form ``protocol://namespace/name``. e.g. ``s3://bucket/path/to/namespace/name``, ``file:///path/to/folder/name`` :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocList to local folder + :param local_cache: store the downloaded `DocList` to local folder :return: Iterator of Documents """ from docarray.base_doc import AnyDoc @@ -170,7 +170,7 @@ def pull_stream( if cls.doc_type == AnyDoc: raise TypeError( 'There is no document schema defined. ' - 'Please specify the DocList\'s Document type using `DocList[MyDoc]`.' + 'Please specify the `DocList`\'s Document type using `DocList[MyDoc]`.' ) logging.info(f'Pulling Document stream from {url}') diff --git a/docarray/store/file.py b/docarray/store/file.py index b649864478a..6c46c3ab615 100644 --- a/docarray/store/file.py +++ b/docarray/store/file.py @@ -16,11 +16,15 @@ class FileDocStore(AbstractDocStore): + """Class to push and pull [`DocList`][docarray.DocList] on-disk.""" + @staticmethod def _abs_filepath(name: str) -> Path: """Resolve a name to an absolute path. - If it is not a path, the cache directoty is prepended. - If it is a path, it is resolved to an absolute path. + + :param name: If it is not a path, the cache directory is prepended. + If it is a path, it is resolved to an absolute path. + :return: Path """ if not (name.startswith('/') or name.startswith('~') or name.startswith('.')): name = str(_get_cache_path() / name) @@ -32,11 +36,11 @@ def _abs_filepath(name: str) -> Path: def list( cls: Type[SelfFileDocStore], namespace: str, show_table: bool ) -> List[str]: - """List all DocArrays in a directory. + """List all [`DocList`s][docarray.DocList] in a directory. :param namespace: The directory to list. :param show_table: If True, print a table of the files in the directory. - :return: A list of the names of the DocArrays in the directory. + :return: A list of the names of the `DocLists` in the directory. """ namespace_dir = cls._abs_filepath(namespace) if not namespace_dir.exists(): @@ -51,7 +55,7 @@ def list( from rich.table import Table table = Table( - title=f'You have {len(da_files)} DocArrays in file://{namespace_dir}', + title=f'You have {len(da_files)} DocLists in file://{namespace_dir}', box=box.SIMPLE, highlight=True, ) @@ -74,9 +78,9 @@ def list( def delete( cls: Type[SelfFileDocStore], name: str, missing_ok: bool = False ) -> bool: - """Delete a DocList from the local filesystem. + """Delete a [`DocList`][docarray.DocList] from the local filesystem. - :param name: The name of the DocList to delete. + :param name: The name of the `DocList` to delete. :param missing_ok: If True, do not raise an exception if the file does not exist. Defaults to False. :return: True if the file was deleted, False if it did not exist. """ @@ -98,8 +102,9 @@ def push( show_progress: bool, branding: Optional[Dict], ) -> Dict: - """Push this DocList object to the specified file path. + """Push this [`DocList`][docarray.DocList] object to the specified file path. + :param docs: The `DocList` to push. :param name: The file path to push to. :param public: Not used by the ``file`` protocol. :param show_progress: If true, a progress bar will be displayed. @@ -150,12 +155,12 @@ def pull( show_progress: bool, local_cache: bool, ) -> 'DocList': - """Pull a :class:`DocList` from the specified url. + """Pull a [`DocList`][docarray.DocList] from the specified url. :param name: The file path to pull from. :param show_progress: if true, display a progress bar. - :param local_cache: store the downloaded DocList to local folder - :return: a :class:`DocList` object + :param local_cache: store the downloaded `DocList` to local folder + :return: a `DocList` object """ return docs_cls( diff --git a/docarray/store/jac.py b/docarray/store/jac.py index 7838e3c26c8..6dafb49839a 100644 --- a/docarray/store/jac.py +++ b/docarray/store/jac.py @@ -82,7 +82,7 @@ def _get_raw_summary(self: 'DocList') -> List[Dict[str, Any]]: class JACDocStore(AbstractDocStore): - """Class to push and pull DocList to and from Jina AI Cloud.""" + """Class to push and pull [`DocList`][docarray.DocList] to and from Jina AI Cloud.""" @staticmethod @hubble.login_required @@ -135,7 +135,7 @@ def list(namespace: str = '', show_table: bool = False) -> List[str]: @hubble.login_required def delete(name: str, missing_ok: bool = True) -> bool: """ - Delete a DocList from the cloud. + Delete a [`DocList`][docarray.DocList] from the cloud. :param name: the name of the DocList to delete. :param missing_ok: if true, do not raise an error if the DocList does not exist. :return: True if the DocList was deleted, False if it did not exist. @@ -158,17 +158,18 @@ def push( show_progress: bool = False, branding: Optional[Dict] = None, ) -> Dict: - """Push this DocList object to Jina AI Cloud + """Push this [`DocList`][docarray.DocList] object to Jina AI Cloud - .. note:: + !!! note - Push with the same ``name`` will override the existing content. - Kinda like a public clipboard where everyone can override anyone's content. So to make your content survive longer, you may want to use longer & more complicated name. - The lifetime of the content is not promised atm, could be a day, could be a week. Do not use it for persistence. Only use this full temporary transmission/storage/clipboard. - :param name: A name that can later be used to retrieve this :class:`DocList`. - :param public: By default, anyone can pull a DocList if they know its name. + :param docs: The `DocList` to push. + :param name: A name that can later be used to retrieve this `DocList`. + :param public: By default, anyone can pull a `DocList` if they know its name. Setting this to false will restrict access to only the creator. :param show_progress: If true, a progress bar will be displayed. :param branding: A dictionary of branding information to be sent to Jina Cloud. e.g. {"icon": "emoji", "background": "#fff"} @@ -245,15 +246,16 @@ def push_stream( ) -> Dict: """Push a stream of documents to Jina AI Cloud - .. note:: + !!! note - Push with the same ``name`` will override the existing content. - Kinda like a public clipboard where everyone can override anyone's content. So to make your content survive longer, you may want to use longer & more complicated name. - The lifetime of the content is not promised atm, could be a day, could be a week. Do not use it for persistence. Only use this full temporary transmission/storage/clipboard. - :param name: A name that can later be used to retrieve this :class:`DocList`. - :param public: By default, anyone can pull a DocList if they know its name. + :param docs: a stream of documents + :param name: A name that can later be used to retrieve this `DocList`. + :param public: By default, anyone can pull a `DocList` if they know its name. Setting this to false will restrict access to only the creator. :param show_progress: If true, a progress bar will be displayed. :param branding: A dictionary of branding information to be sent to Jina Cloud. e.g. {"icon": "emoji", "background": "#fff"} @@ -278,12 +280,12 @@ def pull( show_progress: bool = False, local_cache: bool = True, ) -> 'DocList': - """Pull a :class:`DocList` from Jina AI Cloud to local. + """Pull a [`DocList`][docarray.DocList] from Jina AI Cloud to local. - :param name: the upload name set during :meth:`.push` + :param name: the upload name set during `.push` :param show_progress: if true, display a progress bar. :param local_cache: store the downloaded DocList to local folder - :return: a :class:`DocList` object + :return: a [`DocList`][docarray.DocList] object """ from docarray import DocList @@ -299,9 +301,9 @@ def pull_stream( show_progress: bool = False, local_cache: bool = False, ) -> Iterator['BaseDoc']: - """Pull a :class:`DocList` from Jina AI Cloud to local. + """Pull a [`DocList`][docarray.DocList] from Jina AI Cloud to local. - :param name: the upload name set during :meth:`.push` + :param name: the upload name set during `.push` :param show_progress: if true, display a progress bar. :param local_cache: store the downloaded DocList to local folder :return: An iterator of Documents diff --git a/docarray/store/s3.py b/docarray/store/s3.py index 936a261396f..2ebb864fc8d 100644 --- a/docarray/store/s3.py +++ b/docarray/store/s3.py @@ -48,15 +48,15 @@ def close(self): class S3DocStore(AbstractDocStore): - """Class to push and pull DocList to and from S3.""" + """Class to push and pull [`DocList`][docarray.DocList] to and from S3.""" @staticmethod def list(namespace: str, show_table: bool = False) -> List[str]: - """List all DocArrays in the specified bucket and namespace. + """List all [`DocList`s][docarray.DocList] in the specified bucket and namespace. :param namespace: The bucket and namespace to list. e.g. my_bucket/my_namespace :param show_table: If true, a rich table will be printed to the console. - :return: A list of DocList names. + :return: A list of `DocList` names. """ bucket, namespace = namespace.split('/', 1) s3 = boto3.resource('s3') @@ -74,7 +74,7 @@ def list(namespace: str, show_table: bool = False) -> List[str]: from rich.table import Table table = Table( - title=f'You have {len(da_files)} DocArrays in bucket s3://{bucket} under the namespace "{namespace}"', + title=f'You have {len(da_files)} DocLists in bucket s3://{bucket} under the namespace "{namespace}"', box=box.SIMPLE, highlight=True, ) @@ -94,7 +94,7 @@ def list(namespace: str, show_table: bool = False) -> List[str]: @staticmethod def delete(name: str, missing_ok: bool = True) -> bool: - """Delete the DocList object at the specified bucket and key. + """Delete the [`DocList`][docarray.DocList] object at the specified bucket and key. :param name: The bucket and key to delete. e.g. my_bucket/my_key :param missing_ok: If true, no error will be raised if the object does not exist. @@ -125,9 +125,9 @@ def push( show_progress: bool = False, branding: Optional[Dict] = None, ) -> Dict: - """Push this DocList object to the specified bucket and key. + """Push this [`DocList`][docarray.DocList] object to the specified bucket and key. - :param docs: The DocList to push. + :param docs: The `DocList` to push. :param name: The bucket and key to push to. e.g. my_bucket/my_key :param public: Not used by the ``s3`` protocol. :param show_progress: If true, a progress bar will be displayed. @@ -182,12 +182,12 @@ def pull( show_progress: bool = False, local_cache: bool = False, ) -> 'DocList': - """Pull a :class:`DocList` from the specified bucket and key. + """Pull a [`DocList`][docarray.DocList] from the specified bucket and key. :param name: The bucket and key to pull from. e.g. my_bucket/my_key :param show_progress: if true, display a progress bar. :param local_cache: store the downloaded DocList to local cache - :return: a :class:`DocList` object + :return: a `DocList` object """ docs = docs_cls( # type: ignore cls.pull_stream( diff --git a/docs/api_references/doc_store/doc_store.md b/docs/api_references/doc_store/doc_store.md new file mode 100644 index 00000000000..275f3e8e3b0 --- /dev/null +++ b/docs/api_references/doc_store/doc_store.md @@ -0,0 +1,3 @@ +# DocStore + +::: docarray.store.abstract_doc_store.AbstractDocStore diff --git a/docs/api_references/doc_store/file_doc_store.md b/docs/api_references/doc_store/file_doc_store.md new file mode 100644 index 00000000000..b81dc3ee298 --- /dev/null +++ b/docs/api_references/doc_store/file_doc_store.md @@ -0,0 +1,3 @@ +# FileDocStore + +::: docarray.store.file.FileDocStore diff --git a/docs/api_references/doc_store/jac_doc_store.md b/docs/api_references/doc_store/jac_doc_store.md new file mode 100644 index 00000000000..1d4c0a28303 --- /dev/null +++ b/docs/api_references/doc_store/jac_doc_store.md @@ -0,0 +1,3 @@ +# JACDocStore + +::: docarray.store.jac.JACDocStore diff --git a/docs/api_references/doc_store/s3_doc_store.md b/docs/api_references/doc_store/s3_doc_store.md new file mode 100644 index 00000000000..6856c42f2ff --- /dev/null +++ b/docs/api_references/doc_store/s3_doc_store.md @@ -0,0 +1,3 @@ +# S3DocStore + +::: docarray.store.s3.S3DocStore diff --git a/docs/user_guide/storing/doc_store/store_file.md b/docs/user_guide/storing/doc_store/store_file.md new file mode 100644 index 00000000000..8602eb71adb --- /dev/null +++ b/docs/user_guide/storing/doc_store/store_file.md @@ -0,0 +1,66 @@ +# Store on-disk + +When you want to use your [DocList][docarray.array.doc_list.doc_list.DocList] in another place, you can use the +[`.push()`][docarray.array.doc_list.pushpull.PushPullMixin.push] function to push the [DocList][docarray.array.doc_list.doc_list.DocList] +to one place and later use the [`.pull()`][docarray.array.doc_list.pushpull.PushPullMixin.pull] function to pull its content back. + +## Push & pull +To use the store locally, you need to pass a local file path to the function starting with `'file://'`. + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(8)]) +dl.push('file://simple_dl') + +dl_pull = DocList[SimpleDoc].pull('file://simple_dl') +``` + +A file with the name of `simple_dl.docs` being created to store the `DocList`. + + +## Push & pull with streaming +When you have a large amount of documents to push and pull, you could use the streaming function. +[`.push_stream()`][docarray.array.doc_list.pushpull.PushPullMixin.push_stream] and +[`.pull_stream()`][docarray.array.doc_list.pushpull.PushPullMixin.pull_stream] can help you to stream the `DocList` in +order to save the memory usage. You set multiple `DocList` to pull from the same source as well. + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +store_docs = [SimpleDoc(text=f'doc {i}') for i in range(8)] + +DocList[SimpleDoc].push_stream( + iter(store_docs), + 'file://dl_stream', +) +dl_pull_stream_1 = DocList[SimpleDoc].pull_stream('file://dl_stream') +dl_pull_stream_2 = DocList[SimpleDoc].pull_stream('file://dl_stream') + +for d1, d2 in zip(dl_pull_stream_1, dl_pull_stream_2): + print(f'get {d1}, get {d2}') +``` + +
+ Output + ```text + get SimpleDoc(id='5a4b92af27aadbb852d636892506998b', text='doc 0'), get SimpleDoc(id='5a4b92af27aadbb852d636892506998b', text='doc 0') + get SimpleDoc(id='705e4f6acbab0a6ff10d11a07c03b24c', text='doc 1'), get SimpleDoc(id='705e4f6acbab0a6ff10d11a07c03b24c', text='doc 1') + get SimpleDoc(id='4fb5c01bd5f935bbe91cf73e271ad590', text='doc 2'), get SimpleDoc(id='4fb5c01bd5f935bbe91cf73e271ad590', text='doc 2') + get SimpleDoc(id='381498cef78f1d4f1d80415d67918940', text='doc 3'), get SimpleDoc(id='381498cef78f1d4f1d80415d67918940', text='doc 3') + get SimpleDoc(id='d968bc6fa235b1cfc69eded92926157e', text='doc 4'), get SimpleDoc(id='d968bc6fa235b1cfc69eded92926157e', text='doc 4') + get SimpleDoc(id='30bf347427a4bd50ce8ada1841320fe3', text='doc 5'), get SimpleDoc(id='30bf347427a4bd50ce8ada1841320fe3', text='doc 5') + get SimpleDoc(id='1389877ac97b3e6d0e8eb17568934708', text='doc 6'), get SimpleDoc(id='1389877ac97b3e6d0e8eb17568934708', text='doc 6') + get SimpleDoc(id='264b0eff2cd138d296f15c685e15bf23', text='doc 7'), get SimpleDoc(id='264b0eff2cd138d296f15c685e15bf23', text='doc 7') + ``` +
\ No newline at end of file diff --git a/docs/user_guide/storing/doc_store/store_jac.md b/docs/user_guide/storing/doc_store/store_jac.md new file mode 100644 index 00000000000..2975df7311f --- /dev/null +++ b/docs/user_guide/storing/doc_store/store_jac.md @@ -0,0 +1,59 @@ +# Store on Jina AI Cloud +When you want to use your [`DocList`][docarray.DocList] in another place, you can use the +[`.push()`][docarray.array.doc_list.pushpull.PushPullMixin.push] method to push the `DocList` to Jina AI Cloud and later use the +[`.pull()`][docarray.array.doc_list.pushpull.PushPullMixin.pull] function to pull its content back. + +!!! note + To store on Jina AI Cloud, you need to install the extra dependency with the following line + ```cmd + pip install "docarray[jac]" + ``` + +## Push & pull +To use the store [`DocList`][docarray.DocList] on Jina AI Cloud, you need to pass a Jina AI Cloud path to the function starting with `'jac://'`. + +Before getting started, you need to have an account at [Jina AI Cloud](http://cloud.jina.ai/) and created a [Personal Access Token (PAT)](https://cloud.jina.ai/settings/tokens). + +```python +from docarray import BaseDoc, DocList +import os + + +class SimpleDoc(BaseDoc): + text: str + + +os.environ['JINA_AUTH_TOKEN'] = 'YOUR_PAT' +DL_NAME = 'simple-dl' +dl = DocList[SimpleDoc]([SimpleDoc(text=f'doc {i}') for i in range(8)]) + +# push to Jina AI Cloud +dl.push(f'jac://{DL_NAME}') + +# pull from Jina AI Cloud +dl_pull = DocList[SimpleDoc].pull(f'jac://{DL_NAME}') +``` + + +!!! note + When using `.push()` and `.pull()`, `DocList` calls the default boto3 client. Be sure your default session is correctly set up. + + +## Push & pull with streaming +When you have a large amount of documents to push and pull, you could use the streaming function. +[`.push_stream()`][docarray.array.doc_list.pushpull.PushPullMixin.push_stream] and +[`.pull_stream()`][docarray.array.doc_list.pushpull.PushPullMixin.pull_stream] can help you to stream the +[`DocList`][docarray.DocList] in order to save the memory usage. +You set multiple `DocList` to pull from the same source as well. +The usage is the same as using streaming with local files. +Please refer to [Push & Pull with streaming with local files](store_file.md#push-pull-with-streaming). + + +## Delete +To delete the store, you need to use the static method [`.delete()`][docarray.store.jac.JACDocStore.delete] of [`JACDocStore`][docarray.store.jac.JACDocStore] class. + +```python +from docarray.store import JACDocStore + +JACDocStore.delete(f'jac://{DL_NAME}') +``` \ No newline at end of file diff --git a/docs/user_guide/storing/doc_store/store_s3.md b/docs/user_guide/storing/doc_store/store_s3.md new file mode 100644 index 00000000000..c4e0878133b --- /dev/null +++ b/docs/user_guide/storing/doc_store/store_s3.md @@ -0,0 +1,142 @@ +# Store on S3 +When you want to use your [`DocList`][docarray.DocList] in another place, you can use the +[`.push`][docarray.array.doc_list.pushpull.PushPullMixin.push] method to push the `DocList` to S3 and later use the +[`.pull`][docarray.array.doc_list.pushpull.PushPullMixin.pull] function to pull its content back. + +!!! note + To store on S3, you need to install the extra dependency with the following line + ```cmd + pip install "docarray[aws]" + ``` + +## Push & pull +To use the store [`DocList`][docarray.DocList] on S3, you need to pass an S3 path to the function starting with `'s3://'`. + +In the following demo, we use `MinIO` as a local S3 service. You could use the following docker-compose file to start the service in a Docker container. + +```yaml +version: "3" +services: + minio: + container_name: minio + image: "minio/minio:RELEASE.2023-03-13T19-46-17Z" + ports: + - "9005:9000" + command: server /data +``` +Save the above file as `docker-compose.yml` and run the following line in the same folder as the file. +```cmd +docker-compose up +``` + +```python +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +if __name__ == '__main__': + import boto3 + from botocore.client import Config + + BUCKET = 'tmp_bucket' + my_session = boto3.session.Session() + s3 = my_session.resource( + service_name='s3', + region_name="us-east-1", + use_ssl=False, + endpoint_url="http://localhost:9005", + aws_access_key_id="minioadmin", + aws_secret_access_key="minioadmin", + config=Config(signature_version="s3v4"), + ) + # make a bucket + s3.create_bucket(Bucket=BUCKET) + + store_docs = [SimpleDoc(text=f'doc {i}') for i in range(8)] + docs = DocList[SimpleDoc]() + docs.extend([SimpleDoc(text=f'doc {i}') for i in range(8)]) + + # .push() and .pull() use the default boto3 client + boto3.Session.client.__defaults__ = ( + "us-east-1", + None, + False, + None, + "http://localhost:9005", + "minioadmin", + "minioadmin", + None, + Config(signature_version="s3v4"), + ) + docs.push(f's3://{BUCKET}/simple_docs') + docs_pull = DocList[SimpleDoc].pull(f's3://{BUCKET}/simple_docs') +``` + +Under the bucket `tmp_bucket`, there is a file with the name of `simple_docs.docs` being created to store the `DocList`. + +!!! note + When using `.push()` and `.pull()`, `DocList` calls the default boto3 client. Be sure your default session is correctly set up. + + +## Push & pull with streaming +When you have a large amount of documents to push and pull, you could use the streaming function. +[`.push_stream()`][docarray.array.doc_list.pushpull.PushPullMixin.push_stream] and +[`.pull_stream()`][docarray.array.doc_list.pushpull.PushPullMixin.pull_stream] can help you to stream the +[`DocList`][docarray.DocList] in order to save the memory usage. You set multiple [`DocList`][docarray.DocList] to pull from the same source as well. The usage is the same as using streaming with local files. Please refer to [Push & Pull with streaming with local files](store_file.md#push-pull-with-streaming). + + +## Delete +To delete the store, you need to use the static method [`.delete()`][docarray.store.s3.S3DocStore.delete] of [`S3DocStore`][docarray.store.s3.S3DocStore] class. + +```python hl_lines="44-47" +from docarray import BaseDoc, DocList + + +class SimpleDoc(BaseDoc): + text: str + + +if __name__ == '__main__': + import boto3 + from botocore.client import Config + + BUCKET = 'tmp_bucket' + my_session = boto3.session.Session() + s3 = my_session.resource( + service_name='s3', + region_name="us-east-1", + use_ssl=False, + endpoint_url="http://localhost:9005", + aws_access_key_id="minioadmin", + aws_secret_access_key="minioadmin", + config=Config(signature_version="s3v4"), + ) + # make a bucket + s3.create_bucket(Bucket=BUCKET) + + store_docs = [SimpleDoc(text=f'doc {i}') for i in range(8)] + docs = DocList[SimpleDoc]() + docs.extend([SimpleDoc(text=f'doc {i}') for i in range(8)]) + + # .push() and .pull() use the default boto3 client + boto3.Session.client.__defaults__ = ( + "us-east-1", + None, + False, + None, + "http://localhost:9005", + "minioadmin", + "minioadmin", + None, + Config(signature_version="s3v4"), + ) + docs.push(f's3://{BUCKET}/simple_docs') + + # delete bucket + from docarray.store import S3DocStore + + success = S3DocStore.delete('{BUCKET}/simple_docs') +``` diff --git a/docs/user_guide/storing/first_step.md b/docs/user_guide/storing/first_step.md index e425d156e86..13ecfe138c0 100644 --- a/docs/user_guide/storing/first_step.md +++ b/docs/user_guide/storing/first_step.md @@ -1 +1,26 @@ -# Storing data +# Intro + +In the previous sections we saw how to use [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] to represent multi-modal data and send it over the wire. +In this section we will see how to store and persist this data. + +DocArray offers to ways of storing your data: + +1. In a **[Document Store](#document-store)** for simple long-term storage +2. In a **[Document Index](#document-index)** for fast retrieval using vector similarity + +## Document Store + +[DocList][docarray.array.doc_list.doc_list.DocList] can be persisted using the +[`.push()`][docarray.array.doc_list.pushpull.PushPullMixin.push] and +[`.pull()`][docarray.array.doc_list.pushpull.PushPullMixin.pull] methods. +Under the hood, [DocStore][docarray.store.abstract_doc_store.AbstractDocStore] is used to persist a `DocList`. +You can store your documents on-disk. Alternatively, you can upload them to [AWS S3](https://aws.amazon.com/s3/), +[minio](https://min.io) or [Jina AI Cloud](https://cloud.jina.ai/user/storage). + +This section covers the following three topics: + + - [Store](doc_store/store_file.md) of [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] on-disk + - [Store on Jina AI Cloud](doc_store/store_jac.md) + - [Store on S3](doc_store/store_s3.md) + +## Document Index diff --git a/mkdocs.yml b/mkdocs.yml index 1cdbdb86bd1..fe4f600a3e4 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -90,8 +90,12 @@ nav: - Building API: - user_guide/sending/api/jina.md - user_guide/sending/api/fastAPI.md - - - user_guide/storing/first_step.md + - Storing: + - user_guide/storing/first_step.md + - DocStore: + - user_guide/storing/doc_store/store_file.md + - user_guide/storing/doc_store/store_jac.md + - user_guide/storing/doc_store/store_s3.md - How-to: - how_to/add_doc_index.md diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py index ccda4714700..b071839c88c 100644 --- a/tests/documentation/test_docs.py +++ b/tests/documentation/test_docs.py @@ -63,7 +63,7 @@ def check_md_file(fpath, memory=False, lang="python", keyword_ignore=[]): @pytest.mark.parametrize('fpath', files_to_check, ids=str) def test_files_good(fpath): - check_md_file(fpath=fpath, memory=True, keyword_ignore=['pickle']) + check_md_file(fpath=fpath, memory=True, keyword_ignore=['pickle', 'jac']) def test_readme():