diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 9de76ea3867..c611e0f58b1 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -56,6 +56,8 @@ jobs: -v -s -m "not gpu" ${{ matrix.test-path }} echo "::set-output name=codecov_flag::docarray" timeout-minutes: 30 + env: + JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}" - name: Check codecov file id: check_files uses: andstor/file-existence-action@v1 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 78c20ebb3ac..256cfe224e1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -145,6 +145,8 @@ jobs: -v -s -m "not gpu" ${{ matrix.test-path }} echo "::set-output name=codecov_flag::docarray" timeout-minutes: 30 + env: + JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}" - name: Check codecov file id: check_files uses: andstor/file-existence-action@v1 diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index 5e9cbbad1bc..56999e26933 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -1,24 +1,159 @@ +import json import os +import os.path import warnings +from collections import Counter from pathlib import Path -from typing import Dict, Type, TYPE_CHECKING, Optional +from typing import Dict, Type, TYPE_CHECKING, List, Optional, Any -from docarray.helper import get_request_header, __cache_path__ +import hubble +from hubble import Client as HubbleClient +from hubble.client.endpoints import EndpointsV2 + + +from docarray.helper import get_request_header, __cache_path__, _get_array_info if TYPE_CHECKING: from docarray.typing import T +def _get_length_from_summary(summary: List[Dict]) -> Optional[int]: + """Get the length from summary.""" + for item in summary: + if 'Length' == item['name']: + return item['value'] + + class PushPullMixin: """Transmitting :class:`DocumentArray` via Jina Cloud Service""" _max_bytes = 4 * 1024 * 1024 * 1024 + @staticmethod + def cloud_list(show_table: bool = False) -> List[str]: + """List all available arrays in the cloud. + + :param show_table: if true, show the table of the arrays. + :returns: List of available DocumentArray's names. + """ + from rich import print + + result = [] + from rich.table import Table + from rich import box + + resp = HubbleClient(jsonify=True).list_artifacts( + filter={'type': 'documentArray'}, sort={'createdAt': 1} + ) + + table = Table( + title=f'You have {resp["meta"]["total"]} DocumentArray on the cloud', + box=box.SIMPLE, + highlight=True, + ) + table.add_column('Name') + table.add_column('Length') + table.add_column('Access') + table.add_column('Created at', justify='center') + table.add_column('Updated at', justify='center') + + for da in resp['data']: + result.append(da['name']) + + table.add_row( + da['name'], + str(_get_length_from_summary(da['metaData'].get('summary', []))), + da['visibility'], + da['createdAt'], + da['updatedAt'], + ) + + if show_table: + print(table) + return result + + @staticmethod + def cloud_delete(name: str) -> None: + """ + Delete a DocumentArray from the cloud. + :param name: the name of the DocumentArray to delete. + """ + HubbleClient(jsonify=True).delete_artifact(name=name) + + def _get_raw_summary(self) -> List[Dict[str, Any]]: + ( + is_homo, + _nested_in, + _nested_items, + attr_counter, + all_attrs_names, + ) = _get_array_info(self) + + items = [ + dict( + name='Type', + value=self.__class__.__name__, + description='The type of the DocumentArray', + ), + dict( + name='Length', + value=len(self), + description='The length of the DocumentArray', + ), + dict( + name='Homogenous Documents', + value=is_homo, + description='Whether all documents are of the same structure, attributes', + ), + dict( + name='Common Attributes', + value=list(attr_counter.items())[0][0] if attr_counter else None, + description='The common attributes of all documents', + ), + dict( + name='Has nested Documents in', + value=tuple(_nested_in), + description='The field that contains nested Documents', + ), + dict( + name='Multimodal dataclass', + value=all(d.is_multimodal for d in self), + description='Whether all documents are multimodal', + ), + dict( + name='Subindices', value=tuple(getattr(self, '_subindices', {}).keys()) + ), + ] + + items.append( + dict( + name='Inspect attributes', + value=_nested_items, + description='Quick overview of attributes of all documents', + ) + ) + + storage_infos = self._get_storage_infos() + _nested_items = [] + if storage_infos: + for k, v in storage_infos.items(): + _nested_items.append(dict(name=k, value=v)) + items.append( + dict( + name='Storage backend', + value=_nested_items, + description='Quick overview of the Document Store', + ) + ) + + return items + def push( self, name: str, show_progress: bool = False, public: bool = True, + branding: Optional[Dict] = None, ) -> Dict: """Push this DocumentArray object to Jina Cloud which can be later retrieved via :meth:`.push` @@ -33,6 +168,7 @@ def push( :param show_progress: if to show a progress bar on pulling :param public: by default anyone can pull a DocumentArray if they know its name. Setting this to False will allow only the creator to pull it. This feature of course you to login first. + :param branding: a dict of branding information to be sent to Jina Cloud. {"icon": "emoji", "background": "#fff"} """ import requests @@ -47,11 +183,14 @@ def push( 'name': name, 'type': 'documentArray', 'public': public, + 'metaData': json.dumps( + {'summary': self._get_raw_summary(), 'branding': branding}, + sort_keys=True, + ), } ) headers = {'Content-Type': ctype, **get_request_header()} - import hubble auth_token = hubble.get_token() if auth_token: @@ -98,11 +237,9 @@ def _get_chunk(_batch): yield _tail with pbar: - from hubble import Client - from hubble.client.endpoints import EndpointsV2 response = requests.post( - Client()._base_url + EndpointsV2.upload_artifact, + HubbleClient()._base_url + EndpointsV2.upload_artifact, data=gen(), headers=headers, ) @@ -133,17 +270,12 @@ def pull( headers = {} - import hubble - auth_token = hubble.get_token() if auth_token: headers['Authorization'] = f'token {auth_token}' - from hubble import Client - from hubble.client.endpoints import EndpointsV2 - - url = Client()._base_url + EndpointsV2.download_artifact + f'?name={name}' + url = HubbleClient()._base_url + EndpointsV2.download_artifact + f'?name={name}' response = requests.get(url, headers=headers) if response.ok: @@ -183,3 +315,6 @@ def pull( fp.write(_source.content) return r + + cloud_push = push + cloud_pull = pull diff --git a/docarray/array/mixins/plot.py b/docarray/array/mixins/plot.py index 86c62d50498..9c5227e9776 100644 --- a/docarray/array/mixins/plot.py +++ b/docarray/array/mixins/plot.py @@ -11,6 +11,8 @@ import numpy as np +from docarray.helper import _get_array_info + class PlotMixin: """Helper functions for plotting the arrays.""" @@ -37,44 +39,28 @@ def summary(self): tables = [] console = Console() - all_attrs = self._get_attributes('non_empty_fields') - # remove underscore attribute - all_attrs = [tuple(vv for vv in v if not vv.startswith('_')) for v in all_attrs] - attr_counter = Counter(all_attrs) + ( + is_homo, + _nested_in, + _nested_items, + attr_counter, + all_attrs_names, + ) = _get_array_info(self) table = Table(box=box.SIMPLE, highlight=True) table.show_header = False table.add_row('Type', self.__class__.__name__) table.add_row('Length', str(len(self))) - is_homo = len(attr_counter) == 1 table.add_row('Homogenous Documents', str(is_homo)) - all_attrs_names = set(v for k in all_attrs for v in k) - _nested_in = [] - if 'chunks' in all_attrs_names: - _nested_in.append('chunks') - - if 'matches' in all_attrs_names: - _nested_in.append('matches') - if _nested_in: table.add_row('Has nested Documents in', str(tuple(_nested_in))) if is_homo: table.add_row('Common Attributes', str(list(attr_counter.items())[0][0])) - else: - for _a, _n in attr_counter.most_common(): - if _n == 1: - _doc_text = f'{_n} Document has' - else: - _doc_text = f'{_n} Documents have' - if len(_a) == 1: - _text = f'{_doc_text} one attribute' - elif len(_a) == 0: - _text = f'{_doc_text} no attribute' - else: - _text = f'{_doc_text} attributes' - table.add_row(_text, str(_a)) + + for item in _nested_items: + table.add_row(item['name'], item['value']) is_multimodal = all(d.is_multimodal for d in self) table.add_row('Multimodal dataclass', str(is_multimodal)) diff --git a/docarray/helper.py b/docarray/helper.py index 00ce8011be4..12143334548 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -6,7 +6,11 @@ import uuid import warnings from os.path import expanduser -from typing import Any, Dict, Optional, Sequence, Tuple, Union +from typing import Any, Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING +from collections import Counter + +if TYPE_CHECKING: + from docarray import DocumentArray __resources_path__ = os.path.join( os.path.dirname( @@ -455,3 +459,39 @@ def _safe_cast_int(value: Union[str, int, float]) -> int: if isinstance(value, float) and not value.is_integer(): raise ValueError(f"Can't safely cast {value} to an int") return int(value) + + +def _get_array_info(da: 'DocumentArray'): + all_attrs = da._get_attributes('non_empty_fields') + # remove underscore attribute + all_attrs = [tuple(vv for vv in v if not vv.startswith('_')) for v in all_attrs] + attr_counter = Counter(all_attrs) + + all_attrs_names = set(v for k in all_attrs for v in k) + _nested_in = [] + if 'chunks' in all_attrs_names: + _nested_in.append('chunks') + + if 'matches' in all_attrs_names: + _nested_in.append('matches') + + is_homo = len(attr_counter) == 1 + + _nested_items = [] + if not is_homo: + for n_attributes, n_docs in attr_counter.most_common(): + if n_docs == 1: + _doc_text = f'{n_docs} Document has' + else: + _doc_text = f'{n_docs} Documents have' + if len(n_attributes) == 1: + _text = f'{_doc_text} one attribute' + elif len(n_attributes) == 0: + _text = f'{_doc_text} no attribute' + else: + _text = f'{_doc_text} attributes' + _nested_items.append( + dict(name=_text, value=str(n_attributes), description='') + ) + + return is_homo, _nested_in, _nested_items, attr_counter, all_attrs_names diff --git a/docs/fundamentals/documentarray/serialization.md b/docs/fundamentals/documentarray/serialization.md index 4069ad9ae81..d9dfb2a8a8a 100644 --- a/docs/fundamentals/documentarray/serialization.md +++ b/docs/fundamentals/documentarray/serialization.md @@ -393,3 +393,26 @@ The maximum size of an upload is 4GB under the `protocol='protobuf'` and `compre To avoid unnecessary download when upstream DocumentArray is unchanged, you can add `DocumentArray.pull(..., local_cache=True)`. +Furthermore, it is possible to list all `DocumentArray` objects stored on the cloud using: +```python +DocumentArray.cloud_list(show_table=True) +``` + +```text + You have 1 DocumentArray on the cloud + + Name Length Access Created at Updated at + ──────────────────────────────────────────────────────────────────────────────── + da123 10 public 2022-09-15T07:14:54.256Z 2022-09-15T07:14:54.256Z + +['da123'] +``` + +```{tip} +Use parameter `show_table` to show table summarizing information about DocumentArrays in the cloud. +``` + +It is also possible to delete DocumentArray objects in the cloud using: +```python +DocumentArray.cloud_delete('da123') +``` diff --git a/setup.py b/setup.py index a9f72500dfc..47555bfbd42 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ long_description_content_type='text/markdown', zip_safe=False, setup_requires=['setuptools>=18.0', 'wheel'], - install_requires=['numpy', 'rich>=12.0.0'], + install_requires=['numpy', 'rich>=12.0.0', 'jina-hubble-sdk>=0.13.1'], extras_require={ # req usage, please see https://docarray.jina.ai/#install 'common': [ @@ -50,7 +50,6 @@ 'Pillow', 'fastapi', 'uvicorn', - 'jina-hubble-sdk>=0.11.0', ], 'full': [ 'protobuf>=3.13.0', @@ -60,7 +59,6 @@ 'Pillow', 'trimesh', 'scipy', - 'jina-hubble-sdk>=0.10.0', 'av', 'fastapi', 'uvicorn', diff --git a/tests/unit/array/mixins/test_io.py b/tests/unit/array/mixins/test_io.py index 56dcf746de3..5f77b030790 100644 --- a/tests/unit/array/mixins/test_io.py +++ b/tests/unit/array/mixins/test_io.py @@ -256,6 +256,16 @@ def test_push_pull_io(da_cls, config, show_progress, start_storage): assert len(da1) == len(da2) == 10 assert da1.texts == da2.texts == random_texts + all_names = DocumentArray.cloud_list() + + assert name in all_names + + DocumentArray.cloud_delete(name) + + all_names = DocumentArray.cloud_list() + + assert name not in all_names + @pytest.mark.parametrize( 'protocol', ['protobuf', 'pickle', 'protobuf-array', 'pickle-array'] diff --git a/tests/unit/array/mixins/test_plot.py b/tests/unit/array/mixins/test_plot.py index e9d76fa97e9..65f776a709c 100644 --- a/tests/unit/array/mixins/test_plot.py +++ b/tests/unit/array/mixins/test_plot.py @@ -192,10 +192,13 @@ def test_summary_homo_hetero(da_cls, config, start_storage): da = da_cls.empty(100) da._get_attributes() da.summary() + da._get_raw_summary() da[0].pop('id') da.summary() + da._get_raw_summary() + @pytest.mark.parametrize( 'da_cls,config', diff --git a/tests/unit/array/mixins/test_pushpull.py b/tests/unit/array/mixins/test_pushpull.py index 7ca2d0a322e..af17a607a30 100644 --- a/tests/unit/array/mixins/test_pushpull.py +++ b/tests/unit/array/mixins/test_pushpull.py @@ -1,13 +1,13 @@ import cgi -import json import os +from io import BytesIO + import pytest import requests -from io import BytesIO -from docarray import DocumentArray +from docarray import DocumentArray, Document, dataclass from docarray.helper import random_identity - +from docarray.typing import Image, Text from tests import random_docs @@ -136,7 +136,6 @@ def test_push_fail(mocker, monkeypatch): def test_api_url_change(mocker, monkeypatch): - test_api_url = 'http://localhost:8080' os.environ['JINA_HUBBLE_REGISTRY'] = test_api_url @@ -158,3 +157,38 @@ def test_api_url_change(mocker, monkeypatch): assert push_kwargs['url'].startswith(test_api_url) assert pull_kwargs['url'].startswith(test_api_url) + + +@dataclass +class MyDocument: + image: Image + paragraph: Text + + +@pytest.mark.parametrize( + 'da', + [ + DocumentArray(), + DocumentArray.empty(10), + DocumentArray.empty(10, storage='annlite', config={'n_dim': 10}), + DocumentArray( + [ + Document( + MyDocument( + image='https://docarray.jina.ai/_images/apple.png', + paragraph='hello world', + ) + ) + for _ in range(10) + ], + config={'n_dim': 256}, + storage='annlite', + subindex_configs={ + '@.[image]': {'n_dim': 512}, + '@.[paragraph]': {'n_dim': 128}, + }, + ), + ], +) +def test_get_raw_summary(da: DocumentArray): + assert da._get_raw_summary() diff --git a/tests/unit/array/test_from_to_bytes.py b/tests/unit/array/test_from_to_bytes.py index a8b77d90327..fd500e8ce67 100644 --- a/tests/unit/array/test_from_to_bytes.py +++ b/tests/unit/array/test_from_to_bytes.py @@ -110,7 +110,7 @@ def test_from_to_safe_list(target, protocol, to_fn): @pytest.mark.parametrize('protocol', ['protobuf', 'pickle']) @pytest.mark.parametrize('show_progress', [True, False]) -def test_push_pull_show_progress(show_progress, protocol): +def test_to_bytes_show_progress(show_progress, protocol): da = DocumentArray.empty(1000) r = da.to_bytes(_show_progress=show_progress, protocol=protocol) da_r = DocumentArray.from_bytes(r, _show_progress=show_progress, protocol=protocol)