From df3234544d2d499afb5ca1d4d097ccf1a0581ed6 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Wed, 17 Aug 2022 18:30:32 +0200 Subject: [PATCH 01/21] feat: push meta data along with docarray --- docarray/array/mixins/io/pushpull.py | 95 +++++++++++++++++++++++++++- tests/unit/array/mixins/test_plot.py | 3 + 2 files changed, 97 insertions(+), 1 deletion(-) diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index 5e9cbbad1bc..a5bbdc9f869 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -1,7 +1,10 @@ +import json import os +import os.path import warnings +from collections import Counter from pathlib import Path -from typing import Dict, Type, TYPE_CHECKING, Optional +from typing import Dict, Type, TYPE_CHECKING from docarray.helper import get_request_header, __cache_path__ @@ -14,6 +17,95 @@ class PushPullMixin: _max_bytes = 4 * 1024 * 1024 * 1024 + def _get_raw_summary(self) -> str: + all_attrs = self._get_attributes('non_empty_fields') + # remove underscore attribute + all_attrs = [tuple(vv for vv in v if not vv.startswith('_')) for v in all_attrs] + attr_counter = Counter(all_attrs) + + all_attrs_names = set(v for k in all_attrs for v in k) + _nested_in = [] + if 'chunks' in all_attrs_names: + _nested_in.append('chunks') + + if 'matches' in all_attrs_names: + _nested_in.append('matches') + + is_homo = len(attr_counter) == 1 + + items = [ + dict( + name='Type', + value=self.__class__.__name__, + description='The type of the DocumentArray', + ), + dict( + name='Length', + value=len(self), + description='The length of the DocumentArray', + ), + dict( + name='Homogenous Documents', + value=is_homo, + description='Whether all documents are of the same structure, attributes', + ), + dict( + name='Common Attributes', + value=list(attr_counter.items())[0][0], + description='The common attributes of all documents', + ), + dict( + name='Has nested Documents in', + value=tuple(_nested_in), + description='The field that contains nested Documents', + ), + dict( + name='Multimodal dataclass', + value=all(d.is_multimodal for d in self), + description='Whether all documents are multimodal', + ), + dict( + name='Subindices', value=tuple(getattr(self, '_subindices', {}).keys()) + ), + ] + + _nested_items = [] + if not is_homo: + for _a, _n in attr_counter.most_common(): + if _n == 1: + _doc_text = f'{_n} Document has' + else: + _doc_text = f'{_n} Documents have' + if len(_a) == 1: + _text = f'{_doc_text} one attribute' + elif len(_a) == 0: + _text = f'{_doc_text} no attribute' + else: + _text = f'{_doc_text} attributes' + _nested_items.append(dict(name=_text, value=str(_a), description='')) + items.append( + dict( + name='Inspect attributes', + value=_nested_items, + description='Quick overview of attributes of all documents', + ) + ) + + storage_infos = self._get_storage_infos() + _nested_items = [] + if storage_infos: + for k, v in storage_infos.items(): + _nested_items.append(dict(name=k, value=v)) + items.append( + dict( + name='Storage backend', + value=_nested_items, + description='Quick overview of the Document Store', + ) + ) + + return json.dumps(items, sort_keys=True) + def push( self, name: str, @@ -47,6 +139,7 @@ def push( 'name': name, 'type': 'documentArray', 'public': public, + 'metaData': self._get_raw_summary(), } ) diff --git a/tests/unit/array/mixins/test_plot.py b/tests/unit/array/mixins/test_plot.py index e61d6a082fd..dd7212d61ee 100644 --- a/tests/unit/array/mixins/test_plot.py +++ b/tests/unit/array/mixins/test_plot.py @@ -187,10 +187,13 @@ def test_summary_homo_hetero(da_cls, config, start_storage): da = da_cls.empty(100) da._get_attributes() da.summary() + da._get_raw_summary() da[0].pop('id') da.summary() + da._get_raw_summary() + @pytest.mark.parametrize( 'da_cls,config', From 31a902df242202b94cb03a22bfbfd0f0cfc1bfad Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Wed, 17 Aug 2022 23:07:28 +0200 Subject: [PATCH 02/21] feat: push meta data along with docarray --- .github/workflows/cd.yml | 2 ++ .github/workflows/ci.yml | 2 ++ docarray/array/mixins/io/pushpull.py | 13 +++++++++---- setup.py | 2 +- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 8a07429c661..aeea7f4d41a 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -49,6 +49,8 @@ jobs: -v -s -m "not gpu" ${{ matrix.test-path }} echo "::set-output name=codecov_flag::docarray" timeout-minutes: 30 + env: + JINA_AUTH_TOKEN.: "${{ secrets.JINA_AUTH_TOKEN }}" - name: Check codecov file id: check_files uses: andstor/file-existence-action@v1 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b9a15dfb96d..25144ed4590 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -138,6 +138,8 @@ jobs: -v -s -m "not gpu" ${{ matrix.test-path }} echo "::set-output name=codecov_flag::docarray" timeout-minutes: 30 + env: + JINA_AUTH_TOKEN.: "${{ secrets.JINA_AUTH_TOKEN }}" - name: Check codecov file id: check_files uses: andstor/file-existence-action@v1 diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index a5bbdc9f869..587ec6ac182 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -4,7 +4,7 @@ import warnings from collections import Counter from pathlib import Path -from typing import Dict, Type, TYPE_CHECKING +from typing import Dict, Type, TYPE_CHECKING, Any, List from docarray.helper import get_request_header, __cache_path__ @@ -17,7 +17,7 @@ class PushPullMixin: _max_bytes = 4 * 1024 * 1024 * 1024 - def _get_raw_summary(self) -> str: + def _get_raw_summary(self) -> List[Dict[str, Any]]: all_attrs = self._get_attributes('non_empty_fields') # remove underscore attribute all_attrs = [tuple(vv for vv in v if not vv.startswith('_')) for v in all_attrs] @@ -104,13 +104,14 @@ def _get_raw_summary(self) -> str: ) ) - return json.dumps(items, sort_keys=True) + return items def push( self, name: str, show_progress: bool = False, public: bool = True, + branding: Dict = None, ) -> Dict: """Push this DocumentArray object to Jina Cloud which can be later retrieved via :meth:`.push` @@ -125,6 +126,7 @@ def push( :param show_progress: if to show a progress bar on pulling :param public: by default anyone can pull a DocumentArray if they know its name. Setting this to False will allow only the creator to pull it. This feature of course you to login first. + :param branding: a dict of branding information to be sent to Jina Cloud. {"icon": "emoji", "background": "#fff"} """ import requests @@ -139,7 +141,10 @@ def push( 'name': name, 'type': 'documentArray', 'public': public, - 'metaData': self._get_raw_summary(), + 'metaData': json.dumps( + {'preview': self._get_raw_summary(), 'branding': branding}, + sort_keys=True, + ), } ) diff --git a/setup.py b/setup.py index 21ddf977266..7bfe03c15fd 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ 'Pillow', 'fastapi', 'uvicorn', - 'jina-hubble-sdk>=0.11.0', + 'jina-hubble-sdk==0.12.4', ], 'full': [ 'protobuf>=3.13.0,<=3.20.1', From 8d60522a14b3b52493fe6b66470802ad74ae267d Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Wed, 17 Aug 2022 23:09:35 +0200 Subject: [PATCH 03/21] feat: push meta data along with docarray --- docarray/array/mixins/io/pushpull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index 587ec6ac182..90c2d09f7be 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -142,7 +142,7 @@ def push( 'type': 'documentArray', 'public': public, 'metaData': json.dumps( - {'preview': self._get_raw_summary(), 'branding': branding}, + {'summary': self._get_raw_summary(), 'branding': branding}, sort_keys=True, ), } From aa1b3d957a8b016fa00e3d069cf42bd2c591ae55 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Wed, 17 Aug 2022 23:22:41 +0200 Subject: [PATCH 04/21] feat: push meta data along with docarray --- .github/workflows/cd.yml | 2 +- .github/workflows/ci.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index aeea7f4d41a..3066dc27ca6 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -50,7 +50,7 @@ jobs: echo "::set-output name=codecov_flag::docarray" timeout-minutes: 30 env: - JINA_AUTH_TOKEN.: "${{ secrets.JINA_AUTH_TOKEN }}" + JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}" - name: Check codecov file id: check_files uses: andstor/file-existence-action@v1 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 25144ed4590..047343d3f9f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -139,7 +139,7 @@ jobs: echo "::set-output name=codecov_flag::docarray" timeout-minutes: 30 env: - JINA_AUTH_TOKEN.: "${{ secrets.JINA_AUTH_TOKEN }}" + JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}" - name: Check codecov file id: check_files uses: andstor/file-existence-action@v1 From 27bfb9fb27031c9243a65aa9c9c1963cc9eaf95f Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Thu, 18 Aug 2022 15:22:30 +0200 Subject: [PATCH 05/21] feat: push meta data along with docarray (#491) --- docarray/array/mixins/io/pushpull.py | 79 +++++++++++++++++++++++--- tests/unit/array/mixins/test_io.py | 10 ++++ tests/unit/array/test_from_to_bytes.py | 2 +- 3 files changed, 81 insertions(+), 10 deletions(-) diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index 90c2d09f7be..4cec5f4a14d 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -4,7 +4,12 @@ import warnings from collections import Counter from pathlib import Path -from typing import Dict, Type, TYPE_CHECKING, Any, List +from typing import Dict, Type, TYPE_CHECKING, List, Optional + +import hubble +from hubble import Client +from hubble.client.endpoints import EndpointsV2 + from docarray.helper import get_request_header, __cache_path__ @@ -12,11 +17,70 @@ from docarray.typing import T +def _get_length_from_summary(summary: List[Dict]) -> Optional[int]: + """Get the length from summary.""" + for item in summary: + if 'Length' == item['name']: + return item['value'] + + class PushPullMixin: """Transmitting :class:`DocumentArray` via Jina Cloud Service""" _max_bytes = 4 * 1024 * 1024 * 1024 + @classmethod + @hubble.login_required + def cloud_list(cls, show_table: bool = False) -> List[str]: + """List all available arrays in the cloud. + + :param show_table: if true, show the table of the arrays. + :returns: List of available DocumentArray's names. + """ + + result = [] + from rich.table import Table + from rich import box + + table = Table( + title='Your DocumentArray on the cloud', box=box.SIMPLE, highlight=True + ) + table.add_column('Name') + table.add_column('Length') + table.add_column('Visibility') + table.add_column('Create at', justify='center') + table.add_column('Updated at', justify='center') + + for da in Client(jsonify=True).list_artifacts( + filter={'type': 'documentArray'}, sort={'createdAt': 1} + )['data']: + if da['type'] == 'documentArray': + result.append(da['name']) + + table.add_row( + da['name'], + str(_get_length_from_summary(da['metaData'].get('summary', []))), + da['visibility'], + da['createdAt'], + da['updatedAt'], + ) + + if show_table: + from rich import print + + print(table) + return result + + @classmethod + @hubble.login_required + def cloud_delete(cls, name: str) -> None: + """ + Delete a DocumentArray from the cloud. + :param name: the name of the DocumentArray to delete. + """ + Client(jsonify=True).delete_artifact(name) + + def _get_raw_summary(self) -> List[Dict[str, Any]]: all_attrs = self._get_attributes('non_empty_fields') # remove underscore attribute @@ -106,6 +170,7 @@ def _get_raw_summary(self) -> List[Dict[str, Any]]: return items + @hubble.login_required def push( self, name: str, @@ -149,7 +214,6 @@ def push( ) headers = {'Content-Type': ctype, **get_request_header()} - import hubble auth_token = hubble.get_token() if auth_token: @@ -196,8 +260,6 @@ def _get_chunk(_batch): yield _tail with pbar: - from hubble import Client - from hubble.client.endpoints import EndpointsV2 response = requests.post( Client()._base_url + EndpointsV2.upload_artifact, @@ -211,6 +273,7 @@ def _get_chunk(_batch): response.raise_for_status() @classmethod + @hubble.login_required def pull( cls: Type['T'], name: str, @@ -231,16 +294,11 @@ def pull( headers = {} - import hubble - auth_token = hubble.get_token() if auth_token: headers['Authorization'] = f'token {auth_token}' - from hubble import Client - from hubble.client.endpoints import EndpointsV2 - url = Client()._base_url + EndpointsV2.download_artifact + f'?name={name}' response = requests.get(url, headers=headers) @@ -281,3 +339,6 @@ def pull( fp.write(_source.content) return r + + cloud_push = push + cloud_pull = pull diff --git a/tests/unit/array/mixins/test_io.py b/tests/unit/array/mixins/test_io.py index 383020fb3d2..026befb2fe3 100644 --- a/tests/unit/array/mixins/test_io.py +++ b/tests/unit/array/mixins/test_io.py @@ -237,6 +237,16 @@ def test_push_pull_io(da_cls, config, show_progress, start_storage): assert len(da1) == len(da2) == 10 assert da1.texts == da2.texts == random_texts + all_names = DocumentArray.cloud_list() + + assert name in all_names + + DocumentArray.cloud_delete(name) + + all_names = DocumentArray.cloud_list() + + assert name not in all_names + @pytest.mark.parametrize( 'protocol', ['protobuf', 'pickle', 'protobuf-array', 'pickle-array'] diff --git a/tests/unit/array/test_from_to_bytes.py b/tests/unit/array/test_from_to_bytes.py index a8b77d90327..fd500e8ce67 100644 --- a/tests/unit/array/test_from_to_bytes.py +++ b/tests/unit/array/test_from_to_bytes.py @@ -110,7 +110,7 @@ def test_from_to_safe_list(target, protocol, to_fn): @pytest.mark.parametrize('protocol', ['protobuf', 'pickle']) @pytest.mark.parametrize('show_progress', [True, False]) -def test_push_pull_show_progress(show_progress, protocol): +def test_to_bytes_show_progress(show_progress, protocol): da = DocumentArray.empty(1000) r = da.to_bytes(_show_progress=show_progress, protocol=protocol) da_r = DocumentArray.from_bytes(r, _show_progress=show_progress, protocol=protocol) From aa3d1bfb7aeede36b18814ad53c37cc533bfcd40 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Thu, 18 Aug 2022 15:26:55 +0200 Subject: [PATCH 06/21] feat: push meta data along with docarray --- docarray/array/mixins/io/pushpull.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index 4cec5f4a14d..e5d9bea765b 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -4,7 +4,7 @@ import warnings from collections import Counter from pathlib import Path -from typing import Dict, Type, TYPE_CHECKING, List, Optional +from typing import Dict, Type, TYPE_CHECKING, List, Optional, Any import hubble from hubble import Client @@ -80,7 +80,6 @@ def cloud_delete(cls, name: str) -> None: """ Client(jsonify=True).delete_artifact(name) - def _get_raw_summary(self) -> List[Dict[str, Any]]: all_attrs = self._get_attributes('non_empty_fields') # remove underscore attribute From a5e0ba23b9cf8a7eedf5b08bd838e9405360122c Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Thu, 18 Aug 2022 15:28:49 +0200 Subject: [PATCH 07/21] feat: push meta data along with docarray --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7bfe03c15fd..dc631dc4440 100644 --- a/setup.py +++ b/setup.py @@ -60,7 +60,7 @@ 'Pillow', 'trimesh', 'scipy', - 'jina-hubble-sdk>=0.10.0', + 'jina-hubble-sdk==0.12.4', 'av', 'fastapi', 'uvicorn', From 1ff0d1f959ee3765f55ae7a821f51749f807ec53 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Thu, 18 Aug 2022 15:36:27 +0200 Subject: [PATCH 08/21] feat: push meta data along with docarray --- setup.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/setup.py b/setup.py index dc631dc4440..b37b7030860 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ long_description_content_type='text/markdown', zip_safe=False, setup_requires=['setuptools>=18.0', 'wheel'], - install_requires=['numpy', 'rich>=12.0.0'], + install_requires=['numpy', 'rich>=12.0.0', 'jina-hubble-sdk>=0.13.0'], extras_require={ # req usage, please see https://docarray.jina.ai/#install 'common': [ @@ -50,7 +50,6 @@ 'Pillow', 'fastapi', 'uvicorn', - 'jina-hubble-sdk==0.12.4', ], 'full': [ 'protobuf>=3.13.0,<=3.20.1', @@ -60,7 +59,6 @@ 'Pillow', 'trimesh', 'scipy', - 'jina-hubble-sdk==0.12.4', 'av', 'fastapi', 'uvicorn', From f3891f4c4160cf242721d9b900a8dab50240bf76 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Thu, 18 Aug 2022 16:05:14 +0200 Subject: [PATCH 09/21] feat: push meta data along with docarray --- docarray/array/mixins/io/pushpull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index e5d9bea765b..8ddaffb5cea 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -78,7 +78,7 @@ def cloud_delete(cls, name: str) -> None: Delete a DocumentArray from the cloud. :param name: the name of the DocumentArray to delete. """ - Client(jsonify=True).delete_artifact(name) + Client(jsonify=True).delete_artifact(name=name) def _get_raw_summary(self) -> List[Dict[str, Any]]: all_attrs = self._get_attributes('non_empty_fields') From 410aae4d00cbc784a254b84b17084c60d966b1bf Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Thu, 18 Aug 2022 16:22:57 +0200 Subject: [PATCH 10/21] feat: push meta data along with docarray --- docarray/array/mixins/io/pushpull.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index 8ddaffb5cea..fe0ed317491 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -37,23 +37,28 @@ def cloud_list(cls, show_table: bool = False) -> List[str]: :param show_table: if true, show the table of the arrays. :returns: List of available DocumentArray's names. """ + from rich import print result = [] from rich.table import Table from rich import box + resp = Client(jsonify=True).list_artifacts( + filter={'type': 'documentArray'}, sort={'createdAt': 1} + ) + table = Table( - title='Your DocumentArray on the cloud', box=box.SIMPLE, highlight=True + title=f'Your {resp["meta"]["total"]} DocumentArray on the cloud', + box=box.SIMPLE, + highlight=True, ) table.add_column('Name') table.add_column('Length') - table.add_column('Visibility') - table.add_column('Create at', justify='center') + table.add_column('Access') + table.add_column('Created at', justify='center') table.add_column('Updated at', justify='center') - for da in Client(jsonify=True).list_artifacts( - filter={'type': 'documentArray'}, sort={'createdAt': 1} - )['data']: + for da in resp['data']: if da['type'] == 'documentArray': result.append(da['name']) @@ -66,8 +71,6 @@ def cloud_list(cls, show_table: bool = False) -> List[str]: ) if show_table: - from rich import print - print(table) return result From 4c114441afcc1c4edc4ada3f436bc85bd002bc89 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Thu, 18 Aug 2022 16:48:22 +0200 Subject: [PATCH 11/21] feat: push meta data along with docarray --- docarray/array/mixins/io/pushpull.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index fe0ed317491..bae527e2677 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -48,7 +48,7 @@ def cloud_list(cls, show_table: bool = False) -> List[str]: ) table = Table( - title=f'Your {resp["meta"]["total"]} DocumentArray on the cloud', + title=f'You have {resp["meta"]["total"]} DocumentArray on the cloud', box=box.SIMPLE, highlight=True, ) diff --git a/setup.py b/setup.py index b37b7030860..728cbaec8d1 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ long_description_content_type='text/markdown', zip_safe=False, setup_requires=['setuptools>=18.0', 'wheel'], - install_requires=['numpy', 'rich>=12.0.0', 'jina-hubble-sdk>=0.13.0'], + install_requires=['numpy', 'rich>=12.0.0', 'jina-hubble-sdk>=0.13.1'], extras_require={ # req usage, please see https://docarray.jina.ai/#install 'common': [ From 115de5e2c11e0e1fe14554d01ed47d98353b1c47 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Thu, 18 Aug 2022 20:59:28 +0200 Subject: [PATCH 12/21] feat: push meta data along with docarray --- docarray/array/mixins/io/pushpull.py | 21 ++++++----- tests/unit/array/mixins/test_pushpull.py | 44 +++++++++++++++++++++--- 2 files changed, 49 insertions(+), 16 deletions(-) diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index bae527e2677..c75971c5013 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -59,16 +59,15 @@ def cloud_list(cls, show_table: bool = False) -> List[str]: table.add_column('Updated at', justify='center') for da in resp['data']: - if da['type'] == 'documentArray': - result.append(da['name']) - - table.add_row( - da['name'], - str(_get_length_from_summary(da['metaData'].get('summary', []))), - da['visibility'], - da['createdAt'], - da['updatedAt'], - ) + result.append(da['name']) + + table.add_row( + da['name'], + str(_get_length_from_summary(da['metaData'].get('summary', []))), + da['visibility'], + da['createdAt'], + da['updatedAt'], + ) if show_table: print(table) @@ -117,7 +116,7 @@ def _get_raw_summary(self) -> List[Dict[str, Any]]: ), dict( name='Common Attributes', - value=list(attr_counter.items())[0][0], + value=list(attr_counter.items())[0][0] if attr_counter else None, description='The common attributes of all documents', ), dict( diff --git a/tests/unit/array/mixins/test_pushpull.py b/tests/unit/array/mixins/test_pushpull.py index 7ca2d0a322e..af17a607a30 100644 --- a/tests/unit/array/mixins/test_pushpull.py +++ b/tests/unit/array/mixins/test_pushpull.py @@ -1,13 +1,13 @@ import cgi -import json import os +from io import BytesIO + import pytest import requests -from io import BytesIO -from docarray import DocumentArray +from docarray import DocumentArray, Document, dataclass from docarray.helper import random_identity - +from docarray.typing import Image, Text from tests import random_docs @@ -136,7 +136,6 @@ def test_push_fail(mocker, monkeypatch): def test_api_url_change(mocker, monkeypatch): - test_api_url = 'http://localhost:8080' os.environ['JINA_HUBBLE_REGISTRY'] = test_api_url @@ -158,3 +157,38 @@ def test_api_url_change(mocker, monkeypatch): assert push_kwargs['url'].startswith(test_api_url) assert pull_kwargs['url'].startswith(test_api_url) + + +@dataclass +class MyDocument: + image: Image + paragraph: Text + + +@pytest.mark.parametrize( + 'da', + [ + DocumentArray(), + DocumentArray.empty(10), + DocumentArray.empty(10, storage='annlite', config={'n_dim': 10}), + DocumentArray( + [ + Document( + MyDocument( + image='https://docarray.jina.ai/_images/apple.png', + paragraph='hello world', + ) + ) + for _ in range(10) + ], + config={'n_dim': 256}, + storage='annlite', + subindex_configs={ + '@.[image]': {'n_dim': 512}, + '@.[paragraph]': {'n_dim': 128}, + }, + ), + ], +) +def test_get_raw_summary(da: DocumentArray): + assert da._get_raw_summary() From 7b008c1ad9587e242b3a7bd6df0676ce5b27c006 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 13 Sep 2022 14:13:00 +0100 Subject: [PATCH 13/21] chore: login not required --- docarray/array/mixins/io/pushpull.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index c75971c5013..3df22b52f3e 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -30,7 +30,6 @@ class PushPullMixin: _max_bytes = 4 * 1024 * 1024 * 1024 @classmethod - @hubble.login_required def cloud_list(cls, show_table: bool = False) -> List[str]: """List all available arrays in the cloud. @@ -74,7 +73,6 @@ def cloud_list(cls, show_table: bool = False) -> List[str]: return result @classmethod - @hubble.login_required def cloud_delete(cls, name: str) -> None: """ Delete a DocumentArray from the cloud. @@ -171,7 +169,6 @@ def _get_raw_summary(self) -> List[Dict[str, Any]]: return items - @hubble.login_required def push( self, name: str, @@ -274,7 +271,6 @@ def _get_chunk(_batch): response.raise_for_status() @classmethod - @hubble.login_required def pull( cls: Type['T'], name: str, From a401d65529f1d38962835273c60ddedf26cf9754 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 14 Sep 2022 16:40:39 +0100 Subject: [PATCH 14/21] Revert "chore: login not required" This reverts commit 7b008c1ad9587e242b3a7bd6df0676ce5b27c006. --- docarray/array/mixins/io/pushpull.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index 3df22b52f3e..c75971c5013 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -30,6 +30,7 @@ class PushPullMixin: _max_bytes = 4 * 1024 * 1024 * 1024 @classmethod + @hubble.login_required def cloud_list(cls, show_table: bool = False) -> List[str]: """List all available arrays in the cloud. @@ -73,6 +74,7 @@ def cloud_list(cls, show_table: bool = False) -> List[str]: return result @classmethod + @hubble.login_required def cloud_delete(cls, name: str) -> None: """ Delete a DocumentArray from the cloud. @@ -169,6 +171,7 @@ def _get_raw_summary(self) -> List[Dict[str, Any]]: return items + @hubble.login_required def push( self, name: str, @@ -271,6 +274,7 @@ def _get_chunk(_batch): response.raise_for_status() @classmethod + @hubble.login_required def pull( cls: Type['T'], name: str, From 969ad903d304445a5caa6ebed3e09746139c7e8b Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Thu, 15 Sep 2022 08:03:47 +0100 Subject: [PATCH 15/21] refactor: reuse common code --- docarray/array/mixins/plot.py | 38 +++++++++++---------------------- docarray/helper.py | 40 ++++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 27 deletions(-) diff --git a/docarray/array/mixins/plot.py b/docarray/array/mixins/plot.py index 86c62d50498..9c5227e9776 100644 --- a/docarray/array/mixins/plot.py +++ b/docarray/array/mixins/plot.py @@ -11,6 +11,8 @@ import numpy as np +from docarray.helper import _get_array_info + class PlotMixin: """Helper functions for plotting the arrays.""" @@ -37,44 +39,28 @@ def summary(self): tables = [] console = Console() - all_attrs = self._get_attributes('non_empty_fields') - # remove underscore attribute - all_attrs = [tuple(vv for vv in v if not vv.startswith('_')) for v in all_attrs] - attr_counter = Counter(all_attrs) + ( + is_homo, + _nested_in, + _nested_items, + attr_counter, + all_attrs_names, + ) = _get_array_info(self) table = Table(box=box.SIMPLE, highlight=True) table.show_header = False table.add_row('Type', self.__class__.__name__) table.add_row('Length', str(len(self))) - is_homo = len(attr_counter) == 1 table.add_row('Homogenous Documents', str(is_homo)) - all_attrs_names = set(v for k in all_attrs for v in k) - _nested_in = [] - if 'chunks' in all_attrs_names: - _nested_in.append('chunks') - - if 'matches' in all_attrs_names: - _nested_in.append('matches') - if _nested_in: table.add_row('Has nested Documents in', str(tuple(_nested_in))) if is_homo: table.add_row('Common Attributes', str(list(attr_counter.items())[0][0])) - else: - for _a, _n in attr_counter.most_common(): - if _n == 1: - _doc_text = f'{_n} Document has' - else: - _doc_text = f'{_n} Documents have' - if len(_a) == 1: - _text = f'{_doc_text} one attribute' - elif len(_a) == 0: - _text = f'{_doc_text} no attribute' - else: - _text = f'{_doc_text} attributes' - table.add_row(_text, str(_a)) + + for item in _nested_items: + table.add_row(item['name'], item['value']) is_multimodal = all(d.is_multimodal for d in self) table.add_row('Multimodal dataclass', str(is_multimodal)) diff --git a/docarray/helper.py b/docarray/helper.py index 00ce8011be4..f65c7d6d9ac 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -6,7 +6,11 @@ import uuid import warnings from os.path import expanduser -from typing import Any, Dict, Optional, Sequence, Tuple, Union +from typing import Any, Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING +from collections import Counter + +if TYPE_CHECKING: + from docarray import DocumentArray __resources_path__ = os.path.join( os.path.dirname( @@ -455,3 +459,37 @@ def _safe_cast_int(value: Union[str, int, float]) -> int: if isinstance(value, float) and not value.is_integer(): raise ValueError(f"Can't safely cast {value} to an int") return int(value) + + +def _get_array_info(da: 'DocumentArray'): + all_attrs = da._get_attributes('non_empty_fields') + # remove underscore attribute + all_attrs = [tuple(vv for vv in v if not vv.startswith('_')) for v in all_attrs] + attr_counter = Counter(all_attrs) + + all_attrs_names = set(v for k in all_attrs for v in k) + _nested_in = [] + if 'chunks' in all_attrs_names: + _nested_in.append('chunks') + + if 'matches' in all_attrs_names: + _nested_in.append('matches') + + is_homo = len(attr_counter) == 1 + + _nested_items = [] + if not is_homo: + for _a, _n in attr_counter.most_common(): + if _n == 1: + _doc_text = f'{_n} Document has' + else: + _doc_text = f'{_n} Documents have' + if len(_a) == 1: + _text = f'{_doc_text} one attribute' + elif len(_a) == 0: + _text = f'{_doc_text} no attribute' + else: + _text = f'{_doc_text} attributes' + _nested_items.append(dict(name=_text, value=str(_a), description='')) + + return is_homo, _nested_in, _nested_items, attr_counter, all_attrs_names From af93164e74ab63e6f40e56c736b6fad9c37bf524 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Thu, 15 Sep 2022 08:06:26 +0100 Subject: [PATCH 16/21] chore: rename client --- docarray/array/mixins/io/pushpull.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index c75971c5013..2839ced8d1f 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -7,7 +7,7 @@ from typing import Dict, Type, TYPE_CHECKING, List, Optional, Any import hubble -from hubble import Client +from hubble import Client as HubbleClient from hubble.client.endpoints import EndpointsV2 @@ -43,7 +43,7 @@ def cloud_list(cls, show_table: bool = False) -> List[str]: from rich.table import Table from rich import box - resp = Client(jsonify=True).list_artifacts( + resp = HubbleClient(jsonify=True).list_artifacts( filter={'type': 'documentArray'}, sort={'createdAt': 1} ) @@ -80,7 +80,7 @@ def cloud_delete(cls, name: str) -> None: Delete a DocumentArray from the cloud. :param name: the name of the DocumentArray to delete. """ - Client(jsonify=True).delete_artifact(name=name) + HubbleClient(jsonify=True).delete_artifact(name=name) def _get_raw_summary(self) -> List[Dict[str, Any]]: all_attrs = self._get_attributes('non_empty_fields') @@ -263,7 +263,7 @@ def _get_chunk(_batch): with pbar: response = requests.post( - Client()._base_url + EndpointsV2.upload_artifact, + HubbleClient()._base_url + EndpointsV2.upload_artifact, data=gen(), headers=headers, ) @@ -300,7 +300,7 @@ def pull( if auth_token: headers['Authorization'] = f'token {auth_token}' - url = Client()._base_url + EndpointsV2.download_artifact + f'?name={name}' + url = HubbleClient()._base_url + EndpointsV2.download_artifact + f'?name={name}' response = requests.get(url, headers=headers) if response.ok: From cd22c00f68d62a4b095f325282901b558a99048a Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Thu, 15 Sep 2022 08:27:26 +0100 Subject: [PATCH 17/21] docs: document cloud operations --- .../documentarray/serialization.md | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/docs/fundamentals/documentarray/serialization.md b/docs/fundamentals/documentarray/serialization.md index 4069ad9ae81..d027858eecb 100644 --- a/docs/fundamentals/documentarray/serialization.md +++ b/docs/fundamentals/documentarray/serialization.md @@ -365,6 +365,11 @@ da = DocumentArray.from_dataframe(df) This feature requires `rich` and `requests` dependency. You can do `pip install "docarray[full]"` to install it. ``` +```{important} +As of DocArray 0.16.6, it is required to login to Jina Ecosystem before using cloud resources. Make sure to do +`jina auth login` before using {meth}`~docarray.array.mixins.io.pushpull.PushPullMixin.push` and {meth}`~docarray.array.mixins.io.pushpull.PushPullMixin.pull`. +``` + {meth}`~docarray.array.mixins.io.pushpull.PushPullMixin.push` and {meth}`~docarray.array.mixins.io.pushpull.PushPullMixin.pull` allows you to serialize a DocumentArray object to Jina Cloud and share it across machines. Considering you are working on a GPU machine via Google Colab/Jupyter. After preprocessing and embedding, you got everything you need in a DocumentArray. You can easily store it to the cloud via: @@ -393,3 +398,26 @@ The maximum size of an upload is 4GB under the `protocol='protobuf'` and `compre To avoid unnecessary download when upstream DocumentArray is unchanged, you can add `DocumentArray.pull(..., local_cache=True)`. +Furthermore, it is possible to list all `DocumentArray` objects stored on the cloud using: +```python +DocumentArray.cloud_list(show_table=True) +``` + +```text + You have 1 DocumentArray on the cloud + + Name Length Access Created at Updated at + ──────────────────────────────────────────────────────────────────────────────── + da123 10 public 2022-09-15T07:14:54.256Z 2022-09-15T07:14:54.256Z + +['da123'] +``` + +```{tip} +Use parameter `show_table` to show table summarizing information about DocumentArrays in the cloud. +``` + +It is also possible to delete DocumentArray objects in the cloud using: +```python +DocumentArray.cloud_delete('da123') +``` From df4daae9ded02e75251af50902e9c3b91ef407dd Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Thu, 15 Sep 2022 11:38:08 +0100 Subject: [PATCH 18/21] chore: login not required --- docarray/array/mixins/io/pushpull.py | 4 ---- docs/fundamentals/documentarray/serialization.md | 5 ----- 2 files changed, 9 deletions(-) diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index 2839ced8d1f..2a59f10547b 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -30,7 +30,6 @@ class PushPullMixin: _max_bytes = 4 * 1024 * 1024 * 1024 @classmethod - @hubble.login_required def cloud_list(cls, show_table: bool = False) -> List[str]: """List all available arrays in the cloud. @@ -74,7 +73,6 @@ def cloud_list(cls, show_table: bool = False) -> List[str]: return result @classmethod - @hubble.login_required def cloud_delete(cls, name: str) -> None: """ Delete a DocumentArray from the cloud. @@ -171,7 +169,6 @@ def _get_raw_summary(self) -> List[Dict[str, Any]]: return items - @hubble.login_required def push( self, name: str, @@ -274,7 +271,6 @@ def _get_chunk(_batch): response.raise_for_status() @classmethod - @hubble.login_required def pull( cls: Type['T'], name: str, diff --git a/docs/fundamentals/documentarray/serialization.md b/docs/fundamentals/documentarray/serialization.md index d027858eecb..d9dfb2a8a8a 100644 --- a/docs/fundamentals/documentarray/serialization.md +++ b/docs/fundamentals/documentarray/serialization.md @@ -365,11 +365,6 @@ da = DocumentArray.from_dataframe(df) This feature requires `rich` and `requests` dependency. You can do `pip install "docarray[full]"` to install it. ``` -```{important} -As of DocArray 0.16.6, it is required to login to Jina Ecosystem before using cloud resources. Make sure to do -`jina auth login` before using {meth}`~docarray.array.mixins.io.pushpull.PushPullMixin.push` and {meth}`~docarray.array.mixins.io.pushpull.PushPullMixin.pull`. -``` - {meth}`~docarray.array.mixins.io.pushpull.PushPullMixin.push` and {meth}`~docarray.array.mixins.io.pushpull.PushPullMixin.pull` allows you to serialize a DocumentArray object to Jina Cloud and share it across machines. Considering you are working on a GPU machine via Google Colab/Jupyter. After preprocessing and embedding, you got everything you need in a DocumentArray. You can easily store it to the cloud via: From ef4946bc55462df3cd41b7e7330fabb8c78d1488 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Fri, 16 Sep 2022 09:39:39 +0100 Subject: [PATCH 19/21] chore: apply suggestions --- docarray/array/mixins/io/pushpull.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index 2a59f10547b..47e355456d3 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -29,8 +29,8 @@ class PushPullMixin: _max_bytes = 4 * 1024 * 1024 * 1024 - @classmethod - def cloud_list(cls, show_table: bool = False) -> List[str]: + @staticmethod + def cloud_list(show_table: bool = False) -> List[str]: """List all available arrays in the cloud. :param show_table: if true, show the table of the arrays. @@ -72,8 +72,8 @@ def cloud_list(cls, show_table: bool = False) -> List[str]: print(table) return result - @classmethod - def cloud_delete(cls, name: str) -> None: + @staticmethod + def cloud_delete(name: str) -> None: """ Delete a DocumentArray from the cloud. :param name: the name of the DocumentArray to delete. @@ -174,7 +174,7 @@ def push( name: str, show_progress: bool = False, public: bool = True, - branding: Dict = None, + branding: Optional[Dict] = None, ) -> Dict: """Push this DocumentArray object to Jina Cloud which can be later retrieved via :meth:`.push` From 328057737ee1d8f532d985e0ef04032e7f53cbc3 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Fri, 16 Sep 2022 09:45:06 +0100 Subject: [PATCH 20/21] refactor: reuse code --- docarray/array/mixins/io/pushpull.py | 37 ++++++---------------------- 1 file changed, 8 insertions(+), 29 deletions(-) diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index 47e355456d3..56999e26933 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -11,7 +11,7 @@ from hubble.client.endpoints import EndpointsV2 -from docarray.helper import get_request_header, __cache_path__ +from docarray.helper import get_request_header, __cache_path__, _get_array_info if TYPE_CHECKING: from docarray.typing import T @@ -81,20 +81,13 @@ def cloud_delete(name: str) -> None: HubbleClient(jsonify=True).delete_artifact(name=name) def _get_raw_summary(self) -> List[Dict[str, Any]]: - all_attrs = self._get_attributes('non_empty_fields') - # remove underscore attribute - all_attrs = [tuple(vv for vv in v if not vv.startswith('_')) for v in all_attrs] - attr_counter = Counter(all_attrs) - - all_attrs_names = set(v for k in all_attrs for v in k) - _nested_in = [] - if 'chunks' in all_attrs_names: - _nested_in.append('chunks') - - if 'matches' in all_attrs_names: - _nested_in.append('matches') - - is_homo = len(attr_counter) == 1 + ( + is_homo, + _nested_in, + _nested_items, + attr_counter, + all_attrs_names, + ) = _get_array_info(self) items = [ dict( @@ -132,20 +125,6 @@ def _get_raw_summary(self) -> List[Dict[str, Any]]: ), ] - _nested_items = [] - if not is_homo: - for _a, _n in attr_counter.most_common(): - if _n == 1: - _doc_text = f'{_n} Document has' - else: - _doc_text = f'{_n} Documents have' - if len(_a) == 1: - _text = f'{_doc_text} one attribute' - elif len(_a) == 0: - _text = f'{_doc_text} no attribute' - else: - _text = f'{_doc_text} attributes' - _nested_items.append(dict(name=_text, value=str(_a), description='')) items.append( dict( name='Inspect attributes', From 88406a1471f1d21e8dfae02b9f293c808d16b86b Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Fri, 16 Sep 2022 10:31:24 +0100 Subject: [PATCH 21/21] chore: better naming --- docarray/helper.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/docarray/helper.py b/docarray/helper.py index f65c7d6d9ac..12143334548 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -479,17 +479,19 @@ def _get_array_info(da: 'DocumentArray'): _nested_items = [] if not is_homo: - for _a, _n in attr_counter.most_common(): - if _n == 1: - _doc_text = f'{_n} Document has' + for n_attributes, n_docs in attr_counter.most_common(): + if n_docs == 1: + _doc_text = f'{n_docs} Document has' else: - _doc_text = f'{_n} Documents have' - if len(_a) == 1: + _doc_text = f'{n_docs} Documents have' + if len(n_attributes) == 1: _text = f'{_doc_text} one attribute' - elif len(_a) == 0: + elif len(n_attributes) == 0: _text = f'{_doc_text} no attribute' else: _text = f'{_doc_text} attributes' - _nested_items.append(dict(name=_text, value=str(_a), description='')) + _nested_items.append( + dict(name=_text, value=str(n_attributes), description='') + ) return is_homo, _nested_in, _nested_items, attr_counter, all_attrs_names