Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
df32345
feat: push meta data along with docarray
hanxiao Aug 17, 2022
31a902d
feat: push meta data along with docarray
hanxiao Aug 17, 2022
8d60522
feat: push meta data along with docarray
hanxiao Aug 17, 2022
aa1b3d9
feat: push meta data along with docarray
hanxiao Aug 17, 2022
27bfb9f
feat: push meta data along with docarray (#491)
hanxiao Aug 18, 2022
aa3d1bf
feat: push meta data along with docarray
hanxiao Aug 18, 2022
a5e0ba2
feat: push meta data along with docarray
hanxiao Aug 18, 2022
1ff0d1f
feat: push meta data along with docarray
hanxiao Aug 18, 2022
f3891f4
feat: push meta data along with docarray
hanxiao Aug 18, 2022
410aae4
feat: push meta data along with docarray
hanxiao Aug 18, 2022
4c11444
feat: push meta data along with docarray
hanxiao Aug 18, 2022
115de5e
feat: push meta data along with docarray
hanxiao Aug 18, 2022
5464342
Merge branch 'main' into feat-push-metadata
alaeddine-13 Sep 9, 2022
1610518
Merge branch 'main' into feat-push-metadata
alaeddine-13 Sep 12, 2022
7b008c1
chore: login not required
alaeddine-13 Sep 13, 2022
a401d65
Revert "chore: login not required"
alaeddine-13 Sep 14, 2022
37ab725
Merge branch 'main' into feat-push-metadata
alaeddine-13 Sep 14, 2022
969ad90
refactor: reuse common code
alaeddine-13 Sep 15, 2022
af93164
chore: rename client
alaeddine-13 Sep 15, 2022
cd22c00
docs: document cloud operations
alaeddine-13 Sep 15, 2022
df4daae
chore: login not required
alaeddine-13 Sep 15, 2022
ef4946b
chore: apply suggestions
alaeddine-13 Sep 16, 2022
3280577
refactor: reuse code
alaeddine-13 Sep 16, 2022
88406a1
chore: better naming
alaeddine-13 Sep 16, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ jobs:
-v -s -m "not gpu" ${{ matrix.test-path }}
echo "::set-output name=codecov_flag::docarray"
timeout-minutes: 30
env:
JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}"
- name: Check codecov file
id: check_files
uses: andstor/file-existence-action@v1
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ jobs:
-v -s -m "not gpu" ${{ matrix.test-path }}
echo "::set-output name=codecov_flag::docarray"
timeout-minutes: 30
env:
JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}"
- name: Check codecov file
id: check_files
uses: andstor/file-existence-action@v1
Expand Down
159 changes: 147 additions & 12 deletions docarray/array/mixins/io/pushpull.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,159 @@
import json
import os
import os.path
import warnings
from collections import Counter
from pathlib import Path
from typing import Dict, Type, TYPE_CHECKING, Optional
from typing import Dict, Type, TYPE_CHECKING, List, Optional, Any

from docarray.helper import get_request_header, __cache_path__
import hubble
from hubble import Client as HubbleClient
from hubble.client.endpoints import EndpointsV2


from docarray.helper import get_request_header, __cache_path__, _get_array_info

if TYPE_CHECKING:
from docarray.typing import T


def _get_length_from_summary(summary: List[Dict]) -> Optional[int]:
"""Get the length from summary."""
for item in summary:
if 'Length' == item['name']:
return item['value']


class PushPullMixin:
"""Transmitting :class:`DocumentArray` via Jina Cloud Service"""

_max_bytes = 4 * 1024 * 1024 * 1024

@staticmethod
def cloud_list(show_table: bool = False) -> List[str]:
"""List all available arrays in the cloud.

:param show_table: if true, show the table of the arrays.
:returns: List of available DocumentArray's names.
"""
from rich import print

result = []
from rich.table import Table
from rich import box

resp = HubbleClient(jsonify=True).list_artifacts(
filter={'type': 'documentArray'}, sort={'createdAt': 1}
)

table = Table(
title=f'You have {resp["meta"]["total"]} DocumentArray on the cloud',
box=box.SIMPLE,
highlight=True,
)
table.add_column('Name')
table.add_column('Length')
table.add_column('Access')
table.add_column('Created at', justify='center')
table.add_column('Updated at', justify='center')

for da in resp['data']:
result.append(da['name'])

table.add_row(
da['name'],
str(_get_length_from_summary(da['metaData'].get('summary', []))),
da['visibility'],
da['createdAt'],
da['updatedAt'],
)

if show_table:
print(table)
return result

@staticmethod
def cloud_delete(name: str) -> None:
"""
Delete a DocumentArray from the cloud.
:param name: the name of the DocumentArray to delete.
"""
HubbleClient(jsonify=True).delete_artifact(name=name)

def _get_raw_summary(self) -> List[Dict[str, Any]]:
(
is_homo,
_nested_in,
_nested_items,
attr_counter,
all_attrs_names,
) = _get_array_info(self)

items = [
dict(
name='Type',
value=self.__class__.__name__,
description='The type of the DocumentArray',
),
dict(
name='Length',
value=len(self),
description='The length of the DocumentArray',
),
dict(
name='Homogenous Documents',
value=is_homo,
description='Whether all documents are of the same structure, attributes',
),
dict(
name='Common Attributes',
value=list(attr_counter.items())[0][0] if attr_counter else None,
description='The common attributes of all documents',
),
dict(
name='Has nested Documents in',
value=tuple(_nested_in),
description='The field that contains nested Documents',
),
dict(
name='Multimodal dataclass',
value=all(d.is_multimodal for d in self),
description='Whether all documents are multimodal',
),
dict(
name='Subindices', value=tuple(getattr(self, '_subindices', {}).keys())
),
]

items.append(
dict(
name='Inspect attributes',
value=_nested_items,
description='Quick overview of attributes of all documents',
)
)

storage_infos = self._get_storage_infos()
_nested_items = []
if storage_infos:
for k, v in storage_infos.items():
_nested_items.append(dict(name=k, value=v))
items.append(
dict(
name='Storage backend',
value=_nested_items,
description='Quick overview of the Document Store',
)
)

return items

def push(
self,
name: str,
show_progress: bool = False,
public: bool = True,
branding: Optional[Dict] = None,
) -> Dict:
"""Push this DocumentArray object to Jina Cloud which can be later retrieved via :meth:`.push`

Expand All @@ -33,6 +168,7 @@ def push(
:param show_progress: if to show a progress bar on pulling
:param public: by default anyone can pull a DocumentArray if they know its name.
Setting this to False will allow only the creator to pull it. This feature of course you to login first.
:param branding: a dict of branding information to be sent to Jina Cloud. {"icon": "emoji", "background": "#fff"}
"""
import requests

Expand All @@ -47,11 +183,14 @@ def push(
'name': name,
'type': 'documentArray',
'public': public,
'metaData': json.dumps(
{'summary': self._get_raw_summary(), 'branding': branding},
sort_keys=True,
),
}
)

headers = {'Content-Type': ctype, **get_request_header()}
import hubble

auth_token = hubble.get_token()
if auth_token:
Expand Down Expand Up @@ -98,11 +237,9 @@ def _get_chunk(_batch):
yield _tail

with pbar:
from hubble import Client
from hubble.client.endpoints import EndpointsV2

response = requests.post(
Client()._base_url + EndpointsV2.upload_artifact,
HubbleClient()._base_url + EndpointsV2.upload_artifact,
data=gen(),
headers=headers,
)
Expand Down Expand Up @@ -133,17 +270,12 @@ def pull(

headers = {}

import hubble

auth_token = hubble.get_token()

if auth_token:
headers['Authorization'] = f'token {auth_token}'

from hubble import Client
from hubble.client.endpoints import EndpointsV2

url = Client()._base_url + EndpointsV2.download_artifact + f'?name={name}'
url = HubbleClient()._base_url + EndpointsV2.download_artifact + f'?name={name}'
response = requests.get(url, headers=headers)

if response.ok:
Expand Down Expand Up @@ -183,3 +315,6 @@ def pull(
fp.write(_source.content)

return r

cloud_push = push
cloud_pull = pull
38 changes: 12 additions & 26 deletions docarray/array/mixins/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

import numpy as np

from docarray.helper import _get_array_info


class PlotMixin:
"""Helper functions for plotting the arrays."""
Expand All @@ -37,44 +39,28 @@ def summary(self):
tables = []
console = Console()

all_attrs = self._get_attributes('non_empty_fields')
# remove underscore attribute
all_attrs = [tuple(vv for vv in v if not vv.startswith('_')) for v in all_attrs]
attr_counter = Counter(all_attrs)
(
is_homo,
_nested_in,
_nested_items,
attr_counter,
all_attrs_names,
) = _get_array_info(self)

table = Table(box=box.SIMPLE, highlight=True)
table.show_header = False
table.add_row('Type', self.__class__.__name__)
table.add_row('Length', str(len(self)))
is_homo = len(attr_counter) == 1
table.add_row('Homogenous Documents', str(is_homo))

all_attrs_names = set(v for k in all_attrs for v in k)
_nested_in = []
if 'chunks' in all_attrs_names:
_nested_in.append('chunks')

if 'matches' in all_attrs_names:
_nested_in.append('matches')

if _nested_in:
table.add_row('Has nested Documents in', str(tuple(_nested_in)))

if is_homo:
table.add_row('Common Attributes', str(list(attr_counter.items())[0][0]))
else:
for _a, _n in attr_counter.most_common():
if _n == 1:
_doc_text = f'{_n} Document has'
else:
_doc_text = f'{_n} Documents have'
if len(_a) == 1:
_text = f'{_doc_text} one attribute'
elif len(_a) == 0:
_text = f'{_doc_text} no attribute'
else:
_text = f'{_doc_text} attributes'
table.add_row(_text, str(_a))

for item in _nested_items:
table.add_row(item['name'], item['value'])

is_multimodal = all(d.is_multimodal for d in self)
table.add_row('Multimodal dataclass', str(is_multimodal))
Expand Down
42 changes: 41 additions & 1 deletion docarray/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
import uuid
import warnings
from os.path import expanduser
from typing import Any, Dict, Optional, Sequence, Tuple, Union
from typing import Any, Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING
from collections import Counter

if TYPE_CHECKING:
from docarray import DocumentArray

__resources_path__ = os.path.join(
os.path.dirname(
Expand Down Expand Up @@ -455,3 +459,39 @@ def _safe_cast_int(value: Union[str, int, float]) -> int:
if isinstance(value, float) and not value.is_integer():
raise ValueError(f"Can't safely cast {value} to an int")
return int(value)


def _get_array_info(da: 'DocumentArray'):
all_attrs = da._get_attributes('non_empty_fields')
# remove underscore attribute
all_attrs = [tuple(vv for vv in v if not vv.startswith('_')) for v in all_attrs]
attr_counter = Counter(all_attrs)

all_attrs_names = set(v for k in all_attrs for v in k)
_nested_in = []
if 'chunks' in all_attrs_names:
_nested_in.append('chunks')

if 'matches' in all_attrs_names:
_nested_in.append('matches')

is_homo = len(attr_counter) == 1

_nested_items = []
if not is_homo:
for n_attributes, n_docs in attr_counter.most_common():
if n_docs == 1:
_doc_text = f'{n_docs} Document has'
else:
_doc_text = f'{n_docs} Documents have'
if len(n_attributes) == 1:
_text = f'{_doc_text} one attribute'
elif len(n_attributes) == 0:
_text = f'{_doc_text} no attribute'
else:
_text = f'{_doc_text} attributes'
_nested_items.append(
dict(name=_text, value=str(n_attributes), description='')
)

return is_homo, _nested_in, _nested_items, attr_counter, all_attrs_names
23 changes: 23 additions & 0 deletions docs/fundamentals/documentarray/serialization.md
Original file line number Diff line number Diff line change
Expand Up @@ -393,3 +393,26 @@ The maximum size of an upload is 4GB under the `protocol='protobuf'` and `compre

To avoid unnecessary download when upstream DocumentArray is unchanged, you can add `DocumentArray.pull(..., local_cache=True)`.

Furthermore, it is possible to list all `DocumentArray` objects stored on the cloud using:
```python
DocumentArray.cloud_list(show_table=True)
```

```text
You have 1 DocumentArray on the cloud

Name Length Access Created at Updated at
────────────────────────────────────────────────────────────────────────────────
da123 10 public 2022-09-15T07:14:54.256Z 2022-09-15T07:14:54.256Z

['da123']
```

```{tip}
Use parameter `show_table` to show table summarizing information about DocumentArrays in the cloud.
```

It is also possible to delete DocumentArray objects in the cloud using:
```python
DocumentArray.cloud_delete('da123')
```
Loading