From 4b7ba3533310b8e05af780ff954d4e77b605898b Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Thu, 6 Jan 2022 16:25:47 +0100 Subject: [PATCH 1/6] docs(array): add doc for array serialization --- docarray/array/mixins/plot.py | 21 ++++- docarray/array/mixins/traverse.py | 47 +++++++--- docs/fundamentals/document/visualization.md | 2 +- .../documentarray/access-elements.md | 90 +++++++++++++++++-- .../images/docarray-index-example-full1.svg | 1 + .../images/docarray-index-example-full2.svg | 1 + .../images/docarray-index-example-full3.svg | 1 + .../images/docarray-index-example.svg | 1 + docs/index.md | 2 + tests/unit/array/test_advance_indexing.py | 21 +++++ 10 files changed, 167 insertions(+), 20 deletions(-) create mode 100644 docs/fundamentals/documentarray/images/docarray-index-example-full1.svg create mode 100644 docs/fundamentals/documentarray/images/docarray-index-example-full2.svg create mode 100644 docs/fundamentals/documentarray/images/docarray-index-example-full3.svg create mode 100644 docs/fundamentals/documentarray/images/docarray-index-example.svg diff --git a/docarray/array/mixins/plot.py b/docarray/array/mixins/plot.py index 9393b87fdae..2266dfa948e 100644 --- a/docarray/array/mixins/plot.py +++ b/docarray/array/mixins/plot.py @@ -28,6 +28,17 @@ def summary(self): is_homo = len(attr_counter) == 1 table.add_row('Homogenous Documents', str(is_homo)) + all_attrs_names = set(v for k in all_attrs for v in k) + _nested_in = [] + if 'chunks' in all_attrs_names: + _nested_in.append('chunks') + + if 'matches' in all_attrs_names: + _nested_in.append('matches') + + if _nested_in: + table.add_row('Has nested Documents in', str(tuple(_nested_in))) + if is_homo: table.add_row('Common Attributes', str(list(attr_counter.items())[0][0])) else: @@ -50,16 +61,18 @@ def summary(self): attr_table.add_column('#Unique values') attr_table.add_column('Has empty value') - all_attrs_names = tuple(sorted(set(v for k in all_attrs for v in k))) + all_attrs_names = tuple(sorted(all_attrs_names)) all_attrs_values = self.get_attributes(*all_attrs_names) if len(all_attrs_names) == 1: all_attrs_values = [all_attrs_values] for _a, _a_name in zip(all_attrs_values, all_attrs_names): - _counter_a = Counter(_a) - _set_a = set(_a) + try: + _a = set(_a) + except: + pass _set_type_a = set(type(_aa).__name__ for _aa in _a) attr_table.add_row( - _a_name, str(tuple(_set_type_a)), str(len(_set_a)), str(None in _set_a) + _a_name, str(tuple(_set_type_a)), str(len(_a)), str(None in _a) ) console = Console() diff --git a/docarray/array/mixins/traverse.py b/docarray/array/mixins/traverse.py index 426bbe06111..acf8059f4c4 100644 --- a/docarray/array/mixins/traverse.py +++ b/docarray/array/mixins/traverse.py @@ -1,10 +1,11 @@ import itertools +import re from typing import ( Iterable, TYPE_CHECKING, Optional, Callable, - Union, + Tuple, ) if TYPE_CHECKING: @@ -50,24 +51,26 @@ def _traverse( path: str, filter_fn: Optional[Callable[['Document'], bool]] = None, ): - path = path.strip() + path = re.sub(r'\s+', '', path) if path: - loc = path[0] - if loc == 'r': - yield from TraverseMixin._traverse(docs, path[1:], filter_fn=filter_fn) - elif loc == 'm': + cur_loc, cur_slice, _left = _parse_path_string(path) + if cur_loc == 'r': + yield from TraverseMixin._traverse( + docs[cur_slice], _left, filter_fn=filter_fn + ) + elif cur_loc == 'm': for d in docs: yield from TraverseMixin._traverse( - d.matches, path[1:], filter_fn=filter_fn + d.matches[cur_slice], _left, filter_fn=filter_fn ) - elif loc == 'c': + elif cur_loc == 'c': for d in docs: yield from TraverseMixin._traverse( - d.chunks, path[1:], filter_fn=filter_fn + d.chunks[cur_slice], _left, filter_fn=filter_fn ) else: raise ValueError( - f'`path`:{loc} is invalid, must be one of `c`, `r`, `m`' + f'`path`:{path} is invalid, please refer to https://docarray.jina.ai/fundamentals/documentarray/access-elements/#index-by-nested-structure' ) elif filter_fn is None: yield docs @@ -148,3 +151,27 @@ def _flatten(sequence) -> 'DocumentArray': from ... import DocumentArray return DocumentArray(list(itertools.chain.from_iterable(sequence))) + + +def _parse_path_string(p: str) -> Tuple[str, slice, str]: + g = re.match(r'^([rcm])([-\d:]+)?([rcm].*)?$', p) + _this = g.group(1) + slice_str = g.group(2) + _next = g.group(3) + return _this, _parse_slice(slice_str or ':'), _next or '' + + +def _parse_slice(value): + """ + Parses a `slice()` from string, like `start:stop:step`. + """ + if value: + parts = value.split(':') + if len(parts) == 1: + # slice(stop) + parts = [None, parts[0]] + # else: slice(start, stop[, step]) + else: + # slice() + parts = [] + return slice(*[int(p) if p else None for p in parts]) diff --git a/docs/fundamentals/document/visualization.md b/docs/fundamentals/document/visualization.md index 710a34a9684..a4e25a43e70 100644 --- a/docs/fundamentals/document/visualization.md +++ b/docs/fundamentals/document/visualization.md @@ -2,7 +2,7 @@ If you have an image Document (with possible image data in `.uri`/`.blob`), you can directly visualize it via {meth}`~docarray.document.mixins.plot.PlotMixin.plot`. -```{figure} images/doc-in-jupyter.png +```{figure} images/doc-plot-in-jupyter.png ``` diff --git a/docs/fundamentals/documentarray/access-elements.md b/docs/fundamentals/documentarray/access-elements.md index 233e0a67e50..d29ffedbb50 100644 --- a/docs/fundamentals/documentarray/access-elements.md +++ b/docs/fundamentals/documentarray/access-elements.md @@ -111,12 +111,89 @@ From early chapter, we already know {ref}`Document can be nested + └─ matches + ├─ + └─ + └─ chunks + ├─ + └─ +``` + +That's still too much information, let's minimize it. + +```{figure} images/docarray-index-example.svg +:width: 10% +``` + +Now let's use the red circle to depict our intended selection. Here is what you can with the path-syntax: + +```{figure} images/docarray-index-example-full1.svg +``` + +```python +print(da['@m']) +print(da['@c']) +print(da['@c,m']) +print(da['@c,m,r']) +``` + +```text + + + + +``` + +Let's now consider a deeper nested structure and use the path syntax to select Documents. + +```{figure} images/docarray-index-example-full2.svg +``` + +Last but not the least, you can use integer, or integer slice to restrict the selection. +```{figure} images/docarray-index-example-full3.svg +:width: 60% +``` + +You can add space in the path-string for a better readability. Alternatively, you may leverage {meth}`~docarray.array.mixins.traverse.TraverseMixin.traverse_flat` to do this more explicitly. ## Index by flatten @@ -156,6 +233,9 @@ da[...].summary() Note that there is no `chunks` and `matches` in any of the Document from `da[...]` anymore. They are all flattened. +Documents in `da[...]` are in the chunks-and-depth-first order, i.e depth-first traversing to all chunks and then to all matches. + + ## Batching One can batch a large DocumentArray into small ones via {meth}`~docarray.array.mixins.group.GroupMixin.batch`. This is useful when a DocumentArray is too big to process at once. diff --git a/docs/fundamentals/documentarray/images/docarray-index-example-full1.svg b/docs/fundamentals/documentarray/images/docarray-index-example-full1.svg new file mode 100644 index 00000000000..97a6abb127b --- /dev/null +++ b/docs/fundamentals/documentarray/images/docarray-index-example-full1.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/fundamentals/documentarray/images/docarray-index-example-full2.svg b/docs/fundamentals/documentarray/images/docarray-index-example-full2.svg new file mode 100644 index 00000000000..bf8423fb838 --- /dev/null +++ b/docs/fundamentals/documentarray/images/docarray-index-example-full2.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/fundamentals/documentarray/images/docarray-index-example-full3.svg b/docs/fundamentals/documentarray/images/docarray-index-example-full3.svg new file mode 100644 index 00000000000..261842996b3 --- /dev/null +++ b/docs/fundamentals/documentarray/images/docarray-index-example-full3.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/fundamentals/documentarray/images/docarray-index-example.svg b/docs/fundamentals/documentarray/images/docarray-index-example.svg new file mode 100644 index 00000000000..427f9d2104f --- /dev/null +++ b/docs/fundamentals/documentarray/images/docarray-index-example.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 4bf86860323..0490b5e0b0a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -7,6 +7,8 @@ ## Install +The latest version of DocArray is {{ env.config.version }}. + ```{tip} Jina 3.x users do not need to install `docarray` separately, as it is shipped with Jina. To check your Jina version, type `jina -vf` in the console. ``` diff --git a/tests/unit/array/test_advance_indexing.py b/tests/unit/array/test_advance_indexing.py index f9537bb06db..1f71fc77876 100644 --- a/tests/unit/array/test_advance_indexing.py +++ b/tests/unit/array/test_advance_indexing.py @@ -142,3 +142,24 @@ def test_sequence_str(docarray100): def test_docarray_list_tuple(docarray100): assert isinstance(docarray100[99, 98], DocumentArray) assert len(docarray100[99, 98]) == 2 + + +def test_path_syntax_indexing(): + da = DocumentArray().empty(3) + for d in da: + d.chunks = DocumentArray.empty(5) + d.matches = DocumentArray.empty(7) + for c in d.chunks: + c.chunks = DocumentArray.empty(3) + assert len(da['@c']) == 3 * 5 + assert len(da['@c:1']) == 3 + assert len(da['@c-1:']) == 3 + assert len(da['@c1']) == 3 + assert len(da['@c-2:']) == 3 * 2 + assert len(da['@c1:3']) == 3 * 2 + assert len(da['@c1:3c']) == (3 * 2) * 3 + assert len(da['@c1:3,c1:3c']) == (3 * 2) + (3 * 2) * 3 + assert len(da['@c 1:3 , c 1:3 c']) == (3 * 2) + (3 * 2) * 3 + assert len(da['@cc']) == 3 * 5 * 3 + assert len(da['@cc,m']) == 3 * 5 * 3 + 3 * 7 + assert len(da['@r:1cc,m']) == 1 * 5 * 3 + 3 * 7 From 868d477f14a45351559a522937a5e7c95bc61177 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Thu, 6 Jan 2022 16:29:59 +0100 Subject: [PATCH 2/6] docs(array): add doc for array serialization --- docs/fundamentals/documentarray/access-elements.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/fundamentals/documentarray/access-elements.md b/docs/fundamentals/documentarray/access-elements.md index d29ffedbb50..afbd541fc86 100644 --- a/docs/fundamentals/documentarray/access-elements.md +++ b/docs/fundamentals/documentarray/access-elements.md @@ -193,6 +193,12 @@ Last but not the least, you can use integer, or integer slice to restrict the se :width: 60% ``` +This can be useful when you want to get top matches of all matches from all Documents, e.g.: + +```python +da['@m:5'] +``` + You can add space in the path-string for a better readability. Alternatively, you may leverage {meth}`~docarray.array.mixins.traverse.TraverseMixin.traverse_flat` to do this more explicitly. ## Index by flatten From b9cdde4b1c941e35fc03196323a0b614b44e116f Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Thu, 6 Jan 2022 18:34:44 +0100 Subject: [PATCH 3/6] refactor(array): refactor getattr to exclude blob and embeddings --- docarray/array/document.py | 94 ++++++++- docarray/array/mixins/getattr.py | 70 +++---- docarray/array/mixins/plot.py | 9 +- docarray/document/mixins/attribute.py | 37 +--- docarray/types.py | 8 +- .../documentarray/access-attributes.md | 178 +++++++++++++----- .../documentarray/access-elements.md | 7 +- tests/unit/array/test_advance_indexing.py | 47 +++++ 8 files changed, 329 insertions(+), 121 deletions(-) diff --git a/docarray/array/document.py b/docarray/array/document.py index c779290f5b9..6425e5552a7 100644 --- a/docarray/array/document.py +++ b/docarray/array/document.py @@ -10,6 +10,8 @@ Sequence, Iterable, overload, + Any, + List, ) import numpy as np @@ -24,6 +26,8 @@ DocumentArrayIndexType, DocumentArraySingletonIndexType, DocumentArrayMultipleIndexType, + DocumentArrayMultipleAttributeType, + DocumentArraySingleAttributeType, ) @@ -109,7 +113,17 @@ def __getitem__(self, index: 'DocumentArraySingletonIndexType') -> 'Document': ... @overload - def __getitem__(self, index: 'DocumentArrayMultipleIndexType') -> 'Document': + def __getitem__(self, index: 'DocumentArrayMultipleIndexType') -> 'DocumentArray': + ... + + @overload + def __getitem__(self, index: 'DocumentArraySingleAttributeType') -> List[Any]: + ... + + @overload + def __getitem__( + self, index: 'DocumentArrayMultipleAttributeType' + ) -> List[List[Any]]: ... def __getitem__( @@ -127,7 +141,14 @@ def __getitem__( elif index is Ellipsis: return self.flatten() elif isinstance(index, Sequence): - if isinstance(index[0], bool): + if isinstance(index, tuple) and len(index) == 2: + _docs = self[index[0]] + _attrs = index[1] + if isinstance(_attrs, str): + _attrs = (index[1],) + + return _docs.get_attributes(*_attrs) + elif isinstance(index[0], bool): return DocumentArray(itertools.compress(self._data, index)) elif isinstance(index[0], int): return DocumentArray(self._data[t] for t in index) @@ -143,6 +164,38 @@ def __getitem__( ) raise IndexError(f'Unsupported index type {typename(index)}: {index}') + @overload + def __setitem__( + self, + index: 'DocumentArrayMultipleAttributeType', + value: List[List['Any']], + ): + ... + + @overload + def __setitem__( + self, + index: 'DocumentArraySingleAttributeType', + value: List['Any'], + ): + ... + + @overload + def __setitem__( + self, + index: 'DocumentArraySingletonIndexType', + value: 'Document', + ): + ... + + @overload + def __setitem__( + self, + index: 'DocumentArrayMultipleIndexType', + value: Sequence['Document'], + ): + ... + def __setitem__( self, index: 'DocumentArrayIndexType', @@ -169,7 +222,33 @@ def __setitem__( _d._data = _v._data self._rebuild_id2offset() elif isinstance(index, Sequence): - if isinstance(index[0], bool): + if isinstance(index, tuple) and len(index) == 2: + _docs = self[index[0]] + _attrs = index[1] + + if isinstance(_attrs, str): + # a -> [a] + # [a, a] -> [a, a] + _attrs = (index[1],) + if isinstance(value, (list, tuple)) and not any( + isinstance(el, (tuple, list)) for el in value + ): + # [x] -> [[x]] + # [[x], [y]] -> [[x], [y]] + value = (value,) + if not isinstance(value, (list, tuple)): + # x -> [x] + value = (value,) + + for _a, _v in zip(_attrs, value): + if _a == 'blob': + _docs.blobs = _v + elif _a == 'embedding': + _docs.embeddings = _v + else: + for _d, _vv in zip(_docs, _v): + setattr(_d, _a, _vv) + elif isinstance(index[0], bool): if len(index) != len(self._data): raise IndexError( f'Boolean mask index is required to have the same length as {len(self._data)}, ' @@ -221,7 +300,14 @@ def __delitem__(self, index: 'DocumentArrayIndexType'): self._data.clear() self._id2offset.clear() elif isinstance(index, Sequence): - if isinstance(index[0], bool): + if isinstance(index, tuple) and len(index) == 2: + _docs = self[index[0]] + _attrs = index[1] + if isinstance(_attrs, str): + _attrs = (index[1],) + for _d in _docs: + _d.pop(*_attrs) + elif isinstance(index[0], bool): self._data = list( itertools.compress(self._data, (not _i for _i in index)) ) diff --git a/docarray/array/mixins/getattr.py b/docarray/array/mixins/getattr.py index 3e177a9d4ac..49f2cb52379 100644 --- a/docarray/array/mixins/getattr.py +++ b/docarray/array/mixins/getattr.py @@ -1,7 +1,4 @@ -from typing import Union, List, Tuple, TYPE_CHECKING - -if TYPE_CHECKING: - from ... import DocumentArray +from typing import List class GetAttributeMixin: @@ -14,34 +11,37 @@ def get_attributes(self, *fields: str) -> List: :return: Returns a list of the values for these fields. When `fields` has multiple values, then it returns a list of list. """ - contents = [doc.get_attributes(*fields) for doc in self] - - if len(fields) > 1: - contents = list(map(list, zip(*contents))) - - return contents - - def get_attributes_with_docs( - self, - *fields: str, - ) -> Tuple[List, 'DocumentArray']: - """Return all nonempty values of the fields together with their nonempty docs - - :param fields: Variable length argument with the name of the fields to extract - :return: Returns a tuple. The first element is a list of the values for these fields. - When `fields` has multiple values, then it returns a list of list. The second element is the non-empty docs. - """ - - contents = [] - docs_pts = [] - - for doc in self: - contents.append(doc.get_attributes(*fields)) - docs_pts.append(doc) - - if len(fields) > 1: - contents = list(map(list, zip(*contents))) - - from ... import DocumentArray - - return contents, DocumentArray(docs_pts) + e_index, b_index = None, None + fields = list(fields) + if 'embedding' in fields: + e_index = fields.index('embedding') + if 'blob' in fields: + b_index = fields.index('blob') + fields.remove('blob') + + if 'embedding' in fields: + fields.remove('embedding') + if 'blob' in fields: + fields.remove('blob') + + if fields: + contents = [doc.get_attributes(*fields) for doc in self] + if len(fields) > 1: + contents = list(map(list, zip(*contents))) + contents = [contents] + if b_index is not None: + contents.insert(b_index, self.blobs) + if e_index is not None: + contents.insert(e_index, self.embeddings) + return contents + + if b_index is not None and e_index is None: + return self.blobs + if b_index is None and e_index is not None: + return self.embeddings + if b_index is not None and e_index is not None: + return ( + [self.embeddings, self.blobs] + if b_index > e_index + else [self.blobs, self.embeddings] + ) diff --git a/docarray/array/mixins/plot.py b/docarray/array/mixins/plot.py index 2266dfa948e..c47f2fc7c65 100644 --- a/docarray/array/mixins/plot.py +++ b/docarray/array/mixins/plot.py @@ -55,13 +55,18 @@ def summary(self): _text = f'{_doc_text} attributes' table.add_row(_text, str(_a)) + console = Console() + all_attrs_names = tuple(sorted(all_attrs_names)) + if not all_attrs_names: + console.print(table) + return + attr_table = Table(box=box.SIMPLE, title='Attributes Summary') attr_table.add_column('Attribute') attr_table.add_column('Data type') attr_table.add_column('#Unique values') attr_table.add_column('Has empty value') - all_attrs_names = tuple(sorted(all_attrs_names)) all_attrs_values = self.get_attributes(*all_attrs_names) if len(all_attrs_names) == 1: all_attrs_values = [all_attrs_values] @@ -74,8 +79,6 @@ def summary(self): attr_table.add_row( _a_name, str(tuple(_set_type_a)), str(len(_a)), str(None in _a) ) - - console = Console() console.print(table, attr_table) def plot_embeddings( diff --git a/docarray/document/mixins/attribute.py b/docarray/document/mixins/attribute.py index 0db76a945a9..ce265c797c2 100644 --- a/docarray/document/mixins/attribute.py +++ b/docarray/document/mixins/attribute.py @@ -10,43 +10,18 @@ class GetAttributesMixin: def get_attributes(self, *fields: str) -> Union[Any, List[Any]]: """Bulk fetch Document fields and return a list of the values of these fields - .. note:: - Arguments will be extracted using `dunder_get` - .. highlight:: python - .. code-block:: python - - d = Document({'id': '123', 'hello': 'world', 'tags': {'id': 'external_id', 'good': 'bye'}}) - - assert d.id == '123' # true - assert d.tags['hello'] == 'world' # true - assert d.tags['good'] == 'bye' # true - assert d.tags['id'] == 'external_id' # true - - res = d.get_attrs_values(*['id', 'tags__hello', 'tags__good', 'tags__id']) - - assert res == ['123', 'world', 'bye', 'external_id'] - :param fields: the variable length values to extract from the document :return: a list with the attributes of this document ordered as the args """ ret = [] for k in fields: - try: - if '__' in k: - value = dunder_get(self, k) - else: - value = getattr(self, k) - - if value is None: - raise ValueError - - ret.append(value) - except (AttributeError, ValueError): - warnings.warn( - f'Could not get attribute `{typename(self)}.{k}`, returning `None`' - ) - ret.append(None) + if '__' in k: + value = dunder_get(self, k) + else: + value = getattr(self, k) + + ret.append(value) # unboxing if args is single if len(fields) == 1: diff --git a/docarray/types.py b/docarray/types.py index d5cf679c80e..086f955456c 100644 --- a/docarray/types.py +++ b/docarray/types.py @@ -8,6 +8,7 @@ Dict, Generator, Iterable, + Tuple, ) if TYPE_CHECKING: @@ -47,6 +48,11 @@ DocumentArrayMultipleIndexType = Union[ slice, Sequence[int], Sequence[str], Sequence[bool], Ellipsis ] + DocumentArraySingleAttributeType = Tuple[slice, str] + DocumentArrayMultipleAttributeType = Tuple[slice, Sequence[str]] DocumentArrayIndexType = Union[ - DocumentArraySingletonIndexType, DocumentArrayMultipleIndexType + DocumentArraySingletonIndexType, + DocumentArrayMultipleIndexType, + DocumentArraySingleAttributeType, + DocumentArrayMultipleAttributeType, ] diff --git a/docs/fundamentals/documentarray/access-attributes.md b/docs/fundamentals/documentarray/access-attributes.md index 9f063f6e4c1..b0592565cad 100644 --- a/docs/fundamentals/documentarray/access-attributes.md +++ b/docs/fundamentals/documentarray/access-attributes.md @@ -1,79 +1,165 @@ (bulk-access)= # Access Attributes -You can quickly access `.text`, `.blob`, `.buffer`, `.embedding` of all Documents in the DocumentArray without writing a for-loop. +In the last chapter, we get a taste of the powerful element selector of the DocumentArray. This chapter will continue talking about the attribute selector. -`DocumentArray` provides the plural counterparts, i.e. {attr}`~jina.types.arrays.mixins.content.ContentPropertyMixin.texts`, {attr}`~jina.types.arrays.mixins.content.ContentPropertyMixin.buffers`, {attr}`~jina.types.arrays.mixins.content.ContentPropertyMixin.blobs`, {attr}`~jina.types.arrays.mixins.content.ContentPropertyMixin.embeddings` that allows you to **get** and **set** these properties in one shot. It is much more efficient than looping. +## Attribute selector ```python -from jina import DocumentArray +da[element_selector, attribute_selector] +``` -da = DocumentArray.empty(2) -da.texts = ['hello', 'world'] +Here `element_selector` are the ones introduced {ref}`in the last chapter`. The attribute selector can be a string, or a list/tuple of string that represents the names of the attributes. + +As in element selector, one can use attribute selector to **get/set/delete** attributes in a DocumentArray. + +Let's see an example. + +```python +from docarray import DocumentArray + +da = DocumentArray().empty(3) +for d in da: + d.chunks = DocumentArray.empty(2) + d.matches = DocumentArray.empty(2) + +print(da[:, 'id']) +``` + +```text +['8d41ce5c6f0d11eca2181e008a366d49', '8d41cfa66f0d11eca2181e008a366d49', '8d41cff66f0d11eca2181e008a366d49'] +``` + +Of course you can use it with {ref}`the path-string selector`. + +```python +print(da['@c', 'id']) +``` + +```text +['db60ab8a6f0d11ec99511e008a366d49', 'db60abda6f0d11ec99511e008a366d49', 'db60c12e6f0d11ec99511e008a366d49', 'db60c1886f0d11ec99511e008a366d49', 'db60c4266f0d11ec99511e008a366d49', 'db60c46c6f0d11ec99511e008a366d49'] +``` + +```python +print(da[..., 'id']) +``` + +```text +['285db6586f0e11ec99401e008a366d49', '285db6b26f0e11ec99401e008a366d49', '285dbff46f0e11ec99401e008a366d49', '285dc0586f0e11ec99401e008a366d49', '285db3606f0e11ec99401e008a366d49', '285dcc746f0e11ec99401e008a366d49', '285dccce6f0e11ec99401e008a366d49', '285dce0e6f0e11ec99401e008a366d49', '285dce5e6f0e11ec99401e008a366d49', '285db4fa6f0e11ec99401e008a366d49', '285dcf946f0e11ec99401e008a366d49', '285dcfda6f0e11ec99401e008a366d49', '285dd1066f0e11ec99401e008a366d49', '285dd16a6f0e11ec99401e008a366d49', '285db55e6f0e11ec99401e008a366d49'] +``` + +Let's set the field `mime_type` for top-level Documents. We have three top-level Documents, so: + +```python +da[:, 'mime_type'] = ['image/jpg', 'image/png', 'image/jpg'] + +da.summary() +``` + +```text + Documents Summary + + Length 3 + Homogenous Documents True + Has nested Documents in ('chunks', 'matches') + Common Attributes ('id', 'mime_type', 'chunks', 'matches') + + Attributes Summary + + Attribute Data type #Unique values Has empty value + ──────────────────────────────────────────────────────────────── + chunks ('ChunkArray',) 3 False + id ('str',) 3 False + matches ('MatchArray',) 3 False + mime_type ('str',) 2 False +``` -print(da[0], da[1]) +We can see `mime_type` are set. One can also select multiple attributes in one shot: + +```python +da[:, ['mime_type', 'id']] ``` ```text - - +[['image/jpg', 'image/png', 'image/jpg'], ['095cd76a6f0f11ec82211e008a366d49', '095cd8d26f0f11ec82211e008a366d49', '095cd92c6f0f11ec82211e008a366d49']] ``` -When accessing `.blobs` or `.embeddings`, it automatically ravels/unravels the ndarray (can be Numpy/TensorFlow/PyTorch/SciPy/PaddlePaddle) for you. +Now let's remove them. + +```python +del da[:, 'mime_type'] + +da.summary() +``` + +```text + Documents Summary + + Length 3 + Homogenous Documents True + Has nested Documents in ('chunks', 'matches') + Common Attributes ('id', 'chunks', 'matches') + + Attributes Summary + + Attribute Data type #Unique values Has empty value + ──────────────────────────────────────────────────────────────── + chunks ('ChunkArray',) 3 False + id ('str',) 3 False + matches ('MatchArray',) 3 False + +``` + +## Auto-ravel on NdArray + +Attribute selectors `blob` and `embedding` behave a bit differently. Instead of relying on Python List for input & return when get/set, they automatically ravel/unravel the NdArray-like object [^1] for you. + +[^1]: NdArray-like can be Numpy/TensorFlow/PyTorch/SciPy/PaddlePaddle sparse & dense array. + +Here is an example, ```python import numpy as np import scipy.sparse -from jina import DocumentArray +from docarray import DocumentArray -sp_embed = np.random.random([10, 256]) +# build sparse matrix +sp_embed = np.random.random([3, 10]) sp_embed[sp_embed > 0.1] = 0 -sp_embed = scipy.sparse.coo_matrix(sp_embed) - -da = DocumentArray.empty(10) +sp_embed = scipy.sparse.coo_matrix(sp_embed) -da.embeddings = scipy.sparse.coo_matrix(sp_embed) +da = DocumentArray.empty(3) -print('da.embeddings.shape=', da.embeddings.shape) +da[:, 'embedding'] = sp_embed +print('da.embeddings.shape=', da[:, 'embedding'].shape) for d in da: print('d.embedding.shape=', d.embedding.shape) ``` ```text -da.embeddings.shape= (10, 256) -d.embedding.shape= (1, 256) -d.embedding.shape= (1, 256) -d.embedding.shape= (1, 256) -d.embedding.shape= (1, 256) -d.embedding.shape= (1, 256) -d.embedding.shape= (1, 256) -d.embedding.shape= (1, 256) -d.embedding.shape= (1, 256) -d.embedding.shape= (1, 256) -d.embedding.shape= (1, 256) -``` - -### Bulk access to attributes - -{meth}`~jina.types.arrays.mixins.getattr.GetAttributeMixin.get_attributes` let you fetch multiple attributes from the `Document`s in -one shot: - -```{code-block} python ---- -emphasize-lines: 9 ---- -import numpy as np +da.embeddings.shape= (3, 10) +d.embedding.shape= (1, 10) +d.embedding.shape= (1, 10) +d.embedding.shape= (1, 10) +``` + +## Access content and embedding attributes -from jina import DocumentArray, Document +DocumentArray provides `.texts`, `.buffers`, `.blobs`, `.contents` and `.embeddings` attributes for quickly accessing the content and embedding of Documents. You can use them to get/set/delete attributes of all Documents at the top-level. -da = DocumentArray([Document(id=1, text='hello', embedding=np.array([1, 2, 3])), - Document(id=2, text='goodbye', embedding=np.array([4, 5, 6])), - Document(id=3, text='world', embedding=np.array([7, 8, 9]))]) +```python +from docarray import DocumentArray + +da = DocumentArray.empty(2) +da.texts = ['hello', 'world'] -da.get_attributes('id', 'text', 'embedding') +print(da.texts) ``` ```text -[('1', '2', '3'), ('hello', 'goodbye', 'world'), (array([1, 2, 3]), array([4, 5, 6]), array([7, 8, 9]))] -``` \ No newline at end of file +['hello', 'world'] +``` + +This is same as `da[:, 'text'] = ['hello', 'world']` and then `print(da[:, 'text'])` but more compact and probably more Pythonic. + diff --git a/docs/fundamentals/documentarray/access-elements.md b/docs/fundamentals/documentarray/access-elements.md index afbd541fc86..c802cb6734e 100644 --- a/docs/fundamentals/documentarray/access-elements.md +++ b/docs/fundamentals/documentarray/access-elements.md @@ -103,6 +103,7 @@ print(da) ``` +(path-string)= ## Index by nested structure From early chapter, we already know {ref}`Document can be nested`. DocumentArray provides very easy way to traverse over the nested structure and select Documents. All you need to do is following the syntax below: @@ -272,4 +273,8 @@ da = DocumentArray.empty(1000).sample(10) ```text -``` \ No newline at end of file +``` + +## What's next? + +Now we know how to select Documents from DocumentArray, next we learn how to {ref}`select attributes from DocumentArray`. Spoiler alert, it follows the same syntax. \ No newline at end of file diff --git a/tests/unit/array/test_advance_indexing.py b/tests/unit/array/test_advance_indexing.py index 1f71fc77876..a6106ec7bc7 100644 --- a/tests/unit/array/test_advance_indexing.py +++ b/tests/unit/array/test_advance_indexing.py @@ -163,3 +163,50 @@ def test_path_syntax_indexing(): assert len(da['@cc']) == 3 * 5 * 3 assert len(da['@cc,m']) == 3 * 5 * 3 + 3 * 7 assert len(da['@r:1cc,m']) == 1 * 5 * 3 + 3 * 7 + + +def test_attribute_indexing(): + da = DocumentArray.empty(10) + for v in da[:, 'id']: + assert v + da[:, 'mime_type'] = [f'type {j}' for j in range(10)] + for v in da[:, 'mime_type']: + assert v + del da[:, 'mime_type'] + for v in da[:, 'mime_type']: + assert not v + + da[:, ['text', 'mime_type']] = [ + [f'hello {j}' for j in range(10)], + [f'type {j}' for j in range(10)], + ] + da.summary() + + for v in da[:, ['mime_type', 'text']]: + for vv in v: + assert vv + + +def test_blob_attribute_selector(): + import scipy.sparse + + sp_embed = np.random.random([3, 10]) + sp_embed[sp_embed > 0.1] = 0 + sp_embed = scipy.sparse.coo_matrix(sp_embed) + + da = DocumentArray.empty(3) + + da[:, 'embedding'] = sp_embed + + assert da[:, 'embedding'].shape == (3, 10) + + for d in da: + assert d.embedding.shape == (1, 10) + + v1, v2 = da[:, ['embedding', 'id']] + assert isinstance(v1, scipy.sparse.coo_matrix) + assert isinstance(v2, list) + + v1, v2 = da[:, ['id', 'embedding']] + assert isinstance(v2, scipy.sparse.coo_matrix) + assert isinstance(v1, list) From c96001ef669328c8433f5c578e4e68d2aac09941 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Thu, 6 Jan 2022 18:38:15 +0100 Subject: [PATCH 4/6] refactor(array): refactor getattr to exclude blob and embeddings --- docs/fundamentals/documentarray/access-attributes.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/fundamentals/documentarray/access-attributes.md b/docs/fundamentals/documentarray/access-attributes.md index b0592565cad..d0d56d4cedb 100644 --- a/docs/fundamentals/documentarray/access-attributes.md +++ b/docs/fundamentals/documentarray/access-attributes.md @@ -132,16 +132,16 @@ da = DocumentArray.empty(3) da[:, 'embedding'] = sp_embed -print('da.embeddings.shape=', da[:, 'embedding'].shape) +print(type(da[:, 'embedding']), da[:, 'embedding'].shape) for d in da: - print('d.embedding.shape=', d.embedding.shape) + print(type(d.embedding), d.embedding.shape) ``` ```text -da.embeddings.shape= (3, 10) -d.embedding.shape= (1, 10) -d.embedding.shape= (1, 10) -d.embedding.shape= (1, 10) + (3, 10) + (1, 10) + (1, 10) + (1, 10) ``` ## Access content and embedding attributes From 508f9dc6787ee278f9e4e0099c6ed077b3e80f88 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Thu, 6 Jan 2022 18:40:18 +0100 Subject: [PATCH 5/6] refactor(array): refactor getattr to exclude blob and embeddings --- .../documentarray/access-attributes.md | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/docs/fundamentals/documentarray/access-attributes.md b/docs/fundamentals/documentarray/access-attributes.md index d0d56d4cedb..d246045056c 100644 --- a/docs/fundamentals/documentarray/access-attributes.md +++ b/docs/fundamentals/documentarray/access-attributes.md @@ -144,7 +144,7 @@ for d in da: (1, 10) ``` -## Access content and embedding attributes +## Content and embedding attributes DocumentArray provides `.texts`, `.buffers`, `.blobs`, `.contents` and `.embeddings` attributes for quickly accessing the content and embedding of Documents. You can use them to get/set/delete attributes of all Documents at the top-level. @@ -163,3 +163,27 @@ print(da.texts) This is same as `da[:, 'text'] = ['hello', 'world']` and then `print(da[:, 'text'])` but more compact and probably more Pythonic. +Same for `.blobs` and `.embeddings`: + +```python +import numpy as np +from docarray import DocumentArray + +# build sparse matrix +embed = np.random.random([3, 10]) + +da = DocumentArray.empty(3) + +da.embeddings = embed + +print(type(da.embeddings), da.embeddings.shape) +for d in da: + print(type(d.embedding), d.embedding.shape) +``` + +```text + (3, 10) + (10,) + (10,) + (10,) +``` \ No newline at end of file From 273f0986c4fc02572f5ac5422fac86c3558d2e4c Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Fri, 7 Jan 2022 00:16:49 +0100 Subject: [PATCH 6/6] refactor(array): refactor getattr to exclude blob and embeddings --- docarray/array/document.py | 18 +++++++++++++++--- docarray/array/mixins/getattr.py | 3 +++ docs/conf.py | 2 +- .../documentarray/access-attributes.md | 2 ++ docs/fundamentals/documentarray/embedding.md | 2 +- docs/fundamentals/documentarray/evaluation.md | 2 +- docs/fundamentals/documentarray/list-like.md | 3 +-- .../documentarray/parallelization.md | 2 +- docs/index.md | 4 ++-- tests/unit/document/test_docdata.py | 4 ---- 10 files changed, 27 insertions(+), 15 deletions(-) diff --git a/docarray/array/document.py b/docarray/array/document.py index 6425e5552a7..58b1abf2de4 100644 --- a/docarray/array/document.py +++ b/docarray/array/document.py @@ -141,7 +141,11 @@ def __getitem__( elif index is Ellipsis: return self.flatten() elif isinstance(index, Sequence): - if isinstance(index, tuple) and len(index) == 2: + if ( + isinstance(index, tuple) + and len(index) == 2 + and isinstance(index[0], (slice, Sequence)) + ): _docs = self[index[0]] _attrs = index[1] if isinstance(_attrs, str): @@ -222,7 +226,11 @@ def __setitem__( _d._data = _v._data self._rebuild_id2offset() elif isinstance(index, Sequence): - if isinstance(index, tuple) and len(index) == 2: + if ( + isinstance(index, tuple) + and len(index) == 2 + and isinstance(index[0], (slice, Sequence)) + ): _docs = self[index[0]] _attrs = index[1] @@ -300,7 +308,11 @@ def __delitem__(self, index: 'DocumentArrayIndexType'): self._data.clear() self._id2offset.clear() elif isinstance(index, Sequence): - if isinstance(index, tuple) and len(index) == 2: + if ( + isinstance(index, tuple) + and len(index) == 2 + and isinstance(index[0], (slice, Sequence)) + ): _docs = self[index[0]] _attrs = index[1] if isinstance(_attrs, str): diff --git a/docarray/array/mixins/getattr.py b/docarray/array/mixins/getattr.py index 49f2cb52379..ebbeafb36ad 100644 --- a/docarray/array/mixins/getattr.py +++ b/docarray/array/mixins/getattr.py @@ -28,6 +28,9 @@ def get_attributes(self, *fields: str) -> List: contents = [doc.get_attributes(*fields) for doc in self] if len(fields) > 1: contents = list(map(list, zip(*contents))) + if b_index is None and e_index is None: + return contents + contents = [contents] if b_index is not None: contents.insert(b_index, self.blobs) diff --git a/docs/conf.py b/docs/conf.py index 133946e470b..bf5834b18d5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -109,7 +109,7 @@ 'sphinx_inline_tabs', ] -myst_enable_extensions = ['colon_fence'] +myst_enable_extensions = ['colon_fence', 'substitution'] # -- Custom 404 page diff --git a/docs/fundamentals/documentarray/access-attributes.md b/docs/fundamentals/documentarray/access-attributes.md index d246045056c..543a00672f0 100644 --- a/docs/fundamentals/documentarray/access-attributes.md +++ b/docs/fundamentals/documentarray/access-attributes.md @@ -1,6 +1,8 @@ (bulk-access)= # Access Attributes +DocumentArray itself has no attribute. Accessing attributes in this context means access attributes of the contained Documents in bulk. + In the last chapter, we get a taste of the powerful element selector of the DocumentArray. This chapter will continue talking about the attribute selector. ## Attribute selector diff --git a/docs/fundamentals/documentarray/embedding.md b/docs/fundamentals/documentarray/embedding.md index 536c6e9e0a8..5e2fd642bd0 100644 --- a/docs/fundamentals/documentarray/embedding.md +++ b/docs/fundamentals/documentarray/embedding.md @@ -1,5 +1,5 @@ (embed-via-model)= -# Embedding +# Embed via Deep Neural Network ```{important} diff --git a/docs/fundamentals/documentarray/evaluation.md b/docs/fundamentals/documentarray/evaluation.md index 5f5eb37b83a..ca7093f5527 100644 --- a/docs/fundamentals/documentarray/evaluation.md +++ b/docs/fundamentals/documentarray/evaluation.md @@ -1 +1 @@ -# Evaluation \ No newline at end of file +# Evaluate Matches \ No newline at end of file diff --git a/docs/fundamentals/documentarray/list-like.md b/docs/fundamentals/documentarray/list-like.md index 483b957b658..92b256baa40 100644 --- a/docs/fundamentals/documentarray/list-like.md +++ b/docs/fundamentals/documentarray/list-like.md @@ -1,6 +1,5 @@ -# Other List-like Features +# Other Handy Features -One can see `DocumentArray` as a Python list. Hence, many Python high-level iterator functions/tools can be used on `DocumentArray` as well. ## Shuffle diff --git a/docs/fundamentals/documentarray/parallelization.md b/docs/fundamentals/documentarray/parallelization.md index f9d33837499..a7c6fc80555 100644 --- a/docs/fundamentals/documentarray/parallelization.md +++ b/docs/fundamentals/documentarray/parallelization.md @@ -1,4 +1,4 @@ -# Parallelization +# Parallel Processing ```{seealso} - {meth}`~jina.types.arrays.mixins.parallel.ParallelMixin.map`: to parallel process element by element, return an interator of elements; diff --git a/docs/index.md b/docs/index.md index 0490b5e0b0a..2e698ee8535 100644 --- a/docs/index.md +++ b/docs/index.md @@ -7,8 +7,6 @@ ## Install -The latest version of DocArray is {{ env.config.version }}. - ```{tip} Jina 3.x users do not need to install `docarray` separately, as it is shipped with Jina. To check your Jina version, type `jina -vf` in the console. ``` @@ -53,6 +51,8 @@ Alternatively, you can first do basic installation and then install missing depe '0.1.0' ``` +The latest version of DocArray is {{ env.config.version }}. + ```{attention} If the printed version is smaller than `0.1.0`, say `0.0.x`, then you are not installing `docarray` correctly. You are probably still using an old `docarray` shipped with Jina 2.x. diff --git a/tests/unit/document/test_docdata.py b/tests/unit/document/test_docdata.py index 05d2c35bdcc..8750869f71c 100644 --- a/tests/unit/document/test_docdata.py +++ b/tests/unit/document/test_docdata.py @@ -162,9 +162,7 @@ def test_get_attr_values(): 'scores__metric__value', 'tags__c', 'tags__id', - 'tags__inexistant', 'tags__e__2__f', - 'inexistant', ] res = d.get_attributes(*required_keys) assert len(res) == len(required_keys) @@ -175,8 +173,6 @@ def test_get_attr_values(): assert res[required_keys.index('tags__c')] == 'd' assert res[required_keys.index('tags__id')] == 'identity' assert res[required_keys.index('scores__metric__value')] == 42 - assert res[required_keys.index('tags__inexistant')] is None - assert res[required_keys.index('inexistant')] is None assert res[required_keys.index('tags__e__2__f')] == 'g' required_keys_2 = ['tags', 'text']