Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
86 commits
Select commit Hold shift + click to select a range
e145948
docs: add multi modalities section
Mar 30, 2023
75202ba
fix: data types section add :
Mar 30, 2023
7e4537c
Merge branch 'feat-rewrite-v2' into docs-multi-modalities
Mar 30, 2023
5258594
docs: first draft image modality
Mar 31, 2023
5d3e6e3
fix: image display with mkdocs
Mar 31, 2023
a633b44
fix: image display with mkdocs
Mar 31, 2023
efe17ad
fix: image display with mkdocs
Mar 31, 2023
6aefe26
fix: fix second image
Mar 31, 2023
9f88f3f
fix: second image
Mar 31, 2023
81e61bb
docs: add empty sections and 3d mesh iframe
Mar 31, 2023
55996f2
fix: sections
Mar 31, 2023
ad14bb6
docs: add first draft of 3d mesh section
Mar 31, 2023
00c7d59
fix: update image display_notebook.jpg for image section
Mar 31, 2023
20575c5
fix: remove duplicate mesh display
Mar 31, 2023
f6823c7
fix: section header in mesh section
Mar 31, 2023
0214a75
docs: add first draft of audio section
Mar 31, 2023
e085b74
docs: update audio file
Mar 31, 2023
9d77146
docs: add first draft of video section
Apr 3, 2023
808fa04
docs: fix video display in video section
Apr 3, 2023
66d7495
docs: first draft table section
Apr 3, 2023
2d4b8e9
chore: add mkdocs-video
Apr 3, 2023
d0219dc
fix: move mkdocs-video from markdown-extensions to plugins section
Apr 3, 2023
4715a97
docs: add header to empty sections
Apr 3, 2023
3532c55
docs: fix video display
Apr 3, 2023
d4adf27
fix: video display
Apr 3, 2023
7ac569a
fix: video display
Apr 3, 2023
4d0ebf3
fix: video display
Apr 3, 2023
f7eae9e
fix: video display
Apr 3, 2023
713bb59
fix: video display
Apr 3, 2023
3adb013
fix: video display
Apr 3, 2023
a24649a
fix: video display
Apr 3, 2023
1743d58
fix: video display
Apr 3, 2023
29b7821
fix: use resized video
Apr 3, 2023
954598a
fix: video display
Apr 3, 2023
8d37328
fix: display video
Apr 3, 2023
1c5513b
feat: enable copy to clipboard in mkdocs for code snippets
Apr 3, 2023
f47cb0d
feat: add extra.css file to change highlight color in code blocks
Apr 4, 2023
5827dac
fix: image and other sections
Apr 4, 2023
f1aa66a
fix: apply samis suggestions from code review
Apr 4, 2023
c12a68d
fix: note with cmd instead of python field
Apr 4, 2023
0f2d153
docs: fix audio section
Apr 4, 2023
aa71e5d
docs: fix black docs
Apr 4, 2023
811b6bf
fix: audio tensor import in docarray.typing and audiodoc documentation
Apr 4, 2023
05935d8
docs: update video section
Apr 4, 2023
e2fe3bc
fix: video doc and audio docs
Apr 4, 2023
4a7a985
fix: mesh 3d section
Apr 5, 2023
15e8c83
fix: table section
Apr 5, 2023
83bd4b3
fix: remove duplicates in intro sections
Apr 5, 2023
b05820d
fix: move indexing part in video bytes to make more readable
Apr 5, 2023
6cf817a
refactor: change all DocArray to DocList
Apr 5, 2023
85d4625
fix: rebase missed dash
Apr 5, 2023
55b2eaa
fix: mypy, add type hints
Apr 5, 2023
39bbc08
docs: add emojis to headers
Apr 5, 2023
be44100
docs: text section
Apr 5, 2023
76a7fba
fix: getting started sections
Apr 5, 2023
322edf1
docs: multimodal section
Apr 5, 2023
479678b
fix: collapse output sections
Apr 5, 2023
6c0278a
fix: collapse sections
Apr 5, 2023
8e66a7f
fix: clean up data types section
Apr 5, 2023
9cee673
test: add data types section to tests
Apr 5, 2023
3ef25f3
Merge remote-tracking branch 'origin/feat-rewrite-v2' into docs-multi…
Apr 5, 2023
d487e72
fix: add books.csv to toydata
Apr 6, 2023
15dd77b
fix: move apple png to toydata dir
Apr 6, 2023
ebac011
fix: apply johannes' suggestions from code review
Apr 6, 2023
c36b111
fix: move apple.pngfix: fix docstrings for predefined docs, without t…
Apr 6, 2023
dbdd080
docs: mark missing links
Apr 6, 2023
15337cb
fix: adjust links
Apr 11, 2023
2f2fc13
fix: remove link placeholders
Apr 11, 2023
74a0c87
Merge remote-tracking branch 'origin/feat-rewrite-v2' into docs-multi…
Apr 11, 2023
0598aeb
docs: add missing links
Apr 11, 2023
8e45576
fix: clean up
Apr 11, 2023
55301db
fix: apply suggestions from code review
Apr 11, 2023
0ed8f0f
fix: apply suggestions
Apr 11, 2023
e3b1f52
Merge remote-tracking branch 'origin/feat-rewrite-v2' into docs-multi…
Apr 11, 2023
2ebd7d2
test: add csv and tsv file to toydata dir
Apr 11, 2023
fae8221
fix: docs tests
Apr 11, 2023
d42bfe3
docs: fix audio section
Apr 12, 2023
72ef7ac
Merge remote-tracking branch 'origin/feat-rewrite-v2' into docs-multi…
Apr 12, 2023
9e6ef37
fix: image section
Apr 12, 2023
675b0a3
docs: fix tests
Apr 12, 2023
ecc753b
test: adjust test_docs
Apr 12, 2023
65162ce
Merge remote-tracking branch 'origin/feat-rewrite-v2' into docs-multi…
Apr 12, 2023
cc7de45
fix: adjust paths to github files
Apr 12, 2023
5b4e575
fix: doc string test for documents
Apr 12, 2023
18be4cb
fix: swap docvec and anydocarray sections
Apr 12, 2023
a3c6332
fix: run grammarly on .md files
Apr 12, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 53 additions & 54 deletions docarray/array/any_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def _set_data_column(
field: str,
values: Union[List, T, 'AbstractTensor'],
):
"""Set all Documents in this DocList using the passed values
"""Set all Documents in this [`DocList`][docarray.typing.DocList] using the passed values

:param field: name of the fields to extract
:values: the values to set at the DocList level
Expand All @@ -140,7 +140,7 @@ def to_protobuf(self) -> 'DocListProto':
...

def _to_node_protobuf(self) -> 'NodeProto':
"""Convert a DocList into a NodeProto protobuf message.
"""Convert a [`DocList`][docarray.typing.DocList] into a NodeProto protobuf message.
This function should be called when a DocList
is nested into another Document that need to be converted into a protobuf

Expand All @@ -157,82 +157,81 @@ def traverse_flat(
) -> Union[List[Any], 'AbstractTensor']:
"""
Return a List of the accessed objects when applying the `access_path`. If this
results in a nested list or list of DocLists, the list will be flattened
results in a nested list or list of [`DocList`s][docarray.typing.DocList], the list will be flattened
on the first level. The access path is a string that consists of attribute
names, concatenated and "__"-separated. It describes the path from the first
level to an arbitrary one, e.g. 'content__image__url'.
names, concatenated and `"__"`-separated. It describes the path from the first
level to an arbitrary one, e.g. `'content__image__url'`.

:param access_path: a string that represents the access path ("__"-separated).
:param access_path: a string that represents the access path (`"__"`-separated).
:return: list of the accessed objects, flattened if nested.

EXAMPLE USAGE
.. code-block:: python
from docarray import BaseDoc, DocList, Text
```python
from docarray import BaseDoc, DocList, Text


class Author(BaseDoc):
name: str
class Author(BaseDoc):
name: str


class Book(BaseDoc):
author: Author
content: Text
class Book(BaseDoc):
author: Author
content: Text


docs = DocList[Book](
Book(author=Author(name='Jenny'), content=Text(text=f'book_{i}'))
for i in range(10) # noqa: E501
)
docs = DocList[Book](
Book(author=Author(name='Jenny'), content=Text(text=f'book_{i}'))
for i in range(10) # noqa: E501
)

books = docs.traverse_flat(access_path='content') # list of 10 Text objs
books = docs.traverse_flat(access_path='content') # list of 10 Text objs

authors = docs.traverse_flat(access_path='author__name') # list of 10 strings
authors = docs.traverse_flat(access_path='author__name') # list of 10 strings
```

If the resulting list is a nested list, it will be flattened:

EXAMPLE USAGE
.. code-block:: python
from docarray import BaseDoc, DocList

```python
from docarray import BaseDoc, DocList

class Chapter(BaseDoc):
content: str

class Chapter(BaseDoc):
content: str

class Book(BaseDoc):
chapters: DocList[Chapter]

class Book(BaseDoc):
chapters: DocList[Chapter]

docs = DocList[Book](
Book(chapters=DocList[Chapter]([Chapter(content='some_content') for _ in range(3)]))
for _ in range(10)
)

chapters = docs.traverse_flat(access_path='chapters') # list of 30 strings
docs = DocList[Book](
Book(chapters=DocList[Chapter]([Chapter(content='some_content') for _ in range(3)]))
for _ in range(10)
)

If your DocList is in doc_vec mode, and you want to access a field of
type AnyTensor, the doc_vec tensor will be returned instead of a list:
chapters = docs.traverse_flat(access_path='chapters') # list of 30 strings
```

EXAMPLE USAGE
.. code-block:: python
class Image(BaseDoc):
tensor: TorchTensor[3, 224, 224]
If your [`DocList`][docarray.typing.DocList] is in doc_vec mode, and you want to access a field of
type [`AnyTensor`][docarray.typing.AnyTensor], the doc_vec tensor will be returned instead of a list:

```python
class Image(BaseDoc):
tensor: TorchTensor[3, 224, 224]

batch = DocList[Image](
[
Image(
tensor=torch.zeros(3, 224, 224),
)
for _ in range(2)
]
)

batch_stacked = batch.stack()
tensors = batch_stacked.traverse_flat(
access_path='tensor'
) # tensor of shape (2, 3, 224, 224)
batch = DocList[Image](
[
Image(
tensor=torch.zeros(3, 224, 224),
)
for _ in range(2)
]
)

batch_stacked = batch.stack()
tensors = batch_stacked.traverse_flat(
access_path='tensor'
) # tensor of shape (2, 3, 224, 224)
```
"""
...

Expand Down Expand Up @@ -264,7 +263,7 @@ def _flatten_one_level(sequence: List[Any]) -> List[Any]:

def summary(self):
"""
Print a summary of this DocList object and a summary of the schema of its
Print a summary of this [`DocList`][docarray.typing.DocList] object and a summary of the schema of its
Document type.
"""
DocArraySummary(self).summary()
Expand All @@ -276,13 +275,13 @@ def _batch(
show_progress: bool = False,
) -> Generator[T, None, None]:
"""
Creates a `Generator` that yields `DocList` of size `batch_size`.
Creates a `Generator` that yields [`DocList`][docarray.typing.DocList] of size `batch_size`.
Note, that the last batch might be smaller than `batch_size`.

:param batch_size: Size of each generated batch.
:param shuffle: If set, shuffle the Documents before dividing into minibatches.
:param show_progress: if set, show a progress bar when batching documents.
:yield: a Generator of `DocList`, each in the length of `batch_size`
:yield: a Generator of [`DocList`][docarray.typing.DocList], each in the length of `batch_size`
"""
from rich.progress import track

Expand Down
76 changes: 39 additions & 37 deletions docarray/array/doc_list/doc_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,8 @@ class DocList(
homogeneous and follow the same schema. To precise this schema you can use
the `DocList[MyDocument]` syntax where MyDocument is a Document class
(i.e. schema). This creates a DocList that can only contains Documents of
the type 'MyDocument'.
the type `MyDocument`.

---

```python
from docarray import BaseDoc, DocList
Expand All @@ -86,36 +85,39 @@ class Image(BaseDoc):
docs = DocList[Image](
Image(url='http://url.com/foo.png') for _ in range(10)
) # noqa: E510
```

---

# If your DocList is homogeneous (i.e. follows the same schema), you can access
# fields at the DocList level (for example `docs.tensor` or `docs.url`).

print(docs.url)
# [ImageUrl('http://url.com/foo.png', host_type='domain'), ...]


If your DocList is homogeneous (i.e. follows the same schema), you can access
fields at the DocList level (for example `docs.tensor` or `docs.url`).
You can also set fields, with `docs.tensor = np.random.random([10, 100])`:
# You can also set fields, with `docs.tensor = np.random.random([10, 100])`:

print(docs.url)
# [ImageUrl('http://url.com/foo.png', host_type='domain'), ...]
import numpy as np
import numpy as np

docs.tensor = np.random.random([10, 100])
print(docs.tensor)
# [NdArray([0.11299577, 0.47206767, 0.481723 , 0.34754724, 0.15016037,
# 0.88861321, 0.88317666, 0.93845579, 0.60486676, ... ]), ...]
docs.tensor = np.random.random([10, 100])

You can index into a DocList like a numpy doc_list or torch tensor:
print(docs.tensor)
# [NdArray([0.11299577, 0.47206767, 0.481723 , 0.34754724, 0.15016037,
# 0.88861321, 0.88317666, 0.93845579, 0.60486676, ... ]), ...]


docs[0] # index by position
docs[0:5:2] # index by slice
docs[[0, 2, 3]] # index by list of indices
docs[True, False, True, True, ...] # index by boolean mask
# You can index into a DocList like a numpy doc_list or torch tensor:

You can delete items from a DocList like a Python List
docs[0] # index by position
docs[0:5:2] # index by slice
docs[[0, 2, 3]] # index by list of indices
docs[True, False, True, True, ...] # index by boolean mask

del docs[0] # remove first element from DocList
del docs[0:5] # remove elements for 0 to 5 from DocList

# You can delete items from a DocList like a Python List

del docs[0] # remove first element from DocList
del docs[0:5] # remove elements for 0 to 5 from DocList
```

:param docs: iterable of Document

Expand All @@ -135,10 +137,10 @@ def construct(
docs: Sequence[T_doc],
) -> T:
"""
Create a DocList without validation any data. The data must come from a
Create a `DocList` without validation any data. The data must come from a
trusted source
:param docs: a Sequence (list) of Document with the same schema
:return:
:return: a `DocList` object
"""
new_docs = cls.__new__(cls)
new_docs._data = docs if isinstance(docs, list) else list(docs)
Expand All @@ -154,13 +156,13 @@ def __eq__(self, other: Any) -> bool:

def _validate_docs(self, docs: Iterable[T_doc]) -> Iterable[T_doc]:
"""
Validate if an Iterable of Document are compatible with this DocList
Validate if an Iterable of Document are compatible with this `DocList`
"""
for doc in docs:
yield self._validate_one_doc(doc)

def _validate_one_doc(self, doc: T_doc) -> T_doc:
"""Validate if a Document is compatible with this DocList"""
"""Validate if a Document is compatible with this `DocList`"""
if not issubclass(self.doc_type, AnyDoc) and not isinstance(doc, self.doc_type):
raise ValueError(f'{doc} is not a {self.doc_type}')
return doc
Expand All @@ -178,25 +180,25 @@ def __bytes__(self) -> bytes:

def append(self, doc: T_doc):
"""
Append a Document to the DocList. The Document must be from the same class
as the doc_type of this DocList otherwise it will fail.
Append a Document to the `DocList`. The Document must be from the same class
as the `.doc_type` of this `DocList` otherwise it will fail.
:param doc: A Document
"""
self._data.append(self._validate_one_doc(doc))

def extend(self, docs: Iterable[T_doc]):
"""
Extend a DocList with an Iterable of Document. The Documents must be from
the same class as the doc_type of this DocList otherwise it will
Extend a `DocList` with an Iterable of Document. The Documents must be from
the same class as the `.doc_type` of this `DocList` otherwise it will
fail.
:param docs: Iterable of Documents
"""
self._data.extend(self._validate_docs(docs))

def insert(self, i: int, doc: T_doc):
"""
Insert a Document to the DocList. The Document must be from the same
class as the doc_type of this DocList otherwise it will fail.
Insert a Document to the `DocList`. The Document must be from the same
class as the doc_type of this `DocList` otherwise it will fail.
:param i: index to insert
:param doc: A Document
"""
Expand Down Expand Up @@ -238,10 +240,10 @@ def _set_data_column(
field: str,
values: Union[List, T, 'AbstractTensor'],
):
"""Set all Documents in this DocList using the passed values
"""Set all Documents in this `DocList` using the passed values

:param field: name of the fields to set
:values: the values to set at the DocList level
:values: the values to set at the `DocList` level
"""
...

Expand All @@ -253,11 +255,11 @@ def stack(
tensor_type: Type['AbstractTensor'] = NdArray,
) -> 'DocVec':
"""
Convert the DocList into a DocVec. `Self` cannot be used
Convert the `DocList` into a `DocVec`. `Self` cannot be used
afterwards
:param tensor_type: Tensor Class used to wrap the doc_vec tensors. This is useful
if the BaseDoc has some undefined tensor type like AnyTensor or Union of NdArray and TorchTensor
:return: A DocVec of the same document type as self
:return: A `DocVec` of the same document type as self
"""
from docarray.array.doc_vec.doc_vec import DocVec

Expand Down Expand Up @@ -291,7 +293,7 @@ def traverse_flat(
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T:
"""create a Document from a protobuf message
:param pb_msg: The protobuf message from where to construct the DocList
:param pb_msg: The protobuf message from where to construct the `DocList`
"""
return super().from_protobuf(pb_msg)

Expand Down
Loading