Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
f2c407f
refactor: rename DocArray to DocList
samsja Apr 3, 2023
d33e295
refactor: rename DocArray to DocList
samsja Apr 3, 2023
60a3050
fix: fix Ci
samsja Apr 3, 2023
a8b1087
refactor: rename DocArrayStack to DocVec
samsja Apr 3, 2023
f5b471a
refactor: rename DocArrayStack to DocVec
samsja Apr 3, 2023
6dd9265
refactor: rename namespace stacked to doc vec
samsja Apr 3, 2023
cccbc97
refactor: rename namespace stacked to doc vec
samsja Apr 3, 2023
8369957
fix: fix ci
samsja Apr 3, 2023
446a8f2
refactor: rename namesapce
samsja Apr 3, 2023
fd159ab
fix: fix docstring
samsja Apr 3, 2023
d344b30
refactor: rename proto
samsja Apr 3, 2023
4a4fd1c
refactor: document_type to document
samsja Apr 3, 2023
cd11722
refactor: rename document and document_array key from proto to doc an…
samsja Apr 3, 2023
2af12d7
fix: add docv vec to init
samsja Apr 3, 2023
61a793f
refactor: rename da to docs
samsja Apr 4, 2023
99d9f5b
fix: fix jac
samsja Apr 4, 2023
3f983ef
refactor: rename docstring
samsja Apr 4, 2023
3a156f1
refactor: rename docstring
samsja Apr 4, 2023
9659e38
refactor: rename da to docs
samsja Apr 4, 2023
4e3a31b
refactor: rename da columns to docs_vec_colunn
samsja Apr 4, 2023
9b1a9a7
fix: fix readme
samsja Apr 4, 2023
a4cda60
fix: rename last da
samsja Apr 4, 2023
6b8f30d
fix: rename last da
samsja Apr 4, 2023
4c2d286
fix: fic docs nested da stack
samsja Apr 4, 2023
1852b70
fix: fix becmark
samsja Apr 4, 2023
408736c
fix: fix docsm ap
samsja Apr 4, 2023
cbc1598
fix: fix docsm ap
samsja Apr 4, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ jobs:
poetry install --without dev
poetry run pip install tensorflow==2.11.0
- name: Test basic import
run: poetry run python -c 'from docarray import DocArray, BaseDoc'
run: poetry run python -c 'from docarray import DocList, BaseDoc'


check-mypy:
Expand Down
43 changes: 22 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,10 @@ doc = MultiModalDocument(
)
```

### Collect multiple `Documents` into a `DocArray`:
### Collect multiple `Documents` into a `DocList`:

```python
from docarray import DocArray, BaseDoc
from docarray import DocList, BaseDoc
from docarray.typing import AnyTensor, ImageUrl
import numpy as np

Expand All @@ -90,9 +91,9 @@ class Image(BaseDoc):
```

```python
from docarray import DocArray
from docarray import DocList

da = DocArray[Image](
da = DocList[Image](
[
Image(
url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg",
Expand Down Expand Up @@ -150,16 +151,16 @@ Image.from_protobuf(doc.to_protobuf())

```python
# NOTE: DocumentStores are not yet implemented in version 2
from docarray import DocArray
from docarray import DocList
from docarray.documents import ImageDoc
from docarray.stores import DocumentStore
import numpy as np

da = DocArray([ImageDoc(embedding=np.zeros((128,))) for _ in range(1000)])
da = DocList([ImageDoc(embedding=np.zeros((128,))) for _ in range(1000)])
store = DocumentStore[ImageDoc](
storage='qdrant'
) # create a DocumentStore with Qdrant as backend
store.insert(da) # insert the DocArray into the DocumentStore
store.insert(da) # insert the DocList into the DocumentStore
# find the 10 most similar images based on the 'embedding' field
match = store.find(ImageDoc(embedding=np.zeros((128,))), field='embedding', top_k=10)
```
Expand Down Expand Up @@ -233,7 +234,7 @@ Not very easy on the eyes if you ask us. And even worse, if you need to add one
So, now let's see what the same code looks like with DocArray:

```python
from docarray import DocArray, BaseDoc
from docarray import DocList, BaseDoc
from docarray.documents import ImageDoc, TextDoc, AudioDoc
from docarray.typing import TorchTensor

Expand All @@ -258,18 +259,18 @@ class MyPodcastModel(nn.Module):
self.image_encoder = ImageEncoder()
self.text_encoder = TextEncoder()

def forward_podcast(self, da: DocArray[Podcast]) -> DocArray[Podcast]:
da.audio.embedding = self.audio_encoder(da.audio.tensor)
da.text.embedding = self.text_encoder(da.text.tensor)
da.image.embedding = self.image_encoder(da.image.tensor)
def forward_podcast(self, docs: DocList[Podcast]) -> DocList[Podcast]:
docs.audio.embedding = self.audio_encoder(docs.audio.tensor)
docs.text.embedding = self.text_encoder(docs.text.tensor)
docs.image.embedding = self.image_encoder(docs.image.tensor)

return da
return docs

def forward(self, da: DocArray[PairPodcast]) -> DocArray[PairPodcast]:
da.left = self.forward_podcast(da.left)
da.right = self.forward_podcast(da.right)
def forward(self, docs: DocList[PairPodcast]) -> DocList[PairPodcast]:
docs.left = self.forward_podcast(docs.left)
docs.right = self.forward_podcast(docs.right)

return da
return docs
```

Looks much better, doesn't it?
Expand Down Expand Up @@ -297,7 +298,7 @@ This would look like the following:
```python
from typing import Optional

from docarray import DocArray, BaseDoc
from docarray import DocList, BaseDoc

import tensorflow as tf

Expand All @@ -312,7 +313,7 @@ class MyPodcastModel(tf.keras.Model):
super().__init__()
self.audio_encoder = AudioEncoder()

def call(self, inputs: DocArray[Podcast]) -> DocArray[Podcast]:
def call(self, inputs: DocList[Podcast]) -> DocList[Podcast]:
inputs.audio_tensor.embedding = self.audio_encoder(
inputs.audio_tensor.tensor
) # access audio_tensor's .tensor attribute
Expand Down Expand Up @@ -407,7 +408,7 @@ store it there, and thus make it searchable:

```python
# NOTE: DocumentStores are not yet implemented in version 2
from docarray import DocArray, BaseDoc
from docarray import DocList, BaseDoc
from docarray.stores import DocumentStore
from docarray.documents import ImageDoc, TextDoc
import numpy as np
Expand All @@ -427,7 +428,7 @@ def _random_my_doc():
)


da = DocArray([_random_my_doc() for _ in range(1000)]) # create some data
da = DocList([_random_my_doc() for _ in range(1000)]) # create some data
store = DocumentStore[MyDoc](
storage='qdrant'
) # create a DocumentStore with Qdrant as backend
Expand Down
4 changes: 2 additions & 2 deletions docarray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

import logging

from docarray.array import DocArray, DocArrayStacked
from docarray.array import DocList, DocVec
from docarray.base_doc.doc import BaseDoc

__all__ = ['BaseDoc', 'DocArray', 'DocArrayStacked']
__all__ = ['BaseDoc', 'DocList', 'DocVec']

logger = logging.getLogger('docarray')

Expand Down
7 changes: 4 additions & 3 deletions docarray/array/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from docarray.array.array.array import DocArray
from docarray.array.stacked.array_stacked import DocArrayStacked
from docarray.array.any_array import AnyDocArray
from docarray.array.doc_list.doc_list import DocList
from docarray.array.doc_vec.doc_vec import DocVec

__all__ = ['DocArray', 'DocArrayStacked']
__all__ = ['DocList', 'DocVec', 'AnyDocArray']
66 changes: 32 additions & 34 deletions docarray/array/abstract_array.py → docarray/array/any_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from docarray.utils._internal._typing import change_cls_name

if TYPE_CHECKING:
from docarray.proto import DocumentArrayProto, NodeProto
from docarray.proto import DocListProto, NodeProto
from docarray.typing.tensor.abstract_tensor import AbstractTensor

T = TypeVar('T', bound='AnyDocArray')
Expand All @@ -34,7 +34,7 @@


class AnyDocArray(Sequence[T_doc], Generic[T_doc], AbstractType):
document_type: Type[BaseDoc]
doc_type: Type[BaseDoc]
__typed_da__: Dict[Type['AnyDocArray'], Dict[Type[BaseDoc], Type]] = {}

def __repr__(self):
Expand All @@ -58,9 +58,9 @@ def __class_getitem__(cls, item: Union[Type[BaseDoc], TypeVar, str]):
global _DocArrayTyped

class _DocArrayTyped(cls): # type: ignore
document_type: Type[BaseDoc] = cast(Type[BaseDoc], item)
doc_type: Type[BaseDoc] = cast(Type[BaseDoc], item)

for field in _DocArrayTyped.document_type.__fields__.keys():
for field in _DocArrayTyped.doc_type.__fields__.keys():

def _property_generator(val: str):
def _getter(self):
Expand Down Expand Up @@ -121,34 +121,34 @@ def _set_data_column(
field: str,
values: Union[List, T, 'AbstractTensor'],
):
"""Set all Documents in this DocArray using the passed values
"""Set all Documents in this DocList using the passed values

:param field: name of the fields to extract
:values: the values to set at the DocArray level
:values: the values to set at the DocList level
"""
...

@classmethod
@abstractmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocumentArrayProto') -> T:
def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T:
"""create a Document from a protobuf message"""
...

@abstractmethod
def to_protobuf(self) -> 'DocumentArrayProto':
"""Convert DocArray into a Protobuf message"""
def to_protobuf(self) -> 'DocListProto':
"""Convert DocList into a Protobuf message"""
...

def _to_node_protobuf(self) -> 'NodeProto':
"""Convert a DocArray into a NodeProto protobuf message.
This function should be called when a DocArray
"""Convert a DocList into a NodeProto protobuf message.
This function should be called when a DocList
is nested into another Document that need to be converted into a protobuf

:return: the nested item protobuf message
"""
from docarray.proto import NodeProto

return NodeProto(document_array=self.to_protobuf())
return NodeProto(doc_array=self.to_protobuf())

@abstractmethod
def traverse_flat(
Expand All @@ -157,7 +157,7 @@ def traverse_flat(
) -> Union[List[Any], 'AbstractTensor']:
"""
Return a List of the accessed objects when applying the `access_path`. If this
results in a nested list or list of DocArrays, the list will be flattened
results in a nested list or list of DocLists, the list will be flattened
on the first level. The access path is a string that consists of attribute
names, concatenated and "__"-separated. It describes the path from the first
level to an arbitrary one, e.g. 'content__image__url'.
Expand All @@ -167,7 +167,7 @@ def traverse_flat(

EXAMPLE USAGE
.. code-block:: python
from docarray import BaseDoc, DocArray, Text
from docarray import BaseDoc, DocList, Text


class Author(BaseDoc):
Expand All @@ -179,49 +179,47 @@ class Book(BaseDoc):
content: Text


da = DocArray[Book](
docs = DocList[Book](
Book(author=Author(name='Jenny'), content=Text(text=f'book_{i}'))
for i in range(10) # noqa: E501
)

books = da.traverse_flat(access_path='content') # list of 10 Text objs
books = docs.traverse_flat(access_path='content') # list of 10 Text objs

authors = da.traverse_flat(access_path='author__name') # list of 10 strings
authors = docs.traverse_flat(access_path='author__name') # list of 10 strings

If the resulting list is a nested list, it will be flattened:

EXAMPLE USAGE
.. code-block:: python
from docarray import BaseDoc, DocArray
from docarray import BaseDoc, DocList


class Chapter(BaseDoc):
content: str


class Book(BaseDoc):
chapters: DocArray[Chapter]
chapters: DocList[Chapter]


da = DocArray[Book](
Book(
chapters=DocArray[Chapter]([Chapter(content='some_content') for _ in range(3)])
)
docs = DocList[Book](
Book(chapters=DocList[Chapter]([Chapter(content='some_content') for _ in range(3)]))
for _ in range(10)
)

chapters = da.traverse_flat(access_path='chapters') # list of 30 strings
chapters = docs.traverse_flat(access_path='chapters') # list of 30 strings

If your DocArray is in stacked mode, and you want to access a field of
type AnyTensor, the stacked tensor will be returned instead of a list:
If your DocList is in doc_vec mode, and you want to access a field of
type AnyTensor, the doc_vec tensor will be returned instead of a list:

EXAMPLE USAGE
.. code-block:: python
class Image(BaseDoc):
tensor: TorchTensor[3, 224, 224]


batch = DocArray[Image](
batch = DocList[Image](
[
Image(
tensor=torch.zeros(3, 224, 224),
Expand All @@ -243,9 +241,9 @@ def _traverse(node: Any, access_path: str):
if access_path:
curr_attr, _, path_attrs = access_path.partition('__')

from docarray.array import DocArray
from docarray.array import DocList

if isinstance(node, (DocArray, list)):
if isinstance(node, (DocList, list)):
for n in node:
x = getattr(n, curr_attr)
yield from AnyDocArray._traverse(x, path_attrs)
Expand All @@ -257,16 +255,16 @@ def _traverse(node: Any, access_path: str):

@staticmethod
def _flatten_one_level(sequence: List[Any]) -> List[Any]:
from docarray import DocArray
from docarray import DocList

if len(sequence) == 0 or not isinstance(sequence[0], (list, DocArray)):
if len(sequence) == 0 or not isinstance(sequence[0], (list, DocList)):
return sequence
else:
return [item for sublist in sequence for item in sublist]

def summary(self):
"""
Print a summary of this DocArray object and a summary of the schema of its
Print a summary of this DocList object and a summary of the schema of its
Document type.
"""
DocArraySummary(self).summary()
Expand All @@ -278,13 +276,13 @@ def _batch(
show_progress: bool = False,
) -> Generator[T, None, None]:
"""
Creates a `Generator` that yields `DocArray` of size `batch_size`.
Creates a `Generator` that yields `DocList` of size `batch_size`.
Note, that the last batch might be smaller than `batch_size`.

:param batch_size: Size of each generated batch.
:param shuffle: If set, shuffle the Documents before dividing into minibatches.
:param show_progress: if set, show a progress bar when batching documents.
:yield: a Generator of `DocArray`, each in the length of `batch_size`
:yield: a Generator of `DocList`, each in the length of `batch_size`
"""
from rich.progress import track

Expand Down
File renamed without changes.
Loading