Skip to content

Commit a74b3bb

Browse files
authored
refactor: rename DocArray to DocList (#1334)
* refactor: rename DocArray to DocList Signed-off-by: samsja <[email protected]> * refactor: rename DocArray to DocList Signed-off-by: samsja <[email protected]> * fix: fix Ci Signed-off-by: samsja <[email protected]> * refactor: rename DocArrayStack to DocVec Signed-off-by: samsja <[email protected]> * refactor: rename DocArrayStack to DocVec Signed-off-by: samsja <[email protected]> * refactor: rename namespace stacked to doc vec Signed-off-by: samsja <[email protected]> * refactor: rename namespace stacked to doc vec Signed-off-by: samsja <[email protected]> * fix: fix ci Signed-off-by: samsja <[email protected]> * refactor: rename namesapce Signed-off-by: samsja <[email protected]> * fix: fix docstring Signed-off-by: samsja <[email protected]> * refactor: rename proto Signed-off-by: samsja <[email protected]> * refactor: document_type to document Signed-off-by: samsja <[email protected]> * refactor: rename document and document_array key from proto to doc and doc_arra Signed-off-by: samsja <[email protected]> * fix: add docv vec to init Signed-off-by: samsja <[email protected]> * refactor: rename da to docs Signed-off-by: samsja <[email protected]> * fix: fix jac Signed-off-by: samsja <[email protected]> * refactor: rename docstring Signed-off-by: samsja <[email protected]> * refactor: rename docstring Signed-off-by: samsja <[email protected]> * refactor: rename da to docs Signed-off-by: samsja <[email protected]> * refactor: rename da columns to docs_vec_colunn Signed-off-by: samsja <[email protected]> * fix: fix readme Signed-off-by: samsja <[email protected]> * fix: rename last da Signed-off-by: samsja <[email protected]> * fix: rename last da Signed-off-by: samsja <[email protected]> * fix: fic docs nested da stack Signed-off-by: samsja <[email protected]> * fix: fix becmark Signed-off-by: samsja <[email protected]> * fix: fix docsm ap Signed-off-by: samsja <[email protected]> * fix: fix docsm ap Signed-off-by: samsja <[email protected]> --------- Signed-off-by: samsja <[email protected]>
1 parent 221b440 commit a74b3bb

File tree

80 files changed

+1208
-1236
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+1208
-1236
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ jobs:
6161
poetry install --without dev
6262
poetry run pip install tensorflow==2.11.0
6363
- name: Test basic import
64-
run: poetry run python -c 'from docarray import DocArray, BaseDoc'
64+
run: poetry run python -c 'from docarray import DocList, BaseDoc'
6565

6666

6767
check-mypy:

README.md

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,10 @@ doc = MultiModalDocument(
7777
)
7878
```
7979

80-
### Collect multiple `Documents` into a `DocArray`:
80+
### Collect multiple `Documents` into a `DocList`:
81+
8182
```python
82-
from docarray import DocArray, BaseDoc
83+
from docarray import DocList, BaseDoc
8384
from docarray.typing import AnyTensor, ImageUrl
8485
import numpy as np
8586

@@ -90,9 +91,9 @@ class Image(BaseDoc):
9091
```
9192

9293
```python
93-
from docarray import DocArray
94+
from docarray import DocList
9495

95-
da = DocArray[Image](
96+
da = DocList[Image](
9697
[
9798
Image(
9899
url="https://upload.wikimedia.org/wikipedia/commons/2/2f/Alpamayo.jpg",
@@ -150,16 +151,16 @@ Image.from_protobuf(doc.to_protobuf())
150151

151152
```python
152153
# NOTE: DocumentStores are not yet implemented in version 2
153-
from docarray import DocArray
154+
from docarray import DocList
154155
from docarray.documents import ImageDoc
155156
from docarray.stores import DocumentStore
156157
import numpy as np
157158

158-
da = DocArray([ImageDoc(embedding=np.zeros((128,))) for _ in range(1000)])
159+
da = DocList([ImageDoc(embedding=np.zeros((128,))) for _ in range(1000)])
159160
store = DocumentStore[ImageDoc](
160161
storage='qdrant'
161162
) # create a DocumentStore with Qdrant as backend
162-
store.insert(da) # insert the DocArray into the DocumentStore
163+
store.insert(da) # insert the DocList into the DocumentStore
163164
# find the 10 most similar images based on the 'embedding' field
164165
match = store.find(ImageDoc(embedding=np.zeros((128,))), field='embedding', top_k=10)
165166
```
@@ -233,7 +234,7 @@ Not very easy on the eyes if you ask us. And even worse, if you need to add one
233234
So, now let's see what the same code looks like with DocArray:
234235

235236
```python
236-
from docarray import DocArray, BaseDoc
237+
from docarray import DocList, BaseDoc
237238
from docarray.documents import ImageDoc, TextDoc, AudioDoc
238239
from docarray.typing import TorchTensor
239240

@@ -258,18 +259,18 @@ class MyPodcastModel(nn.Module):
258259
self.image_encoder = ImageEncoder()
259260
self.text_encoder = TextEncoder()
260261

261-
def forward_podcast(self, da: DocArray[Podcast]) -> DocArray[Podcast]:
262-
da.audio.embedding = self.audio_encoder(da.audio.tensor)
263-
da.text.embedding = self.text_encoder(da.text.tensor)
264-
da.image.embedding = self.image_encoder(da.image.tensor)
262+
def forward_podcast(self, docs: DocList[Podcast]) -> DocList[Podcast]:
263+
docs.audio.embedding = self.audio_encoder(docs.audio.tensor)
264+
docs.text.embedding = self.text_encoder(docs.text.tensor)
265+
docs.image.embedding = self.image_encoder(docs.image.tensor)
265266

266-
return da
267+
return docs
267268

268-
def forward(self, da: DocArray[PairPodcast]) -> DocArray[PairPodcast]:
269-
da.left = self.forward_podcast(da.left)
270-
da.right = self.forward_podcast(da.right)
269+
def forward(self, docs: DocList[PairPodcast]) -> DocList[PairPodcast]:
270+
docs.left = self.forward_podcast(docs.left)
271+
docs.right = self.forward_podcast(docs.right)
271272

272-
return da
273+
return docs
273274
```
274275

275276
Looks much better, doesn't it?
@@ -297,7 +298,7 @@ This would look like the following:
297298
```python
298299
from typing import Optional
299300

300-
from docarray import DocArray, BaseDoc
301+
from docarray import DocList, BaseDoc
301302

302303
import tensorflow as tf
303304

@@ -312,7 +313,7 @@ class MyPodcastModel(tf.keras.Model):
312313
super().__init__()
313314
self.audio_encoder = AudioEncoder()
314315

315-
def call(self, inputs: DocArray[Podcast]) -> DocArray[Podcast]:
316+
def call(self, inputs: DocList[Podcast]) -> DocList[Podcast]:
316317
inputs.audio_tensor.embedding = self.audio_encoder(
317318
inputs.audio_tensor.tensor
318319
) # access audio_tensor's .tensor attribute
@@ -407,7 +408,7 @@ store it there, and thus make it searchable:
407408

408409
```python
409410
# NOTE: DocumentStores are not yet implemented in version 2
410-
from docarray import DocArray, BaseDoc
411+
from docarray import DocList, BaseDoc
411412
from docarray.stores import DocumentStore
412413
from docarray.documents import ImageDoc, TextDoc
413414
import numpy as np
@@ -427,7 +428,7 @@ def _random_my_doc():
427428
)
428429

429430

430-
da = DocArray([_random_my_doc() for _ in range(1000)]) # create some data
431+
da = DocList([_random_my_doc() for _ in range(1000)]) # create some data
431432
store = DocumentStore[MyDoc](
432433
storage='qdrant'
433434
) # create a DocumentStore with Qdrant as backend

docarray/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33
import logging
44

5-
from docarray.array import DocArray, DocArrayStacked
5+
from docarray.array import DocList, DocVec
66
from docarray.base_doc.doc import BaseDoc
77

8-
__all__ = ['BaseDoc', 'DocArray', 'DocArrayStacked']
8+
__all__ = ['BaseDoc', 'DocList', 'DocVec']
99

1010
logger = logging.getLogger('docarray')
1111

docarray/array/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from docarray.array.array.array import DocArray
2-
from docarray.array.stacked.array_stacked import DocArrayStacked
1+
from docarray.array.any_array import AnyDocArray
2+
from docarray.array.doc_list.doc_list import DocList
3+
from docarray.array.doc_vec.doc_vec import DocVec
34

4-
__all__ = ['DocArray', 'DocArrayStacked']
5+
__all__ = ['DocList', 'DocVec', 'AnyDocArray']
Lines changed: 32 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from docarray.utils._internal._typing import change_cls_name
2626

2727
if TYPE_CHECKING:
28-
from docarray.proto import DocumentArrayProto, NodeProto
28+
from docarray.proto import DocListProto, NodeProto
2929
from docarray.typing.tensor.abstract_tensor import AbstractTensor
3030

3131
T = TypeVar('T', bound='AnyDocArray')
@@ -34,7 +34,7 @@
3434

3535

3636
class AnyDocArray(Sequence[T_doc], Generic[T_doc], AbstractType):
37-
document_type: Type[BaseDoc]
37+
doc_type: Type[BaseDoc]
3838
__typed_da__: Dict[Type['AnyDocArray'], Dict[Type[BaseDoc], Type]] = {}
3939

4040
def __repr__(self):
@@ -58,9 +58,9 @@ def __class_getitem__(cls, item: Union[Type[BaseDoc], TypeVar, str]):
5858
global _DocArrayTyped
5959

6060
class _DocArrayTyped(cls): # type: ignore
61-
document_type: Type[BaseDoc] = cast(Type[BaseDoc], item)
61+
doc_type: Type[BaseDoc] = cast(Type[BaseDoc], item)
6262

63-
for field in _DocArrayTyped.document_type.__fields__.keys():
63+
for field in _DocArrayTyped.doc_type.__fields__.keys():
6464

6565
def _property_generator(val: str):
6666
def _getter(self):
@@ -121,34 +121,34 @@ def _set_data_column(
121121
field: str,
122122
values: Union[List, T, 'AbstractTensor'],
123123
):
124-
"""Set all Documents in this DocArray using the passed values
124+
"""Set all Documents in this DocList using the passed values
125125
126126
:param field: name of the fields to extract
127-
:values: the values to set at the DocArray level
127+
:values: the values to set at the DocList level
128128
"""
129129
...
130130

131131
@classmethod
132132
@abstractmethod
133-
def from_protobuf(cls: Type[T], pb_msg: 'DocumentArrayProto') -> T:
133+
def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T:
134134
"""create a Document from a protobuf message"""
135135
...
136136

137137
@abstractmethod
138-
def to_protobuf(self) -> 'DocumentArrayProto':
139-
"""Convert DocArray into a Protobuf message"""
138+
def to_protobuf(self) -> 'DocListProto':
139+
"""Convert DocList into a Protobuf message"""
140140
...
141141

142142
def _to_node_protobuf(self) -> 'NodeProto':
143-
"""Convert a DocArray into a NodeProto protobuf message.
144-
This function should be called when a DocArray
143+
"""Convert a DocList into a NodeProto protobuf message.
144+
This function should be called when a DocList
145145
is nested into another Document that need to be converted into a protobuf
146146
147147
:return: the nested item protobuf message
148148
"""
149149
from docarray.proto import NodeProto
150150

151-
return NodeProto(document_array=self.to_protobuf())
151+
return NodeProto(doc_array=self.to_protobuf())
152152

153153
@abstractmethod
154154
def traverse_flat(
@@ -157,7 +157,7 @@ def traverse_flat(
157157
) -> Union[List[Any], 'AbstractTensor']:
158158
"""
159159
Return a List of the accessed objects when applying the `access_path`. If this
160-
results in a nested list or list of DocArrays, the list will be flattened
160+
results in a nested list or list of DocLists, the list will be flattened
161161
on the first level. The access path is a string that consists of attribute
162162
names, concatenated and "__"-separated. It describes the path from the first
163163
level to an arbitrary one, e.g. 'content__image__url'.
@@ -167,7 +167,7 @@ def traverse_flat(
167167
168168
EXAMPLE USAGE
169169
.. code-block:: python
170-
from docarray import BaseDoc, DocArray, Text
170+
from docarray import BaseDoc, DocList, Text
171171
172172
173173
class Author(BaseDoc):
@@ -179,49 +179,47 @@ class Book(BaseDoc):
179179
content: Text
180180
181181
182-
da = DocArray[Book](
182+
docs = DocList[Book](
183183
Book(author=Author(name='Jenny'), content=Text(text=f'book_{i}'))
184184
for i in range(10) # noqa: E501
185185
)
186186
187-
books = da.traverse_flat(access_path='content') # list of 10 Text objs
187+
books = docs.traverse_flat(access_path='content') # list of 10 Text objs
188188
189-
authors = da.traverse_flat(access_path='author__name') # list of 10 strings
189+
authors = docs.traverse_flat(access_path='author__name') # list of 10 strings
190190
191191
If the resulting list is a nested list, it will be flattened:
192192
193193
EXAMPLE USAGE
194194
.. code-block:: python
195-
from docarray import BaseDoc, DocArray
195+
from docarray import BaseDoc, DocList
196196
197197
198198
class Chapter(BaseDoc):
199199
content: str
200200
201201
202202
class Book(BaseDoc):
203-
chapters: DocArray[Chapter]
203+
chapters: DocList[Chapter]
204204
205205
206-
da = DocArray[Book](
207-
Book(
208-
chapters=DocArray[Chapter]([Chapter(content='some_content') for _ in range(3)])
209-
)
206+
docs = DocList[Book](
207+
Book(chapters=DocList[Chapter]([Chapter(content='some_content') for _ in range(3)]))
210208
for _ in range(10)
211209
)
212210
213-
chapters = da.traverse_flat(access_path='chapters') # list of 30 strings
211+
chapters = docs.traverse_flat(access_path='chapters') # list of 30 strings
214212
215-
If your DocArray is in stacked mode, and you want to access a field of
216-
type AnyTensor, the stacked tensor will be returned instead of a list:
213+
If your DocList is in doc_vec mode, and you want to access a field of
214+
type AnyTensor, the doc_vec tensor will be returned instead of a list:
217215
218216
EXAMPLE USAGE
219217
.. code-block:: python
220218
class Image(BaseDoc):
221219
tensor: TorchTensor[3, 224, 224]
222220
223221
224-
batch = DocArray[Image](
222+
batch = DocList[Image](
225223
[
226224
Image(
227225
tensor=torch.zeros(3, 224, 224),
@@ -243,9 +241,9 @@ def _traverse(node: Any, access_path: str):
243241
if access_path:
244242
curr_attr, _, path_attrs = access_path.partition('__')
245243

246-
from docarray.array import DocArray
244+
from docarray.array import DocList
247245

248-
if isinstance(node, (DocArray, list)):
246+
if isinstance(node, (DocList, list)):
249247
for n in node:
250248
x = getattr(n, curr_attr)
251249
yield from AnyDocArray._traverse(x, path_attrs)
@@ -257,16 +255,16 @@ def _traverse(node: Any, access_path: str):
257255

258256
@staticmethod
259257
def _flatten_one_level(sequence: List[Any]) -> List[Any]:
260-
from docarray import DocArray
258+
from docarray import DocList
261259

262-
if len(sequence) == 0 or not isinstance(sequence[0], (list, DocArray)):
260+
if len(sequence) == 0 or not isinstance(sequence[0], (list, DocList)):
263261
return sequence
264262
else:
265263
return [item for sublist in sequence for item in sublist]
266264

267265
def summary(self):
268266
"""
269-
Print a summary of this DocArray object and a summary of the schema of its
267+
Print a summary of this DocList object and a summary of the schema of its
270268
Document type.
271269
"""
272270
DocArraySummary(self).summary()
@@ -278,13 +276,13 @@ def _batch(
278276
show_progress: bool = False,
279277
) -> Generator[T, None, None]:
280278
"""
281-
Creates a `Generator` that yields `DocArray` of size `batch_size`.
279+
Creates a `Generator` that yields `DocList` of size `batch_size`.
282280
Note, that the last batch might be smaller than `batch_size`.
283281
284282
:param batch_size: Size of each generated batch.
285283
:param shuffle: If set, shuffle the Documents before dividing into minibatches.
286284
:param show_progress: if set, show a progress bar when batching documents.
287-
:yield: a Generator of `DocArray`, each in the length of `batch_size`
285+
:yield: a Generator of `DocList`, each in the length of `batch_size`
288286
"""
289287
from rich.progress import track
290288

0 commit comments

Comments
 (0)