Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
ae41e6d
test(sqlite): add more test to cover sqlite backend
hanxiao Jan 26, 2022
b208bc4
fix: adapt embedding setters for storage backends
alaeddine-13 Jan 26, 2022
863d864
test: cover embeddings setter
alaeddine-13 Jan 26, 2022
c8c0c99
fix: texts setter
alaeddine-13 Jan 26, 2022
26b5edc
fix: tensors and blob setters
alaeddine-13 Jan 26, 2022
8683901
fix: linting
alaeddine-13 Jan 26, 2022
a3ca29f
fix: embed for sqlite backend
alaeddine-13 Jan 26, 2022
aef8979
refactor: delegate to __setitem__ in content setters
alaeddine-13 Jan 26, 2022
8bae8d5
fix: linting
alaeddine-13 Jan 26, 2022
be13b62
test: cover set attributes with size 1
alaeddine-13 Jan 26, 2022
bb97b2a
fix: fix set attributes with size 1
alaeddine-13 Jan 26, 2022
caa2b9d
feat: add batching by id
davidbp Jan 27, 2022
bf0e769
feat: add batching by id
davidbp Jan 27, 2022
bccd9ab
fix: change protocol to protobuf in sqlite
davidbp Jan 27, 2022
99da09f
fix: fix setter by sequences
alaeddine-13 Jan 27, 2022
be82921
test: text type should be string not integer
alaeddine-13 Jan 27, 2022
ceebf56
test: cover ellipsis getter
alaeddine-13 Jan 27, 2022
728dc56
feat: raise index error when mask size is not equal to length
alaeddine-13 Jan 27, 2022
bfcf784
fix: setitem raise IndexError properly
alaeddine-13 Jan 27, 2022
ec6e642
test: cover mask with incorrect length
alaeddine-13 Jan 27, 2022
83c5428
test: cover setting docs by mask
alaeddine-13 Jan 27, 2022
5d71a0b
test: cover ValueError raised on wrong number of elements
alaeddine-13 Jan 27, 2022
93b377d
refactor: remove unreachable code
alaeddine-13 Jan 27, 2022
19a673e
test: fix test_single_boolean_and_padding
alaeddine-13 Jan 27, 2022
8abfca0
chore: remove unused method
alaeddine-13 Jan 28, 2022
b0c2631
fix: merge conflicts
alaeddine-13 Jan 28, 2022
14b025f
Merge branch 'main' into tests-sqlite
alaeddine-13 Jan 28, 2022
a7a8efa
refactor: refactor setitem
alaeddine-13 Jan 31, 2022
f1e07b9
test: fix tests
alaeddine-13 Jan 31, 2022
9b8bd3c
test: fix tests
alaeddine-13 Jan 31, 2022
7a8abd3
feat: handle set traversal paths
alaeddine-13 Jan 31, 2022
8712de6
test: fix tests
alaeddine-13 Jan 31, 2022
b1cad20
test: fix tests
alaeddine-13 Jan 31, 2022
883139f
test: fix tests
alaeddine-13 Jan 31, 2022
71b8102
test: fix tests
alaeddine-13 Jan 31, 2022
8b92a1f
refactor: remove _default_protocol
alaeddine-13 Jan 31, 2022
7b380f4
chore: apply suggestions
alaeddine-13 Jan 31, 2022
c45a632
fix: protobuf-array as default protocol for sqlite
alaeddine-13 Jan 31, 2022
75eecad
chore: apply suggestions
alaeddine-13 Jan 31, 2022
00f2ee5
fix: fix _set_doc_value_pairs and empty chunks after flatten
alaeddine-13 Feb 1, 2022
7be25b7
fix: use separate method _set_doc_value_pairs_nested
alaeddine-13 Feb 1, 2022
30ee9a8
fix: add flattened flag to DocumentArray after flattening
alaeddine-13 Feb 1, 2022
459b6c8
feat: do not allow setting by traversal paths with diff ID
alaeddine-13 Feb 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 6 additions & 11 deletions docarray/array/mixins/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,7 @@ def embeddings(self, value: 'ArrayType'):
"""

if value is None:
for d in self:
d.embedding = None
self[:, 'embedding'] = [None] * len(self)
else:
emb_shape0 = _get_len(value)
self._check_length(emb_shape0)
Expand Down Expand Up @@ -70,8 +69,7 @@ def tensors(self, value: 'ArrayType'):
"""

if value is None:
for d in self:
d.tensor = None
self[:, 'tensor'] = [None] * len(self)
else:
tensors_shape0 = _get_len(value)
self._check_length(tensors_shape0)
Expand All @@ -96,13 +94,11 @@ def texts(self, value: Sequence[str]):
number of Documents
"""
if value is None:
for d in self:
d.text = None
self[:, 'text'] = [None] * len(self)
else:
self._check_length(len(value))

for doc, text in zip(self, value):
doc.text = text
self[:, 'text'] = value

@property
def blobs(self) -> Optional[List[bytes]]:
Expand All @@ -123,13 +119,12 @@ def blobs(self, value: List[bytes]):
"""

if value is None:
for d in self:
d.blob = None
self[:, 'blob'] = [None] * len(self)
else:
self._check_length(len(value))

for doc, blob in zip(self, value):
doc.blob = blob
self[doc.id, 'blob'] = blob

@property
def contents(self) -> Optional[Union[Sequence['DocumentContentType'], 'ArrayType']]:
Expand Down
26 changes: 14 additions & 12 deletions docarray/array/mixins/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ def _set_embeddings_keras(

device = tf.device('/GPU:0') if device == 'cuda' else tf.device('/CPU:0')
with device:
for b in self.batch(batch_size):
r = embed_model(b.tensors, training=False)
b.embeddings = r.numpy() if to_numpy else r
for b_ids in self.batch_ids(batch_size):
r = embed_model(self[b_ids, 'tensor'], training=False)
self[b_ids, 'embedding'] = r.numpy() if to_numpy else r

def _set_embeddings_torch(
self: 'T',
Expand All @@ -59,10 +59,11 @@ def _set_embeddings_torch(
is_training_before = embed_model.training
embed_model.eval()
with torch.inference_mode():
for b in self.batch(batch_size):
batch_inputs = torch.tensor(b.tensors, device=device)
for b_ids in self.batch_ids(batch_size):
batch_inputs = torch.tensor(self[b_ids, 'tensor'], device=device)
r = embed_model(batch_inputs).cpu().detach()
b.embeddings = r.numpy() if to_numpy else r
self[b_ids, 'embedding'] = r.numpy() if to_numpy else r

if is_training_before:
embed_model.train()

Expand All @@ -78,10 +79,11 @@ def _set_embeddings_paddle(
is_training_before = embed_model.training
embed_model.to(device=device)
embed_model.eval()
for b in self.batch(batch_size):
batch_inputs = paddle.to_tensor(b.tensors, place=device)
for b_ids in self.batch_ids(batch_size):
batch_inputs = paddle.to_tensor(self[b_ids, 'tensor'], place=device)
r = embed_model(batch_inputs)
b.embeddings = r.numpy() if to_numpy else r
self[b_ids, 'embedding'] = r.numpy() if to_numpy else r

if is_training_before:
embed_model.train()

Expand All @@ -103,9 +105,9 @@ def _set_embeddings_onnx(
f'Your installed `onnxruntime` supports `{support_device}`, but you give {device}'
)

for b in self.batch(batch_size):
b.embeddings = embed_model.run(
None, {embed_model.get_inputs()[0].name: b.tensors}
for b_ids in self.batch_ids(batch_size):
self[b_ids, 'embedding'] = embed_model.run(
None, {embed_model.get_inputs()[0].name: self[b_ids, 'tensor']}
)[0]


Expand Down
29 changes: 28 additions & 1 deletion docarray/array/mixins/group.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import random
from collections import defaultdict
from typing import Dict, Any, TYPE_CHECKING, Generator
from typing import Dict, Any, TYPE_CHECKING, Generator, List
from ...helper import dunder_get
import numpy as np

Expand Down Expand Up @@ -64,3 +64,30 @@ def batch(

for i in range(n_batches):
yield self[ix[i * batch_size : (i + 1) * batch_size]]

def batch_ids(
self,
batch_size: int,
shuffle: bool = False,
) -> Generator[List[str], None, None]:
"""
Creates a `Generator` that yields `lists of ids` of size `batch_size` until `self` is fully traversed.
Note, that the last batch might be smaller than `batch_size`.

:param batch_size: Size of each generated batch (except the last one, which might be smaller)
:param shuffle: If set, shuffle the Documents before dividing into minibatches.
:yield: a Generator of `list` of IDs, each in the length of `batch_size`
"""

if not (isinstance(batch_size, int) and batch_size > 0):
raise ValueError('`batch_size` should be a positive integer')

N = len(self)
ix = self[:, 'id']
n_batches = int(np.ceil(N / batch_size))

if shuffle:
random.shuffle(ix)

for i in range(n_batches):
yield ix[i * batch_size : (i + 1) * batch_size]
233 changes: 113 additions & 120 deletions docarray/array/mixins/setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,135 +68,28 @@ def __setitem__(
self._set_doc_by_offset(int(index), value)
elif isinstance(index, str):
if index.startswith('@'):
self._set_doc_value_pairs(self.traverse_flat(index[1:]), value)
self._set_doc_value_pairs_nested(self.traverse_flat(index[1:]), value)
else:
self._set_doc_by_id(index, value)
elif isinstance(index, slice):
self._set_docs_by_slice(index, value)
elif index is Ellipsis:
self._set_doc_value_pairs(self.flatten(), value)
elif isinstance(index, Sequence):
if (
isinstance(index, tuple)
and len(index) == 2
and (
isinstance(index[0], (slice, Sequence, str, int))
or index[0] is Ellipsis
)
and isinstance(index[1], (str, Sequence))
):
# TODO: this is added because we are still trying to figure out the proper way
# to set attribute and to get test_path_syntax_indexing_set to pass.
# we may have to refactor the following logic

# NOTE: this check is not proper way to handle, but a temporary hack.
# writing it this way to minimize effect on other docarray classs and
# to make it easier to remove/refactor the following block
if self.__class__.__name__ in {
'DocumentArrayWeaviate',
'DocumentArrayInMemory',
}:
from ..memory import DocumentArrayInMemory

if index[1] in self:
# we first handle the case when second item in index is an id not attr
_docs = DocumentArrayInMemory(
self[index[0]]
) + DocumentArrayInMemory(self[index[1]])
self._set_doc_value_pairs(_docs, value)
return

_docs = self[index[0]]

if not _docs:
return

if isinstance(_docs, Document):
_docs = DocumentArrayInMemory(_docs)
# because we've augmented docs dimension, we do the same for value
value = (value,)

attrs = index[1]
if isinstance(attrs, str):
attrs = (attrs,)
# because we've augmented attrs dimension, we do the same for value
value = (value,)

for attr in attrs:
if not hasattr(_docs[0], attr):
raise ValueError(
f'`{attr}` is neither a valid id nor attribute name'
)

for _a, _v in zip(attrs, value):
self._set_docs_attrs(_docs, _a, _v)
return

if isinstance(index[0], str) and isinstance(index[1], str):
# ambiguity only comes from the second string
if index[1] in self:
self._set_doc_value_pairs(
(self[index[0]], self[index[1]]), value
)
elif hasattr(self[index[0]], index[1]):
self._set_doc_attr_by_id(index[0], index[1], value)
else:
# to avoid accidentally add new unsupport attribute
raise ValueError(
f'`{index[1]}` is neither a valid id nor attribute name'
)
elif isinstance(index[0], (slice, Sequence)):
_attrs = index[1]

if isinstance(_attrs, str):
# a -> [a]
# [a, a] -> [a, a]
_attrs = (index[1],)
if isinstance(value, (list, tuple)) and not any(
isinstance(el, (tuple, list)) for el in value
):
# [x] -> [[x]]
# [[x], [y]] -> [[x], [y]]
value = (value,)
if not isinstance(value, (list, tuple)):
# x -> [x]
value = (value,)

_docs = self[index[0]]
for _a, _v in zip(_attrs, value):
if _a in ('tensor', 'embedding'):
if _a == 'tensor':
_docs.tensors = _v
elif _a == 'embedding':
_docs.embeddings = _v
for _d in _docs:
self._set_doc_by_id(_d.id, _d)
else:
if len(_docs) == 1:
self._set_doc_attr_by_id(_docs[0].id, _a, _v)
else:
for _d, _vv in zip(_docs, _v):
self._set_doc_attr_by_id(_d.id, _a, _vv)
if isinstance(index, tuple) and len(index) == 2:
self._set_by_pair(index[0], index[1], value)

elif isinstance(index[0], bool):
if len(index) != len(self):
raise IndexError(
f'Boolean mask index is required to have the same length as {len(self._data)}, '
f'but receiving {len(index)}'
)
_selected = itertools.compress(self, index)
self._set_doc_value_pairs(_selected, value)
self._set_by_mask(index, value)

elif isinstance(index[0], (int, str)):
if not isinstance(value, Sequence) or len(index) != len(value):
raise ValueError(
f'Number of elements for assigning must be '
f'the same as the index length: {len(index)}'
)
if isinstance(value, Document):
for si in index:
self[si] = value # leverage existing setter
else:
for si, _val in zip(index, value):
self[si] = _val # leverage existing setter
for si, _val in zip(index, value):
self[si] = _val # leverage existing setter
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Invalid types in the sequence should be caught and not fail silently; particularly things like da[1.0, 2.0] coming from np or torch might be a pitfall.

Suggested change
self[si] = _val # leverage existing setter
else:
raise IndexError(f"{index} should be either a sequence of bool, int or str")

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

single quote btw

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oups not used to it yet 😕

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

else:
raise IndexError(
f'{index} should be either a sequence of bool, int or str'
)

elif isinstance(index, np.ndarray):
index = index.squeeze()
if index.ndim == 1:
Expand All @@ -207,3 +100,103 @@ def __setitem__(
)
else:
raise IndexError(f'Unsupported index type {typename(index)}: {index}')

def _set_by_pair(self, idx1, idx2, value):
if isinstance(idx1, str) and not idx1.startswith('@'):
# second is an ID
if isinstance(idx2, str) and idx2 in self:
self._set_doc_value_pairs((self[idx1], self[idx2]), value)
# second is an attribute
elif isinstance(idx2, str) and hasattr(self[idx1], idx2):
self._set_doc_attr_by_id(idx1, idx2, value)
# second is a list of attributes:
elif (
isinstance(idx2, Sequence)
and all(isinstance(attr, str) for attr in idx2)
and all(hasattr(self[idx1], attr) for attr in idx2)
):
for attr, _v in zip(idx2, value):
self._set_doc_attr_by_id(idx1, attr, _v)
else:
raise IndexError(f'`{idx2}` is neither a valid id nor attribute name')
elif isinstance(idx1, int):
# second is an offset
if isinstance(idx2, int):
self._set_doc_value_pairs((self[idx1], self[idx2]), value)
# second is an attribute
elif isinstance(idx2, str) and hasattr(self[idx1], idx2):
self._set_doc_attr_by_offset(idx1, idx2, value)
# second is a list of attributes:
elif (
isinstance(idx2, Sequence)
and all(isinstance(attr, str) for attr in idx2)
and all(hasattr(self[idx1], attr) for attr in idx2)
):
for attr, _v in zip(idx2, value):
self._set_doc_attr_by_id(idx1, attr, _v)
else:
raise IndexError(f'`{idx2}` must be an attribute or list of attributes')

elif (
isinstance(idx1, (slice, Sequence))
or idx1 is Ellipsis
or (isinstance(idx1, str) and idx1.startswith('@'))
):
self._set_docs_attributes(idx1, idx2, value)
# TODO: else raise error

def _set_by_mask(self, mask: List[bool], value):
_selected = itertools.compress(self, mask)
self._set_doc_value_pairs(_selected, value)

def _set_docs_attributes(self, index, attributes, value):
# TODO: handle index is Ellipsis
if isinstance(attributes, str):
# a -> [a]
# [a, a] -> [a, a]
attributes = (attributes,)
value = (value,)

if isinstance(index, str) and index.startswith('@'):
self._set_docs_attributes_traversal_paths(index, attributes, value)
else:
_docs = self[index]
if not _docs:
return

for _a, _v in zip(attributes, value):
if _a in ('tensor', 'embedding'):
if _a == 'tensor':
_docs.tensors = _v
elif _a == 'embedding':
_docs.embeddings = _v
for _d in _docs:
self._set_doc_by_id(_d.id, _d)
Comment on lines +168 to +174
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if _a in ('tensor', 'embedding'):
if _a == 'tensor':
_docs.tensors = _v
elif _a == 'embedding':
_docs.embeddings = _v
for _d in _docs:
self._set_doc_by_id(_d.id, _d)
if _a == 'tensor':
_docs.tensors = _v
elif _a == 'embedding':
_docs.embeddings = _v
for _d in _docs:
self._set_doc_by_id(_d.id, _d)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for _d in _docs:
    self._set_doc_by_id(_d.id, _d)

should be common to both branches

else:
if not isinstance(_v, (list, tuple)):
for _d in _docs:
self._set_doc_attr_by_id(_d.id, _a, _v)
else:
for _d, _vv in zip(_docs, _v):
self._set_doc_attr_by_id(_d.id, _a, _vv)

def _set_docs_attributes_traversal_paths(
self, traversal_paths: str, attributes, value
):
_docs = self[traversal_paths]
if not _docs:
return

for _a, _v in zip(attributes, value):
if _a == 'tensor':
_docs.tensors = _v
elif _a == 'embedding':
_docs.embeddings = _v
else:
if not isinstance(_v, (list, tuple)):
for _d in _docs:
setattr(_d, _a, _v)
else:
for _d, _vv in zip(_docs, _v):
setattr(_d, _a, _vv)
self._set_doc_value_pairs_nested(_docs, _docs)
Loading