Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docarray/array/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ def __new__(cls, *args, storage: str = 'memory', **kwargs) -> 'DocumentArrayLike
from .sqlite import DocumentArraySqlite

instance = super().__new__(DocumentArraySqlite)
elif storage == 'weaviate':
from .weaviate import DocumentArrayWeaviate

instance = super().__new__(DocumentArrayWeaviate)
else:
raise ValueError(f'storage=`{storage}` is not supported.')
else:
Expand Down
9 changes: 7 additions & 2 deletions docarray/array/mixins/delitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,14 @@ def __delitem__(self, index: 'DocumentArrayIndexType'):
if (
isinstance(index, tuple)
and len(index) == 2
and isinstance(index[0], (slice, Sequence))
and (
isinstance(index[0], (slice, Sequence, str, int))
or index[0] is Ellipsis
)
and isinstance(index[1], (str, Sequence))
):
if isinstance(index[0], str) and isinstance(index[1], str):
# TODO: add support for cases such as da[1, ['text', 'id']]?
if isinstance(index[0], (str, int)) and isinstance(index[1], str):
# ambiguity only comes from the second string
if index[1] in self:
del self[index[0]]
Expand Down
16 changes: 13 additions & 3 deletions docarray/array/mixins/getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,24 @@ def __getitem__(
if (
isinstance(index, tuple)
and len(index) == 2
and isinstance(index[0], (slice, Sequence))
and (
isinstance(index[0], (slice, Sequence, str, int))
or index[0] is Ellipsis
)
and isinstance(index[1], (str, Sequence))
):
if isinstance(index[0], str) and isinstance(index[1], str):
# TODO: add support for cases such as da[1, ['text', 'id']]?
if isinstance(index[0], (str, int)) and isinstance(index[1], str):
# ambiguity only comes from the second string
if index[1] in self:
return DocumentArray([self[index[0]], self[index[1]]])
else:
return getattr(self[index[0]], index[1])
_docs = self[index[0]]
if not _docs:
return []
if isinstance(_docs, Document):
return getattr(_docs, index[1])
return _docs._get_attributes(index[1])
elif isinstance(index[0], (slice, Sequence)):
_docs = self[index[0]]
_attrs = index[1]
Expand Down
53 changes: 52 additions & 1 deletion docarray/array/mixins/setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,59 @@ def __setitem__(
if (
isinstance(index, tuple)
and len(index) == 2
and isinstance(index[0], (slice, Sequence))
and (
isinstance(index[0], (slice, Sequence, str, int))
or index[0] is Ellipsis
)
and isinstance(index[1], (str, Sequence))
):
# TODO: this is added because we are still trying to figure out the proper way
# to set attribute and to get test_path_syntax_indexing_set to pass.
# we may have to refactor the following logic

# NOTE: this check is not proper way to handle, but a temporary hack.
# writing it this way to minimize effect on other docarray classs and
# to make it easier to remove/refactor the following block
if self.__class__.__name__ in {
'DocumentArrayWeaviate',
'DocumentArrayInMemory',
}:
from ..memory import DocumentArrayInMemory

if index[1] in self:
# we first handle the case when second item in index is an id not attr
_docs = DocumentArrayInMemory(
self[index[0]]
) + DocumentArrayInMemory(self[index[1]])
self._set_doc_value_pairs(_docs, value)
return

_docs = self[index[0]]

if not _docs:
return

if isinstance(_docs, Document):
_docs = DocumentArrayInMemory(_docs)
# because we've augmented docs dimension, we do the same for value
value = (value,)

attrs = index[1]
if isinstance(attrs, str):
attrs = (attrs,)
# because we've augmented attrs dimension, we do the same for value
value = (value,)

for attr in attrs:
if not hasattr(_docs[0], attr):
raise ValueError(
f'`{attr}` is neither a valid id nor attribute name'
)

for _a, _v in zip(attrs, value):
self._set_docs_attrs(_docs, _a, _v)
return

if isinstance(index[0], str) and isinstance(index[1], str):
# ambiguity only comes from the second string
if index[1] in self:
Expand Down
54 changes: 43 additions & 11 deletions docarray/array/storage/base/getsetdel.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,29 @@ def _get_doc_by_id(self, _id: str) -> 'Document':

def _get_docs_by_slice(self, _slice: slice) -> Iterable['Document']:
"""This function is derived from :meth:`_get_doc_by_offset`
Override this function if there is a more efficient logic

Override this function if there is a more efficient logic"""
:param _slice: the slice used for indexing
:return: an iterable of document
"""
return (self._get_doc_by_offset(o) for o in range(len(self))[_slice])

def _get_docs_by_offsets(self, offsets: Sequence[int]) -> Iterable['Document']:
"""This function is derived from :meth:`_get_doc_by_offset`
Override this function if there is a more efficient logic

Override this function if there is a more efficient logic"""
:param offsets: the offsets used for indexing
:return: an iterable of document
"""
return (self._get_doc_by_offset(o) for o in offsets)

def _get_docs_by_ids(self, ids: Sequence[str]) -> Iterable['Document']:
"""This function is derived from :meth:`_get_doc_by_id`
Override this function if there is a more efficient logic

Override this function if there is a more efficient logic"""
:param ids: the ids used for indexing
:return: an iterable of document
"""
return (self._get_doc_by_id(_id) for _id in ids)

# Delitem APIs
Expand All @@ -64,15 +73,17 @@ def _del_doc_by_id(self, _id: str):

def _del_docs_by_slice(self, _slice: slice):
"""This function is derived and may not have the most efficient implementation.

Override this function if there is a more efficient logic"""
Override this function if there is a more efficient logic
:param _slice: the slice used for indexing
"""
for j in range(len(self))[_slice]:
self._del_doc_by_offset(j)

def _del_docs_by_mask(self, mask: Sequence[bool]):
"""This function is derived and may not have the most efficient implementation.

Override this function if there is a more efficient logic"""
Override this function if there is a more efficient logic
:param mask: the boolean mask used for indexing
"""
for idx, m in enumerate(mask):
if not m:
self._del_doc_by_offset(idx)
Expand All @@ -98,6 +109,9 @@ def _set_docs_by_slice(self, _slice: slice, value: Sequence['Document']):
"""This function is derived and may not have the most efficient implementation.

Override this function if there is a more efficient logic
:param _slice: the slice used for indexing
:param value: the value docs will be updated to
:raises TypeError: error raised when right-hand assignment is not an iterable
"""
if not isinstance(value, Iterable):
raise TypeError(
Expand All @@ -107,17 +121,26 @@ def _set_docs_by_slice(self, _slice: slice, value: Sequence['Document']):
self._set_doc_by_offset(_offset, val)

def _set_doc_value_pairs(
self, docs: Iterable['Document'], values: Iterable['Document']
self, docs: Iterable['Document'], values: Sequence['Document']
):
"""This function is derived and may not have the most efficient implementation.

Override this function if there is a more efficient logic
:param docs: the docs to update
:param values: the value docs will be updated to
"""
docs = list(docs)
if len(docs) != len(values):
raise ValueError(
f'length of docs to set({len(docs)}) does not match '
f'length of values({len(values)})'
)

for _d, _v in zip(docs, values):
_d._data = _v._data

for _d in docs:
if _d not in docs:
if _d not in self:
root_d = self._find_root_doc(_d)
else:
# _d is already on the root-level
Expand All @@ -130,6 +153,9 @@ def _set_doc_attr_by_offset(self, offset: int, attr: str, value: Any):
"""This function is derived and may not have the most efficient implementation.

Override this function if there is a more efficient logic
:param offset: the offset used for indexing
:param attr: the attribute of document to update
:param value: the value doc's attr will be updated to
"""
d = self._get_doc_by_offset(offset)
if hasattr(d, attr):
Expand All @@ -140,14 +166,20 @@ def _set_doc_attr_by_id(self, _id: str, attr: str, value: Any):
"""This function is derived and may not have the most efficient implementation.

Override this function if there is a more efficient logic
:param _id: the id used for indexing
:param attr: the attribute of document to update
:param value: the value doc's attr will be updated to
"""
d = self._get_doc_by_id(_id)
if hasattr(d, attr):
setattr(d, attr, value)
self._set_doc_by_id(d.id, d)

def _find_root_doc(self, d: Document):
"""Find `d`'s root Document in an exhaustive manner"""
def _find_root_doc(self, d: Document) -> 'Document':
"""Find `d`'s root Document in an exhaustive manner
:param: d: the input document
:return: the root of the input document
"""
from docarray import DocumentArray

for _d in self:
Expand Down
22 changes: 20 additions & 2 deletions docarray/array/storage/memory/getsetdel.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
)

from ..base.getsetdel import BaseGetSetDelMixin
from .... import Document
from .... import Document, DocumentArray


class GetSetDelMixin(BaseGetSetDelMixin):
Expand Down Expand Up @@ -46,8 +46,15 @@ def _set_docs_by_slice(self, _slice: slice, value: Sequence['Document']):
self._rebuild_id2offset()

def _set_doc_value_pairs(
self, docs: Iterable['Document'], values: Iterable['Document']
self, docs: Iterable['Document'], values: Sequence['Document']
):
docs = list(docs)
if len(docs) != len(values):
raise ValueError(
f'length of docs to set({len(docs)}) does not match '
f'length of values({len(values)})'
)

for _d, _v in zip(docs, values):
_d._data = _v._data
self._rebuild_id2offset()
Expand All @@ -58,6 +65,17 @@ def _set_doc_attr_by_offset(self, offset: int, attr: str, value: Any):
def _set_doc_attr_by_id(self, _id: str, attr: str, value: Any):
setattr(self._data[self._id2offset[_id]], attr, value)

def _set_docs_attrs(self, docs: 'DocumentArray', attr: str, values: Iterable[Any]):
# TODO: remove this function to use _set_doc_attr_by_id once
# we find a way to do
if attr == 'embedding':
docs.embeddings = values
elif attr == 'tensor':
docs.tensors = values
else:
for _d, _v in zip(docs, values):
setattr(_d, attr, _v)

def _get_doc_by_offset(self, offset: int) -> 'Document':
return self._data[offset]

Expand Down
9 changes: 8 additions & 1 deletion docarray/array/storage/sqlite/getsetdel.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,14 @@ def _del_docs_by_mask(self, mask: Sequence[bool]):
self._commit()

def _set_doc_value_pairs(
self, docs: Iterable['Document'], values: Iterable['Document']
self, docs: Iterable['Document'], values: Sequence['Document']
):
docs = list(docs)
if len(docs) != len(values):
raise ValueError(
f'length of docs to set({len(docs)}) does not match '
f'length of values({len(values)})'
)

for _d, _v in zip(docs, values):
self._set_doc_by_id(_d.id, _v)
5 changes: 0 additions & 5 deletions docarray/array/storage/sqlite/seqlike.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,3 @@ def __eq__(self, other):
and type(self._config) is type(other._config)
and self._config == other._config
)

def __add__(self, other: Union['Document', Sequence['Document']]):
v = type(self)(self, storage='sqlite')
v.extend(other)
return v
9 changes: 9 additions & 0 deletions docarray/array/storage/weaviate/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from .backend import BackendMixin
from .getsetdel import GetSetDelMixin
from .seqlike import SequenceLikeMixin

__all__ = ['StorageMixins']


class StorageMixins(BackendMixin, GetSetDelMixin, SequenceLikeMixin):
...
Loading