Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions docarray/array/storage/memory/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
Sequence,
Optional,
TYPE_CHECKING,
Callable,
)
import functools

from ..base.backend import BaseBackendMixin
from .... import Document
Expand All @@ -17,6 +19,17 @@
)


def needs_id2offset_rebuild(func) -> Callable:
# self._id2offset needs to be rebuilt after every insert or delete
# this flag allows to do it lazily and cache the result
@functools.wraps(func)
def wrapper(self, *args, **kwargs):
self._needs_id2offset_rebuild = True
return func(self, *args, **kwargs)

return wrapper


class BackendMixin(BaseBackendMixin):
"""Provide necessary functions to enable this storage backend."""

Expand All @@ -26,8 +39,9 @@ def _id2offset(self) -> Dict[str, int]:

:return: a Python dict.
"""
if not hasattr(self, '_id_to_index'):
if self._needs_id2offset_rebuild:
self._rebuild_id2offset()

return self._id_to_index

def _rebuild_id2offset(self) -> None:
Expand All @@ -40,26 +54,32 @@ def _rebuild_id2offset(self) -> None:
d.id: i for i, d in enumerate(self._data)
} # type: Dict[str, int]

self._needs_id2offset_rebuild = False

@needs_id2offset_rebuild
def _init_storage(
self, _docs: Optional['DocumentArraySourceType'] = None, copy: bool = False
):
from ... import DocumentArray
from ...memory import DocumentArrayInMemory

self._data = []
self._id_to_index = {}
if _docs is None:
return
elif isinstance(
_docs, (DocumentArray, Sequence, Generator, Iterator, itertools.chain)
):
if copy:
self._data = [Document(d, copy=True) for d in _docs]
self._rebuild_id2offset()
elif isinstance(_docs, DocumentArray):
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
elif isinstance(_docs, DocumentArray):
elif isinstance(_docs, DocumentArrayinMemory):

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix in last commit

self._data = _docs._data
self._id_to_index = _docs._id2offset
else:
self._data = list(_docs)
self._rebuild_id2offset()

if isinstance(_docs, DocumentArrayInMemory):
self._id_to_index = _docs._id2offset
self._needs_id2offset_rebuild = _docs._needs_id2offset_rebuild
else:
if isinstance(_docs, Document):
if copy:
Expand Down
14 changes: 6 additions & 8 deletions docarray/array/storage/memory/getsetdel.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,31 @@
Iterable,
Any,
)

from ..memory.backend import needs_id2offset_rebuild
from ..base.getsetdel import BaseGetSetDelMixin
from .... import Document


class GetSetDelMixin(BaseGetSetDelMixin):
"""Implement required and derived functions that power `getitem`, `setitem`, `delitem`"""

@needs_id2offset_rebuild
def _del_docs_by_mask(self, mask: Sequence[bool]):
self._data = list(itertools.compress(self._data, (not _i for _i in mask)))
self._rebuild_id2offset()

def _del_all_docs(self):
self._data.clear()
self._id2offset.clear()

@needs_id2offset_rebuild
def _del_docs_by_slice(self, _slice: slice):
del self._data[_slice]
self._rebuild_id2offset()

def _del_doc_by_id(self, _id: str):
del self._data[self._id2offset[_id]]
self._id2offset.pop(_id)
self._del_doc_by_offset(self._id2offset[_id])

@needs_id2offset_rebuild
def _del_doc_by_offset(self, offset: int):
self._id2offset.pop(self._data[offset].id)
del self._data[offset]

def _set_doc_by_offset(self, offset: int, value: 'Document'):
Expand All @@ -41,16 +40,15 @@ def _set_doc_by_id(self, _id: str, value: 'Document'):
self._data[old_idx] = value
self._id2offset[value.id] = old_idx

@needs_id2offset_rebuild
def _set_docs_by_slice(self, _slice: slice, value: Sequence['Document']):
self._data[_slice] = value
self._rebuild_id2offset()

def _set_doc_value_pairs(
self, docs: Iterable['Document'], values: Iterable['Document']
):
for _d, _v in zip(docs, values):
_d._data = _v._data
self._rebuild_id2offset()

def _set_doc_attr_by_offset(self, offset: int, attr: str, value: Any):
setattr(self._data[offset], attr, value)
Expand Down
8 changes: 6 additions & 2 deletions docarray/array/storage/memory/seqlike.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,20 @@

from .... import Document

from ..memory.backend import needs_id2offset_rebuild


class SequenceLikeMixin(MutableSequence[Document]):
"""Implement sequence-like methods"""

@needs_id2offset_rebuild
def insert(self, index: int, value: 'Document'):
"""Insert `doc` at `index`.

:param index: Position of the insertion.
:param value: The doc needs to be inserted.
"""
self._data.insert(index, value)
self._id2offset[value.id] = index

def __eq__(self, other):
return (
Expand Down Expand Up @@ -56,5 +58,7 @@ def __add__(self, other: Union['Document', Sequence['Document']]):
return v

def extend(self, values: Iterable['Document']) -> None:
values = list(values) # consume the iterator only once
last_idx = len(self._id2offset)
self._data.extend(values)
self._rebuild_id2offset()
self._id_to_index.update({d.id: i + last_idx for i, d in enumerate(values)})
Empty file added test.py
Empty file.
42 changes: 42 additions & 0 deletions tests/unit/array/mixins/test_del.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import pytest

from docarray import DocumentArray, Document


@pytest.fixture()
def docs():
return DocumentArray([Document(id=f'{i}') for i in range(1, 10)])


@pytest.mark.parametrize(
'to_delete',
[
0,
1,
4,
-1,
list(range(1, 4)),
[2, 4, 7, 1, 1],
slice(0, 2),
slice(2, 4),
slice(4, -1),
[True, True, False],
...,
],
)
def test_del_all(docs, to_delete):
doc_to_delete = docs[to_delete]
del docs[to_delete]
assert doc_to_delete not in docs


@pytest.mark.parametrize(
['deleted_ids', 'expected_ids'],
[
(['1', '2', '3', '4'], ['5', '6', '7', '8', '9']),
(['2', '4', '7', '1'], ['3', '5', '6', '8', '9']),
],
)
def test_del_by_multiple_idx(docs, deleted_ids, expected_ids):
del docs[deleted_ids]
assert docs[:, 'id'] == expected_ids
6 changes: 4 additions & 2 deletions tests/unit/array/test_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
def test_insert(da_cls):
da = da_cls()
assert not len(da)
da.insert(0, Document(text='hello'))
da.insert(0, Document(text='world'))
da.insert(0, Document(text='hello', id="0"))
da.insert(0, Document(text='world', id="1"))
assert len(da) == 2
assert da[0].text == 'world'
assert da[1].text == 'hello'
assert da["1"].text == 'world'
assert da["0"].text == 'hello'


@pytest.mark.parametrize('da_cls', [DocumentArray])
Expand Down