From ee3338aa4c2d79d0a6af99a3dc9f4864ed2a04ff Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Wed, 26 Jan 2022 11:46:09 +0100 Subject: [PATCH 01/15] test(del): add test to cover deleting doc --- test.py | 0 tests/unit/array/mixins/test_del.py | 42 +++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 test.py create mode 100644 tests/unit/array/mixins/test_del.py diff --git a/test.py b/test.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/unit/array/mixins/test_del.py b/tests/unit/array/mixins/test_del.py new file mode 100644 index 00000000000..654bc60b2c2 --- /dev/null +++ b/tests/unit/array/mixins/test_del.py @@ -0,0 +1,42 @@ +import pytest + +from docarray import DocumentArray, Document + + +@pytest.fixture() +def docs(): + return DocumentArray([Document(id=f"{i}") for i in range(1, 10)]) + + +@pytest.mark.parametrize( + "to_delete", + [ + 0, + 1, + 4, + -1, + list(range(1, 4)), + [2, 4, 7, 1, 1], + slice(0, 2), + slice(2, 4), + slice(4, -1), + [True, True, False], + ..., + ], +) +def test_del_all(docs, to_delete): + doc_to_delete = docs[to_delete] + del docs[to_delete] + assert doc_to_delete not in docs + + +@pytest.mark.parametrize( + ["deleted_ids", "expected_ids"], + [ + (["1", "2", "3", "4"], ["5", "6", "7", "8", "9"]), + (["2", "4", "7", "1"], ["3", "5", "6", "8", "9"]), + ], +) +def test_del_by_multiple_idx(docs, deleted_ids, expected_ids): + del docs[deleted_ids] + assert docs[:, "id"] == expected_ids From 34807b4c93cc59ca9f8635a221f7c791218cc34f Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Wed, 26 Jan 2022 11:51:38 +0100 Subject: [PATCH 02/15] fix(del): deleting by ids in DA no longer bugs --- docarray/array/storage/memory/getsetdel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/memory/getsetdel.py b/docarray/array/storage/memory/getsetdel.py index 8ea62471cce..a48b2fbc163 100644 --- a/docarray/array/storage/memory/getsetdel.py +++ b/docarray/array/storage/memory/getsetdel.py @@ -26,7 +26,7 @@ def _del_docs_by_slice(self, _slice: slice): def _del_doc_by_id(self, _id: str): del self._data[self._id2offset[_id]] - self._id2offset.pop(_id) + self._rebuild_id2offset() def _del_doc_by_offset(self, offset: int): self._id2offset.pop(self._data[offset].id) From 6aab5fc97fd442d11a8e0fa48dc9833fea8e8f30 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Wed, 26 Jan 2022 14:54:48 +0100 Subject: [PATCH 03/15] test(seq): add tests for acces by id after insert --- tests/unit/array/test_sequence.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/unit/array/test_sequence.py b/tests/unit/array/test_sequence.py index 6ba6679f936..d846161c2e8 100644 --- a/tests/unit/array/test_sequence.py +++ b/tests/unit/array/test_sequence.py @@ -7,11 +7,13 @@ def test_insert(da_cls): da = da_cls() assert not len(da) - da.insert(0, Document(text='hello')) - da.insert(0, Document(text='world')) + da.insert(0, Document(text='hello', id="0")) + da.insert(0, Document(text='world', id="1")) assert len(da) == 2 assert da[0].text == 'world' assert da[1].text == 'hello' + assert da["1"].text == 'world' + assert da["0"].text == 'hello' @pytest.mark.parametrize('da_cls', [DocumentArray]) From 2728244f03c73ee6b1f84428ea7db8d33ea6b20b Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Wed, 26 Jan 2022 14:55:54 +0100 Subject: [PATCH 04/15] fix(seq): rebuild index after insert --- docarray/array/storage/memory/seqlike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/memory/seqlike.py b/docarray/array/storage/memory/seqlike.py index 0121780fadf..9f1284d83af 100644 --- a/docarray/array/storage/memory/seqlike.py +++ b/docarray/array/storage/memory/seqlike.py @@ -13,7 +13,7 @@ def insert(self, index: int, value: 'Document'): :param value: The doc needs to be inserted. """ self._data.insert(index, value) - self._id2offset[value.id] = index + self._rebuild_id2offset() def __eq__(self, other): return ( From 6bd5542963ddca6bbcb222aa12be9036fb6e4df4 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Wed, 26 Jan 2022 15:36:35 +0100 Subject: [PATCH 05/15] fix(del): del by id call del by offset --- docarray/array/storage/memory/getsetdel.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docarray/array/storage/memory/getsetdel.py b/docarray/array/storage/memory/getsetdel.py index a48b2fbc163..8126ba40022 100644 --- a/docarray/array/storage/memory/getsetdel.py +++ b/docarray/array/storage/memory/getsetdel.py @@ -25,12 +25,11 @@ def _del_docs_by_slice(self, _slice: slice): self._rebuild_id2offset() def _del_doc_by_id(self, _id: str): - del self._data[self._id2offset[_id]] - self._rebuild_id2offset() + self._del_doc_by_offset(self._id2offset[_id]) def _del_doc_by_offset(self, offset: int): - self._id2offset.pop(self._data[offset].id) del self._data[offset] + self._rebuild_id2offset() def _set_doc_by_offset(self, offset: int, value: 'Document'): self._data[offset] = value From 40242f243dcb084acd323816a5928002a2bbbad0 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Wed, 26 Jan 2022 16:47:31 +0100 Subject: [PATCH 06/15] fix(del): improve efficiency by rebuilding id2offset lazily when needed --- docarray/array/storage/memory/backend.py | 13 +++++++++---- docarray/array/storage/memory/getsetdel.py | 10 +++++----- docarray/array/storage/memory/seqlike.py | 4 ++-- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/docarray/array/storage/memory/backend.py b/docarray/array/storage/memory/backend.py index ea596702c85..dca0f333692 100644 --- a/docarray/array/storage/memory/backend.py +++ b/docarray/array/storage/memory/backend.py @@ -26,8 +26,9 @@ def _id2offset(self) -> Dict[str, int]: :return: a Python dict. """ - if not hasattr(self, '_id_to_index'): + if self._needs_id2offset_rebuild: self._rebuild_id2offset() + return self._id_to_index def _rebuild_id2offset(self) -> None: @@ -40,11 +41,17 @@ def _rebuild_id2offset(self) -> None: d.id: i for i, d in enumerate(self._data) } # type: Dict[str, int] + self._needs_id2offset_rebuild = False + def _init_storage( - self, _docs: Optional['DocumentArraySourceType'] = None, copy: bool = False + self, _docs: Optional["DocumentArraySourceType"] = None, copy: bool = False ): from ... import DocumentArray + self._needs_id2offset_rebuild = True + # self._id2offset needs to be rebuilt after every insert or delete + # this flag allows to do it lazily and cache the result + self._data = [] if _docs is None: return @@ -53,13 +60,11 @@ def _init_storage( ): if copy: self._data = [Document(d, copy=True) for d in _docs] - self._rebuild_id2offset() elif isinstance(_docs, DocumentArray): self._data = _docs._data self._id_to_index = _docs._id2offset else: self._data = list(_docs) - self._rebuild_id2offset() else: if isinstance(_docs, Document): if copy: diff --git a/docarray/array/storage/memory/getsetdel.py b/docarray/array/storage/memory/getsetdel.py index 8126ba40022..175c9c222a6 100644 --- a/docarray/array/storage/memory/getsetdel.py +++ b/docarray/array/storage/memory/getsetdel.py @@ -14,7 +14,7 @@ class GetSetDelMixin(BaseGetSetDelMixin): def _del_docs_by_mask(self, mask: Sequence[bool]): self._data = list(itertools.compress(self._data, (not _i for _i in mask))) - self._rebuild_id2offset() + self._needs_id2offset_rebuild = True def _del_all_docs(self): self._data.clear() @@ -22,14 +22,14 @@ def _del_all_docs(self): def _del_docs_by_slice(self, _slice: slice): del self._data[_slice] - self._rebuild_id2offset() + self._needs_id2offset_rebuild = True def _del_doc_by_id(self, _id: str): self._del_doc_by_offset(self._id2offset[_id]) def _del_doc_by_offset(self, offset: int): del self._data[offset] - self._rebuild_id2offset() + self._needs_id2offset_rebuild = True def _set_doc_by_offset(self, offset: int, value: 'Document'): self._data[offset] = value @@ -42,14 +42,14 @@ def _set_doc_by_id(self, _id: str, value: 'Document'): def _set_docs_by_slice(self, _slice: slice, value: Sequence['Document']): self._data[_slice] = value - self._rebuild_id2offset() + self._needs_id2offset_rebuild = True def _set_doc_value_pairs( self, docs: Iterable['Document'], values: Iterable['Document'] ): for _d, _v in zip(docs, values): _d._data = _v._data - self._rebuild_id2offset() + self._needs_id2offset_rebuild = True def _set_doc_attr_by_offset(self, offset: int, attr: str, value: Any): setattr(self._data[offset], attr, value) diff --git a/docarray/array/storage/memory/seqlike.py b/docarray/array/storage/memory/seqlike.py index 9f1284d83af..0bf694af48a 100644 --- a/docarray/array/storage/memory/seqlike.py +++ b/docarray/array/storage/memory/seqlike.py @@ -13,7 +13,7 @@ def insert(self, index: int, value: 'Document'): :param value: The doc needs to be inserted. """ self._data.insert(index, value) - self._rebuild_id2offset() + self._needs_id2offset_rebuild = True def __eq__(self, other): return ( @@ -57,4 +57,4 @@ def __add__(self, other: Union['Document', Sequence['Document']]): def extend(self, values: Iterable['Document']) -> None: self._data.extend(values) - self._rebuild_id2offset() + self._needs_id2offset_rebuild = True From fd744f470c0caf3ef5fd5afd81be862fde883cb5 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 27 Jan 2022 08:59:10 +0100 Subject: [PATCH 07/15] fix(del): double quotes ==> simple quote --- docarray/array/storage/memory/backend.py | 2 +- tests/unit/array/mixins/test_del.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docarray/array/storage/memory/backend.py b/docarray/array/storage/memory/backend.py index dca0f333692..46f88706163 100644 --- a/docarray/array/storage/memory/backend.py +++ b/docarray/array/storage/memory/backend.py @@ -44,7 +44,7 @@ def _rebuild_id2offset(self) -> None: self._needs_id2offset_rebuild = False def _init_storage( - self, _docs: Optional["DocumentArraySourceType"] = None, copy: bool = False + self, _docs: Optional['DocumentArraySourceType'] = None, copy: bool = False ): from ... import DocumentArray diff --git a/tests/unit/array/mixins/test_del.py b/tests/unit/array/mixins/test_del.py index 654bc60b2c2..45d2b05402a 100644 --- a/tests/unit/array/mixins/test_del.py +++ b/tests/unit/array/mixins/test_del.py @@ -5,11 +5,11 @@ @pytest.fixture() def docs(): - return DocumentArray([Document(id=f"{i}") for i in range(1, 10)]) + return DocumentArray([Document(id=f'{i}') for i in range(1, 10)]) @pytest.mark.parametrize( - "to_delete", + 'to_delete', [ 0, 1, @@ -31,12 +31,12 @@ def test_del_all(docs, to_delete): @pytest.mark.parametrize( - ["deleted_ids", "expected_ids"], + ['deleted_ids', 'expected_ids'], [ - (["1", "2", "3", "4"], ["5", "6", "7", "8", "9"]), - (["2", "4", "7", "1"], ["3", "5", "6", "8", "9"]), + (['1', '2', '3', '4'], ['5', '6', '7', '8', '9']), + (['2', '4', '7', '1'], ['3', '5', '6', '8', '9']), ], ) def test_del_by_multiple_idx(docs, deleted_ids, expected_ids): del docs[deleted_ids] - assert docs[:, "id"] == expected_ids + assert docs[:, 'id'] == expected_ids From 6621bc46c313d3a116090bea9be3173fd7915390 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 27 Jan 2022 09:48:53 +0100 Subject: [PATCH 08/15] feat(getdelset): use a decorator for needs id2offset rebuild --- docarray/array/storage/memory/backend.py | 18 ++++++++++++++---- docarray/array/storage/memory/getsetdel.py | 12 ++++++------ docarray/array/storage/memory/seqlike.py | 6 ++++-- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/docarray/array/storage/memory/backend.py b/docarray/array/storage/memory/backend.py index 46f88706163..da5e7c7ddcc 100644 --- a/docarray/array/storage/memory/backend.py +++ b/docarray/array/storage/memory/backend.py @@ -6,7 +6,9 @@ Sequence, Optional, TYPE_CHECKING, + Callable, ) +import functools from ..base.backend import BaseBackendMixin from .... import Document @@ -17,6 +19,17 @@ ) +def needs_id2offset_rebuild(func) -> Callable: + # self._id2offset needs to be rebuilt after every insert or delete + # this flag allows to do it lazily and cache the result + @functools.wraps(func) + def wrapper(self, *args, **kwargs): + self._needs_id2offset_rebuild = True + return func(self, *args, **kwargs) + + return wrapper + + class BackendMixin(BaseBackendMixin): """Provide necessary functions to enable this storage backend.""" @@ -43,15 +56,12 @@ def _rebuild_id2offset(self) -> None: self._needs_id2offset_rebuild = False + @needs_id2offset_rebuild def _init_storage( self, _docs: Optional['DocumentArraySourceType'] = None, copy: bool = False ): from ... import DocumentArray - self._needs_id2offset_rebuild = True - # self._id2offset needs to be rebuilt after every insert or delete - # this flag allows to do it lazily and cache the result - self._data = [] if _docs is None: return diff --git a/docarray/array/storage/memory/getsetdel.py b/docarray/array/storage/memory/getsetdel.py index 175c9c222a6..993786e8a66 100644 --- a/docarray/array/storage/memory/getsetdel.py +++ b/docarray/array/storage/memory/getsetdel.py @@ -4,7 +4,7 @@ Iterable, Any, ) - +from ..memory.backend import needs_id2offset_rebuild from ..base.getsetdel import BaseGetSetDelMixin from .... import Document @@ -12,24 +12,24 @@ class GetSetDelMixin(BaseGetSetDelMixin): """Implement required and derived functions that power `getitem`, `setitem`, `delitem`""" + @needs_id2offset_rebuild def _del_docs_by_mask(self, mask: Sequence[bool]): self._data = list(itertools.compress(self._data, (not _i for _i in mask))) - self._needs_id2offset_rebuild = True def _del_all_docs(self): self._data.clear() self._id2offset.clear() + @needs_id2offset_rebuild def _del_docs_by_slice(self, _slice: slice): del self._data[_slice] - self._needs_id2offset_rebuild = True def _del_doc_by_id(self, _id: str): self._del_doc_by_offset(self._id2offset[_id]) + @needs_id2offset_rebuild def _del_doc_by_offset(self, offset: int): del self._data[offset] - self._needs_id2offset_rebuild = True def _set_doc_by_offset(self, offset: int, value: 'Document'): self._data[offset] = value @@ -40,16 +40,16 @@ def _set_doc_by_id(self, _id: str, value: 'Document'): self._data[old_idx] = value self._id2offset[value.id] = old_idx + @needs_id2offset_rebuild def _set_docs_by_slice(self, _slice: slice, value: Sequence['Document']): self._data[_slice] = value - self._needs_id2offset_rebuild = True + @needs_id2offset_rebuild def _set_doc_value_pairs( self, docs: Iterable['Document'], values: Iterable['Document'] ): for _d, _v in zip(docs, values): _d._data = _v._data - self._needs_id2offset_rebuild = True def _set_doc_attr_by_offset(self, offset: int, attr: str, value: Any): setattr(self._data[offset], attr, value) diff --git a/docarray/array/storage/memory/seqlike.py b/docarray/array/storage/memory/seqlike.py index 0bf694af48a..a272e07c82b 100644 --- a/docarray/array/storage/memory/seqlike.py +++ b/docarray/array/storage/memory/seqlike.py @@ -2,10 +2,13 @@ from .... import Document +from ..memory.backend import needs_id2offset_rebuild + class SequenceLikeMixin(MutableSequence[Document]): """Implement sequence-like methods""" + @needs_id2offset_rebuild def insert(self, index: int, value: 'Document'): """Insert `doc` at `index`. @@ -13,7 +16,6 @@ def insert(self, index: int, value: 'Document'): :param value: The doc needs to be inserted. """ self._data.insert(index, value) - self._needs_id2offset_rebuild = True def __eq__(self, other): return ( @@ -55,6 +57,6 @@ def __add__(self, other: Union['Document', Sequence['Document']]): v.extend(other) return v + @needs_id2offset_rebuild def extend(self, values: Iterable['Document']) -> None: self._data.extend(values) - self._needs_id2offset_rebuild = True From b32ac41443a93dc2a672987374bc846449f94cd8 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 27 Jan 2022 11:31:09 +0100 Subject: [PATCH 09/15] fix(extend): extend is faster no rebuild id2offset index --- docarray/array/storage/memory/backend.py | 1 + docarray/array/storage/memory/seqlike.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docarray/array/storage/memory/backend.py b/docarray/array/storage/memory/backend.py index da5e7c7ddcc..28de8da8755 100644 --- a/docarray/array/storage/memory/backend.py +++ b/docarray/array/storage/memory/backend.py @@ -63,6 +63,7 @@ def _init_storage( from ... import DocumentArray self._data = [] + self._id_to_index = {} if _docs is None: return elif isinstance( diff --git a/docarray/array/storage/memory/seqlike.py b/docarray/array/storage/memory/seqlike.py index a272e07c82b..a1decf726cd 100644 --- a/docarray/array/storage/memory/seqlike.py +++ b/docarray/array/storage/memory/seqlike.py @@ -57,6 +57,8 @@ def __add__(self, other: Union['Document', Sequence['Document']]): v.extend(other) return v - @needs_id2offset_rebuild def extend(self, values: Iterable['Document']) -> None: self._data.extend(values) + # extend _id_to_index, no rebuilt needed + last_idx = len(self._id_to_index) + self._id_to_index.update({d.id: i + last_idx for i, d in enumerate(values)}) From 485d99d06688c1bb82761e7d955b665f9fca6205 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 27 Jan 2022 13:06:37 +0100 Subject: [PATCH 10/15] fix(set): no id2offset rebuild for set_do_value_pairs --- docarray/array/storage/memory/getsetdel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docarray/array/storage/memory/getsetdel.py b/docarray/array/storage/memory/getsetdel.py index 993786e8a66..1a33b28e750 100644 --- a/docarray/array/storage/memory/getsetdel.py +++ b/docarray/array/storage/memory/getsetdel.py @@ -44,7 +44,6 @@ def _set_doc_by_id(self, _id: str, value: 'Document'): def _set_docs_by_slice(self, _slice: slice, value: Sequence['Document']): self._data[_slice] = value - @needs_id2offset_rebuild def _set_doc_value_pairs( self, docs: Iterable['Document'], values: Iterable['Document'] ): From 94b23881f5648c96b79ca7baf09004e4e9b57ffc Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 27 Jan 2022 14:20:05 +0100 Subject: [PATCH 11/15] fix(seqlike): insert use id2offset now Co-authored-by: AlaeddineAbdessalem --- docarray/array/storage/memory/seqlike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/memory/seqlike.py b/docarray/array/storage/memory/seqlike.py index a1decf726cd..3ea1f967065 100644 --- a/docarray/array/storage/memory/seqlike.py +++ b/docarray/array/storage/memory/seqlike.py @@ -60,5 +60,5 @@ def __add__(self, other: Union['Document', Sequence['Document']]): def extend(self, values: Iterable['Document']) -> None: self._data.extend(values) # extend _id_to_index, no rebuilt needed - last_idx = len(self._id_to_index) + last_idx = len(self._id2offset) self._id_to_index.update({d.id: i + last_idx for i, d in enumerate(values)}) From ddfa8a96ef2c6035a643ba4a8ec15189782c5981 Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Thu, 27 Jan 2022 14:22:13 +0100 Subject: [PATCH 12/15] fix(newdoc): avoid rebuilding index when copying da Co-authored-by: AlaeddineAbdessalem --- docarray/array/storage/memory/backend.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docarray/array/storage/memory/backend.py b/docarray/array/storage/memory/backend.py index 28de8da8755..ae2aad746a0 100644 --- a/docarray/array/storage/memory/backend.py +++ b/docarray/array/storage/memory/backend.py @@ -61,6 +61,7 @@ def _init_storage( self, _docs: Optional['DocumentArraySourceType'] = None, copy: bool = False ): from ... import DocumentArray + from ...memory import DocumentArrayInMemory self._data = [] self._id_to_index = {} @@ -76,6 +77,9 @@ def _init_storage( self._id_to_index = _docs._id2offset else: self._data = list(_docs) + + if isinstance(_docs, DocumentArrayInMemory): + self._needs_id2offset_rebuild = _docs._needs_id2offset_rebuild else: if isinstance(_docs, Document): if copy: From 697cba0662171f6d08d56ab5e6e32c0c9a916801 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 27 Jan 2022 15:05:49 +0100 Subject: [PATCH 13/15] fix(extend): values iterable is only consume once in extend --- docarray/array/storage/memory/seqlike.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docarray/array/storage/memory/seqlike.py b/docarray/array/storage/memory/seqlike.py index 3ea1f967065..4973a0e2bd8 100644 --- a/docarray/array/storage/memory/seqlike.py +++ b/docarray/array/storage/memory/seqlike.py @@ -58,7 +58,7 @@ def __add__(self, other: Union['Document', Sequence['Document']]): return v def extend(self, values: Iterable['Document']) -> None: - self._data.extend(values) - # extend _id_to_index, no rebuilt needed last_idx = len(self._id2offset) - self._id_to_index.update({d.id: i + last_idx for i, d in enumerate(values)}) + for i, d in enumerate(values): + self._id_to_index[d.id] = last_idx + i + self._data.append(d) From b40ef888adba31c4e62a37af70b60ff29868ce41 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 27 Jan 2022 17:48:06 +0100 Subject: [PATCH 14/15] fix(initstorage): only daInMemory has an _id2offset --- docarray/array/storage/memory/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/memory/backend.py b/docarray/array/storage/memory/backend.py index ae2aad746a0..c38e1acaab0 100644 --- a/docarray/array/storage/memory/backend.py +++ b/docarray/array/storage/memory/backend.py @@ -74,11 +74,11 @@ def _init_storage( self._data = [Document(d, copy=True) for d in _docs] elif isinstance(_docs, DocumentArray): self._data = _docs._data - self._id_to_index = _docs._id2offset else: self._data = list(_docs) if isinstance(_docs, DocumentArrayInMemory): + self._id_to_index = _docs._id2offset self._needs_id2offset_rebuild = _docs._needs_id2offset_rebuild else: if isinstance(_docs, Document): From 30524947f2ee2321718ac900b90e94e46f537262 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 27 Jan 2022 18:00:00 +0100 Subject: [PATCH 15/15] perf(extend): cast iterator to a list to use extend --- docarray/array/storage/memory/seqlike.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docarray/array/storage/memory/seqlike.py b/docarray/array/storage/memory/seqlike.py index 4973a0e2bd8..633d7c1fd66 100644 --- a/docarray/array/storage/memory/seqlike.py +++ b/docarray/array/storage/memory/seqlike.py @@ -58,7 +58,7 @@ def __add__(self, other: Union['Document', Sequence['Document']]): return v def extend(self, values: Iterable['Document']) -> None: + values = list(values) # consume the iterator only once last_idx = len(self._id2offset) - for i, d in enumerate(values): - self._id_to_index[d.id] = last_idx + i - self._data.append(d) + self._data.extend(values) + self._id_to_index.update({d.id: i + last_idx for i, d in enumerate(values)})