diff --git a/docarray/array/document.py b/docarray/array/document.py index d119d2f2983..89e513060d9 100644 --- a/docarray/array/document.py +++ b/docarray/array/document.py @@ -144,10 +144,9 @@ def __enter__(self): def __exit__(self, *args, **kwargs): """ - Ensures that offset2ids are stored in the db after - operations in the DocumentArray are performed. + Ensures that we sync the data to the storage backend when exiting the context manager """ - self._save_offset2ids() + self.sync() def __new__(cls, *args, storage: str = 'memory', **kwargs): if cls is DocumentArray: diff --git a/docarray/array/storage/annlite/backend.py b/docarray/array/storage/annlite/backend.py index 874c4ab6f0e..e2cfde9cb5c 100644 --- a/docarray/array/storage/annlite/backend.py +++ b/docarray/array/storage/annlite/backend.py @@ -88,8 +88,7 @@ def _init_storage( elif isinstance(config, dict): config = dataclass_from_dict(AnnliteConfig, config) - self._persist = bool(config.data_path) - if not self._persist: + if config.data_path is None: from tempfile import TemporaryDirectory config.data_path = TemporaryDirectory().name diff --git a/docarray/array/storage/annlite/getsetdel.py b/docarray/array/storage/annlite/getsetdel.py index 584edf63c3a..d153175fe03 100644 --- a/docarray/array/storage/annlite/getsetdel.py +++ b/docarray/array/storage/annlite/getsetdel.py @@ -43,14 +43,8 @@ def _del_docs_by_ids(self, ids): self._annlite.delete(ids) def __del__(self) -> None: - if not self._persist: - self._offset2ids.clear() - self._annlite.clear() - self._annlite.close() - super().__del__() - def _load_offset2ids(self): self._offsetmapping = OffsetMapping( data_path=self._config.data_path, in_memory=False diff --git a/docarray/array/storage/base/getsetdel.py b/docarray/array/storage/base/getsetdel.py index ab49d0f4e22..682b6964cb2 100644 --- a/docarray/array/storage/base/getsetdel.py +++ b/docarray/array/storage/base/getsetdel.py @@ -325,6 +325,6 @@ def _load_offset2ids(self): def _save_offset2ids(self): ... - def __del__(self): + def sync(self): if hasattr(self, '_offset2ids'): self._save_offset2ids() diff --git a/docarray/array/storage/elastic/backend.py b/docarray/array/storage/elastic/backend.py index 28bb90f07e6..7519c94ed73 100644 --- a/docarray/array/storage/elastic/backend.py +++ b/docarray/array/storage/elastic/backend.py @@ -83,11 +83,8 @@ def _init_storage( config = dataclass_from_dict(ElasticConfig, config) if config.index_name is None: - self._persist = False id = uuid.uuid4().hex config.index_name = 'index_name__' + id - else: - self._persist = True self._index_name_offset2id = 'offset2id__' + config.index_name self._config = config diff --git a/docarray/array/storage/elastic/seqlike.py b/docarray/array/storage/elastic/seqlike.py index 3ca78acc0cc..9c2007b3c7b 100644 --- a/docarray/array/storage/elastic/seqlike.py +++ b/docarray/array/storage/elastic/seqlike.py @@ -45,13 +45,6 @@ def __contains__(self, x: Union[str, 'Document']): else: return False - def __del__(self): - """Delete this :class:`DocumentArrayElastic` object""" - self._save_offset2ids() - - # if not self._persist: - # self._offset2ids.clear() - def __repr__(self): """Return the string representation of :class:`DocumentArrayElastic` object :return: string representation of this object diff --git a/docarray/array/storage/qdrant/backend.py b/docarray/array/storage/qdrant/backend.py index 5561def8801..d153d65aafa 100644 --- a/docarray/array/storage/qdrant/backend.py +++ b/docarray/array/storage/qdrant/backend.py @@ -86,7 +86,6 @@ def _init_storage( self._client = QdrantClient(host=config.host, port=config.port) self._config = config - self._persist = bool(self._config.collection_name) self._config.columns = self._normalize_columns(self._config.columns) @@ -96,7 +95,6 @@ def _init_storage( else self._config.collection_name ) - self._persist = self._config.collection_name self._initialize_qdrant_schema() super()._init_storage() diff --git a/docarray/array/storage/sqlite/backend.py b/docarray/array/storage/sqlite/backend.py index 7422738a1e7..043d00f5151 100644 --- a/docarray/array/storage/sqlite/backend.py +++ b/docarray/array/storage/sqlite/backend.py @@ -91,7 +91,6 @@ def _init_storage( if config.table_name is None else _sanitize_table_name(config.table_name) ) - self._persist = bool(config.table_name) config.table_name = self._table_name initialize_table( self._table_name, self.__class__.__name__, self.schema_version, self._cursor diff --git a/docarray/array/storage/sqlite/seqlike.py b/docarray/array/storage/sqlite/seqlike.py index 06fb4f0a732..ec871a7b6f0 100644 --- a/docarray/array/storage/sqlite/seqlike.py +++ b/docarray/array/storage/sqlite/seqlike.py @@ -48,16 +48,6 @@ def _append(self, doc: 'Document', commit: bool = True, **kwargs) -> None: if commit: self._commit() - def __del__(self) -> None: - super().__del__() - if not self._persist: - self._sql( - 'DELETE FROM metadata WHERE table_name=? AND container_type=?', - (self._table_name, self.__class__.__name__), - ) - self._sql(f'DROP TABLE IF EXISTS {self._table_name}') - self._commit() - def __contains__(self, item: Union[str, 'Document']): if isinstance(item, str): r = self._sql(f'SELECT 1 FROM {self._table_name} WHERE doc_id=?', (item,)) diff --git a/docarray/array/storage/weaviate/backend.py b/docarray/array/storage/weaviate/backend.py index 9d747908562..52b41908baa 100644 --- a/docarray/array/storage/weaviate/backend.py +++ b/docarray/array/storage/weaviate/backend.py @@ -111,8 +111,6 @@ def _init_storage( 'Please capitalize when declaring the name field in config.' ) - self._persist = bool(config.name) - self._client = weaviate.Client( f'{config.protocol}://{config.host}:{config.port}', timeout_config=config.timeout_config, diff --git a/docarray/array/storage/weaviate/seqlike.py b/docarray/array/storage/weaviate/seqlike.py index f0a86cf2df6..f480899ccab 100644 --- a/docarray/array/storage/weaviate/seqlike.py +++ b/docarray/array/storage/weaviate/seqlike.py @@ -54,17 +54,6 @@ def __contains__(self, x: Union[str, 'Document']): else: return False - def __del__(self): - """Delete this :class:`DocumentArrayWeaviate` object""" - super().__del__() - if ( - not self._persist - and len(_REGISTRY[self.__class__.__name__][self._class_name]) == 1 - ): - self._client.schema.delete_class(self._class_name) - self._client.schema.delete_class(self._meta_name) - _REGISTRY[self.__class__.__name__][self._class_name].remove(self) - def __repr__(self): """Return the string representation of :class:`DocumentArrayWeaviate` object :return: string representation of this object diff --git a/docs/advanced/document-store/index.md b/docs/advanced/document-store/index.md index 5db58353f22..b8c9dc7be9f 100644 --- a/docs/advanced/document-store/index.md +++ b/docs/advanced/document-store/index.md @@ -70,7 +70,7 @@ The procedures for creating, retrieving, updating, and deleting Documents are id ## Construct -There are two ways for initializing a DocumentArray with an external storage backend. +There are two ways to initialize a DocumentArray with an external storage backend. ````{tab} Specify storage @@ -145,6 +145,14 @@ da = DocumentArray( Using dataclass gives you better type-checking in IDE but requires an extra import; using dict is more flexible but can be error-prone. You can choose the style that fits best to your context. +```{admonition} Creating DocumentArrays without specifying index +:class: warning +When you specify an index (table name for SQL stores) in the config, the index will be used to persist the DocumentArray in the document store. +If you create a DocumentArray but do not specify an index, a randomized placeholder index will be created to persist the data. + +Creating DocumentArrays without indexes is useful during prototyping but should not be used in a production setting as randomized placeholder data will be persisted in the document store unnecessarily. +``` + ## Feature summary @@ -349,8 +357,8 @@ array([[7., 7., 7.], ## Persistence, mutations and context manager Having DocumentArrays that are backed by a document store introduces an extra consideration into the way you think about DocumentArrays. -The DocumentArray object created in your Python program is now a view of the underlying implementation in the external store. -This means that your DocumentArray object in Python can be out of sync with what is persisted to the external store. +The DocumentArray object created in your Python program is now a view of the underlying implementation in the document store. +This means that your DocumentArray object in Python can be out of sync with what is persisted to the document store. **For example** ```python @@ -415,8 +423,10 @@ Length of da1 is 3 ```` Now that you know the issue, let's explore what you should do to work with DocumentArrays backed by document store in a more predictable manner. -### Using Context Manager -The recommended way is to use the DocumentArray as a context manager like so: + +````{tab} Use with + +The data will be synced when the context manager is exited. ```python from docarray import DocumentArray, Document @@ -429,6 +439,24 @@ print(f"Length of da1 is {len(da1)}") da2 = DocumentArray(storage='redis', config=dict(n_dim=3, index_name="my_index")) print(f"Length of da2 is {len(da2)}") ``` +```` + +````{tab} Use sync + +Explicitly calling the `sync` method of the DocumentArray will save the data to the document store. + +```python +from docarray import DocumentArray, Document + +da1 = DocumentArray(storage='redis', config=dict(n_dim=3, index_name="another_index")) +da1.append(Document()) +da.sync() # Call the sync method +print(f"Length of da1 is {len(da1)}") + +da2 = DocumentArray(storage='redis', config=dict(n_dim=3, index_name="another_index")) +print(f"Length of da2 is {len(da2)}") +``` +```` **First run output** ```console Length of da1 is 1 @@ -447,6 +475,7 @@ Length of da2 is 3 The append you made to the DocumentArray is now persisted properly. Hurray! +The recommended way to sync data to the document store is to use the DocumentArray inside the `with` context manager. ## Known limitations diff --git a/tests/unit/array/test_advance_indexing.py b/tests/unit/array/test_advance_indexing.py index 8cf7780c3ba..666ae8596b9 100644 --- a/tests/unit/array/test_advance_indexing.py +++ b/tests/unit/array/test_advance_indexing.py @@ -689,25 +689,32 @@ def test_edge_case_two_strings(storage, config_gen, start_storage): def test_offset2ids_persistence(storage, config, start_storage): da = DocumentArray(storage=storage, config=config) - da.extend( - [ - Document(id='0'), - Document(id='2'), - Document(id='4'), - ] - ) - da.insert(1, Document(id='1')) - da.insert(3, Document(id='3')) + with da: + da.extend( + [ + Document(id='0'), + Document(id='2'), + Document(id='4'), + ] + ) + da.insert(1, Document(id='1')) + da.insert(3, Document(id='3')) config = da._config da_ids = da[:, 'id'] assert da_ids == [str(i) for i in range(5)] - da._persist = True - da.__del__() + da.sync() - da = DocumentArray(storage=storage, config=config) + da1 = DocumentArray(storage=storage, config=config) + + assert da1[:, 'id'] == da_ids + + with da1: + da1.extend([Document(id=i) for i in 'abc']) + assert len(da1) == 8 - assert da[:, 'id'] == da_ids + da2 = DocumentArray(storage=storage, config=config) + assert da2[:, 'id'] == da1[:, 'id'] def test_dam_conflicting_ids(): diff --git a/tests/unit/array/test_sequence.py b/tests/unit/array/test_sequence.py index 458e563f7c5..92b04995357 100644 --- a/tests/unit/array/test_sequence.py +++ b/tests/unit/array/test_sequence.py @@ -104,8 +104,10 @@ def test_context_manager_from_disk(storage, config, start_storage, tmpdir, tmpfi assert len(da2) == 2 assert len(da2._offset2ids.ids) == 2 - del da - del da2 + # Cleanup modifications made in test + with da: + del da[0] + del da[0] @pytest.mark.parametrize(