Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions docarray/array/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,9 @@ def __enter__(self):

def __exit__(self, *args, **kwargs):
"""
Ensures that offset2ids are stored in the db after
operations in the DocumentArray are performed.
Ensures that we sync the data to the storage backend when exiting the context manager
"""
self._save_offset2ids()
self.sync()

def __new__(cls, *args, storage: str = 'memory', **kwargs):
if cls is DocumentArray:
Expand Down
3 changes: 1 addition & 2 deletions docarray/array/storage/annlite/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,7 @@ def _init_storage(
elif isinstance(config, dict):
config = dataclass_from_dict(AnnliteConfig, config)

self._persist = bool(config.data_path)
if not self._persist:
if config.data_path is None:
from tempfile import TemporaryDirectory

config.data_path = TemporaryDirectory().name
Expand Down
6 changes: 0 additions & 6 deletions docarray/array/storage/annlite/getsetdel.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,8 @@ def _del_docs_by_ids(self, ids):
self._annlite.delete(ids)

def __del__(self) -> None:
if not self._persist:
self._offset2ids.clear()
self._annlite.clear()

self._annlite.close()

super().__del__()

def _load_offset2ids(self):
self._offsetmapping = OffsetMapping(
data_path=self._config.data_path, in_memory=False
Expand Down
2 changes: 1 addition & 1 deletion docarray/array/storage/base/getsetdel.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,6 @@ def _load_offset2ids(self):
def _save_offset2ids(self):
...

def __del__(self):
def sync(self):
if hasattr(self, '_offset2ids'):
self._save_offset2ids()
3 changes: 0 additions & 3 deletions docarray/array/storage/elastic/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,8 @@ def _init_storage(
config = dataclass_from_dict(ElasticConfig, config)

if config.index_name is None:
self._persist = False
id = uuid.uuid4().hex
config.index_name = 'index_name__' + id
else:
self._persist = True

self._index_name_offset2id = 'offset2id__' + config.index_name
self._config = config
Expand Down
7 changes: 0 additions & 7 deletions docarray/array/storage/elastic/seqlike.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,6 @@ def __contains__(self, x: Union[str, 'Document']):
else:
return False

def __del__(self):
"""Delete this :class:`DocumentArrayElastic` object"""
self._save_offset2ids()

# if not self._persist:
# self._offset2ids.clear()

def __repr__(self):
"""Return the string representation of :class:`DocumentArrayElastic` object
:return: string representation of this object
Expand Down
2 changes: 0 additions & 2 deletions docarray/array/storage/qdrant/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ def _init_storage(
self._client = QdrantClient(host=config.host, port=config.port)

self._config = config
self._persist = bool(self._config.collection_name)

self._config.columns = self._normalize_columns(self._config.columns)

Expand All @@ -96,7 +95,6 @@ def _init_storage(
else self._config.collection_name
)

self._persist = self._config.collection_name
self._initialize_qdrant_schema()

super()._init_storage()
Expand Down
1 change: 0 additions & 1 deletion docarray/array/storage/sqlite/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ def _init_storage(
if config.table_name is None
else _sanitize_table_name(config.table_name)
)
self._persist = bool(config.table_name)
config.table_name = self._table_name
initialize_table(
self._table_name, self.__class__.__name__, self.schema_version, self._cursor
Expand Down
10 changes: 0 additions & 10 deletions docarray/array/storage/sqlite/seqlike.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,6 @@ def _append(self, doc: 'Document', commit: bool = True, **kwargs) -> None:
if commit:
self._commit()

def __del__(self) -> None:
super().__del__()
if not self._persist:
self._sql(
'DELETE FROM metadata WHERE table_name=? AND container_type=?',
(self._table_name, self.__class__.__name__),
)
self._sql(f'DROP TABLE IF EXISTS {self._table_name}')
self._commit()

def __contains__(self, item: Union[str, 'Document']):
if isinstance(item, str):
r = self._sql(f'SELECT 1 FROM {self._table_name} WHERE doc_id=?', (item,))
Expand Down
2 changes: 0 additions & 2 deletions docarray/array/storage/weaviate/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,6 @@ def _init_storage(
'Please capitalize when declaring the name field in config.'
)

self._persist = bool(config.name)

self._client = weaviate.Client(
f'{config.protocol}://{config.host}:{config.port}',
timeout_config=config.timeout_config,
Expand Down
11 changes: 0 additions & 11 deletions docarray/array/storage/weaviate/seqlike.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,6 @@ def __contains__(self, x: Union[str, 'Document']):
else:
return False

def __del__(self):
"""Delete this :class:`DocumentArrayWeaviate` object"""
super().__del__()
if (
not self._persist
and len(_REGISTRY[self.__class__.__name__][self._class_name]) == 1
):
self._client.schema.delete_class(self._class_name)
self._client.schema.delete_class(self._meta_name)
_REGISTRY[self.__class__.__name__][self._class_name].remove(self)

def __repr__(self):
"""Return the string representation of :class:`DocumentArrayWeaviate` object
:return: string representation of this object
Expand Down
39 changes: 34 additions & 5 deletions docs/advanced/document-store/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ The procedures for creating, retrieving, updating, and deleting Documents are id

## Construct

There are two ways for initializing a DocumentArray with an external storage backend.
There are two ways to initialize a DocumentArray with an external storage backend.

````{tab} Specify storage

Expand Down Expand Up @@ -145,6 +145,14 @@ da = DocumentArray(

Using dataclass gives you better type-checking in IDE but requires an extra import; using dict is more flexible but can be error-prone. You can choose the style that fits best to your context.

```{admonition} Creating DocumentArrays without specifying index
:class: warning
When you specify an index (table name for SQL stores) in the config, the index will be used to persist the DocumentArray in the document store.
If you create a DocumentArray but do not specify an index, a randomized placeholder index will be created to persist the data.

Creating DocumentArrays without indexes is useful during prototyping but should not be used in a production setting as randomized placeholder data will be persisted in the document store unnecessarily.
```


## Feature summary

Expand Down Expand Up @@ -349,8 +357,8 @@ array([[7., 7., 7.],
## Persistence, mutations and context manager

Having DocumentArrays that are backed by a document store introduces an extra consideration into the way you think about DocumentArrays.
The DocumentArray object created in your Python program is now a view of the underlying implementation in the external store.
This means that your DocumentArray object in Python can be out of sync with what is persisted to the external store.
The DocumentArray object created in your Python program is now a view of the underlying implementation in the document store.
This means that your DocumentArray object in Python can be out of sync with what is persisted to the document store.

**For example**
```python
Expand Down Expand Up @@ -415,8 +423,10 @@ Length of da1 is 3
````

Now that you know the issue, let's explore what you should do to work with DocumentArrays backed by document store in a more predictable manner.
### Using Context Manager
The recommended way is to use the DocumentArray as a context manager like so:

````{tab} Use with

The data will be synced when the context manager is exited.

```python
from docarray import DocumentArray, Document
Expand All @@ -429,6 +439,24 @@ print(f"Length of da1 is {len(da1)}")
da2 = DocumentArray(storage='redis', config=dict(n_dim=3, index_name="my_index"))
print(f"Length of da2 is {len(da2)}")
```
````

````{tab} Use sync

Explicitly calling the `sync` method of the DocumentArray will save the data to the document store.

```python
from docarray import DocumentArray, Document

da1 = DocumentArray(storage='redis', config=dict(n_dim=3, index_name="another_index"))
da1.append(Document())
da.sync() # Call the sync method
print(f"Length of da1 is {len(da1)}")

da2 = DocumentArray(storage='redis', config=dict(n_dim=3, index_name="another_index"))
print(f"Length of da2 is {len(da2)}")
```
````
**First run output**
```console
Length of da1 is 1
Expand All @@ -447,6 +475,7 @@ Length of da2 is 3

The append you made to the DocumentArray is now persisted properly. Hurray!

The recommended way to sync data to the document store is to use the DocumentArray inside the `with` context manager.

## Known limitations

Expand Down
33 changes: 20 additions & 13 deletions tests/unit/array/test_advance_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,25 +689,32 @@ def test_edge_case_two_strings(storage, config_gen, start_storage):
def test_offset2ids_persistence(storage, config, start_storage):
da = DocumentArray(storage=storage, config=config)

da.extend(
[
Document(id='0'),
Document(id='2'),
Document(id='4'),
]
)
da.insert(1, Document(id='1'))
da.insert(3, Document(id='3'))
with da:
da.extend(
[
Document(id='0'),
Document(id='2'),
Document(id='4'),
]
)
da.insert(1, Document(id='1'))
da.insert(3, Document(id='3'))

config = da._config
da_ids = da[:, 'id']
assert da_ids == [str(i) for i in range(5)]
da._persist = True
da.__del__()
da.sync()

da = DocumentArray(storage=storage, config=config)
da1 = DocumentArray(storage=storage, config=config)

assert da1[:, 'id'] == da_ids

with da1:
da1.extend([Document(id=i) for i in 'abc'])
assert len(da1) == 8

assert da[:, 'id'] == da_ids
da2 = DocumentArray(storage=storage, config=config)
assert da2[:, 'id'] == da1[:, 'id']


def test_dam_conflicting_ids():
Expand Down
6 changes: 4 additions & 2 deletions tests/unit/array/test_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,10 @@ def test_context_manager_from_disk(storage, config, start_storage, tmpdir, tmpfi
assert len(da2) == 2
assert len(da2._offset2ids.ids) == 2

del da
del da2
# Cleanup modifications made in test
with da:
del da[0]
del da[0]


@pytest.mark.parametrize(
Expand Down