diff --git a/docarray/array/sqlite.py b/docarray/array/sqlite.py index c203ef5dc53..df4c44df403 100644 --- a/docarray/array/sqlite.py +++ b/docarray/array/sqlite.py @@ -1,5 +1,8 @@ from .document import DocumentArray -from .storage.sqlite import StorageMixins + +from .storage.sqlite import StorageMixins, SqliteConfig + +__all__ = ['SqliteConfig', 'DocumentArraySqlite'] class DocumentArraySqlite(StorageMixins, DocumentArray): diff --git a/docarray/array/storage/weaviate/__init__.py b/docarray/array/storage/weaviate/__init__.py index e0f499bee0a..c7a24670b44 100644 --- a/docarray/array/storage/weaviate/__init__.py +++ b/docarray/array/storage/weaviate/__init__.py @@ -1,8 +1,8 @@ -from .backend import BackendMixin +from .backend import BackendMixin, WeaviateConfig from .getsetdel import GetSetDelMixin from .seqlike import SequenceLikeMixin -__all__ = ['StorageMixins'] +__all__ = ['StorageMixins', 'WeaviateConfig'] class StorageMixins(BackendMixin, GetSetDelMixin, SequenceLikeMixin): diff --git a/docarray/array/weaviate.py b/docarray/array/weaviate.py index c51efc68255..f7b13211b74 100644 --- a/docarray/array/weaviate.py +++ b/docarray/array/weaviate.py @@ -1,6 +1,7 @@ from .document import DocumentArray -from .storage.weaviate import StorageMixins -from .storage.weaviate.backend import WeaviateConfig +from .storage.weaviate import StorageMixins, WeaviateConfig + +__all__ = ['DocumentArrayWeaviate', 'WeaviateConfig'] class DocumentArrayWeaviate(StorageMixins, DocumentArray): diff --git a/docs/advanced/document-store/index.md b/docs/advanced/document-store/index.md new file mode 100644 index 00000000000..af09dcab7c7 --- /dev/null +++ b/docs/advanced/document-store/index.md @@ -0,0 +1,134 @@ +# Document Store + +```{toctree} +:hidden: + +sqlite +``` + +Documents inside a DocumentArray can live in a [document store](https://en.wikipedia.org/wiki/Document-oriented_database) instead of in memory, e.g. in SQLite, Redis. Comparing to the in-memory storage, the benefit of using an external store is often about longer persistence and faster retrieval. + +The look-and-feel of a DocumentArray with external store is **almost the same** as a regular in-memory DocumentArray. This allows users to easily switch between backends under the same DocArray idiom. + +Take SQLite as an example, using it as the store backend of a DocumentArray is as simple as follows: + +```python +from docarray import DocumentArray, Document + +da = DocumentArray(storage='sqlite', config={'connection': 'example.db'}) + +da.append(Document()) +da.summary() +``` + +```text + Documents Summary + + Length 1 + Homogenous Documents True + Common Attributes ('id',) + + Attributes Summary + + Attribute Data type #Unique values Has empty value + ────────────────────────────────────────────────────────── + id ('str',) 1 False + +``` + +Creating, retrieving, updating, deleting Documents are identical to the regular {ref}`DocumentArray`. All DocumentArray methods such as `.summary()`, `.embed()`, `.plot_embeddings()` should work out of the box. + +## Construct + +There are two ways for initializing a DocumentArray with a store backend. + +````{tab} Specify storage + +```python +from docarray import DocumentArray + +da = DocumentArray(storage='sqlite') +``` + + +```text + +``` +```` + +````{tab} Import the class and alias it + +```python +from docarray.array.sqlite import DocumentArraySqlite as DocumentArray + +da = DocumentArray() +``` + +```text + +``` + +```` + +Depending on the context, you can choose the style that fits better. For example, if one wants to use class method such as `DocumentArray.empty(10)`, then explicit importing `DocumentArraySqlite` is the way to go. Of course, you can choose not to alias the imported class to make the code even more explicit. + +### Construct with config + +The config of a store backend is either store-specific dataclass object or a `dict` that can be parsed into the former. + +One can pass the config in the constructor via `config`: + +````{tab} Use dataclass + +```python +from docarray import DocumentArray +from docarray.array.sqlite import SqliteConfig + +cfg = SqliteConfig(connection='example.db', table_name='test') + +da = DocumentArray(storage='sqlite', config=cfg) +``` + +```` + +````{tab} Use dict + +```python +from docarray import DocumentArray + +da = DocumentArray(storage='sqlite', config={'connection': 'example.db', table_name='test'}) +``` + +```` + +Using dataclass gives you better type-checking in IDE but requires an extra import; using dict is more flexible but can be error-prone. You can choose the style that fits best to your context. + +## Known limitations + +### Out-of-array modification + +One can not take a Document *out* from a DocumentArray and modify it, then expect its modification to be committed back to the DocumentArray. + +Specifically, the pattern below is not supported by any external store backend: + +```python +from docarray import DocumentArray + +da = DocumentArray(storage='any_store_beyond_in_memory') +d = da[0] # or any access-element method, e.g. by id, by slice +d.text = 'hello' + +print(da[0]) # this will NOT change to `hello` +``` + +The solution is simple: use {ref}`column-selector`: + +```python +da[0, 'text'] = 'hello' +``` + +### Elements access is slower + +Obviously, a DocumentArray with on-disk storage is slower than in-memory DocumentArray. However, if you choose to use on-disk storage, then often your concern of persistence overwhelms the concern of efficiency. + +Slowness can affect all functions of DocumentArray. On the bright side, they may not be that severe as you would expect. Modern database are highly optimized. Moreover, some database provides faster method for resolving certain queries, e.g. nearest-neighbour queries. We are actively and continuously improving DocArray to better leverage those features. \ No newline at end of file diff --git a/docs/advanced/document-store/sqlite.md b/docs/advanced/document-store/sqlite.md new file mode 100644 index 00000000000..610f7f1fe03 --- /dev/null +++ b/docs/advanced/document-store/sqlite.md @@ -0,0 +1,38 @@ +# SQLite + +One can use SQLite as the document store for DocumentArray. It is useful when you want to access a large number Document which can not fit into memory. + +## Usage + +```python +from docarray import DocumentArray + +da = DocumentArray(storage='sqlite') # with default config + +da1 = DocumentArray(storage='sqlite', config={'connection': 'example.db'}) # with customize config +``` + +To reconnect a formerly persisted database, one can need to specify *both* `connection` and `table_name` in `config`: + +```python +from docarray import DocumentArray + +da = DocumentArray(storage='sqlite', config={'connection': 'example.db', 'table_name': 'mine'}) + +da.summary() +``` + +Other functions behave the same as in-memory DocumentArray. + +## Config + +The following configs can be set: + +| Name | Description | Default | +|--------------------|------------------------------------------------------------------------------------------------------------------|--| +| `connection` | SQLite database filename | a random temp file | +| `table_name` | SQLite table name | a random name | +| `serialize_config` | [Serialization config of each Document](../../fundamentals/document/serialization.md) | None | +| `conn_config` | [Connection config pass to `sqlite3.connect`](https://docs.python.org/3/library/sqlite3.html#sqlite3.Connection) | None | +| `journal_mode` | [SQLite Pragma: journal mode](https://www.sqlite.org/pragma.html#pragma_journal_mode) | `'DELETE'` | +| `synchronous` | [SQLite Pragma: synchronous](https://www.sqlite.org/pragma.html#pragma_synchronous) | `'OFF'` | diff --git a/docs/changelog/index.md b/docs/changelog/index.md index 70c97b0b060..738362b91ef 100644 --- a/docs/changelog/index.md +++ b/docs/changelog/index.md @@ -5,6 +5,11 @@ DocArray follows semantic versioning. However, before the project reach 1.0.0, a This chapter only tracks the most important breaking changes and explain the rationale behind them. +## 0.5.0: add storage backend to DocumentArray + +0.5 introduces an important feature that enables external Document Store as the backend of DocumentArray. It also refactors the implementation of DocumentArray. The change should be backward-compatible; and the version bumping is mainly because of the introduction of the new concept "`storage`". + + ## 0.4.0: change on the DocumentArray serialization format This change affects `DocumentArray.load_binary`, `DocumentArray.from_bytes`, `DocumentArray.to_bytes` and users can not load old DocumentArray back if they store it with `protocol='pickle'` and `protocol='protobuf'` under old version. diff --git a/docs/index.md b/docs/index.md index d38fe4ef424..a9b9bae4323 100644 --- a/docs/index.md +++ b/docs/index.md @@ -105,7 +105,12 @@ fundamentals/notebook-support/index fundamentals/fastapi-support/index ``` +```{toctree} +:caption: Advanced Topics +:hidden: +advanced/document-store/index +``` ```{toctree} :caption: Developer References