Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions docarray/array/mixins/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,15 @@ def summary(self):
)
tables.append(attr_table)

storage_table = Table(box=box.SIMPLE, title='Storage Summary')
storage_table.show_header = False
storage_infos = self._get_storage_infos()
for k, v in storage_infos.items():
storage_table.add_row(k, v)
if storage_infos:
storage_table = Table(box=box.SIMPLE, title='Storage Summary')
storage_table.show_header = False

tables.append(storage_table)
for k, v in storage_infos.items():
storage_table.add_row(k, v)

tables.append(storage_table)

console.print(*tables)

Expand Down
4 changes: 2 additions & 2 deletions docarray/array/storage/base/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ def _init_storage(
):
self._load_offset2ids()

def _get_storage_infos(self) -> Dict:
return {'Class': self.__class__.__name__}
def _get_storage_infos(self) -> Optional[Dict]:
...
5 changes: 0 additions & 5 deletions docarray/array/storage/memory/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,3 @@ def _init_storage(
self.append(Document(_docs, copy=True))
else:
self.append(_docs)

def _get_storage_infos(self) -> Dict:
storage_infos = super()._get_storage_infos()
storage_infos['Backend'] = 'In Memory'
return storage_infos
4 changes: 1 addition & 3 deletions docarray/array/storage/pqlite/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,9 @@ def __setstate__(self, state):
self._pqlite = PQLite(n_dim, lock=False, **config)

def _get_storage_infos(self) -> Dict:
storage_infos = super()._get_storage_infos()
return {
'Backend': 'PQLite (https://github.com/jina-ai/pqlite)',
'Backend': 'PQLite',
'Distance Metric': self._pqlite.metric.name,
'Data Path': self._config.data_path,
'Serialization Protocol': self._config.serialize_config.get('protocol'),
**storage_infos,
}
4 changes: 1 addition & 3 deletions docarray/array/storage/qdrant/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,13 +173,11 @@ def _update_offset2ids_meta(self):
)

def _get_storage_infos(self) -> Dict:
storage_infos = super()._get_storage_infos()
return {
'Backend': 'Qdrant (https://qdrant.tech)',
'Backend': 'Qdrant',
'Host': self._config.host,
'Port': str(self._config.port),
'Collection Name': self.collection_name,
'Distance': self._config.distance,
'Serialization Protocol': self._config.serialize_config.get('protocol'),
**storage_infos,
}
4 changes: 1 addition & 3 deletions docarray/array/storage/sqlite/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,11 +143,9 @@ def __setstate__(self, state):
)

def _get_storage_infos(self) -> Dict:
storage_infos = super()._get_storage_infos()
return {
'Backend': 'SQLite (https://www.sqlite.org)',
'Backend': 'SQLite',
'Connection': self._config.connection,
'Table Name': self._table_name,
'Serialization Protocol': self._config.serialize_config.get('protocol'),
**storage_infos,
}
4 changes: 1 addition & 3 deletions docarray/array/storage/weaviate/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,11 +316,9 @@ def _wmap(self, doc_id: str):
return str(uuid.uuid5(uuid.NAMESPACE_URL, doc_id + self._class_name))

def _get_storage_infos(self) -> Dict:
storage_infos = super()._get_storage_infos()
return {
'Backend': 'Weaviate (www.semi.technology/developers/weaviate)',
'Backend': 'Weaviate',
'Hostname': self._config.client,
'Schema Name': self._config.name,
'Serialization Protocol': self._config.serialize_config.get('protocol'),
**storage_infos,
}
48 changes: 33 additions & 15 deletions docs/fundamentals/documentarray/serialization.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ Afterwards, `doc1_bytes` describes how many bytes are used to serialize `doc1`,
The pattern `dock_bytes` and `dock.to_bytes` is repeated `len(docs)` times.


### From/to Disk
### From/to disk

If you want to store a `DocumentArray` to disk you can use `.save_binary(filename, protocol, compress)` where `protocol` and `compress` refer to the protocol and compression methods used to serialize the data.
If you want to load a `DocumentArray` from disk you can use `.load_binary(filename, protocol, compress)`.
Expand All @@ -177,31 +177,49 @@ da = DocumentArray([Document(text='hello'), Document(text='world')])

da.save_binary('my_docarray.bin', protocol='protobuf', compress='lz4')
da_rec = DocumentArray.load_binary('my_docarray.bin', protocol='protobuf', compress='lz4')
da_rec == da
da_rec.summary()
```

Note that in the previous code snippet the user needs to remember the protol and compression methods used to store the data in order to load it back correctly. `DocArray` allows you to specify `protocol` and `compress` as file extensions.
By doing so you can forget later on which protocol and compression methods were used to serialize the data to disk.
This functionality assumes `.save_binary` and `.load_binary` are called with `filename` following the form `file_name.$protocol.$compress`, where `$protocol` and `$compress` refer to a string interpolation of the respective `protocol` and `compress` methods.
```text
Documents Summary

Length 2
Homogenous Documents True
Common Attributes ('id', 'mime_type', 'text')

Attributes Summary

Attribute Data type #Unique values Has empty value
──────────────────────────────────────────────────────────
id ('str',) 2 False
mime_type ('str',) 1 False
text ('str',) 2 False

```

For example if `file=my_docarray.protobuf.lz4` then the binary data will be created using `protocol=protobuf` and `compress=lz4`.

The previous code snippet can be simplified to
User do not need to remember the protocol and compression methods on loading. You can simply specify `protocol` and `compress` in the file extension via:

```python
from docarray import DocumentArray, Document
```text
filename.protobuf.gzip
~~~~~~~~ ^^^^
| |
| |-- compress
|
|-- protocol
```

da = DocumentArray([Document(text='hello'), Document(text='world')])

When a filename is given as the above format in `.save_binary`, you can simply load it back with `.load_binary` without specifying the protocol and compress method again.


The previous code snippet can be simplified to

```python
da.save_binary('my_docarray.protobuf.lz4')
da_rec = DocumentArray.load_binary('my_docarray.protobuf.lz4')
da_rec == da
```

```{tip}
If you don't want to specify and remember `protocol` and `compress` to store/load to/from disk, save your `DocumentArray` `da` using
`da.save_binary('file_name.$protocol.$compress')` so that it can be loaded back with `DocumentArray.load_binary('file_name.$protocol.$compress')`
```

### Stream large binary serialization from disk

Expand Down