From d01414830560df729166ce654a2c6db03a701420 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Fri, 25 Feb 2022 22:39:43 +0100 Subject: [PATCH] fix(array): storage info in summary --- docarray/array/mixins/plot.py | 12 +++-- docarray/array/storage/base/backend.py | 4 +- docarray/array/storage/memory/backend.py | 5 -- docarray/array/storage/pqlite/backend.py | 4 +- docarray/array/storage/qdrant/backend.py | 4 +- docarray/array/storage/sqlite/backend.py | 4 +- docarray/array/storage/weaviate/backend.py | 4 +- .../documentarray/serialization.md | 48 +++++++++++++------ 8 files changed, 46 insertions(+), 39 deletions(-) diff --git a/docarray/array/mixins/plot.py b/docarray/array/mixins/plot.py index 816ae8322d0..ec778ad05c8 100644 --- a/docarray/array/mixins/plot.py +++ b/docarray/array/mixins/plot.py @@ -98,13 +98,15 @@ def summary(self): ) tables.append(attr_table) - storage_table = Table(box=box.SIMPLE, title='Storage Summary') - storage_table.show_header = False storage_infos = self._get_storage_infos() - for k, v in storage_infos.items(): - storage_table.add_row(k, v) + if storage_infos: + storage_table = Table(box=box.SIMPLE, title='Storage Summary') + storage_table.show_header = False - tables.append(storage_table) + for k, v in storage_infos.items(): + storage_table.add_row(k, v) + + tables.append(storage_table) console.print(*tables) diff --git a/docarray/array/storage/base/backend.py b/docarray/array/storage/base/backend.py index 10093a53228..0341dbb886a 100644 --- a/docarray/array/storage/base/backend.py +++ b/docarray/array/storage/base/backend.py @@ -19,5 +19,5 @@ def _init_storage( ): self._load_offset2ids() - def _get_storage_infos(self) -> Dict: - return {'Class': self.__class__.__name__} + def _get_storage_infos(self) -> Optional[Dict]: + ... diff --git a/docarray/array/storage/memory/backend.py b/docarray/array/storage/memory/backend.py index 693f0059819..c7ec93e4dda 100644 --- a/docarray/array/storage/memory/backend.py +++ b/docarray/array/storage/memory/backend.py @@ -49,8 +49,3 @@ def _init_storage( self.append(Document(_docs, copy=True)) else: self.append(_docs) - - def _get_storage_infos(self) -> Dict: - storage_infos = super()._get_storage_infos() - storage_infos['Backend'] = 'In Memory' - return storage_infos diff --git a/docarray/array/storage/pqlite/backend.py b/docarray/array/storage/pqlite/backend.py index 6b079abc1e3..6b983e21a30 100644 --- a/docarray/array/storage/pqlite/backend.py +++ b/docarray/array/storage/pqlite/backend.py @@ -90,11 +90,9 @@ def __setstate__(self, state): self._pqlite = PQLite(n_dim, lock=False, **config) def _get_storage_infos(self) -> Dict: - storage_infos = super()._get_storage_infos() return { - 'Backend': 'PQLite (https://github.com/jina-ai/pqlite)', + 'Backend': 'PQLite', 'Distance Metric': self._pqlite.metric.name, 'Data Path': self._config.data_path, 'Serialization Protocol': self._config.serialize_config.get('protocol'), - **storage_infos, } diff --git a/docarray/array/storage/qdrant/backend.py b/docarray/array/storage/qdrant/backend.py index ec1139e82a3..19c42570f11 100644 --- a/docarray/array/storage/qdrant/backend.py +++ b/docarray/array/storage/qdrant/backend.py @@ -173,13 +173,11 @@ def _update_offset2ids_meta(self): ) def _get_storage_infos(self) -> Dict: - storage_infos = super()._get_storage_infos() return { - 'Backend': 'Qdrant (https://qdrant.tech)', + 'Backend': 'Qdrant', 'Host': self._config.host, 'Port': str(self._config.port), 'Collection Name': self.collection_name, 'Distance': self._config.distance, 'Serialization Protocol': self._config.serialize_config.get('protocol'), - **storage_infos, } diff --git a/docarray/array/storage/sqlite/backend.py b/docarray/array/storage/sqlite/backend.py index adfb4824998..28f2fa1a92c 100644 --- a/docarray/array/storage/sqlite/backend.py +++ b/docarray/array/storage/sqlite/backend.py @@ -143,11 +143,9 @@ def __setstate__(self, state): ) def _get_storage_infos(self) -> Dict: - storage_infos = super()._get_storage_infos() return { - 'Backend': 'SQLite (https://www.sqlite.org)', + 'Backend': 'SQLite', 'Connection': self._config.connection, 'Table Name': self._table_name, 'Serialization Protocol': self._config.serialize_config.get('protocol'), - **storage_infos, } diff --git a/docarray/array/storage/weaviate/backend.py b/docarray/array/storage/weaviate/backend.py index 56654fc22ae..4f0f34c7b71 100644 --- a/docarray/array/storage/weaviate/backend.py +++ b/docarray/array/storage/weaviate/backend.py @@ -316,11 +316,9 @@ def _wmap(self, doc_id: str): return str(uuid.uuid5(uuid.NAMESPACE_URL, doc_id + self._class_name)) def _get_storage_infos(self) -> Dict: - storage_infos = super()._get_storage_infos() return { - 'Backend': 'Weaviate (www.semi.technology/developers/weaviate)', + 'Backend': 'Weaviate', 'Hostname': self._config.client, 'Schema Name': self._config.name, 'Serialization Protocol': self._config.serialize_config.get('protocol'), - **storage_infos, } diff --git a/docs/fundamentals/documentarray/serialization.md b/docs/fundamentals/documentarray/serialization.md index 50bdd01c18f..a5fc2d75b5f 100644 --- a/docs/fundamentals/documentarray/serialization.md +++ b/docs/fundamentals/documentarray/serialization.md @@ -163,7 +163,7 @@ Afterwards, `doc1_bytes` describes how many bytes are used to serialize `doc1`, The pattern `dock_bytes` and `dock.to_bytes` is repeated `len(docs)` times. -### From/to Disk +### From/to disk If you want to store a `DocumentArray` to disk you can use `.save_binary(filename, protocol, compress)` where `protocol` and `compress` refer to the protocol and compression methods used to serialize the data. If you want to load a `DocumentArray` from disk you can use `.load_binary(filename, protocol, compress)`. @@ -177,31 +177,49 @@ da = DocumentArray([Document(text='hello'), Document(text='world')]) da.save_binary('my_docarray.bin', protocol='protobuf', compress='lz4') da_rec = DocumentArray.load_binary('my_docarray.bin', protocol='protobuf', compress='lz4') -da_rec == da +da_rec.summary() ``` -Note that in the previous code snippet the user needs to remember the protol and compression methods used to store the data in order to load it back correctly. `DocArray` allows you to specify `protocol` and `compress` as file extensions. -By doing so you can forget later on which protocol and compression methods were used to serialize the data to disk. -This functionality assumes `.save_binary` and `.load_binary` are called with `filename` following the form `file_name.$protocol.$compress`, where `$protocol` and `$compress` refer to a string interpolation of the respective `protocol` and `compress` methods. +```text + Documents Summary + + Length 2 + Homogenous Documents True + Common Attributes ('id', 'mime_type', 'text') + + Attributes Summary + + Attribute Data type #Unique values Has empty value + ────────────────────────────────────────────────────────── + id ('str',) 2 False + mime_type ('str',) 1 False + text ('str',) 2 False + +``` -For example if `file=my_docarray.protobuf.lz4` then the binary data will be created using `protocol=protobuf` and `compress=lz4`. -The previous code snippet can be simplified to +User do not need to remember the protocol and compression methods on loading. You can simply specify `protocol` and `compress` in the file extension via: -```python -from docarray import DocumentArray, Document +```text +filename.protobuf.gzip + ~~~~~~~~ ^^^^ + | | + | |-- compress + | + |-- protocol +``` -da = DocumentArray([Document(text='hello'), Document(text='world')]) +When a filename is given as the above format in `.save_binary`, you can simply load it back with `.load_binary` without specifying the protocol and compress method again. + + +The previous code snippet can be simplified to + +```python da.save_binary('my_docarray.protobuf.lz4') da_rec = DocumentArray.load_binary('my_docarray.protobuf.lz4') -da_rec == da ``` -```{tip} -If you don't want to specify and remember `protocol` and `compress` to store/load to/from disk, save your `DocumentArray` `da` using -`da.save_binary('file_name.$protocol.$compress')` so that it can be loaded back with `DocumentArray.load_binary('file_name.$protocol.$compress')` -``` ### Stream large binary serialization from disk