diff --git a/docarray/array/storage/base/backend.py b/docarray/array/storage/base/backend.py index 0341dbb886a..5ca74c2b4df 100644 --- a/docarray/array/storage/base/backend.py +++ b/docarray/array/storage/base/backend.py @@ -1,12 +1,8 @@ from abc import ABC from typing import Dict, Optional, TYPE_CHECKING -from docarray.array.storage.base.helper import Offset2ID - if TYPE_CHECKING: - from ....types import ( - DocumentArraySourceType, - ) + from ....types import DocumentArraySourceType, ArrayType class BaseBackendMixin(ABC): @@ -21,3 +17,11 @@ def _init_storage( def _get_storage_infos(self) -> Optional[Dict]: ... + + def _map_id(self, _id: str) -> str: + return _id + + def _map_embedding(self, embedding: 'ArrayType') -> 'ArrayType': + from ....math.ndarray import to_numpy_array + + return to_numpy_array(embedding) diff --git a/docarray/array/storage/base/getsetdel.py b/docarray/array/storage/base/getsetdel.py index adc36f4c915..be10730ce74 100644 --- a/docarray/array/storage/base/getsetdel.py +++ b/docarray/array/storage/base/getsetdel.py @@ -4,6 +4,7 @@ Sequence, Any, Iterable, + Dict, ) from .helper import Offset2ID @@ -151,7 +152,7 @@ def _set_doc(self, _id: str, value: 'Document'): def _set_doc_by_id(self, _id: str, value: 'Document'): ... - def _set_docs_by_ids(self, ids, docs: Iterable['Document']): + def _set_docs_by_ids(self, ids, docs: Iterable['Document'], mismatch_ids: Dict): """This function is derived from :meth:`_set_doc_by_id` Override this function if there is a more efficient logic @@ -162,8 +163,8 @@ def _set_docs_by_ids(self, ids, docs: Iterable['Document']): def _set_docs(self, ids, docs: Iterable['Document']): docs = list(docs) - self._set_docs_by_ids(ids, docs) mismatch_ids = {_id: doc.id for _id, doc in zip(ids, docs) if _id != doc.id} + self._set_docs_by_ids(ids, docs, mismatch_ids) self._offset2ids.update_ids(mismatch_ids) def _set_docs_by_slice(self, _slice: slice, value: Sequence['Document']): diff --git a/docarray/array/storage/pqlite/__init__.py b/docarray/array/storage/pqlite/__init__.py index cd5b422e2f0..d9b388443d2 100644 --- a/docarray/array/storage/pqlite/__init__.py +++ b/docarray/array/storage/pqlite/__init__.py @@ -16,9 +16,3 @@ class StorageMixins(FindMixin, BackendMixin, GetSetDelMixin, SequenceLikeMixin, ABC): ... - - def _to_numpy_embedding(self, doc: 'Document'): - if doc.embedding is None: - doc.embedding = np.zeros(self._pqlite.dim, dtype=np.float32) - elif isinstance(doc.embedding, list): - doc.embedding = np.array(doc.embedding, dtype=np.float32) diff --git a/docarray/array/storage/pqlite/backend.py b/docarray/array/storage/pqlite/backend.py index 6b983e21a30..8130c10e697 100644 --- a/docarray/array/storage/pqlite/backend.py +++ b/docarray/array/storage/pqlite/backend.py @@ -9,15 +9,15 @@ Generator, Iterator, ) + +import numpy as np from pqlite import PQLite from ..base.backend import BaseBackendMixin from ....helper import dataclass_from_dict if TYPE_CHECKING: - from ....types import ( - DocumentArraySourceType, - ) + from ....types import DocumentArraySourceType, ArrayType @dataclass @@ -96,3 +96,10 @@ def _get_storage_infos(self) -> Dict: 'Data Path': self._config.data_path, 'Serialization Protocol': self._config.serialize_config.get('protocol'), } + + def _map_embedding(self, embedding: 'ArrayType') -> 'ArrayType': + if embedding is None: + embedding = np.zeros(self._pqlite.dim, dtype=np.float32) + elif isinstance(embedding, list): + embedding = np.array(embedding, dtype=np.float32) + return embedding diff --git a/docarray/array/storage/pqlite/getsetdel.py b/docarray/array/storage/pqlite/getsetdel.py index cf438a80d90..cf0dcdbb29e 100644 --- a/docarray/array/storage/pqlite/getsetdel.py +++ b/docarray/array/storage/pqlite/getsetdel.py @@ -1,4 +1,4 @@ -from typing import Iterable +from typing import Iterable, Dict from .helper import OffsetMapping from ..base.getsetdel import BaseGetSetDelMixin @@ -19,7 +19,9 @@ def _get_doc_by_id(self, _id: str) -> 'Document': return doc def _set_doc_by_id(self, _id: str, value: 'Document'): - self._to_numpy_embedding(value) + if _id != value.id: + self._pqlite.delete([_id]) + value.embedding = self._map_embedding(value.embedding) docs = DocumentArrayInMemory([value]) self._pqlite.update(docs) @@ -29,10 +31,11 @@ def _del_doc_by_id(self, _id: str): def _clear_storage(self): self._pqlite.clear() - def _set_docs_by_ids(self, ids, docs: Iterable['Document']): + def _set_docs_by_ids(self, ids, docs: Iterable['Document'], mismatch_ids: Dict): + self._pqlite.delete(list(mismatch_ids.keys())) docs = DocumentArrayInMemory(docs) for doc in docs: - self._to_numpy_embedding(doc) + doc.embedding = self._map_embedding(doc.embedding) self._pqlite.update(docs) def _del_docs_by_ids(self, ids): diff --git a/docarray/array/storage/pqlite/seqlike.py b/docarray/array/storage/pqlite/seqlike.py index a37087fabc6..727eb6afd4b 100644 --- a/docarray/array/storage/pqlite/seqlike.py +++ b/docarray/array/storage/pqlite/seqlike.py @@ -15,7 +15,7 @@ def extend(self, values: Iterable['Document']) -> None: return for doc in docs: - self._to_numpy_embedding(doc) + doc.embedding = self._map_embedding(doc.embedding) self._pqlite.index(docs) self._offset2ids.extend([doc.id for doc in docs]) diff --git a/docarray/array/storage/qdrant/__init__.py b/docarray/array/storage/qdrant/__init__.py index d51a885dfba..3646fc07f4a 100644 --- a/docarray/array/storage/qdrant/__init__.py +++ b/docarray/array/storage/qdrant/__init__.py @@ -1,9 +1,9 @@ -from typing import Iterable, TYPE_CHECKING +from typing import TYPE_CHECKING from .backend import BackendMixin, QdrantConfig -from .helper import DISTANCES from .find import FindMixin from .getsetdel import GetSetDelMixin +from .helper import DISTANCES from .seqlike import SequenceLikeMixin __all__ = ['StorageMixins', 'QdrantConfig'] @@ -11,7 +11,6 @@ if TYPE_CHECKING: from qdrant_client import QdrantClient from qdrant_openapi_client.models.models import Distance - from docarray import Document class StorageMixins(FindMixin, BackendMixin, GetSetDelMixin, SequenceLikeMixin): @@ -23,11 +22,6 @@ def serialize_config(self) -> dict: def distance(self) -> 'Distance': return DISTANCES[self._config.distance] - def extend(self, docs: Iterable['Document']): - docs = list(docs) - self._upload_batch(docs) - self._offset2ids.extend([doc.id for doc in docs]) - @property def serialization_config(self) -> dict: return self._serialize_config diff --git a/docarray/array/storage/qdrant/backend.py b/docarray/array/storage/qdrant/backend.py index 19c42570f11..4c9c942622c 100644 --- a/docarray/array/storage/qdrant/backend.py +++ b/docarray/array/storage/qdrant/backend.py @@ -12,6 +12,7 @@ List, ) +import numpy as np from qdrant_client import QdrantClient from qdrant_openapi_client.models.models import ( Distance, @@ -24,11 +25,10 @@ from docarray.array.storage.base.backend import BaseBackendMixin from docarray.array.storage.qdrant.helper import DISTANCES from docarray.helper import dataclass_from_dict, random_identity +from docarray.math.helper import EPSILON if TYPE_CHECKING: - from docarray.types import ( - DocumentArraySourceType, - ) + from docarray.types import DocumentArraySourceType, ArrayType @dataclass @@ -125,7 +125,7 @@ def _collection_exists(self, collection_name): return collection_name in collections @staticmethod - def _qmap(doc_id: str): + def _map_id(doc_id: str): # if doc_id is a random ID in hex format, just translate back to UUID str # otherwise, create UUID5 from doc_id try: @@ -181,3 +181,18 @@ def _get_storage_infos(self) -> Dict: 'Distance': self._config.distance, 'Serialization Protocol': self._config.serialize_config.get('protocol'), } + + def _map_embedding(self, embedding: 'ArrayType') -> List[float]: + if embedding is None: + embedding = np.random.rand(self.n_dim) + else: + from ....math.ndarray import to_numpy_array + + embedding = to_numpy_array(embedding) + + if embedding.ndim > 1: + embedding = np.asarray(embedding).squeeze() + + if np.all(embedding == 0): + embedding = embedding + EPSILON + return embedding.tolist() diff --git a/docarray/array/storage/qdrant/find.py b/docarray/array/storage/qdrant/find.py index b3e08c2bf31..6e0925af0e9 100644 --- a/docarray/array/storage/qdrant/find.py +++ b/docarray/array/storage/qdrant/find.py @@ -7,7 +7,6 @@ List, ) -from .helper import QdrantStorageHelper from .... import Document, DocumentArray from ....math import ndarray from ....score import NamedScore @@ -50,7 +49,7 @@ def distance(self) -> 'Distance': raise NotImplementedError() def _find_similar_vectors(self, q: 'QdrantArrayType', limit=10): - query_vector = QdrantStorageHelper.embedding_to_array(q, default_dim=0) + query_vector = self._map_embedding(q) search_result = self.client.search( self.collection_name, diff --git a/docarray/array/storage/qdrant/getsetdel.py b/docarray/array/storage/qdrant/getsetdel.py index abad79a16ea..0938c7fbbf4 100644 --- a/docarray/array/storage/qdrant/getsetdel.py +++ b/docarray/array/storage/qdrant/getsetdel.py @@ -13,7 +13,6 @@ from docarray import Document from docarray.array.storage.base.getsetdel import BaseGetSetDelMixin from docarray.array.storage.base.helper import Offset2ID -from docarray.array.storage.qdrant.helper import QdrantStorageHelper class GetSetDelMixin(BaseGetSetDelMixin): @@ -67,15 +66,15 @@ def _qdrant_to_document(self, qdrant_record: dict) -> 'Document': def _document_to_qdrant(self, doc: 'Document') -> 'PointStruct': return PointStruct( - id=self._qmap(doc.id), + id=self._map_id(doc.id), payload=dict(_serialized=doc.to_base64(**self.serialization_config)), - vector=QdrantStorageHelper.embedding_to_array(doc.embedding, self.n_dim), + vector=self._map_embedding(doc.embedding), ) def _get_doc_by_id(self, _id: str) -> 'Document': try: resp = self.client.http.points_api.get_point( - name=self.collection_name, id=self._qmap(_id) + name=self.collection_name, id=self._map_id(_id) ) return self._qdrant_to_document(resp.result.payload) except UnexpectedResponse as response_error: @@ -86,7 +85,7 @@ def _del_doc_by_id(self, _id: str): self.client.http.points_api.delete_points( name=self.collection_name, wait=True, - points_selector=PointIdsList(points=[self._qmap(_id)]), + points_selector=PointIdsList(points=[self._map_id(_id)]), ) def _set_doc_by_id(self, _id: str, value: 'Document'): diff --git a/docarray/array/storage/qdrant/helper.py b/docarray/array/storage/qdrant/helper.py index 791735794ed..24dc93cf50b 100644 --- a/docarray/array/storage/qdrant/helper.py +++ b/docarray/array/storage/qdrant/helper.py @@ -1,35 +1,5 @@ -from typing import List, TYPE_CHECKING - -import numpy as np -import scipy.sparse from qdrant_openapi_client.models.models import Distance -from docarray.math.helper import EPSILON - -if TYPE_CHECKING: - from docarray.types import ArrayType - - -class QdrantStorageHelper: - @classmethod - def embedding_to_array( - cls, embedding: 'ArrayType', default_dim: int - ) -> List[float]: - if embedding is None: - embedding = np.random.rand(default_dim) - else: - from ....math.ndarray import to_numpy_array - - embedding = to_numpy_array(embedding) - - if embedding.ndim > 1: - embedding = np.asarray(embedding).squeeze() - - if np.all(embedding == 0): - embedding = embedding + EPSILON - return embedding.tolist() - - DISTANCES = { 'cosine': Distance.COSINE, 'euclidean': Distance.EUCLID, diff --git a/docarray/array/storage/qdrant/seqlike.py b/docarray/array/storage/qdrant/seqlike.py index cc15525bea0..bea3ba119e2 100644 --- a/docarray/array/storage/qdrant/seqlike.py +++ b/docarray/array/storage/qdrant/seqlike.py @@ -1,5 +1,5 @@ from abc import abstractmethod -from typing import MutableSequence, Iterable, Iterator, Union +from typing import Iterable, Union from docarray import Document from qdrant_client import QdrantClient @@ -23,6 +23,10 @@ def collection_name(self) -> str: def config(self): raise NotImplementedError() + @abstractmethod + def _upload_batch(self, docs: Iterable['Document']): + raise NotImplementedError() + def __eq__(self, other): """Compare this object to the other, returns True if and only if other as the same type as self and other has the same meta information @@ -67,3 +71,8 @@ def __bool__(self): :return: returns true if the length of the array is larger than 0 """ return len(self) > 0 + + def extend(self, docs: Iterable['Document']): + docs = list(docs) + self._upload_batch(docs) + self._offset2ids.extend([doc.id for doc in docs]) diff --git a/docarray/array/storage/weaviate/backend.py b/docarray/array/storage/weaviate/backend.py index 13aabb03ca8..64f3103348b 100644 --- a/docarray/array/storage/weaviate/backend.py +++ b/docarray/array/storage/weaviate/backend.py @@ -23,9 +23,7 @@ from ..registry import _REGISTRY if TYPE_CHECKING: - from ....types import ( - DocumentArraySourceType, - ) + from ....types import DocumentArraySourceType, ArrayType @dataclass @@ -277,30 +275,14 @@ def _doc2weaviate_create_payload(self, value: 'Document'): :param value: document to create a payload for :return: the payload dictionary """ - if value.embedding is not None: - from ....math.ndarray import to_numpy_array - - embedding = to_numpy_array(value.embedding) - - if embedding.ndim > 1: - embedding = np.asarray(embedding).squeeze() - - # Weaviate expects vector to have dim 2 at least - # or get weaviate.exceptions.UnexpectedStatusCodeException: models.C11yVector - # hence we cast it to list of a single element - if len(embedding) == 1: - embedding = [embedding[0]] - else: - embedding = None - return dict( data_object={'_serialized': value.to_base64(**self._serialize_config)}, class_name=self._class_name, - uuid=self._wmap(value.id), - vector=embedding, + uuid=self._map_id(value.id), + vector=self._map_embedding(value.embedding), ) - def _wmap(self, doc_id: str): + def _map_id(self, doc_id: str): """the function maps doc id to weaviate id :param doc_id: id of the document @@ -312,6 +294,24 @@ def _wmap(self, doc_id: str): # daw2[0, 'text'] == 'hi' # this will be False if we don't append class name return str(uuid.uuid5(uuid.NAMESPACE_URL, doc_id + self._class_name)) + def _map_embedding(self, embedding: 'ArrayType'): + if embedding is not None: + from ....math.ndarray import to_numpy_array + + embedding = to_numpy_array(embedding) + + if embedding.ndim > 1: + embedding = np.asarray(embedding).squeeze() + + # Weaviate expects vector to have dim 2 at least + # or get weaviate.exceptions.UnexpectedStatusCodeException: models.C11yVector + # hence we cast it to list of a single element + if len(embedding) == 1: + embedding = [embedding[0]] + else: + embedding = None + return embedding + def _get_storage_infos(self) -> Dict: return { 'Backend': 'Weaviate', diff --git a/docarray/array/storage/weaviate/getsetdel.py b/docarray/array/storage/weaviate/getsetdel.py index f3a0da4af0e..7018e22cbcc 100644 --- a/docarray/array/storage/weaviate/getsetdel.py +++ b/docarray/array/storage/weaviate/getsetdel.py @@ -1,3 +1,5 @@ +from typing import Iterable, Dict + from ..base.getsetdel import BaseGetSetDelMixin from ..base.helper import Offset2ID from .... import Document @@ -28,9 +30,9 @@ def _get_doc_by_id(self, _id: str) -> 'Document': :param _id: the id of the document :return: the retrieved document from weaviate """ - return self._getitem(self._wmap(_id)) + return self._getitem(self._map_id(_id)) - def _set_doc_by_id(self, _id: str, value: 'Document'): + def _set_doc_by_id(self, _id: str, value: 'Document', flush: bool = True): """Concrete implementation of base class' ``_set_doc_by_id`` :param _id: the id of doc to update @@ -40,16 +42,25 @@ def _set_doc_by_id(self, _id: str, value: 'Document'): self._del_doc_by_id(_id) payload = self._doc2weaviate_create_payload(value) - if self._client.data_object.exists(payload['uuid']): - self._client.data_object.delete(payload['uuid']) - self._client.data_object.create(**payload) + self._client.batch.add_data_object(**payload) + if flush: + self._client.batch.flush() + + def _set_docs_by_ids(self, ids, docs: Iterable['Document'], mismatch_ids: Dict): + """Overridden implementation of _set_docs_by_ids in order to add docs in batches and flush at the end + + :param ids: the ids used for indexing + """ + for _id, doc in zip(ids, docs): + self._set_doc_by_id(_id, doc, flush=False) + self._client.batch.flush() def _del_doc_by_id(self, _id: str): """Concrete implementation of base class' ``_del_doc_by_id`` :param _id: the id of the document to delete """ - wid = self._wmap(_id) + wid = self._map_id(_id) if self._client.data_object.exists(wid): self._client.data_object.delete(wid) diff --git a/docarray/array/storage/weaviate/seqlike.py b/docarray/array/storage/weaviate/seqlike.py index de96d3644e7..c2f7a0525a7 100644 --- a/docarray/array/storage/weaviate/seqlike.py +++ b/docarray/array/storage/weaviate/seqlike.py @@ -48,9 +48,9 @@ def __contains__(self, x: Union[str, 'Document']): :return: True if ``x`` is contained in self """ if isinstance(x, str): - return self._client.data_object.exists(self._wmap(x)) + return self._client.data_object.exists(self._map_id(x)) elif isinstance(x, Document): - return self._client.data_object.exists(self._wmap(x.id)) + return self._client.data_object.exists(self._map_id(x.id)) else: return False diff --git a/docs/advanced/document-store/qdrant.md b/docs/advanced/document-store/qdrant.md index 5ba409a1d80..64d3987f0e8 100644 --- a/docs/advanced/document-store/qdrant.md +++ b/docs/advanced/document-store/qdrant.md @@ -17,7 +17,7 @@ services: qdrant: image: qdrant/qdrant:v0.5.1 ports: - - 6333:6333 + - "6333:6333" ulimits: # Only required for tests, as there are a lot of collections created nofile: soft: 65535 diff --git a/docs/advanced/document-store/weaviate.md b/docs/advanced/document-store/weaviate.md index e9614e1ca8c..176f08a6a61 100644 --- a/docs/advanced/document-store/weaviate.md +++ b/docs/advanced/document-store/weaviate.md @@ -15,15 +15,15 @@ version: '3.4' services: weaviate: command: - - --host - - 0.0.0.0 - - --port - - '8080' - - --scheme - - http + - --host + - 0.0.0.0 + - --port + - '8080' + - --scheme + - http image: semitechnologies/weaviate:1.10.0 ports: - - 8080:8080 + - "8080:8080" restart: on-failure:0 environment: QUERY_DEFAULTS_LIMIT: 25 diff --git a/docs/fundamentals/document/attribute.md b/docs/fundamentals/document/attribute.md index 3fa0797da3e..099427c8292 100644 --- a/docs/fundamentals/document/attribute.md +++ b/docs/fundamentals/document/attribute.md @@ -44,11 +44,11 @@ Among all attributes, content attributes, namely `.text`, `.tensor`, and `.blob` They correspond to string-like data (e.g. for natural language), `ndarray`-like data (e.g. for image/audio/video data), and binary data for general purpose, respectively. -| Attribute | Accept type | Use case | -| --- |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| --- | -| `doc.text` | Python string | Contain text | -| `doc.tensor` | A Python (nested) list/tuple of numbers, Numpy `ndarray`, SciPy sparse matrix (`spmatrix`), TensorFlow dense & sparse tensor, PyTorch dense & sparse tensor, PaddlePaddle dense tensor | Contain image/video/audio | -| `doc.blob` | Binary string | Contain intermediate IO buffer | +| Attribute | Accept type | Use case | +|--------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------| +| `doc.text` | Python string | Contain text | +| `doc.tensor` | A Python (nested) list/tuple of numbers, Numpy `ndarray`, SciPy sparse matrix (`spmatrix`), TensorFlow dense & sparse tensor, PyTorch dense & sparse tensor, PaddlePaddle dense tensor | Contain image/video/audio | +| `doc.blob` | Binary string | Contain intermediate IO buffer | Each Document can contain only one type of content. That means these three attributes are mutually exclusive. Let's see an example: diff --git a/docs/fundamentals/document/nested.md b/docs/fundamentals/document/nested.md index 5cfd405d596..62b8ecaccbe 100644 --- a/docs/fundamentals/document/nested.md +++ b/docs/fundamentals/document/nested.md @@ -6,12 +6,12 @@ Document can be nested both horizontally and vertically via `.matches` and `.chu ```{figure} images/nested-structure.svg ``` -| Attribute | Description | -| --- |-------------------------------------------------------------------------------------------------| -| `doc.chunks` | The list of sub-Documents of this Document. They have `granularity + 1` but same `adjacency` | -| `doc.matches` | The list of matched Documents of this Document. They have `adjacency + 1` but same `granularity` | -| `doc.granularity` | The "depth" of the nested chunks structure | -| `doc.adjacency` | The "width" of the nested match structure | +| Attribute | Description | +|-------------------|--------------------------------------------------------------------------------------------------| +| `doc.chunks` | The list of sub-Documents of this Document. They have `granularity + 1` but same `adjacency` | +| `doc.matches` | The list of matched Documents of this Document. They have `adjacency + 1` but same `granularity` | +| `doc.granularity` | The "depth" of the nested chunks structure | +| `doc.adjacency` | The "width" of the nested match structure | You can add **chunks** (sub-Document) and **matches** (neighbour-Document) to a Document: diff --git a/docs/fundamentals/documentarray/access-attributes.md b/docs/fundamentals/documentarray/access-attributes.md index a4c3cd19e7f..38734f88857 100644 --- a/docs/fundamentals/documentarray/access-attributes.md +++ b/docs/fundamentals/documentarray/access-attributes.md @@ -23,7 +23,7 @@ As in element selector, one can use attribute selector to **get/set/delete** att | `da[1:3, ('id', 'scores')]` | a list of two list, first is all `.id` from the first three Documents, second is all `.scores` from the first three Documents | | `da[:, 'scores__cosine__value']` | all `.scores['cosine'].value` from the first three Documents | | `da[1:3, 'embedding']`, `da[1:3].embeddings` | a NdArray-like object of the first three Documents embeddings | -| `da[:, 'tensor']`, `da.tensors` | a NdArray-like object of the all top-level Documents tensors | +| `da[:, 'tensor']`, `da.tensors` | a NdArray-like object of the all top-level Documents tensors | Let's see an example. diff --git a/docs/fundamentals/documentarray/access-elements.md b/docs/fundamentals/documentarray/access-elements.md index 9b83d8738ba..07de33980b2 100644 --- a/docs/fundamentals/documentarray/access-elements.md +++ b/docs/fundamentals/documentarray/access-elements.md @@ -7,14 +7,14 @@ If it is just a `list` and you can only access elements via `[1]`, `[-1]`, `[1:3 The table below summarizes all indexing routines that DocumentArray supports. You can use them to **get, set, and delete** items in DocumentArray. -| Indexing routine | Example | Return | -|---------------------|------------------------------------------------------------------------------|----------------| -| by integer | `da[1]`, `da[-1]` | Document | -| by integers | `da[1,2,3]` | DocumentArray | -| by slice | `da[1:10:2]`, `da[5:]` | DocumentArray | -| by `id` | `da['a04633546e6211ec8ad31e008a366d49']` | Document | -| by `id`s | `da['a04633546e6211ec8ad31e008a366d49', 'af7923406e6211ecbc811e008a366d49']` | DocumentArray | -| by boolean mask | `da[True, False, True, False] ` | DocumentArray | +| Indexing routine | Example | Return | +|---------------------|------------------------------------------------------------------------------|---------------| +| by integer | `da[1]`, `da[-1]` | Document | +| by integers | `da[1,2,3]` | DocumentArray | +| by slice | `da[1:10:2]`, `da[5:]` | DocumentArray | +| by `id` | `da['a04633546e6211ec8ad31e008a366d49']` | Document | +| by `id`s | `da['a04633546e6211ec8ad31e008a366d49', 'af7923406e6211ecbc811e008a366d49']` | DocumentArray | +| by boolean mask | `da[True, False, True, False] ` | DocumentArray | | by Ellipsis | `da[...]` | DocumentArray | | by nested structure | `da['@cm,m,c']`, `da['@c1:3m']` | DocumentArray | diff --git a/docs/fundamentals/documentarray/evaluation.md b/docs/fundamentals/documentarray/evaluation.md index 01a05b38462..65a9be846fa 100644 --- a/docs/fundamentals/documentarray/evaluation.md +++ b/docs/fundamentals/documentarray/evaluation.md @@ -10,17 +10,17 @@ The results are stored in `.evaluations` field of each Document. DocArray provides some common metrics used in the information retrieval community that allows one to evaluate the nearest-neighbour matches. Different metric accepts different arguments as `kwargs`: -| Metric | Accept `kwargs` | -|---------------------|-------------------------| -| `r_precision` | None | -| `average_precision` | None | -| `reciprocal_rank` | None | -| `precision_at_k` | `k` | -| `hit_at_k` | `k` | -| `recall_at_k` | `max_rel`, `k` | -| `f1_score_at_k` | `max_rel`, `k` | -| `dcg_at_k` | `method`, `k` | -| `ndcg_at_k` | `method`, `k` | +| Metric | Accept `kwargs` | +|---------------------|------------------| +| `r_precision` | None | +| `average_precision` | None | +| `reciprocal_rank` | None | +| `precision_at_k` | `k` | +| `hit_at_k` | `k` | +| `recall_at_k` | `max_rel`, `k` | +| `f1_score_at_k` | `max_rel`, `k` | +| `dcg_at_k` | `method`, `k` | +| `ndcg_at_k` | `method`, `k` | For example, let's create a DocumentArray with random embeddings and matching it to itself: diff --git a/docs/fundamentals/documentarray/matching.md b/docs/fundamentals/documentarray/matching.md index afa2e023f42..408ecf44d11 100644 --- a/docs/fundamentals/documentarray/matching.md +++ b/docs/fundamentals/documentarray/matching.md @@ -106,13 +106,13 @@ match emb = (0, 0) 1.0 The following metrics are supported: -| Metric | Frameworks | -|----------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------| +| Metric | Frameworks | +|----------------------------------------------------------------------------------------------------------------------|-------------------------------------------| | `cosine` | Scipy, Numpy, Tensorflow, Pytorch, Paddle | | `sqeuclidean` | Scipy, Numpy, Tensorflow, Pytorch, Paddle | | `euclidean` | Scipy, Numpy, Tensorflow, Pytorch, Paddle | -| [Metrics supported by Scipy](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html) | Scipy | -| User defined callable | Depending on the callable | +| [Metrics supported by Scipy](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html) | Scipy | +| User defined callable | Depending on the callable | Note that framework is auto-chosen based on the type of `.embeddings`. For example, if `.embeddings` is a Tensorflow Tensor, then Tensorflow will be used for computing. One exception is when `.embeddings` is a Numpy `ndarray`, you can choose to use Numpy or Scipy (by specify `.match(..., use_scipy=True)`) for computing. diff --git a/docs/fundamentals/documentarray/serialization.md b/docs/fundamentals/documentarray/serialization.md index a5fc2d75b5f..44b45ae1e29 100644 --- a/docs/fundamentals/documentarray/serialization.md +++ b/docs/fundamentals/documentarray/serialization.md @@ -119,12 +119,12 @@ If you go with default `protcol` and `compress` settings, you can simply use `by The table below summarize the supported serialization protocols and compressions: -| `protocol=...` | Description | Remarks | -|--------------------------|-----------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------| -| `pickle-array` (default) | Serialize the whole array in one-shot using Python `pickle` | Often fastest. Not portable to other languages. Insecure in production. | +| `protocol=...` | Description | Remarks | +|--------------------------|------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------| +| `pickle-array` (default) | Serialize the whole array in one-shot using Python `pickle` | Often fastest. Not portable to other languages. Insecure in production. | | `protobuf-array` | Serialize the whole array using [`DocumentArrayProto`](../../../proto/#docarray.DocumentArrayProto). | Portable to other languages if they implement `DocumentArrayProto`. 2GB max-size (pre-compression) restriction by Protobuf. | -| `pickle` | Serialize elements one-by-one using Python `pickle`. | Allow streaming. Not portable to other languages. Insecure in production. | -| `protobuf` | Serialize elements one-by-one using [`DocumentProto`](../../../proto/#docarray.DocumentProto). | Allow streaming. Portable to other languages if they implement `DocumentProto`. No max-size restriction | +| `pickle` | Serialize elements one-by-one using Python `pickle`. | Allow streaming. Not portable to other languages. Insecure in production. | +| `protobuf` | Serialize elements one-by-one using [`DocumentProto`](../../../proto/#docarray.DocumentProto). | Allow streaming. Portable to other languages if they implement `DocumentProto`. No max-size restriction | For compressions, the following algorithms are supported: `lz4`, `bz2`, `lzma`, `zlib`, `gzip`. The most frequently used ones are `lz4` (fastest) and `gzip` (most widely used). diff --git a/tests/unit/array/docker-compose.yml b/tests/unit/array/docker-compose.yml index 21806311f3f..a8529ba6a33 100644 --- a/tests/unit/array/docker-compose.yml +++ b/tests/unit/array/docker-compose.yml @@ -3,7 +3,7 @@ services: weaviate: image: semitechnologies/weaviate:1.10.0 ports: - - 8080:8080 + - "8080:8080" environment: CONTEXTIONARY_URL: contextionary:9999 QUERY_DEFAULTS_LIMIT: 25 @@ -12,7 +12,7 @@ services: qdrant: image: qdrant/qdrant:v0.5.1 ports: - - 6333:6333 + - "6333:6333" ulimits: # Only required for tests, as there are a lot of collections created nofile: soft: 65535 diff --git a/tests/unit/array/test_pull_out.py b/tests/unit/array/test_pull_out.py new file mode 100644 index 00000000000..c4b7c44bb30 --- /dev/null +++ b/tests/unit/array/test_pull_out.py @@ -0,0 +1,226 @@ +import numpy as np +import pytest + +from docarray import DocumentArray, Document +from docarray.math.ndarray import to_numpy_array + + +@pytest.fixture(scope='function') +def docs(): + d1 = Document(embedding=np.array([10, 0])) + d2 = Document(embedding=np.array([0, 10])) + d3 = Document(embedding=np.array([-10, -10])) + yield d1, d2, d3 + + +@pytest.mark.parametrize( + 'storage,config', + [ + ('memory', None), + ('sqlite', None), + ('weaviate', {'n_dim': 2}), + ('pqlite', {'n_dim': 2}), + ('qdrant', {'n_dim': 2}), + ], +) +def test_update_embedding(docs, storage, config, start_storage): + if config: + da = DocumentArray(docs, storage=storage, config=config) + else: + da = DocumentArray(docs, storage=storage) + + results = da.find(np.array([1, 9])) + assert results[0].id == docs[1].id + assert results[1].id == docs[0].id + assert results[2].id == docs[2].id + + da[0, 'embedding'] = np.array([1.1, 9.1]) + + results = da.find(np.array([1, 9])) + assert results[0].id == docs[0].id + assert results[1].id == docs[1].id + assert results[2].id == docs[2].id + + np.testing.assert_almost_equal( + to_numpy_array(da[0].embedding), np.array([1.1, 9.1]) + ) + + +@pytest.mark.parametrize( + 'storage,config', + [ + ('memory', None), + ('sqlite', None), + ('weaviate', {'n_dim': 2}), + ('pqlite', {'n_dim': 2}), + ('qdrant', {'n_dim': 2}), + ], +) +def test_update_doc_embedding(docs, storage, config, start_storage): + if config: + da = DocumentArray(docs, storage=storage, config=config) + else: + da = DocumentArray(docs, storage=storage) + + results = da.find(np.array([1, 9])) + assert results[0].id == docs[1].id + assert results[1].id == docs[0].id + assert results[2].id == docs[2].id + + da[0] = Document(id=docs[0].id, embedding=np.array([1.1, 9.1])) + + results = da.find(np.array([1, 9])) + assert results[0].id == docs[0].id + assert results[1].id == docs[1].id + assert results[2].id == docs[2].id + + np.testing.assert_almost_equal( + to_numpy_array(da[0].embedding), np.array([1.1, 9.1]) + ) + + +@pytest.mark.parametrize( + 'storage,config', + [ + ('memory', None), + ('sqlite', None), + ('weaviate', {'n_dim': 2}), + ('pqlite', {'n_dim': 2}), + ('qdrant', {'n_dim': 2}), + ], +) +def test_batch_update_embedding(docs, storage, config, start_storage): + if config: + da = DocumentArray(docs, storage=storage, config=config) + else: + da = DocumentArray(docs, storage=storage) + + results = da.find(np.array([1, 9])) + assert results[0].id == docs[1].id + assert results[1].id == docs[0].id + assert results[2].id == docs[2].id + + da[:, 'embedding'] = np.array([[0, 10], [10, 0], [-10, -10]]) + + results = da.find(np.array([1, 9])) + assert results[0].id == docs[0].id + assert results[1].id == docs[1].id + assert results[2].id == docs[2].id + + np.testing.assert_almost_equal(to_numpy_array(da[0].embedding), np.array([0, 10])) + + +@pytest.mark.parametrize( + 'storage,config', + [ + ('memory', None), + ('sqlite', None), + ('weaviate', {'n_dim': 2}), + ('pqlite', {'n_dim': 2}), + ('qdrant', {'n_dim': 2}), + ], +) +def test_batch_update_doc_embedding(docs, storage, config, start_storage): + if config: + da = DocumentArray(docs, storage=storage, config=config) + else: + da = DocumentArray(docs, storage=storage) + + results = da.find(np.array([1, 9])) + assert results[0].id == docs[1].id + assert results[1].id == docs[0].id + assert results[2].id == docs[2].id + + da[:2] = [ + Document(id=docs[0].id, embedding=np.array([0, 10])), + Document(id=docs[1].id, embedding=np.array([10, 0])), + ] + + results = da.find(np.array([1, 9])) + assert results[0].id == docs[0].id + assert results[1].id == docs[1].id + assert results[2].id == docs[2].id + + np.testing.assert_almost_equal(to_numpy_array(da[0].embedding), np.array([0, 10])) + + +@pytest.mark.parametrize( + 'storage,config', + [ + ('sqlite', None), + ('weaviate', {'n_dim': 2}), + ('pqlite', {'n_dim': 2}), + ('qdrant', {'n_dim': 2}), + ], +) +def test_update_id(docs, storage, config, start_storage): + if config: + da = DocumentArray(docs, storage=storage, config=config) + else: + da = DocumentArray(docs, storage=storage) + + da[0, 'id'] = Document(embedding=np.random.rand(2)).id + assert docs[0] not in da + + +@pytest.mark.parametrize( + 'storage,config', + [ + ('memory', None), + ('sqlite', None), + ('weaviate', {'n_dim': 2}), + ('pqlite', {'n_dim': 2}), + ('qdrant', {'n_dim': 2}), + ], +) +def test_update_doc_id(docs, storage, config, start_storage): + if config: + da = DocumentArray(docs, storage=storage, config=config) + else: + da = DocumentArray(docs, storage=storage) + + da[0] = Document(embedding=np.random.rand(2)) + assert docs[0] not in da + + +@pytest.mark.parametrize( + 'storage,config', + [ + ('sqlite', None), + ('weaviate', {'n_dim': 2}), + ('pqlite', {'n_dim': 2}), + ('qdrant', {'n_dim': 2}), + ], +) +def test_batch_update_id(docs, storage, config, start_storage): + if config: + da = DocumentArray(docs, storage=storage, config=config) + else: + da = DocumentArray(docs, storage=storage) + + da[:, 'id'] = [Document(embedding=np.random.rand(2)).id for _ in range(3)] + assert docs[0] not in da + assert docs[1] not in da + assert docs[2] not in da + + +@pytest.mark.parametrize( + 'storage,config', + [ + ('memory', None), + ('sqlite', None), + ('weaviate', {'n_dim': 2}), + ('pqlite', {'n_dim': 2}), + ('qdrant', {'n_dim': 2}), + ], +) +def test_batch_update_doc_id(docs, storage, config, start_storage): + if config: + da = DocumentArray(docs, storage=storage, config=config) + else: + da = DocumentArray(docs, storage=storage) + + da[:] = [Document(embedding=np.random.rand(2)) for _ in range(3)] + assert docs[0] not in da + assert docs[1] not in da + assert docs[2] not in da