From 7879f83f91af581dc8d3992ec04a30ea1100f19d Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Wed, 5 Oct 2022 16:14:39 +0200 Subject: [PATCH 01/14] feat: update qdrant to version 0.10.1 --- docarray/array/storage/qdrant/backend.py | 89 +++++++++++++--------- docarray/array/storage/qdrant/find.py | 2 +- docarray/array/storage/qdrant/getsetdel.py | 57 +++++++------- docarray/array/storage/qdrant/seqlike.py | 14 ++-- docs/advanced/document-store/qdrant.md | 8 +- scripts/docker-compose.yml | 5 +- setup.py | 2 +- tests/unit/array/docker-compose.yml | 3 +- 8 files changed, 101 insertions(+), 79 deletions(-) diff --git a/docarray/array/storage/qdrant/backend.py b/docarray/array/storage/qdrant/backend.py index 5561def8801..9ac0b517edf 100644 --- a/docarray/array/storage/qdrant/backend.py +++ b/docarray/array/storage/qdrant/backend.py @@ -1,4 +1,5 @@ import uuid +from abc import abstractmethod from dataclasses import dataclass, field, asdict from typing import ( Optional, @@ -18,6 +19,7 @@ PointsList, PointStruct, HnswConfigDiff, + VectorParams, ) from docarray import Document @@ -37,6 +39,10 @@ class QdrantConfig: collection_name: Optional[str] = None host: Optional[str] = field(default="localhost") port: Optional[int] = field(default=6333) + grpc_port: Optional[int] = field(default=6334) + prefer_grpc: Optional[bool] = field(default=True) + api_key: Optional[str] = field(default=None) + https: Optional[bool] = field(default=None) serialize_config: Dict = field(default_factory=dict) scroll_batch_size: int = 64 ef_construct: Optional[int] = None @@ -46,15 +52,20 @@ class QdrantConfig: class BackendMixin(BaseBackendMixin): + @property + @abstractmethod + def client(self) -> 'QdrantClient': + raise NotImplementedError() + @classmethod def _tmp_collection_name(cls) -> str: return uuid.uuid4().hex def _init_storage( - self, - docs: Optional['DocumentArraySourceType'] = None, - config: Optional[Union[QdrantConfig, Dict]] = None, - **kwargs, + self, + docs: Optional['DocumentArraySourceType'] = None, + config: Optional[Union[QdrantConfig, Dict]] = None, + **kwargs, ): """Initialize qdrant storage. @@ -83,7 +94,14 @@ def _init_storage( self._n_dim = config.n_dim self._serialize_config = config.serialize_config - self._client = QdrantClient(host=config.host, port=config.port) + self._client = QdrantClient( + host=config.host, + port=config.port, + prefer_grpc=config.prefer_grpc, + grpc_port=config.grpc_port, + api_key=config.api_key, + https=config.https, + ) self._config = config self._persist = bool(self._config.collection_name) @@ -114,15 +132,15 @@ def _init_storage( self.append(docs) def _ensure_unique_config( - self, - config_root: dict, - config_subindex: dict, - config_joined: dict, - subindex_name: str, + self, + config_root: dict, + config_subindex: dict, + config_joined: dict, + subindex_name: str, ) -> dict: if 'collection_name' not in config_subindex: config_joined['collection_name'] = ( - config_joined['collection_name'] + '_subindex_' + subindex_name + config_joined['collection_name'] + '_subindex_' + subindex_name ) return config_joined @@ -133,18 +151,18 @@ def _initialize_qdrant_schema(self): full_scan_threshold=self._config.full_scan_threshold, m=self._config.m, ) - self.client.http.collections_api.create_collection( - self.collection_name, - CreateCollection( - vector_size=self.n_dim, + self.client.recreate_collection( + collection_name=self.collection_name, + vectors_config=VectorParams( + size=self.n_dim, distance=self.distance, - hnsw_config=hnsw_config, ), + hnsw_config=hnsw_config, ) def _collection_exists(self, collection_name): - resp = self.client.http.collections_api.get_collections() - collections = [collection.name for collection in resp.result.collections] + resp = self.client.get_collections() + collections = [collection.name for collection in resp.collections] return collection_name in collections @staticmethod @@ -164,33 +182,36 @@ def __getstate__(self): def __setstate__(self, state): self.__dict__ = state self._client = QdrantClient( - host=state['_config'].host, port=state['_config'].port + host=state['_config'].host, + port=state['_config'].port, + prefer_grpc=state['_config'].prefer_grpc, + grpc_port=state['_config'].grpc_port, + api_key=state['_config'].api_key, + https=state['_config'].https, ) def _get_offset2ids_meta(self) -> List[str]: if not self._collection_exists(self.collection_name_meta): return [] - return self.client.http.points_api.get_point( - self.collection_name_meta, id=1 - ).result.payload['offset2id'] + return self.client.retrieve(self.collection_name_meta, ids=[1])[0].payload[ + 'offset2id' + ] def _update_offset2ids_meta(self): if not self._collection_exists(self.collection_name_meta): - self.client.http.collections_api.create_collection( - self.collection_name_meta, - CreateCollection(vector_size=1, distance=Distance.COSINE), + self.client.recreate_collection( + collection_name=self.collection_name_meta, + vectors_config={}, # no vectors ) - self.client.http.points_api.upsert_points( + self.client.upsert( collection_name=self.collection_name_meta, + points=[ + PointStruct( + id=1, payload={"offset2id": self._offset2ids.ids}, vector={} + ) + ], wait=True, - point_insert_operations=PointsList( - points=[ - PointStruct( - id=1, payload={"offset2id": self._offset2ids.ids}, vector=[1] - ) - ] - ), ) def _map_embedding(self, embedding: 'ArrayType') -> List[float]: @@ -209,4 +230,4 @@ def _map_embedding(self, embedding: 'ArrayType') -> List[float]: if np.all(embedding == 0): embedding = embedding + EPSILON - return embedding.tolist() + return embedding.tolist() \ No newline at end of file diff --git a/docarray/array/storage/qdrant/find.py b/docarray/array/storage/qdrant/find.py index dbfc348721c..0ed9dad8ced 100644 --- a/docarray/array/storage/qdrant/find.py +++ b/docarray/array/storage/qdrant/find.py @@ -60,7 +60,7 @@ def _find_similar_vectors( query_vector=query_vector, query_filter=filter, search_params=None, - top=limit, + limit=limit, append_payload=['_serialized'], ) diff --git a/docarray/array/storage/qdrant/getsetdel.py b/docarray/array/storage/qdrant/getsetdel.py index 17e5194ca49..bf448b0cfc4 100644 --- a/docarray/array/storage/qdrant/getsetdel.py +++ b/docarray/array/storage/qdrant/getsetdel.py @@ -5,9 +5,8 @@ from qdrant_client.http.exceptions import UnexpectedResponse from qdrant_client.http.models.models import ( PointIdsList, - PointsList, - ScrollRequest, PointStruct, + VectorParams, ) from docarray import Document @@ -46,17 +45,17 @@ def _upload_batch(self, docs: Iterable['Document']): for doc in docs: batch.append(self._document_to_qdrant(doc)) if len(batch) > self.scroll_batch_size: - self.client.http.points_api.upsert_points( + self.client.upsert( collection_name=self.collection_name, + points=batch, wait=True, - point_insert_operations=PointsList(points=batch), ) batch = [] if len(batch) > 0: - self.client.http.points_api.upsert_points( + self.client.upsert( collection_name=self.collection_name, wait=True, - point_insert_operations=PointsList(points=batch), + points=batch, ) def _qdrant_to_document(self, qdrant_record: dict) -> 'Document': @@ -79,49 +78,47 @@ def _document_to_qdrant(self, doc: 'Document') -> 'PointStruct': def _get_doc_by_id(self, _id: str) -> 'Document': try: - resp = self.client.http.points_api.get_point( - collection_name=self.collection_name, id=self._map_id(_id) + resp = self.client.retrieve( + collection_name=self.collection_name, ids=[self._map_id(_id)] ) - return self._qdrant_to_document(resp.result.payload) + if len(resp) == 0: + raise KeyError(_id) + return self._qdrant_to_document(resp[0].payload) except UnexpectedResponse as response_error: if response_error.status_code in [404, 400]: raise KeyError(_id) def _del_doc_by_id(self, _id: str): - self.client.http.points_api.delete_points( + self.client.delete( collection_name=self.collection_name, - wait=True, points_selector=PointIdsList(points=[self._map_id(_id)]), + wait=True, ) def _set_doc_by_id(self, _id: str, value: 'Document'): if _id != value.id: self._del_doc_by_id(_id) - self.client.http.points_api.upsert_points( + self.client.upsert( collection_name=self.collection_name, wait=True, - point_insert_operations=PointsList( - points=[self._document_to_qdrant(value)] - ), + points=[self._document_to_qdrant(value)], ) def scan(self) -> Iterator['Document']: offset = None while True: - response = self.client.http.points_api.scroll_points( + response, next_page = self.client.scroll( collection_name=self.collection_name, - scroll_request=ScrollRequest( - offset=offset, - limit=self.scroll_batch_size, - with_payload=['_serialized'], - with_vector=False, - ), + offset=offset, + limit=self.scroll_batch_size, + with_payload=['_serialized'], + with_vectors=False, ) - for point in response.result.points: + for point in response: yield self._qdrant_to_document(point.payload) - if response.result.next_page_offset: - offset = response.result.next_page_offset + if next_page: + offset = next_page else: break @@ -133,8 +130,10 @@ def _save_offset2ids(self): self._update_offset2ids_meta() def _clear_storage(self): - self._client.recreate_collection( + self.client.recreate_collection( self.collection_name, - vector_size=self.n_dim, - distance=self.distance, - ) + vectors_config=VectorParams( + size=self.n_dim, + distance=self.distance, + ), + ) \ No newline at end of file diff --git a/docarray/array/storage/qdrant/seqlike.py b/docarray/array/storage/qdrant/seqlike.py index 7ded158bc4d..a800694fa93 100644 --- a/docarray/array/storage/qdrant/seqlike.py +++ b/docarray/array/storage/qdrant/seqlike.py @@ -36,16 +36,14 @@ def __eq__(self, other): """ # two DAW are considered as the same if they have the same client meta data return ( - type(self) is type(other) - and self.client.openapi_client.client.host - == other.openapi_client.client.host - and self.config == other.config + type(self) is type(other) + and self.client.openapi_client.client.host + == other.openapi_client.client.host + and self.config == other.config ) def __len__(self): - return self.client.http.collections_api.get_collection( - self.collection_name - ).result.vectors_count + return self.client.get_collection(self.collection_name).points_count def __contains__(self, x: Union[str, 'Document']): if isinstance(x, str): @@ -68,4 +66,4 @@ def __repr__(self): def _extend(self, docs: Iterable['Document'], **kwargs): docs = list(docs) self._upload_batch(docs) - self._offset2ids.extend([doc.id for doc in docs]) + self._offset2ids.extend([doc.id for doc in docs]) \ No newline at end of file diff --git a/docs/advanced/document-store/qdrant.md b/docs/advanced/document-store/qdrant.md index d5a8b1b35c5..5c8378d8a24 100644 --- a/docs/advanced/document-store/qdrant.md +++ b/docs/advanced/document-store/qdrant.md @@ -19,9 +19,10 @@ server. Create `docker-compose.yml` as follows: version: '3.4' services: qdrant: - image: qdrant/qdrant:v0.7.0 + image: qdrant/qdrant:v0.10.1 ports: - "6333:6333" + - "6334:6334" ulimits: # Only required for tests, as there are a lot of collections created nofile: soft: 65535 @@ -98,9 +99,10 @@ Create `docker-compose.yml`: version: '3.4' services: qdrant: - image: qdrant/qdrant:v0.7.0 + image: qdrant/qdrant:v0.10.1 ports: - "6333:6333" + - "6334:6334" ulimits: # Only required for tests, as there are a lot of collections created nofile: soft: 65535 @@ -205,4 +207,4 @@ Embeddings Nearest Neighbours with "price" at most 7: embedding=[6. 6. 6.], price=6 embedding=[5. 5. 5.], price=5 embedding=[4. 4. 4.], price=4 -``` +``` \ No newline at end of file diff --git a/scripts/docker-compose.yml b/scripts/docker-compose.yml index 4d917761b60..1d1e73d24bd 100644 --- a/scripts/docker-compose.yml +++ b/scripts/docker-compose.yml @@ -10,9 +10,10 @@ services: AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' PERSISTENCE_DATA_PATH: '/var/lib/weaviate' qdrant: - image: qdrant/qdrant:v0.7.0 + image: qdrant/qdrant:v0.10.1 ports: - - "41233:41233" + - "41237:41237" + - "41238:41238" ulimits: # Only required for tests, as there are a lot of collections created nofile: soft: 65535 diff --git a/setup.py b/setup.py index 3124418193d..7e84b7d4570 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,7 @@ 'strawberry-graphql', ], 'qdrant': [ - 'qdrant-client~=0.7.3', + 'qdrant-client~=0.10.3', ], 'annlite': [ 'annlite>=0.3.12', diff --git a/tests/unit/array/docker-compose.yml b/tests/unit/array/docker-compose.yml index 07de4842154..ddc76f3f1cf 100644 --- a/tests/unit/array/docker-compose.yml +++ b/tests/unit/array/docker-compose.yml @@ -10,9 +10,10 @@ services: AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' PERSISTENCE_DATA_PATH: '/var/lib/weaviate' qdrant: - image: qdrant/qdrant:v0.7.0 + image: qdrant/qdrant:v0.10.1 ports: - "6333:6333" + - "6334:6334" ulimits: # Only required for tests, as there are a lot of collections created nofile: soft: 65535 From 0d066e0d3d70e858aba4a5e24a7526223dccfc1c Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Wed, 5 Oct 2022 16:37:43 +0200 Subject: [PATCH 02/14] fix: set prefer_grpc False --- docarray/array/storage/qdrant/backend.py | 24 +++++++++++----------- docarray/array/storage/qdrant/getsetdel.py | 2 +- docarray/array/storage/qdrant/seqlike.py | 10 ++++----- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/docarray/array/storage/qdrant/backend.py b/docarray/array/storage/qdrant/backend.py index 9ac0b517edf..b35f187588c 100644 --- a/docarray/array/storage/qdrant/backend.py +++ b/docarray/array/storage/qdrant/backend.py @@ -40,7 +40,7 @@ class QdrantConfig: host: Optional[str] = field(default="localhost") port: Optional[int] = field(default=6333) grpc_port: Optional[int] = field(default=6334) - prefer_grpc: Optional[bool] = field(default=True) + prefer_grpc: Optional[bool] = field(default=False) api_key: Optional[str] = field(default=None) https: Optional[bool] = field(default=None) serialize_config: Dict = field(default_factory=dict) @@ -62,10 +62,10 @@ def _tmp_collection_name(cls) -> str: return uuid.uuid4().hex def _init_storage( - self, - docs: Optional['DocumentArraySourceType'] = None, - config: Optional[Union[QdrantConfig, Dict]] = None, - **kwargs, + self, + docs: Optional['DocumentArraySourceType'] = None, + config: Optional[Union[QdrantConfig, Dict]] = None, + **kwargs, ): """Initialize qdrant storage. @@ -132,15 +132,15 @@ def _init_storage( self.append(docs) def _ensure_unique_config( - self, - config_root: dict, - config_subindex: dict, - config_joined: dict, - subindex_name: str, + self, + config_root: dict, + config_subindex: dict, + config_joined: dict, + subindex_name: str, ) -> dict: if 'collection_name' not in config_subindex: config_joined['collection_name'] = ( - config_joined['collection_name'] + '_subindex_' + subindex_name + config_joined['collection_name'] + '_subindex_' + subindex_name ) return config_joined @@ -230,4 +230,4 @@ def _map_embedding(self, embedding: 'ArrayType') -> List[float]: if np.all(embedding == 0): embedding = embedding + EPSILON - return embedding.tolist() \ No newline at end of file + return embedding.tolist() diff --git a/docarray/array/storage/qdrant/getsetdel.py b/docarray/array/storage/qdrant/getsetdel.py index bf448b0cfc4..b0974816851 100644 --- a/docarray/array/storage/qdrant/getsetdel.py +++ b/docarray/array/storage/qdrant/getsetdel.py @@ -136,4 +136,4 @@ def _clear_storage(self): size=self.n_dim, distance=self.distance, ), - ) \ No newline at end of file + ) diff --git a/docarray/array/storage/qdrant/seqlike.py b/docarray/array/storage/qdrant/seqlike.py index a800694fa93..92d068997e8 100644 --- a/docarray/array/storage/qdrant/seqlike.py +++ b/docarray/array/storage/qdrant/seqlike.py @@ -36,10 +36,10 @@ def __eq__(self, other): """ # two DAW are considered as the same if they have the same client meta data return ( - type(self) is type(other) - and self.client.openapi_client.client.host - == other.openapi_client.client.host - and self.config == other.config + type(self) is type(other) + and self.client.openapi_client.client.host + == other.openapi_client.client.host + and self.config == other.config ) def __len__(self): @@ -66,4 +66,4 @@ def __repr__(self): def _extend(self, docs: Iterable['Document'], **kwargs): docs = list(docs) self._upload_batch(docs) - self._offset2ids.extend([doc.id for doc in docs]) \ No newline at end of file + self._offset2ids.extend([doc.id for doc in docs]) From 460d1682cd3c8c811e44c78f1b4fecb1ee1a764d Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 28 Sep 2022 13:29:26 +0100 Subject: [PATCH 03/14] fix: free paddle version in tests --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e51183656d3..23c22e20c48 100644 --- a/setup.py +++ b/setup.py @@ -93,7 +93,7 @@ 'pytest-custom_exit_code', 'black==22.3.0', 'tensorflow==2.7.0', - 'paddlepaddle==2.2.0', + 'paddlepaddle', 'torch==1.9.0', 'torchvision==0.10.0', 'datasets', From 59f9893d27e46f979463a41221cef130e597bffd Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 28 Sep 2022 14:31:48 +0100 Subject: [PATCH 04/14] test: install protobuf <==3.20.0 --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 23c22e20c48..b3ceb027e4c 100644 --- a/setup.py +++ b/setup.py @@ -83,6 +83,7 @@ 'seaborn', ], 'test': [ + 'protobuf>=3.13.0,<=3.20.0', # pip dependency resolution does not respect this restriction from paddle 'pytest', 'pytest-timeout', 'pytest-mock', From f4b16ef2a6e6cb560f4933b6928492ddbba825eb Mon Sep 17 00:00:00 2001 From: AlaeddineAbdessalem Date: Fri, 4 Nov 2022 09:45:36 +0100 Subject: [PATCH 05/14] test: unpin rocksdict in tests (#721) --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 6a18f1f848c..fe6d0e5c660 100644 --- a/setup.py +++ b/setup.py @@ -106,7 +106,6 @@ 'elasticsearch>=8.2.0', 'redis>=4.3.0', 'jina', - 'rocksdict<=0.2.16', ], }, classifiers=[ From a6784d0c9bfbbd9ebedc0cd69a39bf4450576284 Mon Sep 17 00:00:00 2001 From: samsja <55492238+samsja@users.noreply.github.com> Date: Fri, 4 Nov 2022 16:11:42 +0100 Subject: [PATCH 06/14] chore: fix type gov (#723) --- GOVERNANCE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GOVERNANCE.md b/GOVERNANCE.md index bbad6eb5f08..1801a0cd16b 100644 --- a/GOVERNANCE.md +++ b/GOVERNANCE.md @@ -28,9 +28,9 @@ Project releases will occur on a scheduled basis as agreed to by the committers. # Communication -This project, just like all of open source, is a global community. In addition to the [Code of Conduct](./.github/CODE_OF_CONDUCT.md), this project will: +This project, just like all open source, is a global community. In addition to the [Code of Conduct](./.github/CODE_OF_CONDUCT.md), this project will: -* Keep all communucation on open channels ( mailing list, forums, chat ). +* Keep all communication on open channels ( mailing list, forums, chat ). * Be respectful of time and language differences between community members ( such as scheduling meetings, email/issue responsiveness, etc ). * Ensure tools are able to be used by community members regardless of their region. From f475f433181275d6e54b72e5fd11122eaacdfd95 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sun, 6 Nov 2022 15:24:02 +0800 Subject: [PATCH 07/14] fix: update qdrant schema api --- docarray/array/storage/qdrant/backend.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docarray/array/storage/qdrant/backend.py b/docarray/array/storage/qdrant/backend.py index cfd0eff60d9..88ed5dfb07c 100644 --- a/docarray/array/storage/qdrant/backend.py +++ b/docarray/array/storage/qdrant/backend.py @@ -151,12 +151,11 @@ def _initialize_qdrant_schema(self): full_scan_threshold=self._config.full_scan_threshold, m=self._config.m, ) - self.client.http.collections_api.create_collection( + self.client.recreate_collection( collection_name=self.collection_name, - create_collection=CreateCollection( - vector_size=self._n_dim, - distance=DISTANCES[self._distance], - hnsw_config=hnsw_config, + vectors_config=VectorParams( + size=self.n_dim, + distance=self.distance, ), hnsw_config=hnsw_config, ) From f9355cdfe179942b50be7294770048d0a7146bb3 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sun, 6 Nov 2022 15:53:34 +0800 Subject: [PATCH 08/14] fix: convert embedding to list of float --- docarray/array/storage/qdrant/backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docarray/array/storage/qdrant/backend.py b/docarray/array/storage/qdrant/backend.py index 88ed5dfb07c..c5509e623f0 100644 --- a/docarray/array/storage/qdrant/backend.py +++ b/docarray/array/storage/qdrant/backend.py @@ -230,4 +230,5 @@ def _map_embedding(self, embedding: 'ArrayType') -> List[float]: if np.all(embedding == 0): embedding = embedding + EPSILON - return embedding.tolist() + + return embedding.astype(float).tolist() From 3c3f94cd370465d8f571cffe091f99d44f7df36d Mon Sep 17 00:00:00 2001 From: Anne Yang Date: Wed, 26 Oct 2022 14:54:50 +0800 Subject: [PATCH 09/14] feat(qdrant): pass search_params in find (#675) --- docarray/array/storage/qdrant/find.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docarray/array/storage/qdrant/find.py b/docarray/array/storage/qdrant/find.py index 86ddd6470e2..3086bb286b3 100644 --- a/docarray/array/storage/qdrant/find.py +++ b/docarray/array/storage/qdrant/find.py @@ -57,8 +57,10 @@ def _find_similar_vectors( self.collection_name, query_vector=query_vector, query_filter=filter, - search_params=None, - limit=limit, + search_params=None + if not search_params + else rest.SearchParams(**search_params), + top=limit, append_payload=['_serialized'], ) From 27062269b9fc6cc6bb23ec2fd3c5d9e457cc5969 Mon Sep 17 00:00:00 2001 From: AnneY Date: Sun, 6 Nov 2022 16:29:40 +0800 Subject: [PATCH 10/14] fix: transformers need oldproto --- tests/unit/array/mixins/{ => oldproto}/test_eval_class.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/unit/array/mixins/{ => oldproto}/test_eval_class.py (100%) diff --git a/tests/unit/array/mixins/test_eval_class.py b/tests/unit/array/mixins/oldproto/test_eval_class.py similarity index 100% rename from tests/unit/array/mixins/test_eval_class.py rename to tests/unit/array/mixins/oldproto/test_eval_class.py From 83e06255a7c0ad2469f5d397a4451d8bca7c844a Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 7 Nov 2022 11:24:04 +0800 Subject: [PATCH 11/14] fix: use limit instead of top --- docarray/array/storage/qdrant/find.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docarray/array/storage/qdrant/find.py b/docarray/array/storage/qdrant/find.py index 3086bb286b3..4692b37d394 100644 --- a/docarray/array/storage/qdrant/find.py +++ b/docarray/array/storage/qdrant/find.py @@ -60,7 +60,7 @@ def _find_similar_vectors( search_params=None if not search_params else rest.SearchParams(**search_params), - top=limit, + limit=limit, append_payload=['_serialized'], ) From a8cde14da91d80833e967dff4af5c49d11b5ea1a Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 7 Nov 2022 14:46:45 +0800 Subject: [PATCH 12/14] ci: extent timeout-minutes for old-proto --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5c29ec99a19..b9e79e24fbc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -238,7 +238,7 @@ jobs: pytest --suppress-no-test-exit-code --cov=docarray --cov-report=xml \ -v -s -m "not gpu" ${{ matrix.test-path }} echo "::set-output name=codecov_flag::docarray" - timeout-minutes: 30 + timeout-minutes: 40 env: JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}" - name: Check codecov file From 44922cccae99f3ba5e2a41b0a82aaaca3e6ca493 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 7 Nov 2022 18:13:27 +0800 Subject: [PATCH 13/14] test: add some grpc tests for qdrant --- tests/unit/array/test_advance_indexing.py | 51 +++++++++++++++---- .../unit/array/test_backend_configuration.py | 4 +- 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/tests/unit/array/test_advance_indexing.py b/tests/unit/array/test_advance_indexing.py index 4586b3edede..df8e005ddc0 100644 --- a/tests/unit/array/test_advance_indexing.py +++ b/tests/unit/array/test_advance_indexing.py @@ -27,6 +27,7 @@ def indices(): ('weaviate', WeaviateConfig(n_dim=123)), ('annlite', AnnliteConfig(n_dim=123)), ('qdrant', QdrantConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123, prefer_grpc=True)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), ], @@ -61,6 +62,7 @@ def test_getter_int_str(docs, storage, config, start_storage): ('weaviate', WeaviateConfig(n_dim=123)), ('annlite', AnnliteConfig(n_dim=123)), ('qdrant', QdrantConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123, prefer_grpc=True)), ('redis', RedisConfig(n_dim=123)), ], ) @@ -90,6 +92,7 @@ def test_setter_int_str(docs, storage, config, start_storage): ('weaviate', WeaviateConfig(n_dim=123)), ('annlite', AnnliteConfig(n_dim=123)), ('qdrant', QdrantConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123, prefer_grpc=True)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), ], @@ -125,6 +128,7 @@ def test_del_int_str(docs, storage, config, start_storage, indices): ('weaviate', WeaviateConfig(n_dim=123)), ('annlite', AnnliteConfig(n_dim=123)), ('qdrant', QdrantConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123, prefer_grpc=True)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), ], @@ -164,6 +168,7 @@ def test_slice(docs, storage, config, start_storage): ('weaviate', WeaviateConfig(n_dim=123)), ('annlite', AnnliteConfig(n_dim=123)), ('qdrant', QdrantConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123, prefer_grpc=True)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), ], @@ -211,6 +216,7 @@ def test_sequence_bool_index(docs, storage, config, start_storage): ('weaviate', WeaviateConfig(n_dim=123)), ('annlite', AnnliteConfig(n_dim=123)), ('qdrant', QdrantConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123, prefer_grpc=True)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), ], @@ -248,6 +254,7 @@ def test_sequence_int(docs, nparray, storage, config, start_storage): ('weaviate', WeaviateConfig(n_dim=123)), ('annlite', AnnliteConfig(n_dim=123)), ('qdrant', QdrantConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123, prefer_grpc=True)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), ], @@ -283,6 +290,7 @@ def test_sequence_str(docs, storage, config, start_storage): ('weaviate', WeaviateConfig(n_dim=123)), ('annlite', AnnliteConfig(n_dim=123)), ('qdrant', QdrantConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123, prefer_grpc=True)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), ], @@ -304,6 +312,7 @@ def test_docarray_list_tuple(docs, storage, config, start_storage): ('weaviate', WeaviateConfig(n_dim=123)), ('annlite', AnnliteConfig(n_dim=123)), ('qdrant', QdrantConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123, prefer_grpc=True)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), ], @@ -344,6 +353,7 @@ def test_path_syntax_indexing(storage, config, start_storage): ('weaviate', WeaviateConfig(n_dim=123)), ('annlite', AnnliteConfig(n_dim=123)), ('qdrant', QdrantConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123, prefer_grpc=True)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), ], @@ -441,6 +451,7 @@ def test_path_syntax_indexing_set(storage, config, use_subindex, start_storage): ('weaviate', WeaviateConfig(n_dim=123)), ('annlite', AnnliteConfig(n_dim=123)), ('qdrant', QdrantConfig(n_dim=123)), + ('qdrant', QdrantConfig(n_dim=123, prefer_grpc=True)), ('elasticsearch', ElasticConfig(n_dim=123)), ('redis', RedisConfig(n_dim=123)), ], @@ -487,6 +498,7 @@ def test_getset_subindex(storage, config, start_storage): ('weaviate', lambda: WeaviateConfig(n_dim=123)), ('annlite', lambda: AnnliteConfig(n_dim=123)), ('qdrant', lambda: QdrantConfig(n_dim=123)), + ('qdrant', lambda: QdrantConfig(n_dim=123, prefer_grpc=True)), ('elasticsearch', lambda: ElasticConfig(n_dim=123)), ('redis', lambda: RedisConfig(n_dim=123)), ], @@ -519,18 +531,27 @@ def test_attribute_indexing(storage, config_gen, start_storage, size): @pytest.mark.parametrize( - 'storage', - ['memory', 'sqlite', 'weaviate', 'annlite', 'qdrant', 'elasticsearch', 'redis'], + 'storage,config_gen', + [ + ('memory', None), + ('sqlite', None), + ('weaviate', lambda: WeaviateConfig(n_dim=10)), + ('annlite', lambda: AnnliteConfig(n_dim=10)), + ('qdrant', lambda: QdrantConfig(n_dim=10)), + ('qdrant', lambda: QdrantConfig(n_dim=10, prefer_grpc=True)), + ('elasticsearch', lambda: ElasticConfig(n_dim=10)), + ('redis', lambda: RedisConfig(n_dim=10)), + ], ) -def test_tensor_attribute_selector(storage, start_storage): +def test_tensor_attribute_selector(storage, config_gen, start_storage): import scipy.sparse sp_embed = np.random.random([3, 10]) sp_embed[sp_embed > 0.1] = 0 sp_embed = scipy.sparse.coo_matrix(sp_embed) - if storage in ('annlite', 'weaviate', 'qdrant', 'elasticsearch', 'redis'): - da = DocumentArray(storage=storage, config={'n_dim': 10}) + if config_gen: + da = DocumentArray(storage=storage, config=config_gen()) else: da = DocumentArray(storage=storage) @@ -572,12 +593,21 @@ def test_advance_selector_mixed(storage): @pytest.mark.parametrize( - 'storage', - ['memory', 'sqlite', 'weaviate', 'annlite', 'qdrant', 'elasticsearch', 'redis'], + 'storage,config_gen', + [ + ('memory', None), + ('sqlite', None), + ('weaviate', lambda: WeaviateConfig(n_dim=10)), + ('annlite', lambda: AnnliteConfig(n_dim=10)), + ('qdrant', lambda: QdrantConfig(n_dim=10)), + ('qdrant', lambda: QdrantConfig(n_dim=10, prefer_grpc=True)), + ('elasticsearch', lambda: ElasticConfig(n_dim=10)), + ('redis', lambda: RedisConfig(n_dim=10)), + ], ) -def test_single_boolean_and_padding(storage, start_storage): - if storage in ('annlite', 'weaviate', 'qdrant', 'elasticsearch', 'redis'): - da = DocumentArray(storage=storage, config={'n_dim': 10}) +def test_single_boolean_and_padding(storage, config_gen, start_storage): + if config_gen: + da = DocumentArray(storage=storage, config=config_gen()) else: da = DocumentArray(storage=storage) da.extend(DocumentArray.empty(3)) @@ -604,6 +634,7 @@ def test_single_boolean_and_padding(storage, start_storage): ('weaviate', lambda: WeaviateConfig(n_dim=123)), ('annlite', lambda: AnnliteConfig(n_dim=123)), ('qdrant', lambda: QdrantConfig(n_dim=123)), + ('qdrant', lambda: QdrantConfig(n_dim=123, prefer_grpc=True)), ('elasticsearch', lambda: ElasticConfig(n_dim=123)), ('redis', lambda: RedisConfig(n_dim=123)), ], diff --git a/tests/unit/array/test_backend_configuration.py b/tests/unit/array/test_backend_configuration.py index 8255375e6d2..e10b080d9da 100644 --- a/tests/unit/array/test_backend_configuration.py +++ b/tests/unit/array/test_backend_configuration.py @@ -130,7 +130,8 @@ def test_cast_columns_annlite(start_storage, type_da, type_column): @pytest.mark.parametrize('type_da', [int, float, str]) @pytest.mark.parametrize('type_column', ['int', 'float', 'str']) -def test_cast_columns_qdrant(start_storage, type_da, type_column, request): +@pytest.mark.parametrize('prefer_grpc', [False, True]) +def test_cast_columns_qdrant(start_storage, type_da, type_column, prefer_grpc, request): test_id = request.node.callspec.id.replace( '-', '' @@ -143,6 +144,7 @@ def test_cast_columns_qdrant(start_storage, type_da, type_column, request): 'collection_name': f'test{test_id}', 'n_dim': 3, 'columns': {'price': type_column}, + 'prefer_grpc': prefer_grpc, }, ) From e8cba21cb1536822572b540f8d00cf134ba0a358 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 7 Nov 2022 18:18:18 +0800 Subject: [PATCH 14/14] docs: add new config params to doc --- docs/advanced/document-store/qdrant.md | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/docs/advanced/document-store/qdrant.md b/docs/advanced/document-store/qdrant.md index d1fecf49714..e6ff683de6f 100644 --- a/docs/advanced/document-store/qdrant.md +++ b/docs/advanced/document-store/qdrant.md @@ -80,13 +80,19 @@ The following configs can be set: |-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------| | `n_dim` | Number of dimensions of embeddings to be stored and retrieved | **This is always required** | | `collection_name` | Qdrant collection name client | **Random collection name generated** | -| `host` | Hostname of the Qdrant server | 'localhost' | -| `port` | port of the Qdrant server | 6333 | -| `distance` | Distance metric to be used during search. Can be 'cosine', 'dot' or 'euclidean' | 'cosine' | -| `scroll_batch_size` | batch size used when scrolling over the storage | 64 | -| `ef_construct` | Number of neighbours to consider during the index building. Larger the value - more accurate the search, more time required to build index. | `None`, defaults to the default value in Qdrant* | -| `full_scan_threshold` | Minimal amount of points for additional payload-based indexing. | `None`, defaults to the default value in Qdrant* | -| `m` | Number of edges per node in the index graph. Larger the value - more accurate the search, more space required. | `None`, defaults to the default value in Qdrant* | +| `distance` | Distance metric to be used during search. Can be 'cosine', 'dot' or 'euclidean' | `'cosine'` | +| `host` | Hostname of the Qdrant server | `'localhost'` | +| `port` | Port of the Qdrant server | `6333` | +| `grpc_port` | Port of the Qdrant gRPC interface | `6334` | +| `prefer_grpc` | Set `true` to use gPRC interface whenever possible in custom methods | `False` | +| `api_key` | API key for authentication in Qdrant Cloud | `None` | +| `https` | Set `true` to use HTTPS(SSL) protocol | `None` | +| `serialize_config` | [Serialization config of each Document](../../../fundamentals/document/serialization.md) | `None` | +| `scroll_batch_size` | Batch size used when scrolling over the storage | 64 | +| `ef_construct` | Number of neighbours to consider during the index building. Larger the value - more accurate the search, more time required to build index | `None`, defaults to the default value in Qdrant* | +| `full_scan_threshold` | Minimal amount of points for additional payload-based indexing | `None`, defaults to the default value in Qdrant* | +| `m` | Number of edges per node in the index graph. Larger the value - more accurate the search, more space required | `None`, defaults to the default value in Qdrant* | +| `columns` | Other fields to store in Document | `None` | *You can read more about the HNSW parameters and their default values [here](https://qdrant.tech/documentation/indexing/#vector-index)