Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
33b8d92
Merge branch 'main' of https://github.com/jina-ai/docarray into feat-…
Oct 5, 2022
7879f83
feat: update qdrant to version 0.10.1
Oct 5, 2022
0d066e0
fix: set prefer_grpc False
Oct 5, 2022
7ce7d04
Merge branch 'feat-upgrade-qdrant-client' of https://github.com/jina-…
Oct 5, 2022
6d976cf
Merge branch 'feat-upgrade-qdrant-client' into feat-qdrant-version-2
Oct 5, 2022
e1e4c16
Merge branch 'feat-upgrade-qdrant-client' into feat-qdrant-version-2
Oct 5, 2022
52ad866
Merge branch 'feat-upgrade-qdrant-client' into feat-qdrant-version-2
Oct 6, 2022
71a3e17
Merge branch 'feat-upgrade-qdrant-client' into feat-qdrant-version-2
AnneYang720 Nov 4, 2022
cae5c4c
Merge remote-tracking branch 'origin/main' into feat-qdrant-version-2
AnneYang720 Nov 4, 2022
dd3b723
Merge branch 'feat-upgrade-qdrant-client' into feat-qdrant-version-2
AnneYang720 Nov 4, 2022
460d168
fix: free paddle version in tests
alaeddine-13 Sep 28, 2022
59f9893
test: install protobuf <==3.20.0
alaeddine-13 Sep 28, 2022
f4b16ef
test: unpin rocksdict in tests (#721)
alaeddine-13 Nov 4, 2022
a6784d0
chore: fix type gov (#723)
samsja Nov 4, 2022
f475f43
fix: update qdrant schema api
AnneYang720 Nov 6, 2022
f9355cd
fix: convert embedding to list of float
AnneYang720 Nov 6, 2022
a7fda11
Merge remote-tracking branch 'origin/main' into feat-qdrant-version-2
AnneYang720 Nov 6, 2022
3c3f94c
feat(qdrant): pass search_params in find (#675)
AnneYang720 Oct 26, 2022
2706226
fix: transformers need oldproto
AnneYang720 Nov 6, 2022
83e0625
fix: use limit instead of top
AnneYang720 Nov 7, 2022
a8cde14
ci: extent timeout-minutes for old-proto
AnneYang720 Nov 7, 2022
44922cc
test: add some grpc tests for qdrant
AnneYang720 Nov 7, 2022
e8cba21
docs: add new config params to doc
AnneYang720 Nov 7, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ jobs:
pytest --suppress-no-test-exit-code --cov=docarray --cov-report=xml \
-v -s -m "not gpu" ${{ matrix.test-path }}
echo "::set-output name=codecov_flag::docarray"
timeout-minutes: 30
timeout-minutes: 40
env:
JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}"
- name: Check codecov file
Expand Down
4 changes: 2 additions & 2 deletions GOVERNANCE.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ Project releases will occur on a scheduled basis as agreed to by the committers.

# Communication

This project, just like all of open source, is a global community. In addition to the [Code of Conduct](./.github/CODE_OF_CONDUCT.md), this project will:
This project, just like all open source, is a global community. In addition to the [Code of Conduct](./.github/CODE_OF_CONDUCT.md), this project will:

* Keep all communucation on open channels ( mailing list, forums, chat ).
* Keep all communication on open channels ( mailing list, forums, chat ).
* Be respectful of time and language differences between community members ( such as scheduling meetings, email/issue responsiveness, etc ).
* Ensure tools are able to be used by community members regardless of their region.

Expand Down
69 changes: 43 additions & 26 deletions docarray/array/storage/qdrant/backend.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import copy
import uuid
from abc import abstractmethod
from dataclasses import dataclass, field, asdict
from typing import (
Optional,
Expand All @@ -19,6 +20,7 @@
PointsList,
PointStruct,
HnswConfigDiff,
VectorParams,
)

from docarray import Document
Expand All @@ -38,6 +40,10 @@ class QdrantConfig:
collection_name: Optional[str] = None
host: Optional[str] = field(default="localhost")
port: Optional[int] = field(default=6333)
grpc_port: Optional[int] = field(default=6334)
prefer_grpc: Optional[bool] = field(default=False)
api_key: Optional[str] = field(default=None)
https: Optional[bool] = field(default=None)
serialize_config: Dict = field(default_factory=dict)
scroll_batch_size: int = 64
ef_construct: Optional[int] = None
Expand All @@ -47,6 +53,11 @@ class QdrantConfig:


class BackendMixin(BaseBackendMixin):
@property
@abstractmethod
def client(self) -> 'QdrantClient':
raise NotImplementedError()

@classmethod
def _tmp_collection_name(cls) -> str:
return uuid.uuid4().hex
Expand Down Expand Up @@ -85,7 +96,14 @@ def _init_storage(
self._distance = config.distance
self._serialize_config = config.serialize_config

self._client = QdrantClient(host=config.host, port=config.port)
self._client = QdrantClient(
host=config.host,
port=config.port,
prefer_grpc=config.prefer_grpc,
grpc_port=config.grpc_port,
api_key=config.api_key,
https=config.https,
)

self._config = config

Expand Down Expand Up @@ -133,13 +151,13 @@ def _initialize_qdrant_schema(self):
full_scan_threshold=self._config.full_scan_threshold,
m=self._config.m,
)
self.client.http.collections_api.create_collection(
self.client.recreate_collection(
collection_name=self.collection_name,
create_collection=CreateCollection(
vector_size=self._n_dim,
distance=DISTANCES[self._distance],
hnsw_config=hnsw_config,
vectors_config=VectorParams(
size=self.n_dim,
distance=self.distance,
),
hnsw_config=hnsw_config,
)

def _collection_exists(self, collection_name):
Expand All @@ -164,38 +182,36 @@ def __getstate__(self):
def __setstate__(self, state):
self.__dict__ = state
self._client = QdrantClient(
host=state['_config'].host, port=state['_config'].port
host=state['_config'].host,
port=state['_config'].port,
prefer_grpc=state['_config'].prefer_grpc,
grpc_port=state['_config'].grpc_port,
api_key=state['_config'].api_key,
https=state['_config'].https,
)

def _get_offset2ids_meta(self) -> List[str]:
if not self._collection_exists(self.collection_name_meta):
return []
results = self.client.retrieve(
collection_name=self.collection_name_meta, ids=[1]
)
if len(results) == 0:
return []
else:
return results[0].payload.get('offset2id', [])
return self.client.retrieve(self.collection_name_meta, ids=[1])[0].payload[
'offset2id'
]

def _update_offset2ids_meta(self):
if not self._collection_exists(self.collection_name_meta):
self.client.recreate_collection(
self.collection_name_meta,
vector_size=1,
distance=Distance.COSINE,
collection_name=self.collection_name_meta,
vectors_config={}, # no vectors
)

self.client.http.points_api.upsert_points(
self.client.upsert(
collection_name=self.collection_name_meta,
points=[
PointStruct(
id=1, payload={"offset2id": self._offset2ids.ids}, vector={}
)
],
wait=True,
point_insert_operations=PointsList(
points=[
PointStruct(
id=1, payload={"offset2id": self._offset2ids.ids}, vector=[1]
)
]
),
)

def _map_embedding(self, embedding: 'ArrayType') -> List[float]:
Expand All @@ -214,4 +230,5 @@ def _map_embedding(self, embedding: 'ArrayType') -> List[float]:

if np.all(embedding == 0):
embedding = embedding + EPSILON
return embedding.tolist()

return embedding.astype(float).tolist()
2 changes: 1 addition & 1 deletion docarray/array/storage/qdrant/find.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def _find_similar_vectors(
search_params=None
if not search_params
else rest.SearchParams(**search_params),
top=limit,
limit=limit,
append_payload=['_serialized'],
)

Expand Down
55 changes: 27 additions & 28 deletions docarray/array/storage/qdrant/getsetdel.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
from qdrant_client.http.exceptions import UnexpectedResponse
from qdrant_client.http.models.models import (
PointIdsList,
PointsList,
ScrollRequest,
PointStruct,
VectorParams,
)

from docarray import Document
Expand Down Expand Up @@ -46,17 +45,17 @@ def _upload_batch(self, docs: Iterable['Document']):
for doc in docs:
batch.append(self._document_to_qdrant(doc))
if len(batch) > self.scroll_batch_size:
self.client.http.points_api.upsert_points(
self.client.upsert(
collection_name=self.collection_name,
points=batch,
wait=True,
point_insert_operations=PointsList(points=batch),
)
batch = []
if len(batch) > 0:
self.client.http.points_api.upsert_points(
self.client.upsert(
collection_name=self.collection_name,
wait=True,
point_insert_operations=PointsList(points=batch),
points=batch,
)

def _qdrant_to_document(self, qdrant_record: dict) -> 'Document':
Expand All @@ -79,49 +78,47 @@ def _document_to_qdrant(self, doc: 'Document') -> 'PointStruct':

def _get_doc_by_id(self, _id: str) -> 'Document':
try:
resp = self.client.http.points_api.get_point(
collection_name=self.collection_name, id=self._map_id(_id)
resp = self.client.retrieve(
collection_name=self.collection_name, ids=[self._map_id(_id)]
)
return self._qdrant_to_document(resp.result.payload)
if len(resp) == 0:
raise KeyError(_id)
return self._qdrant_to_document(resp[0].payload)
except UnexpectedResponse as response_error:
if response_error.status_code in [404, 400]:
raise KeyError(_id)

def _del_doc_by_id(self, _id: str):
self.client.http.points_api.delete_points(
self.client.delete(
collection_name=self.collection_name,
wait=True,
points_selector=PointIdsList(points=[self._map_id(_id)]),
wait=True,
)

def _set_doc_by_id(self, _id: str, value: 'Document'):
if _id != value.id:
self._del_doc_by_id(_id)
self.client.http.points_api.upsert_points(
self.client.upsert(
collection_name=self.collection_name,
wait=True,
point_insert_operations=PointsList(
points=[self._document_to_qdrant(value)]
),
points=[self._document_to_qdrant(value)],
)

def scan(self) -> Iterator['Document']:
offset = None
while True:
response = self.client.http.points_api.scroll_points(
response, next_page = self.client.scroll(
collection_name=self.collection_name,
scroll_request=ScrollRequest(
offset=offset,
limit=self.scroll_batch_size,
with_payload=['_serialized'],
with_vector=False,
),
offset=offset,
limit=self.scroll_batch_size,
with_payload=['_serialized'],
with_vectors=False,
)
for point in response.result.points:
for point in response:
yield self._qdrant_to_document(point.payload)

if response.result.next_page_offset:
offset = response.result.next_page_offset
if next_page:
offset = next_page
else:
break

Expand All @@ -133,8 +130,10 @@ def _save_offset2ids(self):
self._update_offset2ids_meta()

def _clear_storage(self):
self._client.recreate_collection(
self.client.recreate_collection(
self.collection_name,
vector_size=self.n_dim,
distance=self.distance,
vectors_config=VectorParams(
size=self.n_dim,
distance=self.distance,
),
)
4 changes: 1 addition & 3 deletions docarray/array/storage/qdrant/seqlike.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,7 @@ def __eq__(self, other):
)

def __len__(self):
return self.client.http.collections_api.get_collection(
self.collection_name
).result.vectors_count
return self.client.get_collection(self.collection_name).points_count

def __contains__(self, x: Union[str, 'Document']):
if isinstance(x, str):
Expand Down
Loading