Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
b8bed08
feat: find method weaviate
davidbp Feb 3, 2022
d8f9fde
refactor: generalize find to all weaviate types
davidbp Feb 3, 2022
962c6c6
feat: add method to detect rows in ndarray like
davidbp Feb 4, 2022
3f3fe74
test: add config to weaviate
alaeddine-13 Feb 4, 2022
63187d6
chore: n_dim no more optional
alaeddine-13 Feb 4, 2022
53844b9
Merge branch 'feat-find' of https://github.com/jina-ai/docarray into …
alaeddine-13 Feb 4, 2022
8404ced
fix: merge conflict
alaeddine-13 Feb 4, 2022
1487a0a
fix: merge conflict
alaeddine-13 Feb 4, 2022
e8354ce
test: test get array rows
davidbp Feb 4, 2022
c4e7e23
test: add get array rows
davidbp Feb 4, 2022
d523342
chore: raise value error when config is None
alaeddine-13 Feb 4, 2022
b8bc557
Merge branch 'feat-find' of https://github.com/jina-ai/docarray into …
alaeddine-13 Feb 4, 2022
2917787
fix: fix tests
alaeddine-13 Feb 4, 2022
70c84d5
test: fix tests
Feb 4, 2022
ee9dd90
test: add find test
davidbp Feb 4, 2022
597245b
test: fix tests
alaeddine-13 Feb 4, 2022
9bbbcb6
feat: add find method pqlite
davidbp Feb 4, 2022
2cdc424
test: fix tests
alaeddine-13 Feb 4, 2022
965cbdd
Merge branch 'feat-find' of https://github.com/jina-ai/docarray into …
alaeddine-13 Feb 4, 2022
cdf077c
test: fix tests
alaeddine-13 Feb 4, 2022
9164cc1
test: add configs as lambda
davidbp Feb 4, 2022
3ba2c18
refactor: add frameworks embedding
davidbp Feb 4, 2022
a736695
refactor: simplify tests removing if else branches
davidbp Feb 4, 2022
7706b2b
test: fix test_contruct
alaeddine-13 Feb 4, 2022
65c0a16
test: fix test_find
alaeddine-13 Feb 4, 2022
09e55a6
test: fix test_plot
alaeddine-13 Feb 4, 2022
15559c4
test: fix tests
alaeddine-13 Feb 7, 2022
38a0bcf
test: fix tests
alaeddine-13 Feb 7, 2022
709fd57
test: fix tests
alaeddine-13 Feb 7, 2022
1093bfc
test: add weaviate tests for sequence
davidbp Feb 7, 2022
8c3b927
Merge branch 'feat-find' of https://github.com/jina-ai/docarray into …
davidbp Feb 7, 2022
dfcd376
fix: weaviate paylod cast to list if embedding has len 1
davidbp Feb 7, 2022
1b7e5ad
test: fix tests
alaeddine-13 Feb 7, 2022
06ff576
Merge branch 'feat-find' of https://github.com/jina-ai/docarray into …
alaeddine-13 Feb 7, 2022
19d6d1f
fix: add name main for parallel
davidbp Feb 7, 2022
d9be172
Merge branch 'main' into feat-find
alaeddine-13 Feb 7, 2022
3bb5d66
test: fix parallel tests
davidbp Feb 7, 2022
b6d85b8
test: test config accepts dict
davidbp Feb 7, 2022
0280b72
test: fix n dim parameter in test
davidbp Feb 7, 2022
a868551
style(import): fix imports
hanxiao Feb 7, 2022
813867c
style(import): fix imports
hanxiao Feb 7, 2022
fbd1371
style(import): fix imports
hanxiao Feb 7, 2022
1d5f5d7
style(import): fix imports
hanxiao Feb 7, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docarray/array/mixins/group.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import random
from collections import defaultdict
from typing import Dict, Any, TYPE_CHECKING, Generator, List
from ...helper import dunder_get

import numpy as np

from ...helper import dunder_get

if TYPE_CHECKING:
from ... import DocumentArray

Expand Down
23 changes: 18 additions & 5 deletions docarray/array/mixins/io/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from contextlib import nullcontext
from typing import Union, BinaryIO, TYPE_CHECKING, Type, Optional, Generator

from ....helper import random_uuid, __windows__, get_compress_ctx, decompress_bytes
from ....helper import __windows__, get_compress_ctx, decompress_bytes

if TYPE_CHECKING:
from ....types import T
Expand All @@ -24,6 +24,8 @@ def load_binary(
compress: Optional[str] = None,
_show_progress: bool = False,
streaming: bool = False,
*args,
**kwargs,
) -> Union['DocumentArray', Generator['Document', None, None]]:
"""Load array elements from a compressed binary file.

Expand Down Expand Up @@ -51,7 +53,9 @@ def load_binary(
_show_progress=_show_progress,
)
else:
return cls._load_binary_all(file_ctx, protocol, compress, _show_progress)
return cls._load_binary_all(
file_ctx, protocol, compress, _show_progress, *args, **kwargs
)

@classmethod
def _load_binary_stream(
Expand Down Expand Up @@ -97,7 +101,9 @@ def _load_binary_stream(
)

@classmethod
def _load_binary_all(cls, file_ctx, protocol, compress, show_progress):
def _load_binary_all(
cls, file_ctx, protocol, compress, show_progress, *args, **kwargs
):
"""Read a `DocumentArray` object from a binary file

:param protocol: protocol to use
Expand Down Expand Up @@ -156,8 +162,7 @@ def _load_binary_all(cls, file_ctx, protocol, compress, show_progress):
d[start_doc_pos:end_doc_pos], protocol=protocol, compress=compress
)
docs.append(doc)

return cls(docs)
return cls(docs, *args, **kwargs)

@classmethod
def from_bytes(
Expand All @@ -166,12 +171,16 @@ def from_bytes(
protocol: str = 'pickle-array',
compress: Optional[str] = None,
_show_progress: bool = False,
*args,
**kwargs,
) -> 'T':
return cls.load_binary(
data,
protocol=protocol,
compress=compress,
_show_progress=_show_progress,
*args,
**kwargs,
)

def save_binary(
Expand Down Expand Up @@ -298,12 +307,16 @@ def from_base64(
protocol: str = 'pickle-array',
compress: Optional[str] = None,
_show_progress: bool = False,
*args,
**kwargs,
) -> 'T':
return cls.load_binary(
base64.b64decode(data),
protocol=protocol,
compress=compress,
_show_progress=_show_progress,
*args,
**kwargs,
)

def to_base64(
Expand Down
7 changes: 5 additions & 2 deletions docarray/array/mixins/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ def save(

@classmethod
def load(
cls: Type['T'], file: Union[str, TextIO, BinaryIO], file_format: str = 'binary'
cls: Type['T'],
file: Union[str, TextIO, BinaryIO],
file_format: str = 'binary',
**kwargs
) -> 'T':
"""Load array elements from a JSON or a binary file, or a CSV file.

Expand All @@ -40,7 +43,7 @@ def load(
:return: the loaded DocumentArray object
"""
if file_format == 'json':
return cls.load_json(file)
return cls.load_json(file, **kwargs)
elif file_format == 'binary':
return cls.load_binary(file)
elif file_format == 'csv':
Expand Down
4 changes: 2 additions & 2 deletions docarray/array/mixins/io/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@ def to_dataframe(self, **kwargs) -> 'DataFrame':
return DataFrame.from_dict(self.to_list(), **kwargs)

@classmethod
def from_dataframe(cls: Type['T'], df: 'DataFrame') -> 'T':
def from_dataframe(cls: Type['T'], df: 'DataFrame', *args, **kwargs) -> 'T':
"""Import a :class:`DocumentArray` from a :class:`pandas.DataFrame` object.

:param df: a :class:`pandas.DataFrame` object.
:return: a :class:`DocumentArray` object
"""
da = cls()
da = cls(**kwargs)
from .... import Document

for m in df.to_dict(orient='records'):
Expand Down
4 changes: 3 additions & 1 deletion docarray/array/mixins/io/from_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def _from_generator(cls: Type['T'], meth: str, *args, **kwargs) -> 'T':
from ....document import generators

from_fn = getattr(generators, meth)
da_like = cls()
da_like = cls(**kwargs)
da_like.extend(from_fn(*args, **kwargs))
return da_like

Expand Down Expand Up @@ -67,6 +67,8 @@ def from_files(
sampling_rate: Optional[float] = None,
read_mode: Optional[str] = None,
to_dataturi: bool = False,
*args,
**kwargs,
) -> 'T':
"""Build from a list of file path or the content of the files.

Expand Down
2 changes: 1 addition & 1 deletion docarray/array/mixins/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def load_json(
constructor = Document.from_dict

with file_ctx as fp:
return cls(constructor(v, protocol=protocol, **kwargs) for v in fp)
return cls([constructor(v, protocol=protocol) for v in fp], **kwargs)

@classmethod
def from_json(
Expand Down
8 changes: 6 additions & 2 deletions docarray/array/mixins/io/pushpull.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import io
import json
from contextlib import nullcontext
from typing import Type, TYPE_CHECKING
from functools import lru_cache
from typing import Type, TYPE_CHECKING
from urllib.request import Request, urlopen

from ....helper import get_request_header
Expand Down Expand Up @@ -90,7 +90,9 @@ def read(self, n=-1):
)

@classmethod
def pull(cls: Type['T'], token: str, show_progress: bool = False) -> 'T':
def pull(
cls: Type['T'], token: str, show_progress: bool = False, *args, **kwargs
) -> 'T':
"""Pulling a :class:`DocumentArray` from Jina Cloud Service to local.

:param token: the upload token set during :meth:`.push`
Expand Down Expand Up @@ -131,6 +133,8 @@ def pull(cls: Type['T'], token: str, show_progress: bool = False) -> 'T':
protocol='protobuf',
compress='gzip',
_show_progress=show_progress,
*args,
**kwargs,
)

def _get_dict_data(self, token, show_progress):
Expand Down
2 changes: 0 additions & 2 deletions docarray/array/mixins/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@

import numpy as np

from ...helper import deprecate_by


class PlotMixin:
"""Helper functions for plotting the arrays. """
Expand Down
8 changes: 6 additions & 2 deletions docarray/array/storage/memory/backend.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import functools
import itertools
from typing import (
Generator,
Expand All @@ -8,7 +9,6 @@
TYPE_CHECKING,
Callable,
)
import functools

from ..base.backend import BaseBackendMixin
from .... import Document
Expand Down Expand Up @@ -58,7 +58,11 @@ def _rebuild_id2offset(self) -> None:

@needs_id2offset_rebuild
def _init_storage(
self, _docs: Optional['DocumentArraySourceType'] = None, copy: bool = False
self,
_docs: Optional['DocumentArraySourceType'] = None,
copy: bool = False,
*args,
**kwargs
):
from ... import DocumentArray
from ...memory import DocumentArrayInMemory
Expand Down
5 changes: 3 additions & 2 deletions docarray/array/storage/memory/getsetdel.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
Iterable,
Any,
)
from ..memory.backend import needs_id2offset_rebuild

from ..base.getsetdel import BaseGetSetDelMixin
from .... import Document, DocumentArray
from ..memory.backend import needs_id2offset_rebuild
from .... import Document


class GetSetDelMixin(BaseGetSetDelMixin):
Expand Down
5 changes: 2 additions & 3 deletions docarray/array/storage/memory/seqlike.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from typing import Iterator, Union, Sequence, Iterable, MutableSequence

from .... import Document
from typing import Iterator, Union, Iterable, MutableSequence

from ..memory.backend import needs_id2offset_rebuild
from .... import Document


class SequenceLikeMixin(MutableSequence[Document]):
Expand Down
3 changes: 2 additions & 1 deletion docarray/array/storage/pqlite/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from abc import ABC

from .backend import BackendMixin, PqliteConfig
from .find import FindMixin
from .getsetdel import GetSetDelMixin
from .seqlike import SequenceLikeMixin

__all__ = ['StorageMixins', 'PqliteConfig']


class StorageMixins(BackendMixin, GetSetDelMixin, SequenceLikeMixin, ABC):
class StorageMixins(FindMixin, BackendMixin, GetSetDelMixin, SequenceLikeMixin, ABC):
...
50 changes: 50 additions & 0 deletions docarray/array/storage/pqlite/find.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from typing import (
Union,
TYPE_CHECKING,
List,
)

if TYPE_CHECKING:
import numpy as np
from .... import DocumentArray


class FindMixin:
def _find_similar_vectors(self, q: 'np.ndarray', limit=10):

"""
if q.ndim == 1:
input = DocumentArray(Document(embedding=q))
else:
input = DocumentArray(Document(embedding=q_k) for q_k in q)
docs = self._pqlite.search(input, limit=limit)
return DocumentArray(docs)
"""

if q.ndim == 1:
q = q.reshape((1, -1))

_, list_of_docs = self._pqlite._search_documents(q, limit=limit)

if len(list_of_docs) == 1:
# this is a single DocumentArray
return list_of_docs[0]
else:
# this is a list of DocumentArrays
return list_of_docs

def find(
self, query: 'np.ndarray', limit: int = 10
) -> Union['DocumentArray', List['DocumentArray']]:
"""Returns approximate nearest neighbors given a batch of input queries.
:param query: input supported to be stored in Weaviate. This includes any from the list '[np.ndarray, tensorflow.Tensor, torch.Tensor, Sequence[float]]'
:param limit: number of retrieved items

:return: DocumentArray containing the closest documents to the query if it is a single query, otherwise a list of DocumentArrays containing
the closest Document objects for each of the queries in `query`.

Note: Weaviate returns `certainty` values. To get cosine similarities one needs to use `cosine_sim = 2*certainty - 1` as explained here:
https://www.semi.technology/developers/weaviate/current/more-resources/faq.html#q-how-do-i-get-the-cosine-similarity-from-weaviates-certainty
"""

return self._find_similar_vectors(query, limit=limit)
4 changes: 3 additions & 1 deletion docarray/array/storage/pqlite/getsetdel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
Sequence,
Iterable,
)

import numpy as np
from ...memory import DocumentArrayInMemory

from ..base.getsetdel import BaseGetSetDelMixin
from ...memory import DocumentArrayInMemory
from .... import Document


Expand Down
4 changes: 3 additions & 1 deletion docarray/array/storage/pqlite/seqlike.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from typing import Iterator, Union, Iterable, Sequence, MutableSequence

import numpy as np
from .... import Document

from ...memory import DocumentArrayInMemory
from .... import Document


class SequenceLikeMixin(MutableSequence[Document]):
Expand Down
2 changes: 1 addition & 1 deletion docarray/array/storage/sqlite/seqlike.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Iterator, Union, Iterable, MutableSequence, Optional, Sequence
from typing import Iterator, Union, Iterable, MutableSequence, Optional

from .... import Document

Expand Down
3 changes: 2 additions & 1 deletion docarray/array/storage/weaviate/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from abc import ABC

from .backend import BackendMixin, WeaviateConfig
from .find import FindMixin
from .getsetdel import GetSetDelMixin
from .seqlike import SequenceLikeMixin

__all__ = ['StorageMixins', 'WeaviateConfig']


class StorageMixins(BackendMixin, GetSetDelMixin, SequenceLikeMixin, ABC):
class StorageMixins(FindMixin, BackendMixin, GetSetDelMixin, SequenceLikeMixin, ABC):
...
Loading