Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docarray/typing/tensor/video/video_ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,15 @@ class MyVideoDoc(BaseDoc):
video_tensor=np.random.random((100, 224, 224, 3)),
)

doc_1.video_tensor.save(file_path='file_1.mp4')
doc_1.video_tensor.save(file_path='/tmp/file_1.mp4')

doc_2 = MyVideoDoc(
title='my_second_video_doc',
url='file_1.mp4',
url='/tmp/file_1.mp4',
)

doc_2.video_tensor = parse_obj_as(VideoNdArray, doc_2.url.load().video)
doc_2.video_tensor.save(file_path='file_2.mp4')
doc_2.video_tensor.save(file_path='/tmp/file_2.mp4')
```

---
Expand Down
79 changes: 43 additions & 36 deletions docarray/utils/filter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
__all__ = ['filter_docs']

import json
from typing import Dict, List, Union

Expand All @@ -13,50 +15,55 @@ def filter_docs(
Filter the Documents in the index according to the given filter query.


EXAMPLE USAGE

.. code-block:: python
---

from docarray import DocArray, BaseDoc
from docarray.documents import Text, Image
from docarray.util.filter import filter_docs
```python
from docarray import DocArray, BaseDoc
from docarray.documents import TextDoc, ImageDoc
from docarray.utils.filter import filter_docs


class MyDocument(BaseDoc):
caption: Text
image: Image
price: int
class MyDocument(BaseDoc):
caption: TextDoc
ImageDoc: ImageDoc
price: int


docs = DocArray[MyDocument](
[
MyDocument(
caption='A tiger in the jungle',
image=Image(url='tigerphoto.png'),
price=100,
),
MyDocument(
caption='A swimming turtle', image=Image(url='turtlepic.png'), price=50
),
MyDocument(
caption='A couple birdwatching with binoculars',
image=Image(url='binocularsphoto.png'),
price=30,
),
]
)
query = {
'$and': {
'image__url': {'$regex': 'photo'},
'price': {'$lte': 50},
}
docs = DocArray[MyDocument](
[
MyDocument(
caption='A tiger in the jungle',
ImageDoc=ImageDoc(url='tigerphoto.png'),
price=100,
),
MyDocument(
caption='A swimming turtle',
ImageDoc=ImageDoc(url='turtlepic.png'),
price=50,
),
MyDocument(
caption='A couple birdwatching with binoculars',
ImageDoc=ImageDoc(url='binocularsphoto.png'),
price=30,
),
]
)
query = {
'$and': {
'ImageDoc__url': {'$regex': 'photo'},
'price': {'$lte': 50},
}
}

results = filter_docs(docs, query)
assert len(results) == 1
assert results[0].price == 30
assert results[0].caption == 'A couple birdwatching with binoculars'
assert results[0].ImageDoc.url == 'binocularsphoto.png'
```

results = filter_docs(docs, query)
assert len(results) == 1
assert results[0].price == 30
assert results[0].caption == 'A couple birdwatching with binoculars'
assert results[0].image.url == 'binocularsphoto.png'
---

:param docs: the DocArray where to apply the filter
:param query: the query to filter by
Expand Down
165 changes: 80 additions & 85 deletions docarray/utils/find.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
__all__ = ['find', 'find_batched']

from typing import Any, Dict, List, NamedTuple, Optional, Type, Union, cast

from typing_inspect import is_union_type
Expand Down Expand Up @@ -34,52 +36,48 @@ def find(
Find the closest Documents in the index to the query.
Supports PyTorch and NumPy embeddings.

.. note::
This utility function is likely to be removed once
Document Stores are available.
At that point, and in-memory Document Store will serve the same purpose
by exposing a .find() method.

.. note::
This is a simple implementation that assumes the same embedding field name for
both query and index, does not support nested search, and does not support
hybrid (multi-vector) search. These shortcoming will be addressed in future
versions.
!!! note
This is a simple implementation of exact search. If you need to do advance
search using approximate nearest neighbours search or hybrid search or
multi vector search please take a look at the [BaseDoc][docarray.base_doc.doc.BaseDoc]

EXAMPLE USAGE
---

.. code-block:: python
```python
from docarray import DocArray, BaseDoc
from docarray.typing import TorchTensor
from docarray.utils.find import find
import torch

from docarray import DocArray, BaseDoc
from docarray.typing import TorchTensor
from docarray.util.find import find

class MyDocument(BaseDoc):
embedding: TorchTensor

class MyDocument(BaseDoc):
embedding: TorchTensor

index = DocArray[MyDocument](
[MyDocument(embedding=torch.rand(128)) for _ in range(100)]
)

index = DocArray[MyDocument](
[MyDocument(embedding=torch.rand(128)) for _ in range(100)]
)
# use Document as query
query = MyDocument(embedding=torch.rand(128))
top_matches, scores = find(
index=index,
query=query,
embedding_field='embedding',
metric='cosine_sim',
)

# use Document as query
query = MyDocument(embedding=torch.rand(128))
top_matches, scores = find(
index=index,
query=query,
embedding_field='tensor',
metric='cosine_sim',
)
# use tensor as query
query = torch.rand(128)
top_matches, scores = find(
index=index,
query=query,
embedding_field='embedding',
metric='cosine_sim',
)
```

# use tensor as query
query = torch.rand(128)
top_matches, scores = find(
index=index,
query=query,
embedding_field='tensor',
metric='cosine_sim',
)
---

:param index: the index of Documents to search in
:param query: the query to search for
Expand Down Expand Up @@ -123,54 +121,51 @@ def find_batched(
Find the closest Documents in the index to the queries.
Supports PyTorch and NumPy embeddings.

.. note::
This utility function is likely to be removed once
Document Stores are available.
At that point, and in-memory Document Store will serve the same purpose
by exposing a .find() method.

.. note::
This is a simple implementation that assumes the same embedding field name for
both query and index, does not support nested search, and does not support
hybrid (multi-vector) search. These shortcoming will be addressed in future
versions.

EXAMPLE USAGE

.. code-block:: python

from docarray import DocArray, BaseDoc
from docarray.typing import TorchTensor
from docarray.util.find import find


class MyDocument(BaseDoc):
embedding: TorchTensor


index = DocArray[MyDocument](
[MyDocument(embedding=torch.rand(128)) for _ in range(100)]
)

# use DocArray as query
query = DocArray[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)])
results = find(
index=index,
query=query,
embedding_field='tensor',
metric='cosine_sim',
)
top_matches, scores = results[0]

# use tensor as query
query = torch.rand(3, 128)
results, scores = find(
index=index,
query=query,
embedding_field='tensor',
metric='cosine_sim',
)
top_matches, scores = results[0]
!!! note
This is a simple implementation of exact search. If you need to do advance
search using approximate nearest neighbours search or hybrid search or
multi vector search please take a look at the [BaseDoc][docarray.base_doc.doc.BaseDoc]


---

```python
# from docarray import DocArray, BaseDoc
# from docarray.typing import TorchTensor
# from docarray.utils.find import find
# import torch
#
#
# class MyDocument(BaseDoc):
# embedding: TorchTensor
#
#
# index = DocArray[MyDocument](
# [MyDocument(embedding=torch.rand(128)) for _ in range(100)]
# )
#
# # use DocArray as query
# query = DocArray[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)])
# results = find(
# index=index,
# query=query,
# embedding_field='embedding',
# metric='cosine_sim',
# )
# top_matches, scores = results[0]
#
# # use tensor as query
# query = torch.rand(3, 128)
# results, scores = find(
# index=index,
# query=query,
# embedding_field='embedding',
# metric='cosine_sim',
# )
# top_matches, scores = results[0]
```

---

:param index: the index of Documents to search in
:param query: the query to search for
Expand Down
Loading