Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions docarray/array/queryset/lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,14 @@ def lookup(key, val, doc: 'Document') -> bool:
)

if '__' in get_key:
is_empty = False
try:
is_empty = not value
except:
# ndarray-like will end up here
pass

return is_empty != val
if value is None and val is True:
return False
elif value is None and val is False:
return True
elif value is not None and val is True:
return True
else: # value is not None and val is False:
return False
else:
return (_is_not_empty(get_key, value)) == val
else:
Expand Down
162 changes: 115 additions & 47 deletions docs/fundamentals/documentarray/find.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
(find-documentarray)=

# Query by Conditions

You can use {meth}`~docarray.array.mixins.find.FindMixin.find` to select Documents from a DocumentArray based on conditions specified in a `query` object.
You can use {meth}`~docarray.array.mixins.find.FindMixin.find` to select Documents from a DocumentArray based the
conditions specified in a `query` object. You can use `da.find(query)` to filter Documents and get nearest neighbors
from `da`:

- To filter Documents, the `query` object is a Python dictionary object that defines the filtering conditions using
a [MongoDB](https://docs.mongodb.com/manual/reference/operator/query/)-like query language.
- To find nearest neighbors, the `query` object needs to be a NdArray-like, a Document, or a DocumentArray object that
defines embedding. You can also use `.match()` function for this purpose, and there is a minor interface difference
between these two functions, which is described {ref}`in the next chapter<match-documentarray>`.

- To filter Documents, the `query` object is a Python dictionary object that defines the filtering conditions using a [MongoDB](https://docs.mongodb.com/manual/reference/operator/query/)-like query language.
- To find nearest neighbors, the `query` object needs to be an ndarray-like, Document, or DocumentArray that defines embedding(s). You can also use the `.match()` function for this purpose, and there's a minor interface difference between these two functions which is covered {ref}`in the next chapter<match-documentarray>`.

```{admonition} filter query syntax
:class: note
Expand Down Expand Up @@ -81,9 +88,11 @@ A query filter document uses query operators to specify conditions:
{ <field1>: { <operator1>: <value1> }, ... }
```

Here `field1` is {ref}`any field name<doc-fields>` of a Document object. To access nested fields, you can use the dunder expression. For example, `tags__timestamp` accesses the `doc.tags['timestamp']` field.

`value1` can be either a user given Python object, or a substitution field with curly bracket `{field}`
Here `field1` is {ref}`any field name<doc-fields>` of a Document object. To access nested fields, you can use the dunder expression.
For example, `tags__timestamp` accesses the `doc.tags['timestamp']` field.

`value1` can be either a user given Python object, or a substitution field with curly bracket `{field}`

Finally, `operator1` can be one of the following:

Expand All @@ -99,8 +108,7 @@ Finally, `operator1` can be one of the following:
| `$nin` | Not in an array |
| `$regex` | Match the specified regular expression |
| `$size` | Match array/dict field that have the specified size. `$size` does not accept ranges of values. |
| `$exists` | Matches documents that have the specified field. And empty string content is also considered as not exists. |

| `$exists` | Matches documents that have the specified field; {ref}`predefined fields<doc-fields>` having a default value (for example empty string, or 0) are considered as not existing; if the expression specifies a field `x` in `tags` (`tags__x`), then the operator tests that `x` is not `None`. |

To select all `modality='D'` Documents:

Expand All @@ -111,16 +119,30 @@ pprint(r.to_dict(exclude_none=True)) # just for pretty print
```

```json
[{"id": "92aee5d665d0c4dd34db10d83642aded",
"modality": "D",
"tags": {"h": 8.5, "uom": "in", "w": 11.0},
"text": "paper",
"weight": 100.0},
{"id": "1a9d2139b02bc1c7842ecda94b347889",
"modality": "D",
"tags": {"h": 22.85, "uom": "cm", "w": 30.0},
"text": "planner",
"weight": 75.0}]
[
{
"id": "92aee5d665d0c4dd34db10d83642aded",
"modality": "D",
"tags": {
"h": 8.5,
"uom": "in",
"w": 11.0
},
"text": "paper",
"weight": 100.0
},
{
"id": "1a9d2139b02bc1c7842ecda94b347889",
"modality": "D",
"tags": {
"h": 22.85,
"uom": "cm",
"w": 30.0
},
"text": "planner",
"weight": 75.0
}
]
```

To select all Documents whose `.tags['h']>10`,
Expand All @@ -130,36 +152,62 @@ r = da.find({'tags__h': {'$gt': 10}})
```

```json
[{"id": "4045a9659875fd1299e482d710753de3",
"modality": "A",
"tags": {"h": 14.0, "uom": "cm", "w": 21.0},
"text": "journal",
"weight": 25.0},
{"id": "cf7691c445220b94b88ff116911bad24",
"modality": "D",
"tags": {"h": 22.85, "uom": "cm", "w": 30.0},
"text": "planner",
"weight": 75.0}]
[
{
"id": "4045a9659875fd1299e482d710753de3",
"modality": "A",
"tags": {
"h": 14.0,
"uom": "cm",
"w": 21.0
},
"text": "journal",
"weight": 25.0
},
{
"id": "cf7691c445220b94b88ff116911bad24",
"modality": "D",
"tags": {
"h": 22.85,
"uom": "cm",
"w": 30.0
},
"text": "planner",
"weight": 75.0
}
]
```


Beside using a predefined value, you can also use a substitution with `{field}`. Notice those curly braces. For example:


```python
r = da.find({'tags__h': {'$gt': '{tags__w}'}})
```

```json
[{"id": "44c6a4b18eaa005c6dbe15a28a32ebce",
"modality": "A",
"tags": {"h": 14.0, "uom": "cm", "w": 10.0},
"text": "journal",
"weight": 25.0}]
[
{
"id": "44c6a4b18eaa005c6dbe15a28a32ebce",
"modality": "A",
"tags": {
"h": 14.0,
"uom": "cm",
"w": 10.0
},
"text": "journal",
"weight": 25.0
}
]
```

## Combine multiple conditions


You can combine multiple conditions using the following operators:


| Boolean Operator | Description |
|------------------|----------------------------------------------------|
| `$and` | Join query clauses with a logical AND |
Expand All @@ -171,19 +219,39 @@ r = da.find({'$or': [{'weight': {'$eq': 45}}, {'modality': {'$eq': 'D'}}]})
```

```json
[{"id": "22985b71b6d483c31cbe507ed4d02bd1",
"modality": "D",
"tags": {"h": 8.5, "uom": "in", "w": 11.0},
"text": "paper",
"weight": 100.0},
{"id": "a071faf19feac5809642e3afcd3a5878",
"modality": "D",
"tags": {"h": 22.85, "uom": "cm", "w": 30.0},
"text": "planner",
"weight": 75.0},
{"id": "411ecc70a71a3f00fc3259bf08c239d1",
"modality": "A",
"tags": {"h": 10.0, "uom": "cm", "w": 15.25},
"text": "postcard",
"weight": 45.0}]
[
{
"id": "22985b71b6d483c31cbe507ed4d02bd1",
"modality": "D",
"tags": {
"h": 8.5,
"uom": "in",
"w": 11.0
},
"text": "paper",
"weight": 100.0
},
{
"id": "a071faf19feac5809642e3afcd3a5878",
"modality": "D",
"tags": {
"h": 22.85,
"uom": "cm",
"w": 30.0
},
"text": "planner",
"weight": 75.0
},
{
"id": "411ecc70a71a3f00fc3259bf08c239d1",
"modality": "A",
"tags": {
"h": 10.0,
"uom": "cm",
"w": 15.25
},
"text": "postcard",
"weight": 45.0
}
]
```
11 changes: 8 additions & 3 deletions tests/unit/array/test_lookup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pytest

from docarray import Document
import numpy as np


@pytest.fixture
Expand All @@ -9,6 +10,8 @@ def doc():
text='test',
embedding=np.random.random(10),
tags={
'v': np.zeros(3),
'w': 0,
'x': 0.1,
'y': 1.5,
'z': 1,
Expand Down Expand Up @@ -43,13 +46,15 @@ def test_lookup_ops(doc):
assert lookup('text__regex', '^test', doc)
assert not lookup('text__regex', '^est', doc)

assert lookup('tags__size', 6, doc)
assert lookup('tags__size', 8, doc)
assert lookup('tags__labels__size', 3, doc)

assert lookup('tags__exists', True, doc)
assert lookup('tags__z__exists', True, doc)
assert lookup('tags__v__exists', True, doc)
assert lookup('tags__w__exists', True, doc)
assert lookup('tags__foo__exists', False, doc)
assert lookup('tags__bar__exists', False, doc)
assert lookup('tags__bar__exists', True, doc)
assert lookup('embedding__exists', True, doc)
assert lookup('tensor__exists', False, doc)
assert lookup('blob__exists', False, doc)
Expand Down