diff --git a/docarray/array/queryset/lookup.py b/docarray/array/queryset/lookup.py index 452e7dfae60..be1e23c7846 100644 --- a/docarray/array/queryset/lookup.py +++ b/docarray/array/queryset/lookup.py @@ -114,14 +114,14 @@ def lookup(key, val, doc: 'Document') -> bool: ) if '__' in get_key: - is_empty = False - try: - is_empty = not value - except: - # ndarray-like will end up here - pass - - return is_empty != val + if value is None and val is True: + return False + elif value is None and val is False: + return True + elif value is not None and val is True: + return True + else: # value is not None and val is False: + return False else: return (_is_not_empty(get_key, value)) == val else: diff --git a/docs/fundamentals/documentarray/find.md b/docs/fundamentals/documentarray/find.md index 32ad37e93e8..000dd3eaa71 100644 --- a/docs/fundamentals/documentarray/find.md +++ b/docs/fundamentals/documentarray/find.md @@ -1,10 +1,17 @@ (find-documentarray)= + # Query by Conditions -You can use {meth}`~docarray.array.mixins.find.FindMixin.find` to select Documents from a DocumentArray based on conditions specified in a `query` object. +You can use {meth}`~docarray.array.mixins.find.FindMixin.find` to select Documents from a DocumentArray based the +conditions specified in a `query` object. You can use `da.find(query)` to filter Documents and get nearest neighbors +from `da`: + +- To filter Documents, the `query` object is a Python dictionary object that defines the filtering conditions using + a [MongoDB](https://docs.mongodb.com/manual/reference/operator/query/)-like query language. +- To find nearest neighbors, the `query` object needs to be a NdArray-like, a Document, or a DocumentArray object that + defines embedding. You can also use `.match()` function for this purpose, and there is a minor interface difference + between these two functions, which is described {ref}`in the next chapter`. -- To filter Documents, the `query` object is a Python dictionary object that defines the filtering conditions using a [MongoDB](https://docs.mongodb.com/manual/reference/operator/query/)-like query language. -- To find nearest neighbors, the `query` object needs to be an ndarray-like, Document, or DocumentArray that defines embedding(s). You can also use the `.match()` function for this purpose, and there's a minor interface difference between these two functions which is covered {ref}`in the next chapter`. ```{admonition} filter query syntax :class: note @@ -81,9 +88,11 @@ A query filter document uses query operators to specify conditions: { : { : }, ... } ``` -Here `field1` is {ref}`any field name` of a Document object. To access nested fields, you can use the dunder expression. For example, `tags__timestamp` accesses the `doc.tags['timestamp']` field. -`value1` can be either a user given Python object, or a substitution field with curly bracket `{field}` +Here `field1` is {ref}`any field name` of a Document object. To access nested fields, you can use the dunder expression. +For example, `tags__timestamp` accesses the `doc.tags['timestamp']` field. + +`value1` can be either a user given Python object, or a substitution field with curly bracket `{field}` Finally, `operator1` can be one of the following: @@ -99,8 +108,7 @@ Finally, `operator1` can be one of the following: | `$nin` | Not in an array | | `$regex` | Match the specified regular expression | | `$size` | Match array/dict field that have the specified size. `$size` does not accept ranges of values. | -| `$exists` | Matches documents that have the specified field. And empty string content is also considered as not exists. | - +| `$exists` | Matches documents that have the specified field; {ref}`predefined fields` having a default value (for example empty string, or 0) are considered as not existing; if the expression specifies a field `x` in `tags` (`tags__x`), then the operator tests that `x` is not `None`. | To select all `modality='D'` Documents: @@ -111,16 +119,30 @@ pprint(r.to_dict(exclude_none=True)) # just for pretty print ``` ```json -[{"id": "92aee5d665d0c4dd34db10d83642aded", - "modality": "D", - "tags": {"h": 8.5, "uom": "in", "w": 11.0}, - "text": "paper", - "weight": 100.0}, - {"id": "1a9d2139b02bc1c7842ecda94b347889", - "modality": "D", - "tags": {"h": 22.85, "uom": "cm", "w": 30.0}, - "text": "planner", - "weight": 75.0}] +[ + { + "id": "92aee5d665d0c4dd34db10d83642aded", + "modality": "D", + "tags": { + "h": 8.5, + "uom": "in", + "w": 11.0 + }, + "text": "paper", + "weight": 100.0 + }, + { + "id": "1a9d2139b02bc1c7842ecda94b347889", + "modality": "D", + "tags": { + "h": 22.85, + "uom": "cm", + "w": 30.0 + }, + "text": "planner", + "weight": 75.0 + } +] ``` To select all Documents whose `.tags['h']>10`, @@ -130,36 +152,62 @@ r = da.find({'tags__h': {'$gt': 10}}) ``` ```json -[{"id": "4045a9659875fd1299e482d710753de3", - "modality": "A", - "tags": {"h": 14.0, "uom": "cm", "w": 21.0}, - "text": "journal", - "weight": 25.0}, - {"id": "cf7691c445220b94b88ff116911bad24", - "modality": "D", - "tags": {"h": 22.85, "uom": "cm", "w": 30.0}, - "text": "planner", - "weight": 75.0}] +[ + { + "id": "4045a9659875fd1299e482d710753de3", + "modality": "A", + "tags": { + "h": 14.0, + "uom": "cm", + "w": 21.0 + }, + "text": "journal", + "weight": 25.0 + }, + { + "id": "cf7691c445220b94b88ff116911bad24", + "modality": "D", + "tags": { + "h": 22.85, + "uom": "cm", + "w": 30.0 + }, + "text": "planner", + "weight": 75.0 + } +] ``` + Beside using a predefined value, you can also use a substitution with `{field}`. Notice those curly braces. For example: + ```python r = da.find({'tags__h': {'$gt': '{tags__w}'}}) ``` ```json -[{"id": "44c6a4b18eaa005c6dbe15a28a32ebce", - "modality": "A", - "tags": {"h": 14.0, "uom": "cm", "w": 10.0}, - "text": "journal", - "weight": 25.0}] +[ + { + "id": "44c6a4b18eaa005c6dbe15a28a32ebce", + "modality": "A", + "tags": { + "h": 14.0, + "uom": "cm", + "w": 10.0 + }, + "text": "journal", + "weight": 25.0 + } +] ``` ## Combine multiple conditions + You can combine multiple conditions using the following operators: + | Boolean Operator | Description | |------------------|----------------------------------------------------| | `$and` | Join query clauses with a logical AND | @@ -171,19 +219,39 @@ r = da.find({'$or': [{'weight': {'$eq': 45}}, {'modality': {'$eq': 'D'}}]}) ``` ```json -[{"id": "22985b71b6d483c31cbe507ed4d02bd1", - "modality": "D", - "tags": {"h": 8.5, "uom": "in", "w": 11.0}, - "text": "paper", - "weight": 100.0}, - {"id": "a071faf19feac5809642e3afcd3a5878", - "modality": "D", - "tags": {"h": 22.85, "uom": "cm", "w": 30.0}, - "text": "planner", - "weight": 75.0}, - {"id": "411ecc70a71a3f00fc3259bf08c239d1", - "modality": "A", - "tags": {"h": 10.0, "uom": "cm", "w": 15.25}, - "text": "postcard", - "weight": 45.0}] +[ + { + "id": "22985b71b6d483c31cbe507ed4d02bd1", + "modality": "D", + "tags": { + "h": 8.5, + "uom": "in", + "w": 11.0 + }, + "text": "paper", + "weight": 100.0 + }, + { + "id": "a071faf19feac5809642e3afcd3a5878", + "modality": "D", + "tags": { + "h": 22.85, + "uom": "cm", + "w": 30.0 + }, + "text": "planner", + "weight": 75.0 + }, + { + "id": "411ecc70a71a3f00fc3259bf08c239d1", + "modality": "A", + "tags": { + "h": 10.0, + "uom": "cm", + "w": 15.25 + }, + "text": "postcard", + "weight": 45.0 + } +] ``` diff --git a/tests/unit/array/test_lookup.py b/tests/unit/array/test_lookup.py index 4b45d365954..87526b53b7e 100644 --- a/tests/unit/array/test_lookup.py +++ b/tests/unit/array/test_lookup.py @@ -1,6 +1,7 @@ +import numpy as np import pytest + from docarray import Document -import numpy as np @pytest.fixture @@ -9,6 +10,8 @@ def doc(): text='test', embedding=np.random.random(10), tags={ + 'v': np.zeros(3), + 'w': 0, 'x': 0.1, 'y': 1.5, 'z': 1, @@ -43,13 +46,15 @@ def test_lookup_ops(doc): assert lookup('text__regex', '^test', doc) assert not lookup('text__regex', '^est', doc) - assert lookup('tags__size', 6, doc) + assert lookup('tags__size', 8, doc) assert lookup('tags__labels__size', 3, doc) assert lookup('tags__exists', True, doc) assert lookup('tags__z__exists', True, doc) + assert lookup('tags__v__exists', True, doc) + assert lookup('tags__w__exists', True, doc) assert lookup('tags__foo__exists', False, doc) - assert lookup('tags__bar__exists', False, doc) + assert lookup('tags__bar__exists', True, doc) assert lookup('embedding__exists', True, doc) assert lookup('tensor__exists', False, doc) assert lookup('blob__exists', False, doc)