From 906405bc16e7106b0a7819abe87347cb902a938a Mon Sep 17 00:00:00 2001 From: marcosbodio Date: Fri, 9 Dec 2022 10:38:06 +0000 Subject: [PATCH 1/2] fix: query operator does not work correctly with tags (#911) Signed-off-by: marcosbodio --- docarray/array/queryset/lookup.py | 16 +-- docs/fundamentals/documentarray/find.md | 164 ++++++++++++++++-------- tests/unit/array/test_lookup.py | 11 +- 3 files changed, 127 insertions(+), 64 deletions(-) diff --git a/docarray/array/queryset/lookup.py b/docarray/array/queryset/lookup.py index 452e7dfae60..be1e23c7846 100644 --- a/docarray/array/queryset/lookup.py +++ b/docarray/array/queryset/lookup.py @@ -114,14 +114,14 @@ def lookup(key, val, doc: 'Document') -> bool: ) if '__' in get_key: - is_empty = False - try: - is_empty = not value - except: - # ndarray-like will end up here - pass - - return is_empty != val + if value is None and val is True: + return False + elif value is None and val is False: + return True + elif value is not None and val is True: + return True + else: # value is not None and val is False: + return False else: return (_is_not_empty(get_key, value)) == val else: diff --git a/docs/fundamentals/documentarray/find.md b/docs/fundamentals/documentarray/find.md index 6cba8ea373e..88f6f290ace 100644 --- a/docs/fundamentals/documentarray/find.md +++ b/docs/fundamentals/documentarray/find.md @@ -1,10 +1,16 @@ (find-documentarray)= + # Query by Conditions -We can use {meth}`~docarray.array.mixins.find.FindMixin.find` to select Documents from a DocumentArray based the conditions specified in a `query` object. One can use `da.find(query)` to filter Documents and get nearest neighbours from `da`: +We can use {meth}`~docarray.array.mixins.find.FindMixin.find` to select Documents from a DocumentArray based the +conditions specified in a `query` object. One can use `da.find(query)` to filter Documents and get nearest neighbours +from `da`: -- To filter Documents, the `query` object is a Python dictionary object that defines the filtering conditions using a [MongoDB](https://docs.mongodb.com/manual/reference/operator/query/)-like query language. -- To find nearest neighbours, the `query` object needs to be a NdArray-like, a Document, or a DocumentArray object that defines embedding. One can also use `.match()` function for this purpose, and there is a minor interface difference between these two functions, which will be described {ref}`in the next chapter`. +- To filter Documents, the `query` object is a Python dictionary object that defines the filtering conditions using + a [MongoDB](https://docs.mongodb.com/manual/reference/operator/query/)-like query language. +- To find nearest neighbours, the `query` object needs to be a NdArray-like, a Document, or a DocumentArray object that + defines embedding. One can also use `.match()` function for this purpose, and there is a minor interface difference + between these two functions, which will be described {ref}`in the next chapter`. ```{admonition} filter query syntax :class: note @@ -82,9 +88,10 @@ A query filter document can use the query operators to specify conditions in the { : { : }, ... } ``` -Here `field1` is {ref}`any field name` of a Document object. To access nested fields, one can use the dunder expression. For example, `tags__timestamp` is to access `doc.tags['timestamp']` field. +Here `field1` is {ref}`any field name` of a Document object. To access nested fields, one can use the dunder +expression. For example, `tags__timestamp` is to access `doc.tags['timestamp']` field. -`value1` can be either a user given Python object, or a substitution field with curly bracket `{field}` +`value1` can be either a user given Python object, or a substitution field with curly bracket `{field}` Finally, `operator1` can be one of the following: @@ -100,8 +107,7 @@ Finally, `operator1` can be one of the following: | `$nin` | Not in an array | | `$regex` | Match the specified regular expression | | `$size` | Match array/dict field that have the specified size. `$size` does not accept ranges of values. | -| `$exists` | Matches documents that have the specified field. And empty string content is also considered as not exists. | - +| `$exists` | Matches documents that have the specified field; {ref}`predefined fields` having a default value (for example empty string, or 0) are considered as not existing; if the expression specifies a field `x` in `tags` (`tags__x`), then the operator tests that `x` is not `None`. | For example, to select all `modality='D'` Documents, @@ -112,16 +118,30 @@ pprint(r.to_dict(exclude_none=True)) # just for pretty print ``` ```json -[{"id": "92aee5d665d0c4dd34db10d83642aded", - "modality": "D", - "tags": {"h": 8.5, "uom": "in", "w": 11.0}, - "text": "paper", - "weight": 100.0}, - {"id": "1a9d2139b02bc1c7842ecda94b347889", - "modality": "D", - "tags": {"h": 22.85, "uom": "cm", "w": 30.0}, - "text": "planner", - "weight": 75.0}] +[ + { + "id": "92aee5d665d0c4dd34db10d83642aded", + "modality": "D", + "tags": { + "h": 8.5, + "uom": "in", + "w": 11.0 + }, + "text": "paper", + "weight": 100.0 + }, + { + "id": "1a9d2139b02bc1c7842ecda94b347889", + "modality": "D", + "tags": { + "h": 22.85, + "uom": "cm", + "w": 30.0 + }, + "text": "planner", + "weight": 75.0 + } +] ``` To select all Documents whose `.tags['h']>10`, @@ -131,37 +151,57 @@ r = da.find({'tags__h': {'$gt': 10}}) ``` ```json -[{"id": "4045a9659875fd1299e482d710753de3", - "modality": "A", - "tags": {"h": 14.0, "uom": "cm", "w": 21.0}, - "text": "journal", - "weight": 25.0}, - {"id": "cf7691c445220b94b88ff116911bad24", - "modality": "D", - "tags": {"h": 22.85, "uom": "cm", "w": 30.0}, - "text": "planner", - "weight": 75.0}] +[ + { + "id": "4045a9659875fd1299e482d710753de3", + "modality": "A", + "tags": { + "h": 14.0, + "uom": "cm", + "w": 21.0 + }, + "text": "journal", + "weight": 25.0 + }, + { + "id": "cf7691c445220b94b88ff116911bad24", + "modality": "D", + "tags": { + "h": 22.85, + "uom": "cm", + "w": 30.0 + }, + "text": "planner", + "weight": 75.0 + } +] ``` -Beside using a predefined value, one can also use a substitution with `{field}`, notice the curly brackets there. For example, +Beside using a predefined value, one can also use a substitution with `{field}`, notice the curly brackets there. For +example, ```python r = da.find({'tags__h': {'$gt': '{tags__w}'}}) ``` ```json -[{"id": "44c6a4b18eaa005c6dbe15a28a32ebce", - "modality": "A", - "tags": {"h": 14.0, "uom": "cm", "w": 10.0}, - "text": "journal", - "weight": 25.0}] +[ + { + "id": "44c6a4b18eaa005c6dbe15a28a32ebce", + "modality": "A", + "tags": { + "h": 14.0, + "uom": "cm", + "w": 10.0 + }, + "text": "journal", + "weight": 25.0 + } +] ``` - - ## Combine multiple conditions - You can combine multiple conditions using the following operators | Boolean Operator | Description | @@ -170,26 +210,44 @@ You can combine multiple conditions using the following operators | `$or` | Join query clauses with a logical OR | | `$not` | Inverts the effect of a query expression | - - ```python r = da.find({'$or': [{'weight': {'$eq': 45}}, {'modality': {'$eq': 'D'}}]}) ``` ```json -[{"id": "22985b71b6d483c31cbe507ed4d02bd1", - "modality": "D", - "tags": {"h": 8.5, "uom": "in", "w": 11.0}, - "text": "paper", - "weight": 100.0}, - {"id": "a071faf19feac5809642e3afcd3a5878", - "modality": "D", - "tags": {"h": 22.85, "uom": "cm", "w": 30.0}, - "text": "planner", - "weight": 75.0}, - {"id": "411ecc70a71a3f00fc3259bf08c239d1", - "modality": "A", - "tags": {"h": 10.0, "uom": "cm", "w": 15.25}, - "text": "postcard", - "weight": 45.0}] +[ + { + "id": "22985b71b6d483c31cbe507ed4d02bd1", + "modality": "D", + "tags": { + "h": 8.5, + "uom": "in", + "w": 11.0 + }, + "text": "paper", + "weight": 100.0 + }, + { + "id": "a071faf19feac5809642e3afcd3a5878", + "modality": "D", + "tags": { + "h": 22.85, + "uom": "cm", + "w": 30.0 + }, + "text": "planner", + "weight": 75.0 + }, + { + "id": "411ecc70a71a3f00fc3259bf08c239d1", + "modality": "A", + "tags": { + "h": 10.0, + "uom": "cm", + "w": 15.25 + }, + "text": "postcard", + "weight": 45.0 + } +] ``` diff --git a/tests/unit/array/test_lookup.py b/tests/unit/array/test_lookup.py index 4b45d365954..87526b53b7e 100644 --- a/tests/unit/array/test_lookup.py +++ b/tests/unit/array/test_lookup.py @@ -1,6 +1,7 @@ +import numpy as np import pytest + from docarray import Document -import numpy as np @pytest.fixture @@ -9,6 +10,8 @@ def doc(): text='test', embedding=np.random.random(10), tags={ + 'v': np.zeros(3), + 'w': 0, 'x': 0.1, 'y': 1.5, 'z': 1, @@ -43,13 +46,15 @@ def test_lookup_ops(doc): assert lookup('text__regex', '^test', doc) assert not lookup('text__regex', '^est', doc) - assert lookup('tags__size', 6, doc) + assert lookup('tags__size', 8, doc) assert lookup('tags__labels__size', 3, doc) assert lookup('tags__exists', True, doc) assert lookup('tags__z__exists', True, doc) + assert lookup('tags__v__exists', True, doc) + assert lookup('tags__w__exists', True, doc) assert lookup('tags__foo__exists', False, doc) - assert lookup('tags__bar__exists', False, doc) + assert lookup('tags__bar__exists', True, doc) assert lookup('embedding__exists', True, doc) assert lookup('tensor__exists', False, doc) assert lookup('blob__exists', False, doc) From bb03e746296d90dfb991bdaa59fddc5845ff9633 Mon Sep 17 00:00:00 2001 From: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> Date: Mon, 12 Dec 2022 14:57:38 +0100 Subject: [PATCH 2/2] docs: rewording Signed-off-by: Johannes Messner <44071807+JohannesMessner@users.noreply.github.com> --- docs/fundamentals/documentarray/find.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/fundamentals/documentarray/find.md b/docs/fundamentals/documentarray/find.md index 88f6f290ace..869ac48315e 100644 --- a/docs/fundamentals/documentarray/find.md +++ b/docs/fundamentals/documentarray/find.md @@ -2,15 +2,15 @@ # Query by Conditions -We can use {meth}`~docarray.array.mixins.find.FindMixin.find` to select Documents from a DocumentArray based the -conditions specified in a `query` object. One can use `da.find(query)` to filter Documents and get nearest neighbours +You can use {meth}`~docarray.array.mixins.find.FindMixin.find` to select Documents from a DocumentArray based the +conditions specified in a `query` object. You can use `da.find(query)` to filter Documents and get nearest neighbors from `da`: - To filter Documents, the `query` object is a Python dictionary object that defines the filtering conditions using a [MongoDB](https://docs.mongodb.com/manual/reference/operator/query/)-like query language. - To find nearest neighbours, the `query` object needs to be a NdArray-like, a Document, or a DocumentArray object that - defines embedding. One can also use `.match()` function for this purpose, and there is a minor interface difference - between these two functions, which will be described {ref}`in the next chapter`. + defines embedding. You can also use `.match()` function for this purpose, and there is a minor interface difference + between these two functions, which is described {ref}`in the next chapter`. ```{admonition} filter query syntax :class: note @@ -88,7 +88,7 @@ A query filter document can use the query operators to specify conditions in the { : { : }, ... } ``` -Here `field1` is {ref}`any field name` of a Document object. To access nested fields, one can use the dunder +Here `field1` is {ref}`any field name` of a Document object. To access nested fields, you can use the dunder expression. For example, `tags__timestamp` is to access `doc.tags['timestamp']` field. `value1` can be either a user given Python object, or a substitution field with curly bracket `{field}` @@ -177,7 +177,7 @@ r = da.find({'tags__h': {'$gt': 10}}) ] ``` -Beside using a predefined value, one can also use a substitution with `{field}`, notice the curly brackets there. For +Besides using a predefined value, you can also use a substitution with `{field}`, notice the curly brackets there. For example, ```python