diff --git a/docarray/array/queryset/lookup.py b/docarray/array/queryset/lookup.py index 05cce78dbd2..c896556f565 100644 --- a/docarray/array/queryset/lookup.py +++ b/docarray/array/queryset/lookup.py @@ -107,13 +107,22 @@ def lookup(key, val, doc: 'Document') -> bool: elif last == 'size': return iff_not_none(value, lambda y: len(y) == val) elif last == 'exists': - if value is None: - return True != val - elif isinstance(value, (str, bytes)): - return (value == '' or value == b'') != val + if not isinstance(val, bool): + raise ValueError( + '$exists operator can only accept True/False as value for comparison' + ) + + if '__' in get_key: + is_empty = False + try: + is_empty = not value + except: + # ndarray-like will end up here + pass + + return is_empty != val else: - return True == val - # return (value is None or value == '' or value == b'') != val + return (get_key in doc.non_empty_fields) == val else: # return value == val raise ValueError( diff --git a/docarray/array/queryset/parser.py b/docarray/array/queryset/parser.py index 0c0957fb623..83773944550 100644 --- a/docarray/array/queryset/parser.py +++ b/docarray/array/queryset/parser.py @@ -51,11 +51,15 @@ def _parse_lookups(data: Dict = {}, root_node: Optional[LookupNode] = None): f'The operator {key} is not supported yet, please double check the given filters!' ) else: - items = list(value.items()) - if len(items) == 0: - raise ValueError(f'The query is illegal: {data}') + if not value or not isinstance(value, dict): + raise ValueError( + '''Not a valid query. It should follow the format: + { : { : }, ... } + ''' + ) - elif len(items) == 1: + items = list(value.items()) + if len(items) == 1: op, val = items[0] if op in LOGICAL_OPERATORS: if op == '$not': diff --git a/docs/fundamentals/document/index.md b/docs/fundamentals/document/index.md index b602d621f8f..adeee3752e3 100644 --- a/docs/fundamentals/document/index.md +++ b/docs/fundamentals/document/index.md @@ -4,6 +4,7 @@ A Document object has a predefined data schema as below, each of the attributes can be set/get with the dot expression as you would do with any Python object. +(doc-fields)= | Attribute | Type | Description | |-------------|-----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------| | id | string | A hexdigest that represents a unique document ID | diff --git a/docs/fundamentals/documentarray/access-elements.md b/docs/fundamentals/documentarray/access-elements.md index 2b3c54446bf..4f0390925e6 100644 --- a/docs/fundamentals/documentarray/access-elements.md +++ b/docs/fundamentals/documentarray/access-elements.md @@ -1,5 +1,5 @@ (access-elements)= -# Access Elements +# Access Documents This is probably my favorite chapter so far. Readers come to this far may ask: okay you re-implement Python List coin it as DocumentArray, what's the big deal? diff --git a/docs/fundamentals/documentarray/find.md b/docs/fundamentals/documentarray/find.md new file mode 100644 index 00000000000..a64bf10a917 --- /dev/null +++ b/docs/fundamentals/documentarray/find.md @@ -0,0 +1,159 @@ +(find-documentarray)= +# Query by Conditions + +We can use {meth}`~docarray.array.mixins.find.FindMixin.find` to select Documents from a DocumentArray based the conditions specified in a `query` object. One can use `da.find(query)` to filter Documents and get nearest neighbours from `da`: + +- To filter Documents, the `query` object is a Python dictionary object that defines the filtering conditions using a [MongoDB](https://docs.mongodb.com/manual/reference/operator/query/)-like query language. +- To find nearest neighbours, the `query` object needs to be a NdArray-like, a Document, or a DocumentArray object that defines embedding. One can also use `.match()` function for this purpose, and there is a minor interface difference between these two functions, which will be described {ref}`in the next chapter`. + +Let's see some examples in action. First, let's prepare a DocumentArray we will use. + +```python +from jina import Document, DocumentArray + +da = DocumentArray([Document(text='journal', weight=25, tags={'h': 14, 'w': 21, 'uom': 'cm'}, modality='A'), + Document(text='notebook', weight=50, tags={'h': 8.5, 'w': 11, 'uom': 'in'}, modality='A'), + Document(text='paper', weight=100, tags={'h': 8.5, 'w': 11, 'uom': 'in'}, modality='D'), + Document(text='planner', weight=75, tags={'h': 22.85, 'w': 30, 'uom': 'cm'}, modality='D'), + Document(text='postcard', weight=45, tags={'h': 10, 'w': 15.25, 'uom': 'cm'}, modality='A')]) + +da.summary() +``` + +```text + Documents Summary + + Length 5 + Homogenous Documents True + Common Attributes ('id', 'text', 'tags', 'weight', 'modality') + + Attributes Summary + + Attribute Data type #Unique values Has empty value + ────────────────────────────────────────────────────────── + id ('str',) 5 False + weight ('int',) 5 False + modality ('str',) 2 False + tags ('dict',) 5 False + text ('str',) 5 False +``` + +## Filter with query operators + +A query filter document can use the query operators to specify conditions in the following form: + +```text +{ : { : }, ... } +``` + +Here `field1` is {ref}`any field name` of a Document object. To access nested fields, one can use the dunder expression. For example, `tags__timestamp` is to access `doc.tags['timestamp']` field. + +`value1` can be either a user given Python object, or a substitution field with curly bracket `{field}` + +Finally, `operator1` can be one of the following: + +| Query Operator | Description | +|----------------|------------------------------------------------------------------------------------------------------------| +| `$eq` | Equal to (number, string) | +| `$ne` | Not equal to (number, string) | +| `$gt` | Greater than (number) | +| `$gte` | Greater than or equal to (number) | +| `$lt` | Less than (number) | +| `$lte` | Less than or equal to (number) | +| `$in` | Is in an array | +| `$nin` | Not in an array | +| `$regex` | Match the specified regular expression | +| `$size` | Match array/dict field that have the specified size. `$size` does not accept ranges of values. | +| `$exists` | Matches documents that have the specified field. And empty string content is also considered as not exists. | + + +For example, to select all `modality='D'` Documents, + +```python +r = da.find({'modality': {'$eq': 'D'}}) + +pprint(r.to_dict(exclude_none=True)) # just for pretty print +``` + +```text +[{'id': '92aee5d665d0c4dd34db10d83642aded', + 'modality': 'D', + 'tags': {'h': 8.5, 'uom': 'in', 'w': 11.0}, + 'text': 'paper', + 'weight': 100.0}, + {'id': '1a9d2139b02bc1c7842ecda94b347889', + 'modality': 'D', + 'tags': {'h': 22.85, 'uom': 'cm', 'w': 30.0}, + 'text': 'planner', + 'weight': 75.0}] +``` + +To select all Documents whose `.tags['h']>10`, + +```python +r = da.find({'tags__h': {'$gt': 10}}) +``` + +```text +[{'id': '4045a9659875fd1299e482d710753de3', + 'modality': 'A', + 'tags': {'h': 14.0, 'uom': 'cm', 'w': 21.0}, + 'text': 'journal', + 'weight': 25.0}, + {'id': 'cf7691c445220b94b88ff116911bad24', + 'modality': 'D', + 'tags': {'h': 22.85, 'uom': 'cm', 'w': 30.0}, + 'text': 'planner', + 'weight': 75.0}] +``` + +Beside using a predefined value, one can also use a substitution with `{field}`, notice the curly brackets there. For example, + +```python +r = da.find({'tags__h': {'$gt': '{tags__w}'}}) +``` + +```text +[{'id': '44c6a4b18eaa005c6dbe15a28a32ebce', + 'modality': 'A', + 'tags': {'h': 14.0, 'uom': 'cm', 'w': 10.0}, + 'text': 'journal', + 'weight': 25.0}] +``` + + + +## Combine multiple conditions + + +You can combine multiple conditions using the following operators + +| Boolean Operator | Description | +|------------------|----------------------------------------------------| +| `$and` | Join query clauses with a logical AND | +| `$or` | Join query clauses with a logical OR | +| `$not` | Inverts the effect of a query expression | + + + +```python +r = da.find({'$or': [{'weight': {'$eq': 45}}, {'modality': {'$eq': 'D'}}]}) +``` + +```text +[{'id': '22985b71b6d483c31cbe507ed4d02bd1', + 'modality': 'D', + 'tags': {'h': 8.5, 'uom': 'in', 'w': 11.0}, + 'text': 'paper', + 'weight': 100.0}, + {'id': 'a071faf19feac5809642e3afcd3a5878', + 'modality': 'D', + 'tags': {'h': 22.85, 'uom': 'cm', 'w': 30.0}, + 'text': 'planner', + 'weight': 75.0}, + {'id': '411ecc70a71a3f00fc3259bf08c239d1', + 'modality': 'A', + 'tags': {'h': 10.0, 'uom': 'cm', 'w': 15.25}, + 'text': 'postcard', + 'weight': 45.0}] +``` diff --git a/docs/fundamentals/documentarray/index.md b/docs/fundamentals/documentarray/index.md index 6fff3dfc252..62f05a870da 100644 --- a/docs/fundamentals/documentarray/index.md +++ b/docs/fundamentals/documentarray/index.md @@ -36,9 +36,10 @@ serialization access-elements access-attributes embedding +find matching evaluation parallelization visualization post-external -``` \ No newline at end of file +``` diff --git a/docs/fundamentals/documentarray/matching.md b/docs/fundamentals/documentarray/matching.md index 408ecf44d11..21ab3b1a3aa 100644 --- a/docs/fundamentals/documentarray/matching.md +++ b/docs/fundamentals/documentarray/matching.md @@ -3,10 +3,31 @@ ```{important} -{meth}`~docarray.array.mixins.match.MatchMixin.match` supports both CPU & GPU. +{meth}`~docarray.array.mixins.match.MatchMixin.match` and {meth}`~docarray.array.mixins.find.FindMixin.find` support both CPU & GPU. ``` -Once `.embeddings` is set, one can use {func}`~docarray.array.mixins.match.MatchMixin.match` function to find the nearest-neighbour Documents from another DocumentArray (or itself) based on their `.embeddings`. +Once `.embeddings` is set, one can use {meth}`~docarray.array.mixins.find.FindMixin.find` or {func}`~docarray.array.mixins.match.MatchMixin.match` function to find the nearest-neighbour Documents from another DocumentArray (or itself) based on their `.embeddings` and distance metrics. + + +## Difference between find and match + +Though both `.find()` and `.match()` is about finding nearest neighbours of a given "query" and both accpet similar arguments, there are some differences between them: + +##### Which side is the query at? +- `.find()` always requires the query on the right-hand side. Say you have a DocumentArray with one million Documents, to find one query's nearest neightbours you should write `one_million_docs.find(query)`; +- `.match()` assumes the query is on left-hand side. `A.match(B)` semantically means "A matches against B and save the results to A". So with `.match()` you should write `query.match(one_million_docs)`. + +##### What is type of the query? + - query (RHS) in `.find()` can be plain NdArray-like object or a single Document or a DocumentArray. + - query (lHS) in `.match()` can be either a Document or a DocumentArray. + +##### What is the return? + - `.find()` returns a List of DocumentArray, each of which corresponds to one element/row in the query. + - `.match()` do not return anything. Match results are stored inside right-hand side's `.matches`. + +In the example below, we will use `.match()` to describe the feature. But keep in mind, `.find()` should always work by simply switching the right and left-hand sides. + +## Example The following example finds for each element in `da1` the three closest Documents from the elements in `da2` according to Euclidean distance. @@ -104,6 +125,22 @@ match emb = (0, 0) 1.0 ```` +The above example when writing with `.find()`: + +```python +da2.find(da1, metric='euclidean', limit=3) +``` + +or simply: + +```python +da2.find(np.array( + [[0, 0, 0, 0, 1], + [1, 0, 0, 0, 0], + [1, 1, 1, 1, 0], + [1, 2, 2, 1, 0]]), metric='euclidean', limit=3) +``` + The following metrics are supported: | Metric | Frameworks | diff --git a/tests/unit/array/test_lookup.py b/tests/unit/array/test_lookup.py index b229a599eda..4b45d365954 100644 --- a/tests/unit/array/test_lookup.py +++ b/tests/unit/array/test_lookup.py @@ -11,7 +11,7 @@ def doc(): tags={ 'x': 0.1, 'y': 1.5, - 'z': 0, + 'z': 1, 'name': 'test', 'bar': '', 'labels': ['a', 'b', 'test'],