From 0d00c3f2d62b745a9a65cb013a7c3a85456600d4 Mon Sep 17 00:00:00 2001 From: nan-wang Date: Fri, 7 Apr 2023 18:00:58 +0200 Subject: [PATCH 01/35] docs: add hnswDocumentIndex Signed-off-by: nan-wang --- docs/.gitignore | 2 +- docs/user_guide/storing/index.md | 35 ++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 docs/user_guide/storing/index.md diff --git a/docs/.gitignore b/docs/.gitignore index 38f32345848..c528ce87543 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -2,5 +2,5 @@ api/* proto/* README.md -index.md +#index.md CONTRIBUTING.md \ No newline at end of file diff --git a/docs/user_guide/storing/index.md b/docs/user_guide/storing/index.md new file mode 100644 index 00000000000..07511f4b2a2 --- /dev/null +++ b/docs/user_guide/storing/index.md @@ -0,0 +1,35 @@ +# Index +This section show you how to use the `DocArray.index` module. `DocArray.index` module is used to create index for the tensors so that one can search the document based on the vector similarity. `DocArray.index` implements the following index. + +## Hnswlib + +[HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] implement the index based on [hnswlib](https://github.com/nmslib/hnswlib). This is a lightweight implementation with vectors stored in memory. + +!!! note + To use [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex], one need to install the extra dependency with the following command + + ```console + pip install "docarray[hnswlib]" + ``` + +### Construct +To construct an index, you need to define the schema first. You can define the schema in the same way as define a `Doc`. The only difference is that you need to define the dimensionality of the vector space by `dim` and the name of the space by `space`. The `dim` argument must be an integer. The `space` argument can be one of `l2`, `ip` or `cosine`. TODO: add links to the detailed explaination + +```python +from pydantic import Field + +from docarray import BaseDoc +from docarray.index import HnswDocumentIndex +from docarray.typing import NdArray + + +class SimpleSchema(BaseDoc): + tens: NdArray[10] = Field(dim=128, space='cosine') + + +doc_index = HnswDocumentIndex[SimpleSchema](work_dir='./tmp') +``` + +### Index + +## ElasticSearch From 4ea4b7d4f6d60ad13d65505560b5bb1b11d11887 Mon Sep 17 00:00:00 2001 From: nan-wang Date: Fri, 7 Apr 2023 18:21:56 +0200 Subject: [PATCH 02/35] docs: add the crud operations Signed-off-by: nan-wang --- docs/user_guide/storing/index.md | 44 +++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/docs/user_guide/storing/index.md b/docs/user_guide/storing/index.md index 07511f4b2a2..9cda1a7d0a6 100644 --- a/docs/user_guide/storing/index.md +++ b/docs/user_guide/storing/index.md @@ -24,12 +24,54 @@ from docarray.typing import NdArray class SimpleSchema(BaseDoc): - tens: NdArray[10] = Field(dim=128, space='cosine') + tensor: NdArray[128] = Field(dim=128, space='cosine') doc_index = HnswDocumentIndex[SimpleSchema](work_dir='./tmp') ``` ### Index +Use `.index()` to add `Doc` into the index. You need to define the `Doc` following the schema of the index. + +```python +from docarray import BaseDoc +from docarray.typing import NdArray +import numpy as np + +class SimpleDoc(BaseDoc): + tensor: NdArray[128] + +index_docs = [SimpleDoc(tensor=np.zeros(128)) for _ in range(64)] + +doc_index.index(index_docs) +``` + +### Access +To access the `Doc`, you need to specify the `id`. You can also pass a list of `id` to access multiple `Doc`. + +```python +# access a single Doc +doc_index[index_docs[16].id] + +# access multiple Docs +doc_index[index_docs[16].id, index_docs[17].id] +``` + +### Delete +To delete the `Doc`, use the built-in function `del` with the `id` of the `Doc` to be deleted. You can also pass a list of `id` to delete multiple `Doc`. + +```python +# delete a single Doc +del doc_index[index_docs[16].id] + +# delete multiple Docs +del doc_index[index_docs[16].id, index_docs[17].id] +``` + +### Find nearest neighbors + +#### Find by a field + +#### Find a nested Doc ## ElasticSearch From 1429e4c5a0a375a9e8e9a1890f21fddcbf0865d4 Mon Sep 17 00:00:00 2001 From: nan-wang Date: Fri, 7 Apr 2023 18:47:43 +0200 Subject: [PATCH 03/35] docs: add docs Signed-off-by: nan-wang --- docs/user_guide/storing/index.md | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/docs/user_guide/storing/index.md b/docs/user_guide/storing/index.md index 9cda1a7d0a6..723a5fd9a0a 100644 --- a/docs/user_guide/storing/index.md +++ b/docs/user_guide/storing/index.md @@ -15,6 +15,8 @@ This section show you how to use the `DocArray.index` module. `DocArray.index` m ### Construct To construct an index, you need to define the schema first. You can define the schema in the same way as define a `Doc`. The only difference is that you need to define the dimensionality of the vector space by `dim` and the name of the space by `space`. The `dim` argument must be an integer. The `space` argument can be one of `l2`, `ip` or `cosine`. TODO: add links to the detailed explaination +`work_dir` is the directory for storing the index. If there is an index in the directory, it will be automatically loaded. When the schema of the saved and the defined index do not match, an exception will be raised. + ```python from pydantic import Field @@ -31,7 +33,7 @@ doc_index = HnswDocumentIndex[SimpleSchema](work_dir='./tmp') ``` ### Index -Use `.index()` to add `Doc` into the index. You need to define the `Doc` following the schema of the index. +Use `.index()` to add `Doc` into the index. You need to define the `Doc` following the schema of the index. `.num_docs()` returns the total number of `Doc` in the index. ```python from docarray import BaseDoc @@ -44,6 +46,7 @@ class SimpleDoc(BaseDoc): index_docs = [SimpleDoc(tensor=np.zeros(128)) for _ in range(64)] doc_index.index(index_docs) +print(f'number of docs in the index: {doc_index.num_docs()}') ``` ### Access @@ -69,9 +72,29 @@ del doc_index[index_docs[16].id, index_docs[17].id] ``` ### Find nearest neighbors +Use `.find()` to find the nearest neighbors. You can use `limit` argument to configurate how much `Doc` to return. + +```python +query = SimpleDoc(tensor=np.ones(10)) -#### Find by a field +docs, scores = doc_index.find(query, limit=5) +``` -#### Find a nested Doc +### Nested index +When using the index, you can define multiple fields as well as the nested structure. + +```python +# example of construct nested and flat index +``` -## ElasticSearch +Use the `search_field` to specify which field to be used when performing the vector search. You can use the dunder operator to specify the field defined in the nested data. + +```python +# example of find nested and flat index +``` + +To delete a nested data, ... + +```python +# example of delete nested and flat index +``` \ No newline at end of file From 8cd8d81090125baa94548a8d8d069430838d892d Mon Sep 17 00:00:00 2001 From: nan-wang Date: Fri, 7 Apr 2023 18:48:32 +0200 Subject: [PATCH 04/35] docs: rename the file Signed-off-by: nan-wang --- docs/user_guide/storing/{index.md => index_hnswlib.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/user_guide/storing/{index.md => index_hnswlib.md} (100%) diff --git a/docs/user_guide/storing/index.md b/docs/user_guide/storing/index_hnswlib.md similarity index 100% rename from docs/user_guide/storing/index.md rename to docs/user_guide/storing/index_hnswlib.md From de8b0cb3f8b9944cddfee85fa5519a8a4d624ba2 Mon Sep 17 00:00:00 2001 From: nan-wang Date: Sat, 8 Apr 2023 10:16:29 +0200 Subject: [PATCH 05/35] docs: complete the hnswlib index Signed-off-by: nan-wang --- docs/user_guide/storing/index_hnswlib.md | 75 ++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 6 deletions(-) diff --git a/docs/user_guide/storing/index_hnswlib.md b/docs/user_guide/storing/index_hnswlib.md index 723a5fd9a0a..0951caaff12 100644 --- a/docs/user_guide/storing/index_hnswlib.md +++ b/docs/user_guide/storing/index_hnswlib.md @@ -26,7 +26,7 @@ from docarray.typing import NdArray class SimpleSchema(BaseDoc): - tensor: NdArray[128] = Field(dim=128, space='cosine') + tensor: NdArray = Field(dim=128, space='cosine') doc_index = HnswDocumentIndex[SimpleSchema](work_dir='./tmp') @@ -41,7 +41,7 @@ from docarray.typing import NdArray import numpy as np class SimpleDoc(BaseDoc): - tensor: NdArray[128] + tensor: NdArray index_docs = [SimpleDoc(tensor=np.zeros(128)) for _ in range(64)] @@ -81,20 +81,83 @@ docs, scores = doc_index.find(query, limit=5) ``` ### Nested index -When using the index, you can define multiple fields as well as the nested structure. +When using the index, you can define multiple fields as well as the nested structure. In the following example, you have `YouTubeVideoDoc` including the `tensor` field calculated based on the description. Besides, `YouTbueVideoDoc` has `thumbnail` and `video` field, each of which has its own `tensor`. ```python -# example of construct nested and flat index +from docarray import BaseDoc +from docarray.typing import ImageUrl, VideoUrl, AnyTensor +from docarray.index import HnswDocumentIndex +import numpy as np +from pydantic import Field + + +class ImageDoc(BaseDoc): + url: ImageUrl + tensor: AnyTensor = Field(space='cosine', dim=64) + + +class VideoDoc(BaseDoc): + url: VideoUrl + tensor: AnyTensor = Field(space='cosine', dim=128) + + +class YouTubeVideoDoc(BaseDoc): + title: str + description: str + thumbnail: ImageDoc + video: VideoDoc + tensor: AnyTensor = Field(space='cosine', dim=256) + + +doc_index = HnswDocumentIndex[YouTubeVideoDoc](work_dir='./tmp') +index_docs = [ + YouTubeVideoDoc( + title=f'video {i+1}', + description=f'this is video from author {10*i}', + thumbnail=ImageDoc( + url=f'http://example.ai/images/{i}', + tensor=np.ones(64)), + video=VideoDoc( + url=f'http://example.ai/videos/{i}', + tensor=np.ones(128) + ), + tensor=np.ones(256) + ) for i in range(8) +] +doc_index.index(index_docs) ``` -Use the `search_field` to specify which field to be used when performing the vector search. You can use the dunder operator to specify the field defined in the nested data. +Use the `search_field` to specify which field to be used when performing the vector search. You can use the dunder operator to specify the field defined in the nested data. In the following codes, you can perform vector search on the `tensor` field of the `YouTubeVideoDoc` or on the `tensor` field of the `thumbnail` and `video` field. ```python # example of find nested and flat index +query_doc = YouTubeVideoDoc( + title=f'video query', + description=f'this is a query video', + thumbnail=ImageDoc( + url=f'http://example.ai/images/1024', + tensor=np.ones(64) + ), + video=VideoDoc( + url=f'http://example.ai/videos/1024', + tensor=np.ones(128) + ), + tensor=np.ones(256) +) +# find by the youtubevideo tensor +docs, scores = doc_index.find(query_doc, search_field='tensor', limit=3) +# find by the thumbnail tensor +docs, scores = doc_index.find(query_doc, search_field='thumbnail__tensor', limit=3) +# find by the video tensor +docs, scores = doc_index.find(query_doc, search_field='video__tensor', limit=3) ``` -To delete a nested data, ... +To delete a nested data, you need to specify the `id`. + +!!! note + You can only delete `Doc` at the top level. Deletion of the `Doc` on the lower level is not supported yet. ```python # example of delete nested and flat index +del doc_index[index_docs[16].id, index_docs[32].id] ``` \ No newline at end of file From b151c5276dfc1c350cc677709ca2fc6c93c0d15d Mon Sep 17 00:00:00 2001 From: nan-wang Date: Sat, 8 Apr 2023 14:00:39 +0200 Subject: [PATCH 06/35] docs: add the index api Signed-off-by: nan-wang --- docs/api_references/index/backends.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 docs/api_references/index/backends.md diff --git a/docs/api_references/index/backends.md b/docs/api_references/index/backends.md new file mode 100644 index 00000000000..6bfcaf17670 --- /dev/null +++ b/docs/api_references/index/backends.md @@ -0,0 +1,3 @@ +# Backends + +::: docarray.index.backends From 3d14556026efb63b52e013fe80acaaff15031803 Mon Sep 17 00:00:00 2001 From: nan-wang Date: Sat, 8 Apr 2023 14:07:25 +0200 Subject: [PATCH 07/35] docs: add elastic index Signed-off-by: nan-wang --- docs/user_guide/storing/first_step.md | 12 ++++++++++++ docs/user_guide/storing/index_elastic.md | 2 ++ docs/user_guide/storing/index_hnswlib.md | 15 ++++++--------- 3 files changed, 20 insertions(+), 9 deletions(-) create mode 100644 docs/user_guide/storing/index_elastic.md diff --git a/docs/user_guide/storing/first_step.md b/docs/user_guide/storing/first_step.md index 5be8b39165b..91a6ed8d0d5 100644 --- a/docs/user_guide/storing/first_step.md +++ b/docs/user_guide/storing/first_step.md @@ -1 +1,13 @@ # Storing + +## Index +This section show you how to use the `DocArray.index` module. `DocArray.index` module is used to create index for the tensors so that one can search the document based on the vector similarity. `DocArray.index` implements the following index. + +- link to hnswlib +- link to elastic + +## Store +This section show you how to use the `DocArray.store` module. `DocArray.store` module is used to store the `Doc`. + +- link to jac +- link to s3 diff --git a/docs/user_guide/storing/index_elastic.md b/docs/user_guide/storing/index_elastic.md new file mode 100644 index 00000000000..b703406c1d9 --- /dev/null +++ b/docs/user_guide/storing/index_elastic.md @@ -0,0 +1,2 @@ +# Elastic +[] \ No newline at end of file diff --git a/docs/user_guide/storing/index_hnswlib.md b/docs/user_guide/storing/index_hnswlib.md index 0951caaff12..00df75d1b68 100644 --- a/docs/user_guide/storing/index_hnswlib.md +++ b/docs/user_guide/storing/index_hnswlib.md @@ -1,7 +1,4 @@ -# Index -This section show you how to use the `DocArray.index` module. `DocArray.index` module is used to create index for the tensors so that one can search the document based on the vector similarity. `DocArray.index` implements the following index. - -## Hnswlib +# Hnswlib [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] implement the index based on [hnswlib](https://github.com/nmslib/hnswlib). This is a lightweight implementation with vectors stored in memory. @@ -12,7 +9,7 @@ This section show you how to use the `DocArray.index` module. `DocArray.index` m pip install "docarray[hnswlib]" ``` -### Construct +## Construct To construct an index, you need to define the schema first. You can define the schema in the same way as define a `Doc`. The only difference is that you need to define the dimensionality of the vector space by `dim` and the name of the space by `space`. The `dim` argument must be an integer. The `space` argument can be one of `l2`, `ip` or `cosine`. TODO: add links to the detailed explaination `work_dir` is the directory for storing the index. If there is an index in the directory, it will be automatically loaded. When the schema of the saved and the defined index do not match, an exception will be raised. @@ -32,7 +29,7 @@ class SimpleSchema(BaseDoc): doc_index = HnswDocumentIndex[SimpleSchema](work_dir='./tmp') ``` -### Index +## Index Use `.index()` to add `Doc` into the index. You need to define the `Doc` following the schema of the index. `.num_docs()` returns the total number of `Doc` in the index. ```python @@ -49,7 +46,7 @@ doc_index.index(index_docs) print(f'number of docs in the index: {doc_index.num_docs()}') ``` -### Access +## Access To access the `Doc`, you need to specify the `id`. You can also pass a list of `id` to access multiple `Doc`. ```python @@ -60,7 +57,7 @@ doc_index[index_docs[16].id] doc_index[index_docs[16].id, index_docs[17].id] ``` -### Delete +## Delete To delete the `Doc`, use the built-in function `del` with the `id` of the `Doc` to be deleted. You can also pass a list of `id` to delete multiple `Doc`. ```python @@ -80,7 +77,7 @@ query = SimpleDoc(tensor=np.ones(10)) docs, scores = doc_index.find(query, limit=5) ``` -### Nested index +## Nested index When using the index, you can define multiple fields as well as the nested structure. In the following example, you have `YouTubeVideoDoc` including the `tensor` field calculated based on the description. Besides, `YouTbueVideoDoc` has `thumbnail` and `video` field, each of which has its own `tensor`. ```python From bffd2b5be9eb8f443a7c11bca3bf65a62ba7973e Mon Sep 17 00:00:00 2001 From: nan-wang Date: Sat, 8 Apr 2023 22:11:52 +0200 Subject: [PATCH 08/35] docs: complete the elastic index Signed-off-by: nan-wang --- docs/user_guide/storing/index_elastic.md | 252 ++++++++++++++++++++++- docs/user_guide/storing/index_hnswlib.md | 4 +- 2 files changed, 253 insertions(+), 3 deletions(-) diff --git a/docs/user_guide/storing/index_elastic.md b/docs/user_guide/storing/index_elastic.md index b703406c1d9..7ac5825363e 100644 --- a/docs/user_guide/storing/index_elastic.md +++ b/docs/user_guide/storing/index_elastic.md @@ -1,2 +1,252 @@ # Elastic -[] \ No newline at end of file +[ElasticV7DocIndex][docarray.index.backends.elastic.ElasticV7DocIndex] implement the index based on [Elasticsearch 7.0](https://github.com/elastic/elasticsearch). This is an implementation with vectors stored and supporting text/range search. + +!!! note + To use [ElasticV7DocIndex][docarray.index.backends.elastic.ElasticV7DocIndex], one need to install the extra dependency with the following command + + ```console + pip install "docarray[elasticsearch]" + ``` + + +In the following examples, we use docker-compose to create a local elasticsearch service with the following `docker-compose.yml`. + +```yaml +version: "3.3" +services: + elastic: + image: docker.elastic.co/elasticsearch/elasticsearch:7.10.2 + environment: + - xpack.security.enabled=false + - discovery.type=single-node + - ES_JAVA_OPTS=-Xmx1024m + ports: + - "9200:9200" + networks: + - elastic + +networks: + elastic: + name: elastic +``` + +Run the following command in the folder of the above `docker-compose.yml` to start the service, + +```bash +docker-compose up +``` + +## Construct +To construct an index, you need to define the schema first. You can define the schema in the same way as define a `Doc`. The only difference is that you need to define the dimensionality of the vector space by `dims`. The `dims` argument must be an integer. TODO: add links to the detailed explaination + +`hosts` is the argument for setting the elasticsearch hosts. By default, it is using `http://localhost:9200`. TODO: add more detailed explaination of the ES-related parameters. + +```python +from pydantic import Field + +from docarray import BaseDoc +from docarray.index import ElasticV7DocIndex +from docarray.typing import NdArray + + +class SimpleDoc(BaseDoc): + tensor: NdArray = Field(dims=128) + + +doc_index = ElasticV7DocIndex[SimpleDoc]() + +``` + +## Index +Use `.index()` to add `Doc` into the index. You could use the same class as the schema for defining the `Doc`. Alternatively, you need to define the `Doc` following the schema of the index. `.num_docs()` returns the total number of `Doc` in the index. + +```python + +index_docs = [SimpleDoc(tensor=np.ones(128)) for _ in range(64)] + +doc_index.index(index_docs) + +print(f'number of docs in the index: {doc_index.num_docs()}') +``` + +## Access +To access the `Doc`, you need to specify the `id`. You can also pass a list of `id` to access multiple `Doc`. + +```python +# access a single Doc +doc_index[index_docs[16].id] + +# access multiple Docs +doc_index[index_docs[16].id, index_docs[17].id] +``` + +## Delete +To delete the `Doc`, use the built-in function `del` with the `id` of the `Doc` to be deleted. You can also pass a list of `id` to delete multiple `Doc`. + +```python +# delete a single Doc +del doc_index[index_docs[16].id] + +# delete multiple Docs +del doc_index[index_docs[16].id, index_docs[17].id] +``` + +## Find Nearest Neighbors +Use `.find()` to find the nearest neighbors. You can use `limit` argument to configurate how much `Doc` to return. + +```python +query = SimpleDoc(tensor=np.ones(128)) + +docs, scores = doc_index.find(query, limit=5) +``` + +!!! note + [ElasticV7DocIndex][docarray.index.backends.elastic.ElasticV7DocIndex] is using Elasticsearch v7.x which does not support approximate nearest neighbour algorithms as Hnswlib. This could lead to a poor performance when the search involves too many vectors. + +## Nested Index +When using the index, you can define multiple fields as well as the nested structure. In the following example, you have `YouTubeVideoDoc` including the `tensor` field calculated based on the description. Besides, `YouTbueVideoDoc` has `thumbnail` and `video` field, each of which has its own `tensor`. + +```python +from docarray import BaseDoc +from docarray.typing import ImageUrl, VideoUrl, AnyTensor +from docarray.index import ElasticV7DocIndex +import numpy as np +from pydantic import Field + + +class ImageDoc(BaseDoc): + url: ImageUrl + tensor: AnyTensor = Field(space='cosine', dim=64) + + +class VideoDoc(BaseDoc): + url: VideoUrl + tensor: AnyTensor = Field(space='cosine', dim=128) + + +class YouTubeVideoDoc(BaseDoc): + title: str + description: str + thumbnail: ImageDoc + video: VideoDoc + tensor: AnyTensor = Field(space='cosine', dim=256) + + +doc_index = ElasticV7DocIndex[YouTubeVideoDoc]() +index_docs = [ + YouTubeVideoDoc( + title=f'video {i+1}', + description=f'this is video from author {10*i}', + thumbnail=ImageDoc( + url=f'http://example.ai/images/{i}', + tensor=np.ones(64)), + video=VideoDoc( + url=f'http://example.ai/videos/{i}', + tensor=np.ones(128) + ), + tensor=np.ones(256) + ) for i in range(8) +] +doc_index.index(index_docs) +``` + +Use the `search_field` to specify which field to be used when performing the vector search. You can use the dunder operator to specify the field defined in the nested data. In the following codes, you can perform vector search on the `tensor` field of the `YouTubeVideoDoc` or on the `tensor` field of the `thumbnail` and `video` field. + +```python +# example of find nested and flat index +query_doc = YouTubeVideoDoc( + title=f'video query', + description=f'this is a query video', + thumbnail=ImageDoc( + url=f'http://example.ai/images/1024', + tensor=np.ones(64) + ), + video=VideoDoc( + url=f'http://example.ai/videos/1024', + tensor=np.ones(128) + ), + tensor=np.ones(256) +) +# find by the youtubevideo tensor +docs, scores = doc_index.find(query_doc, search_field='tensor', limit=3) +# find by the thumbnail tensor +docs, scores = doc_index.find(query_doc, search_field='thumbnail__tensor', limit=3) +# find by the video tensor +docs, scores = doc_index.find(query_doc, search_field='video__tensor', limit=3) +``` + +To delete a nested data, you need to specify the `id`. + +!!! note +You can only delete `Doc` at the top level. Deletion of the `Doc` on the lower level is not supported yet. + +```python +# example of delete nested and flat index +del doc_index[index_docs[16].id, index_docs[32].id] +``` + +## Elasticsearch Query +Besides the vector search, you can also perform other queries supported by Elasticsearch. + +### Text Search +As in elasticsearch, you could use text search directly on the field of the type `str`. + +```python +from pydantic import Field + +from docarray import BaseDoc +from docarray.index import ElasticV7DocIndex + + +class NewsDoc(BaseDoc): + text: str + + +doc_index = ElasticV7DocIndex[NewsDoc]() +index_docs = [ + NewsDoc(id='0', text='this is a news for sport'), + NewsDoc(id='1', text='this is a news for finance'), + NewsDoc(id='2', text='this is another news for sport'), +] +doc_index.index(index_docs) +query = 'finance' +# search with text +docs, scores = doc_index.text_search(query, search_field='text') +``` + +### Query Filter +To filter the docs, you can use `col_type` to configurate the fields. + +#### Keyword filter +To filter the docs, you can use `col_type='keyword'` to configurate the keyword search for the fields. + +```python +from pydantic import Field + +from docarray import BaseDoc +from docarray.index import ElasticV7DocIndex + + +class NewsDoc(BaseDoc): + text: str + category: str = Field(col_type='keyword') + + +doc_index = ElasticV7DocIndex[NewsDoc]() +index_docs = [ + NewsDoc(id='0', text='this is a news for sport', category='sport'), + NewsDoc(id='1', text='this is a news for finance', category='finance'), + NewsDoc(id='2', text='this is another news for sport', category='sport'), +] +doc_index.index(index_docs) + +# search with filer +query_filter = {'terms': {'category': ['sport']}} +docs = doc_index.filter(query_filter) +``` + +#### Range filter +To filter the docs, you can use `col_type='range'` to configurate the keyword search for the fields. + +#### Geolocation filter +To filter the docs, you can use `col_type='geo_point'` to configurate the keyword search for the fields. diff --git a/docs/user_guide/storing/index_hnswlib.md b/docs/user_guide/storing/index_hnswlib.md index 00df75d1b68..ad80af820b4 100644 --- a/docs/user_guide/storing/index_hnswlib.md +++ b/docs/user_guide/storing/index_hnswlib.md @@ -68,7 +68,7 @@ del doc_index[index_docs[16].id] del doc_index[index_docs[16].id, index_docs[17].id] ``` -### Find nearest neighbors +## Find Nearest Neighbors Use `.find()` to find the nearest neighbors. You can use `limit` argument to configurate how much `Doc` to return. ```python @@ -77,7 +77,7 @@ query = SimpleDoc(tensor=np.ones(10)) docs, scores = doc_index.find(query, limit=5) ``` -## Nested index +## Nested Index When using the index, you can define multiple fields as well as the nested structure. In the following example, you have `YouTubeVideoDoc` including the `tensor` field calculated based on the description. Besides, `YouTbueVideoDoc` has `thumbnail` and `video` field, each of which has its own `tensor`. ```python From 4538559c12fb701396835c1b1578bc55a58d6354 Mon Sep 17 00:00:00 2001 From: nan-wang Date: Sun, 9 Apr 2023 09:08:21 +0200 Subject: [PATCH 09/35] docs: add geolocation filter example Signed-off-by: nan-wang --- docs/user_guide/storing/index_elastic.md | 42 ++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/docs/user_guide/storing/index_elastic.md b/docs/user_guide/storing/index_elastic.md index 7ac5825363e..abef8bc952c 100644 --- a/docs/user_guide/storing/index_elastic.md +++ b/docs/user_guide/storing/index_elastic.md @@ -245,8 +245,44 @@ query_filter = {'terms': {'category': ['sport']}} docs = doc_index.filter(query_filter) ``` +#### Geolocation filter +To filter the docs, you can use `col_type='geo_point'` to configurate the keyword search for the fields. You need to construct the query and use `execute_query()` to perform the query. + +```python +from pydantic import Field + +from docarray import BaseDoc +from docarray.index import ElasticV7DocIndex + + +class NewsDoc(BaseDoc): + text: str + location: dict = Field(col_type='geo_point') + +doc_index = ElasticV7DocIndex[NewsDoc]() +index_docs = [ + NewsDoc(text='this is from Berlin', location={'lon': 13.24, 'lat': 50.31}), + NewsDoc(text='this is from Beijing', location={'lon': 116.22, 'lat': 39.55}), + NewsDoc(text='this is from San Jose', location={'lon': -121.89, 'lat': 37.34}), +] +doc_index.index(index_docs) + +# filter the eastern hemisphere +query = { + 'query': { + 'geo_bounding_box': { + 'location': { + 'top_left': {'lon': 0, 'lat': 90}, + 'bottom_right': {'lon': 180, 'lat': 0}, + } + } + } +} + +docs, _ = doc_index.execute_query(query) +``` + #### Range filter -To filter the docs, you can use `col_type='range'` to configurate the keyword search for the fields. +You can use `col_type='date_range'` is used to filter the docs based on the range of the date. TODO: find a use case. + -#### Geolocation filter -To filter the docs, you can use `col_type='geo_point'` to configurate the keyword search for the fields. From 6aaa21e916dead9e7da5452a006d1daaa08c7845 Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 12 Apr 2023 15:06:15 +0800 Subject: [PATCH 10/35] docs: update elastic index Signed-off-by: AnneY --- docs/user_guide/storing/index_elastic.md | 83 ++++++++++++++++-------- 1 file changed, 55 insertions(+), 28 deletions(-) diff --git a/docs/user_guide/storing/index_elastic.md b/docs/user_guide/storing/index_elastic.md index abef8bc952c..ba915ddf371 100644 --- a/docs/user_guide/storing/index_elastic.md +++ b/docs/user_guide/storing/index_elastic.md @@ -1,5 +1,5 @@ # Elastic -[ElasticV7DocIndex][docarray.index.backends.elastic.ElasticV7DocIndex] implement the index based on [Elasticsearch 7.0](https://github.com/elastic/elasticsearch). This is an implementation with vectors stored and supporting text/range search. +[ElasticV7DocIndex](docarray.index.backends.elastic.ElasticV7DocIndex) implement the index based on [Elasticsearch 7.10](https://github.com/elastic/elasticsearch). This is an implementation with vectors stored and supporting text/range search. !!! note To use [ElasticV7DocIndex][docarray.index.backends.elastic.ElasticV7DocIndex], one need to install the extra dependency with the following command @@ -8,14 +8,24 @@ pip install "docarray[elasticsearch]" ``` +[ElasticDocIndex](docarray.index.backends.elastic.ElasticDocIndex) is based on [Elasticsearch 8](https://github.com/elastic/elasticsearch) and supports hnsw based vector search as well. -In the following examples, we use docker-compose to create a local elasticsearch service with the following `docker-compose.yml`. +!!! note + To use [ElasticDocIndex][docarray.index.backends.elastic.ElasticDocIndex], one need to install the extra dependency with the following command + + ```console + pip install elasticsearch==8.6.2 + pip install elastic-transport + ``` + + +The following examples is based on `ElasticDocIndex`. We use docker-compose to create a local elasticsearch service with the following `docker-compose.yml`. ```yaml version: "3.3" services: elastic: - image: docker.elastic.co/elasticsearch/elasticsearch:7.10.2 + image: docker.elastic.co/elasticsearch/elasticsearch:8.6.2 environment: - xpack.security.enabled=false - discovery.type=single-node @@ -37,31 +47,32 @@ docker-compose up ``` ## Construct -To construct an index, you need to define the schema first. You can define the schema in the same way as define a `Doc`. The only difference is that you need to define the dimensionality of the vector space by `dims`. The `dims` argument must be an integer. TODO: add links to the detailed explaination +To construct an index, you need to define the schema first. You can define the schema in the same way as defining a `Doc`. Dimensionality is necessary for vector space, you need to specify the shape or define it by `dims`. TODO: add links to the detailed explaination. + +`hosts` is the argument for setting the elasticsearch hosts. By default, it is `http://localhost:9200`. -`hosts` is the argument for setting the elasticsearch hosts. By default, it is using `http://localhost:9200`. TODO: add more detailed explaination of the ES-related parameters. ```python from pydantic import Field from docarray import BaseDoc -from docarray.index import ElasticV7DocIndex +from docarray.index import ElasticDocIndex from docarray.typing import NdArray class SimpleDoc(BaseDoc): tensor: NdArray = Field(dims=128) + # tensor: NdArray[128] -doc_index = ElasticV7DocIndex[SimpleDoc]() - +doc_index = ElasticDocIndex[SimpleDoc]() ``` +TODO some common info: specifying col_type, custom_config, Union etc. ## Index Use `.index()` to add `Doc` into the index. You could use the same class as the schema for defining the `Doc`. Alternatively, you need to define the `Doc` following the schema of the index. `.num_docs()` returns the total number of `Doc` in the index. ```python - index_docs = [SimpleDoc(tensor=np.ones(128)) for _ in range(64)] doc_index.index(index_docs) @@ -80,6 +91,19 @@ doc_index[index_docs[16].id] doc_index[index_docs[16].id, index_docs[17].id] ``` +### Persistence +To access a `Doc` formerly persisted, you can specify `index_name` and the `hosts`. + +```python +doc_index = ElasticDocIndex[SimpleDoc](index_name='previously_stored') +doc_index.index(index_docs) + +doc_index2 = ElasticDocIndex[SimpleDoc](index_name='previously_stored') + +print(f'number of docs in the persisted index: {doc_index2.num_docs()}') +``` + + ## Delete To delete the `Doc`, use the built-in function `del` with the `id` of the `Doc` to be deleted. You can also pass a list of `id` to delete multiple `Doc`. @@ -101,7 +125,7 @@ docs, scores = doc_index.find(query, limit=5) ``` !!! note - [ElasticV7DocIndex][docarray.index.backends.elastic.ElasticV7DocIndex] is using Elasticsearch v7.x which does not support approximate nearest neighbour algorithms as Hnswlib. This could lead to a poor performance when the search involves too many vectors. + [ElasticV7DocIndex][docarray.index.backends.elastic.ElasticV7DocIndex] is using Elasticsearch v7.10.1 which does not support approximate nearest neighbour algorithms as Hnswlib. This could lead to a poor performance when the search involves too many vectors. ## Nested Index When using the index, you can define multiple fields as well as the nested structure. In the following example, you have `YouTubeVideoDoc` including the `tensor` field calculated based on the description. Besides, `YouTbueVideoDoc` has `thumbnail` and `video` field, each of which has its own `tensor`. @@ -137,15 +161,11 @@ index_docs = [ YouTubeVideoDoc( title=f'video {i+1}', description=f'this is video from author {10*i}', - thumbnail=ImageDoc( - url=f'http://example.ai/images/{i}', - tensor=np.ones(64)), - video=VideoDoc( - url=f'http://example.ai/videos/{i}', - tensor=np.ones(128) - ), - tensor=np.ones(256) - ) for i in range(8) + thumbnail=ImageDoc(url=f'http://example.ai/images/{i}', tensor=np.ones(64)), + video=VideoDoc(url=f'http://example.ai/videos/{i}', tensor=np.ones(128)), + tensor=np.ones(256), + ) + for i in range(8) ] doc_index.index(index_docs) ``` @@ -157,15 +177,9 @@ Use the `search_field` to specify which field to be used when performing the vec query_doc = YouTubeVideoDoc( title=f'video query', description=f'this is a query video', - thumbnail=ImageDoc( - url=f'http://example.ai/images/1024', - tensor=np.ones(64) - ), - video=VideoDoc( - url=f'http://example.ai/videos/1024', - tensor=np.ones(128) - ), - tensor=np.ones(256) + thumbnail=ImageDoc(url=f'http://example.ai/images/1024', tensor=np.ones(64)), + video=VideoDoc(url=f'http://example.ai/videos/1024', tensor=np.ones(128)), + tensor=np.ones(256), ) # find by the youtubevideo tensor docs, scores = doc_index.find(query_doc, search_field='tensor', limit=3) @@ -259,6 +273,7 @@ class NewsDoc(BaseDoc): text: str location: dict = Field(col_type='geo_point') + doc_index = ElasticV7DocIndex[NewsDoc]() index_docs = [ NewsDoc(text='this is from Berlin', location={'lon': 13.24, 'lat': 50.31}), @@ -286,3 +301,15 @@ docs, _ = doc_index.execute_query(query) You can use `col_type='date_range'` is used to filter the docs based on the range of the date. TODO: find a use case. + +## Config + +The following configs can be set in `DBConfig`: + +| Name | Description | Default | +|-------------------|----------------------------------------------------------------------------------------------------------------------------------------|-------------------------| +| `hosts` | Hostname of the Elasticsearch server | `http://localhost:9200` | +| `es_config` | Other ES [configuration options](https://www.elastic.co/guide/en/elasticsearch/client/python-api/8.6/config.html) in a Dict and pass to `Elasticsearch` client constructor, e.g. `cloud_id`, `api_key` | None | +| `index_name` | Elasticsearch index name, the name of Elasticsearch index object | None | +| `index_settings` | Other [index settings](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/index-modules.html#index-modules-settings) in a Dict for creating the index | dict | +| `index_mappings` | Other [index mappings](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/mapping.html) in a Dict for creating the index | dict | From 2c7146f031cac8bdf65a99ea3f1b4830a5fbce1f Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 12 Apr 2023 16:32:38 +0800 Subject: [PATCH 11/35] docs: update es index filter and todo Signed-off-by: AnneY --- docs/user_guide/storing/index_elastic.md | 57 +++++++++++++++--------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/docs/user_guide/storing/index_elastic.md b/docs/user_guide/storing/index_elastic.md index ba915ddf371..9f569aafebe 100644 --- a/docs/user_guide/storing/index_elastic.md +++ b/docs/user_guide/storing/index_elastic.md @@ -116,7 +116,7 @@ del doc_index[index_docs[16].id, index_docs[17].id] ``` ## Find Nearest Neighbors -Use `.find()` to find the nearest neighbors. You can use `limit` argument to configurate how much `Doc` to return. +Use `.find()` to find the nearest neighbors of a tensor. You can use `limit` argument to configurate how much `Doc` to return, and `search_field` argument to configurate the name of the field to search on. ```python query = SimpleDoc(tensor=np.ones(128)) @@ -133,19 +133,19 @@ When using the index, you can define multiple fields as well as the nested struc ```python from docarray import BaseDoc from docarray.typing import ImageUrl, VideoUrl, AnyTensor -from docarray.index import ElasticV7DocIndex +from docarray.index import ElasticDocIndex import numpy as np from pydantic import Field class ImageDoc(BaseDoc): url: ImageUrl - tensor: AnyTensor = Field(space='cosine', dim=64) + tensor: AnyTensor = Field(similarity='cosine', dims=64) class VideoDoc(BaseDoc): url: VideoUrl - tensor: AnyTensor = Field(space='cosine', dim=128) + tensor: AnyTensor = Field(similarity='cosine', dims=128) class YouTubeVideoDoc(BaseDoc): @@ -153,10 +153,10 @@ class YouTubeVideoDoc(BaseDoc): description: str thumbnail: ImageDoc video: VideoDoc - tensor: AnyTensor = Field(space='cosine', dim=256) + tensor: AnyTensor = Field(similarity='cosine', dims=256) -doc_index = ElasticV7DocIndex[YouTubeVideoDoc]() +doc_index = ElasticDocIndex[YouTubeVideoDoc]() index_docs = [ YouTubeVideoDoc( title=f'video {i+1}', @@ -199,24 +199,26 @@ You can only delete `Doc` at the top level. Deletion of the `Doc` on the lower l del doc_index[index_docs[16].id, index_docs[32].id] ``` +TODO field_name of nested level + ## Elasticsearch Query Besides the vector search, you can also perform other queries supported by Elasticsearch. ### Text Search -As in elasticsearch, you could use text search directly on the field of the type `str`. +As in elasticsearch, you could use text search directly on the field of type `str`. ```python from pydantic import Field from docarray import BaseDoc -from docarray.index import ElasticV7DocIndex +from docarray.index import ElasticDocIndex class NewsDoc(BaseDoc): text: str -doc_index = ElasticV7DocIndex[NewsDoc]() +doc_index = ElasticDocIndex[NewsDoc]() index_docs = [ NewsDoc(id='0', text='this is a news for sport'), NewsDoc(id='1', text='this is a news for finance'), @@ -229,7 +231,7 @@ docs, scores = doc_index.text_search(query, search_field='text') ``` ### Query Filter -To filter the docs, you can use `col_type` to configurate the fields. +To filter the docs, you can use `col_type` to configurate the fields. `filter()` accepts queries that follow [Elasticsearch Query DSL](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html) and consists of leaf and compound clauses. #### Keyword filter To filter the docs, you can use `col_type='keyword'` to configurate the keyword search for the fields. @@ -238,7 +240,7 @@ To filter the docs, you can use `col_type='keyword'` to configurate the keyword from pydantic import Field from docarray import BaseDoc -from docarray.index import ElasticV7DocIndex +from docarray.index import ElasticDocIndex class NewsDoc(BaseDoc): @@ -246,7 +248,7 @@ class NewsDoc(BaseDoc): category: str = Field(col_type='keyword') -doc_index = ElasticV7DocIndex[NewsDoc]() +doc_index = ElasticDocIndex[NewsDoc]() index_docs = [ NewsDoc(id='0', text='this is a news for sport', category='sport'), NewsDoc(id='1', text='this is a news for finance', category='finance'), @@ -260,13 +262,13 @@ docs = doc_index.filter(query_filter) ``` #### Geolocation filter -To filter the docs, you can use `col_type='geo_point'` to configurate the keyword search for the fields. You need to construct the query and use `execute_query()` to perform the query. +To filter the docs, you can use `col_type='geo_point'` to configurate the keyword search for the fields. ```python from pydantic import Field from docarray import BaseDoc -from docarray.index import ElasticV7DocIndex +from docarray.index import ElasticDocIndex class NewsDoc(BaseDoc): @@ -274,7 +276,7 @@ class NewsDoc(BaseDoc): location: dict = Field(col_type='geo_point') -doc_index = ElasticV7DocIndex[NewsDoc]() +doc_index = ElasticDocIndex[NewsDoc]() index_docs = [ NewsDoc(text='this is from Berlin', location={'lon': 13.24, 'lat': 50.31}), NewsDoc(text='this is from Beijing', location={'lon': 116.22, 'lat': 39.55}), @@ -284,26 +286,35 @@ doc_index.index(index_docs) # filter the eastern hemisphere query = { - 'query': { - 'geo_bounding_box': { - 'location': { - 'top_left': {'lon': 0, 'lat': 90}, - 'bottom_right': {'lon': 180, 'lat': 0}, + 'bool': { + 'filter': { + 'geo_bounding_box': { + 'location': { + 'top_left': {'lon': 0, 'lat': 90}, + 'bottom_right': {'lon': 180, 'lat': 0}, + } } } } } -docs, _ = doc_index.execute_query(query) +docs = doc_index.filter(query) ``` #### Range filter -You can use `col_type='date_range'` is used to filter the docs based on the range of the date. TODO: find a use case. +You can use `col_type='date_range'` is used to filter the docs based on the range of the date. +TODO: find a use case. +### QueryBuilder + + +## Batched Operation + ## Config +### DBConfig The following configs can be set in `DBConfig`: | Name | Description | Default | @@ -313,3 +324,5 @@ The following configs can be set in `DBConfig`: | `index_name` | Elasticsearch index name, the name of Elasticsearch index object | None | | `index_settings` | Other [index settings](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/index-modules.html#index-modules-settings) in a Dict for creating the index | dict | | `index_mappings` | Other [index mappings](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/mapping.html) in a Dict for creating the index | dict | + +### RuntimeConfig \ No newline at end of file From 0eba06257d6201389af10b75571a625041ee261e Mon Sep 17 00:00:00 2001 From: AnneY Date: Wed, 12 Apr 2023 21:06:14 +0800 Subject: [PATCH 12/35] docs: es index querybuilder and runtimeconfig Signed-off-by: AnneY --- docs/user_guide/storing/index_elastic.md | 91 ++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 5 deletions(-) diff --git a/docs/user_guide/storing/index_elastic.md b/docs/user_guide/storing/index_elastic.md index 9f569aafebe..2f41e1ff4eb 100644 --- a/docs/user_guide/storing/index_elastic.md +++ b/docs/user_guide/storing/index_elastic.md @@ -199,7 +199,7 @@ You can only delete `Doc` at the top level. Deletion of the `Doc` on the lower l del doc_index[index_docs[16].id, index_docs[32].id] ``` -TODO field_name of nested level +TODO style of field_name of nested level ## Elasticsearch Query Besides the vector search, you can also perform other queries supported by Elasticsearch. @@ -302,15 +302,79 @@ docs = doc_index.filter(query) ``` #### Range filter -You can use `col_type='date_range'` is used to filter the docs based on the range of the date. -TODO: find a use case. +You can have [range field types](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/range.html) in your `Doc` schema and set `col_type='integer_range'`(or also `date_range`, etc.) to filter the docs based on the range of the field. +```python +from pydantic import Field + +from docarray import BaseDoc +from docarray.index import ElasticDocIndex + + +class NewsDoc(BaseDoc): + time_frame: dict = Field(col_type='date_range', format='yyyy-MM-dd') + + +doc_index = ElasticDocIndex[NewsDoc]() +index_docs = [ + NewsDoc(time_frame={'gte': '2023-01-01', 'lt': '2023-02-01'}), + NewsDoc(time_frame={'gte': '2023-02-01', 'lt': '2023-03-01'}), + NewsDoc(time_frame={'gte': '2023-03-01', 'lt': '2023-04-01'}), +] +doc_index.index(index_docs) + +query = { + 'bool': { + 'filter': { + 'range': { + 'time_frame': { + 'gte': '2023-02-05', + 'lt': '2023-02-10', + 'relation': 'contains', + } + } + } + } +} +docs = doc_index.filter(query) +``` ### QueryBuilder +You can use `QueryBuilder` to build your own query. `find()`, `filter()` and `text_search()` methods and their combination are supported. + +```python +import numpy as np +from pydantic import Field + +from docarray import BaseDoc +from docarray.index import ElasticDocIndex +from docarray.typing import NdArray + +class MyDoc(BaseDoc): + tens: NdArray[10] = Field(similarity='l2_norm') + num: int + text: str -## Batched Operation +doc_index = ElasticDocIndex[MyDoc]() +index_docs = [ + MyDoc(id=f'{i}', tens=np.ones(10) * i, num=int(i / 2), text=f'text {int(i/2)}') + for i in range(10) +] +doc_index.index(index_docs) + +q = ( + doc_index.build_query() + .filter({'range': {'num': {'lte': 3}}}) + .find(index_docs[-1], search_field='tens') + .text_search('0', search_field='text') + .build() +) +docs, _ = doc_index.execute_query(q) +``` + +You can also directly pass a query to `execute_query()` method. ## Config @@ -325,4 +389,21 @@ The following configs can be set in `DBConfig`: | `index_settings` | Other [index settings](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/index-modules.html#index-modules-settings) in a Dict for creating the index | dict | | `index_mappings` | Other [index mappings](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/mapping.html) in a Dict for creating the index | dict | -### RuntimeConfig \ No newline at end of file +### RuntimeConfig + +The `RuntimeConfig` dataclass of `ElasticDocIndex` consists of `default_column_config` and `chunk_size`. You can change `chunk_size` for batch operations. + +```python +doc_index = ElasticDocIndex[SimpleDoc]() +doc_index.configure(ElasticDocIndex.RuntimeConfig(chunk_size=1000)) +``` + +`default_column_config` is the default configurations for every column type. Since there are many column types in Elasticsearch, you can also consider changing the column config when defining the schema. + +```python +class SimpleDoc(BaseDoc): + tensor: NdArray[128] = Field(similarity='l2_norm', 'm'=32, 'num_candidates'=5000) + + +doc_index = ElasticDocIndex[SimpleDoc]() +``` \ No newline at end of file From f89a710e47a0fded769c3c4455e87e557c8e5e6b Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Wed, 12 Apr 2023 17:48:10 +0200 Subject: [PATCH 13/35] docs: add doc index docs Signed-off-by: Johannes Messner --- docs/user_guide/storing/first_step.md | 13 -- docs/user_guide/storing/first_steps.md | 221 +++++++++++++++++++++++++ 2 files changed, 221 insertions(+), 13 deletions(-) delete mode 100644 docs/user_guide/storing/first_step.md create mode 100644 docs/user_guide/storing/first_steps.md diff --git a/docs/user_guide/storing/first_step.md b/docs/user_guide/storing/first_step.md deleted file mode 100644 index 91a6ed8d0d5..00000000000 --- a/docs/user_guide/storing/first_step.md +++ /dev/null @@ -1,13 +0,0 @@ -# Storing - -## Index -This section show you how to use the `DocArray.index` module. `DocArray.index` module is used to create index for the tensors so that one can search the document based on the vector similarity. `DocArray.index` implements the following index. - -- link to hnswlib -- link to elastic - -## Store -This section show you how to use the `DocArray.store` module. `DocArray.store` module is used to store the `Doc`. - -- link to jac -- link to s3 diff --git a/docs/user_guide/storing/first_steps.md b/docs/user_guide/storing/first_steps.md new file mode 100644 index 00000000000..746c7080b4d --- /dev/null +++ b/docs/user_guide/storing/first_steps.md @@ -0,0 +1,221 @@ +# Store + +If you work with multi-modal data, usually you want to **store** it somewhere. + +DocArray offers to ways of storing your data: + +1. In a **[Document Index](#document-index)** for fast retrieval using vector similarity +2. In a **[Document Store](#document-store)** for simple long-term storage + +## Document Index + +A Document Index lets you store your Documents and search through them using vector similarity. + +This is useful if you want to store a bunch of data, and at a later point retrieve Documents that are similar to +some query that you provide. +Concrete examples where this is relevant are neural search application, Augmenting LLMs and Chatbots with domain knowledge ([Retrieval-Augmented Generation](https://arxiv.org/abs/2005.11401))]), +or recommender systems. + +!!! question "How does vector similarity search work?" + TODO + +DocArray's Document Index concept achieves this by providing a unified interface to a number of [vector databases](https://learn.microsoft.com/en-us/semantic-kernel/concepts-ai/vectordb). +In fact, you can think of Document Index as an **[ORM](https://sqlmodel.tiangolo.com/db-to-code/) for vector databases**. + +Currently, DocArray supports the following vector databases: +- [Weaviate](https://weaviate.io/) | [Docs](TODO) +- [Qdrant](https://qdrant.tech/) | [Docs](TODO) +- [Elasticsearch](https://www.elastic.co/elasticsearch/) | [Docs v8](TODO), [Docs v7](TODO) +- [HNSWlib](https://github.com/nmslib/hnswlib) | [Docs](TODO) + +For this user guide you will use the [HNSWLibDocumentIndex](docarray.index.backends.hnswlib.HnswDocumentIndex) +because it doesn't require you to launch a database server. Instead, it will store your data locally. + +!!! note "Using a different vector database" + You can easily use Weaviate, Qdrant, or Elasticsearch instead, they share the same API! + To do so, check out their respective documentation sections. + +!!! note "HNSWLib-specific settings" + The following sections explain the general concept of Document Index by using + [HNSWLibDocumentIndex](docarray.index.backends.hnswlib.HnswDocumentIndex) as an example. + For HNSWLib-specific settings, check out the [HNSWLibDocumentIndex](docarray.index.backends.hnswlib.HnswDocumentIndex) documentation. + TODO link docs + +### Create a Document Index + +To create a Document Index, your first need a Document that defines the schema of your index. + +```python +from docarray import BaseDoc +from docarray.index import HNSWLibDocumentIndex +from docarray.typing import NdArray + + +class MyDoc(BaseDoc): + embedding: NdArray[128] + text: str + + +db = HNSWLibDocumentIndex[MyDoc](work_dir='./my_test_db') +``` + +**Schema definition:** + +In this code snippet, `HNSWLibDocumentIndex` takes a schema of the form of `MyDoc`. +The Document Index then _creates column for each field in `MyDoc`_. + +The column types in the backend database are determined the type hints of the fields in the Document. +Optionally, you can customize the database types for every field TODO link to this. + +Most vector databases need to know the dimensionality of the vectors that will be stored. +Here, that is automatically inferred from the type hint of the `embedding` field: `NdArray[128]` means that +the database will store vectors with 128 dimensions. + +!!! note "PyTorch and TensorFlow support" + Instead of using `NdArray` you can use `TorchTensor` or `TensorFlowTensor` and the Document Index will handle that + for you. No need to convert your tensors to numpy arrays! + +**Database location:** + +For `HNSWLibDocumentIndex` you need to specify a `work_dir` where the data will be stored; for other backends you +usually specify a `host` and a `port` instead. + +Either way, if the location does not yet contain any data, we start from a blank slate. +If the location already contains data from a previous session, it will be accessible through the Document Index. + +### Index data + +Now that you have a Document Index, you can add data to it, using the [index()][docarray.index.backends.hnswlib.HnswDocumentIndex.index] method: + +```python +import numpy as np +from docarray import DocList + +# create some random data +docs = DocList[MyDoc]( + [MyDoc(embedding=np.random.rand(128), text=f'text {i}') for i in range(100)] +) + +# index the data +db.index(docs) +``` + +That call to [index()][docarray.index.backends.hnswlib.HnswDocumentIndex.index] stores all Documents in `docs` into the Document Index, +ready to be retrieved in the next step. + +As you can see, `DocList[MyDoc]` and `HNSWLibDocumentIndex[MyDoc]` are both parameterized with `MyDoc`. +This means that they share the same schema, and in general, the schema of a Document Index and the data that you want to store +need to have compatible schemas. + +!!! question "When are two schemas compatible?" + The schema of your Document Index and of your data need to be compatible with each other. + + Let's say A is the schema of your Document Index and B is the schema of your data. + There are a few rules that determine if a schema A is compatible with a schema B. + If _any_ of the following is true, then A and B are compatible: + - A and B are the same class + - A and B have the same field names and field types + - A and B have the same field names, and, for every field, the type of B is a subclass of the type of A + +### Perform vector similarity search + +Now that you have indexed your data, you can perform vector similarity search using the [find()][docarray.index.backends.hnswlib.HnswDocumentIndex.find] method. + + +Provided with a Document of type `MyDoc`, [find()][docarray.index.backends.hnswlib.HnswDocumentIndex.find] can find +similar Documents in the Document Index. + +=== "Search by Document" + +```python +# create a query Document +query = MyDoc(embedding=np.random.rand(128), text='query') + +# find similar Documents +matches, scores = db.find(query, search_field='embedding', limit=5) + +print(f'{matches=}') +print(f'{matches.text=}') +print(f'{scores=}') +``` + +=== "Search by raw vector" + +```python +# create a query vector +query = np.random.rand(128) + +# find similar Documents +matches, scores = db.find(query, search_field='embedding', limit=5) + +print(f'{matches=}') +print(f'{matches.text=}') +print(f'{scores=}') +``` + +To succesfully peform a vector search, you need to specify a `search_field`. This is the field that serves as the +basis of comparison between your query and the documents in the Document Index. + +In this particular example you only have one field (`embedding`) that is a vector, so you can trivially choose that one. +In general, you could have multiple fields of type `NdArray` or `TorchTensor` or `TensorFlowTensor`, and you can choose +which one to use for the search. + +The [find()][docarray.index.backends.hnswlib.HnswDocumentIndex.find] method returns a named tuple containing the closest +matching documents and their associated similarity scores. + +How these scores are calculated depends on the backend, and can usually be configured TODO link. + +**Batched search:** + +You can also search for multiple Documents at once, in a batch, using the [find_batched()][docarray.index.backends.hnswlib.HnswDocumentIndex.find_batched] method. + +=== "Search by Documents" + +```python +# create some query Documents +queries = DocList[MyDoc]( + MyDoc(embedding=np.random.rand(128), text=f'query {i}') for _ in range(3) +) + +# find similar Documents +matches, scores = db.find(queries, search_field='embedding', limit=5) + +print(f'{matches=}') +print(f'{matches.text=}') +print(f'{scores=}') +``` + +=== "Search by raw vector" + +```python +# create some query vectors +query = np.random.rand(3, 128) + +# find similar Documents +matches, scores = db.find(query, search_field='embedding', limit=5) + +print(f'{matches=}') +print(f'{matches[0].text=}') +print(f'{scores=}') +``` + +The [find_batched()][docarray.index.backends.hnswlib.HnswDocumentIndex.find_batched] method returns a named tuple containing +a list of `DocList`s, one for each query, containing the closest matching documents; and the associated similarity scores. + +### Perform filter search + +You can also perform filter search using the [filter()][docarray.index.backends.hnswlib.HnswDocumentIndex.filter] method. + +This method takes in a filter query and returns Documents that fulfill the conditions expressed through that filter: + +```python +# create a filter +``` + +### Delete data + +## Document Store +This section show you how to use the `DocArray.store` module. `DocArray.store` module is used to store the `Doc`. + +- link to jac +- link to s3 From b3bc25d8597ba99d6bda0e5a19f306ca6d14eebf Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 13 Apr 2023 19:51:39 +0800 Subject: [PATCH 14/35] ci: fix elastic in documentation Signed-off-by: AnneY --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f49442f938a..3027d59fa4b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -158,6 +158,7 @@ jobs: python -m pip install poetry rm poetry.lock poetry install --all-extras + poetry run pip install elasticsearch==8.6.2 sudo apt-get update sudo apt-get install --no-install-recommends ffmpeg From d971e95316b2ec2736fb25fcb02c914b91fe91d4 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 13 Apr 2023 20:15:34 +0800 Subject: [PATCH 15/35] docs: fix elastic code examples Signed-off-by: AnneY --- docs/user_guide/storing/index_elastic.md | 45 +++--------------------- 1 file changed, 5 insertions(+), 40 deletions(-) diff --git a/docs/user_guide/storing/index_elastic.md b/docs/user_guide/storing/index_elastic.md index 2f41e1ff4eb..d4c0d486222 100644 --- a/docs/user_guide/storing/index_elastic.md +++ b/docs/user_guide/storing/index_elastic.md @@ -53,6 +53,7 @@ To construct an index, you need to define the schema first. You can define the s ```python +import numpy as np from pydantic import Field from docarray import BaseDoc @@ -112,7 +113,7 @@ To delete the `Doc`, use the built-in function `del` with the `id` of the `Doc` del doc_index[index_docs[16].id] # delete multiple Docs -del doc_index[index_docs[16].id, index_docs[17].id] +del doc_index[index_docs[17].id, index_docs[18].id] ``` ## Find Nearest Neighbors @@ -121,7 +122,7 @@ Use `.find()` to find the nearest neighbors of a tensor. You can use `limit` arg ```python query = SimpleDoc(tensor=np.ones(128)) -docs, scores = doc_index.find(query, limit=5) +docs, scores = doc_index.find(query, limit=5, search_field='tensor') ``` !!! note @@ -131,11 +132,7 @@ docs, scores = doc_index.find(query, limit=5) When using the index, you can define multiple fields as well as the nested structure. In the following example, you have `YouTubeVideoDoc` including the `tensor` field calculated based on the description. Besides, `YouTbueVideoDoc` has `thumbnail` and `video` field, each of which has its own `tensor`. ```python -from docarray import BaseDoc from docarray.typing import ImageUrl, VideoUrl, AnyTensor -from docarray.index import ElasticDocIndex -import numpy as np -from pydantic import Field class ImageDoc(BaseDoc): @@ -196,7 +193,7 @@ You can only delete `Doc` at the top level. Deletion of the `Doc` on the lower l ```python # example of delete nested and flat index -del doc_index[index_docs[16].id, index_docs[32].id] +del doc_index[index_docs[3].id, index_docs[4].id] ``` TODO style of field_name of nested level @@ -208,12 +205,6 @@ Besides the vector search, you can also perform other queries supported by Elast As in elasticsearch, you could use text search directly on the field of type `str`. ```python -from pydantic import Field - -from docarray import BaseDoc -from docarray.index import ElasticDocIndex - - class NewsDoc(BaseDoc): text: str @@ -237,12 +228,6 @@ To filter the docs, you can use `col_type` to configurate the fields. `filter()` To filter the docs, you can use `col_type='keyword'` to configurate the keyword search for the fields. ```python -from pydantic import Field - -from docarray import BaseDoc -from docarray.index import ElasticDocIndex - - class NewsDoc(BaseDoc): text: str category: str = Field(col_type='keyword') @@ -265,12 +250,6 @@ docs = doc_index.filter(query_filter) To filter the docs, you can use `col_type='geo_point'` to configurate the keyword search for the fields. ```python -from pydantic import Field - -from docarray import BaseDoc -from docarray.index import ElasticDocIndex - - class NewsDoc(BaseDoc): text: str location: dict = Field(col_type='geo_point') @@ -305,12 +284,6 @@ docs = doc_index.filter(query) You can have [range field types](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/range.html) in your `Doc` schema and set `col_type='integer_range'`(or also `date_range`, etc.) to filter the docs based on the range of the field. ```python -from pydantic import Field - -from docarray import BaseDoc -from docarray.index import ElasticDocIndex - - class NewsDoc(BaseDoc): time_frame: dict = Field(col_type='date_range', format='yyyy-MM-dd') @@ -343,14 +316,6 @@ docs = doc_index.filter(query) You can use `QueryBuilder` to build your own query. `find()`, `filter()` and `text_search()` methods and their combination are supported. ```python -import numpy as np -from pydantic import Field - -from docarray import BaseDoc -from docarray.index import ElasticDocIndex -from docarray.typing import NdArray - - class MyDoc(BaseDoc): tens: NdArray[10] = Field(similarity='l2_norm') num: int @@ -402,7 +367,7 @@ doc_index.configure(ElasticDocIndex.RuntimeConfig(chunk_size=1000)) ```python class SimpleDoc(BaseDoc): - tensor: NdArray[128] = Field(similarity='l2_norm', 'm'=32, 'num_candidates'=5000) + tensor: NdArray[128] = Field(similarity='l2_norm', m=32, num_candidates=5000) doc_index = ElasticDocIndex[SimpleDoc]() From 43c4122c698722108e35d187ac2c5450d1f56abd Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 13 Apr 2023 14:31:26 +0200 Subject: [PATCH 16/35] docs: add more stuff Signed-off-by: Johannes Messner --- docs/user_guide/storing/first_steps.md | 114 +++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 5 deletions(-) diff --git a/docs/user_guide/storing/first_steps.md b/docs/user_guide/storing/first_steps.md index 746c7080b4d..d91daf3cf40 100644 --- a/docs/user_guide/storing/first_steps.md +++ b/docs/user_guide/storing/first_steps.md @@ -43,6 +43,13 @@ because it doesn't require you to launch a database server. Instead, it will sto ### Create a Document Index +!!! note + To use [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex], you need to install extra dependencies with the following command: + + ```console + pip install "docarray[hnswlib]" + ``` + To create a Document Index, your first need a Document that defines the schema of your index. ```python @@ -202,17 +209,114 @@ print(f'{scores=}') The [find_batched()][docarray.index.backends.hnswlib.HnswDocumentIndex.find_batched] method returns a named tuple containing a list of `DocList`s, one for each query, containing the closest matching documents; and the associated similarity scores. -### Perform filter search +### Perform filter search and text search + +In addition to vector similarity search, the Document Index interface offers methods for text search and filter search: +[text_search()][docarray.index.backends.hnswlib.HnswDocumentIndex.text_search] and [filter()][docarray.index.backends.hnswlib.HnswDocumentIndex.filter], +as well as their batched versions [text_search_batched()][docarray.index.backends.hnswlib.HnswDocumentIndex.text_search_batched] and [filter_batched()][docarray.index.backends.hnswlib.HnswDocumentIndex.filter_batched] + +The [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] implementation does not offer support for filter +or text search. + +To see how to perform these operations, you can check out other backends that do: TODO add link to those + +### Perform hybrid search through the query builder + +Document Index support atomic operations for vector similarity search, text search and filter search. + +In order to combine these operations into a singe, hybrid search query, you can use the query builder that is accessible +through [build_query()][docarray.index.backends.hnswlib.HnswDocumentIndex.build_query]: + +```python +# prepare a query +q_doc = MyDoc(embedding=np.random.rand(128), text='query') +# TODO black doesnt like the code below +# query = db.build_query() \ # get empty query object +# .find(query=q_doc, search_field='embedding') \ # add vector similarity search +# .filter(filter_query={'tens': {'$exists': True}}) \ # add filter search +# .build() # build the query + +# execute the combined query and return the results +results = store.execute_query(q) +print(f'{results=}') +``` + +In the example above you can see how to form a hybrid query that combines vector similarity search and filter search +to obtain a combined set of results. + +What kinds of atomic queries can be combined in this way depends on the backend. +Some can combine text search and vector search, others can perform filters and vectors search, etc. +To see what backend can do what, check out the specific docs TODO add links + +### Access Documents by id + +To retrieve a Document from a Document Index, you don't necessarily need to perform some fancy search. + +You can also access data by the id that as assigned to every Document: + +```python +# prepare some data +data = DocList[MyDoc]( + MyDoc(embedding=np.random.rand(128), text=f'query {i}') for _ in range(3) +) + +# remember the Document ids and index the data +ids = data.ids +db.index(data) + +# access the Documents by id +doc = db[ids[0]] # get by single id +docs = db[ids] # get by list of ids +``` -You can also perform filter search using the [filter()][docarray.index.backends.hnswlib.HnswDocumentIndex.filter] method. +### Delete Documents -This method takes in a filter query and returns Documents that fulfill the conditions expressed through that filter: +In the same way you can access Documents by id, you can delete them: ```python -# create a filter +# prepare some data +data = DocList[MyDoc]( + MyDoc(embedding=np.random.rand(128), text=f'query {i}') for _ in range(3) +) + +# remember the Document ids and index the data +ids = data.ids +db.index(data) + +# access the Documents by id +del db[ids[0]] # del by single id +del db[ids[1:]] # del by list of ids ``` -### Delete data +### Customize configurations + +It is DocArray's philosophy that each Document Index should "just work", meaning that it comes with a sane set of default +settings that can get you most of the way there. + +However, there are different configurations that you may want to tweak, including: +- The [ANN](https://ignite.apache.org/docs/latest/machine-learning/binary-classification/ann) algorithm used, for example [HNSW](https://www.pinecone.io/learn/hnsw/) or [ScaNN](https://ai.googleblog.com/2020/07/announcing-scann-efficient-vector.html) +- Hyperparameters of the ANN algorithm, such as `ef_construction` for HNSW +- The distance metric to use, such as cosine or L2 distance +- The data type of each column in the database +- ... + +The specific configurations that you can tweak depend on the backend, but the interface to do so is universal. + +Document Indexes differentiate between three different kind of configurations: + +**Database configurations** + +_Database configurations_ are configurations that pertain to the entire DB or DB table (as opposed to just a specific column), +and that you don't dynamically change at runtime. + +This commonly includes: +- host and port +- index or collection name +- authentication settings +- ... + + +TODO ## Document Store This section show you how to use the `DocArray.store` module. `DocArray.store` module is used to store the `Doc`. From 714b14b32dbc3611be5d53923844bbb73dc7321e Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 13 Apr 2023 20:58:57 +0800 Subject: [PATCH 17/35] fix: test docs Signed-off-by: AnneY --- docs/user_guide/storing/index_hnswlib.md | 48 ++++++++---------------- tests/documentation/test_docs.py | 2 + 2 files changed, 18 insertions(+), 32 deletions(-) diff --git a/docs/user_guide/storing/index_hnswlib.md b/docs/user_guide/storing/index_hnswlib.md index ad80af820b4..5981b33fe0a 100644 --- a/docs/user_guide/storing/index_hnswlib.md +++ b/docs/user_guide/storing/index_hnswlib.md @@ -15,6 +15,7 @@ To construct an index, you need to define the schema first. You can define the s `work_dir` is the directory for storing the index. If there is an index in the directory, it will be automatically loaded. When the schema of the saved and the defined index do not match, an exception will be raised. ```python +import numpy as np from pydantic import Field from docarray import BaseDoc @@ -33,12 +34,9 @@ doc_index = HnswDocumentIndex[SimpleSchema](work_dir='./tmp') Use `.index()` to add `Doc` into the index. You need to define the `Doc` following the schema of the index. `.num_docs()` returns the total number of `Doc` in the index. ```python -from docarray import BaseDoc -from docarray.typing import NdArray -import numpy as np - class SimpleDoc(BaseDoc): - tensor: NdArray + tensor: NdArray[128] + index_docs = [SimpleDoc(tensor=np.zeros(128)) for _ in range(64)] @@ -65,27 +63,23 @@ To delete the `Doc`, use the built-in function `del` with the `id` of the `Doc` del doc_index[index_docs[16].id] # delete multiple Docs -del doc_index[index_docs[16].id, index_docs[17].id] +del doc_index[index_docs[17].id, index_docs[18].id] ``` ## Find Nearest Neighbors Use `.find()` to find the nearest neighbors. You can use `limit` argument to configurate how much `Doc` to return. ```python -query = SimpleDoc(tensor=np.ones(10)) +query = SimpleDoc(tensor=np.ones(128)) -docs, scores = doc_index.find(query, limit=5) +docs, scores = doc_index.find(query, limit=5, search_field='tensor') ``` ## Nested Index When using the index, you can define multiple fields as well as the nested structure. In the following example, you have `YouTubeVideoDoc` including the `tensor` field calculated based on the description. Besides, `YouTbueVideoDoc` has `thumbnail` and `video` field, each of which has its own `tensor`. ```python -from docarray import BaseDoc from docarray.typing import ImageUrl, VideoUrl, AnyTensor -from docarray.index import HnswDocumentIndex -import numpy as np -from pydantic import Field class ImageDoc(BaseDoc): @@ -106,20 +100,16 @@ class YouTubeVideoDoc(BaseDoc): tensor: AnyTensor = Field(space='cosine', dim=256) -doc_index = HnswDocumentIndex[YouTubeVideoDoc](work_dir='./tmp') +doc_index = HnswDocumentIndex[YouTubeVideoDoc](work_dir='./tmp2') index_docs = [ YouTubeVideoDoc( title=f'video {i+1}', description=f'this is video from author {10*i}', - thumbnail=ImageDoc( - url=f'http://example.ai/images/{i}', - tensor=np.ones(64)), - video=VideoDoc( - url=f'http://example.ai/videos/{i}', - tensor=np.ones(128) - ), - tensor=np.ones(256) - ) for i in range(8) + thumbnail=ImageDoc(url=f'http://example.ai/images/{i}', tensor=np.ones(64)), + video=VideoDoc(url=f'http://example.ai/videos/{i}', tensor=np.ones(128)), + tensor=np.ones(256), + ) + for i in range(8) ] doc_index.index(index_docs) ``` @@ -131,15 +121,9 @@ Use the `search_field` to specify which field to be used when performing the vec query_doc = YouTubeVideoDoc( title=f'video query', description=f'this is a query video', - thumbnail=ImageDoc( - url=f'http://example.ai/images/1024', - tensor=np.ones(64) - ), - video=VideoDoc( - url=f'http://example.ai/videos/1024', - tensor=np.ones(128) - ), - tensor=np.ones(256) + thumbnail=ImageDoc(url=f'http://example.ai/images/1024', tensor=np.ones(64)), + video=VideoDoc(url=f'http://example.ai/videos/1024', tensor=np.ones(128)), + tensor=np.ones(256), ) # find by the youtubevideo tensor docs, scores = doc_index.find(query_doc, search_field='tensor', limit=3) @@ -156,5 +140,5 @@ To delete a nested data, you need to specify the `id`. ```python # example of delete nested and flat index -del doc_index[index_docs[16].id, index_docs[32].id] +del doc_index[index_docs[6].id] ``` \ No newline at end of file diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py index 085022b5a00..6aae74e71a4 100644 --- a/tests/documentation/test_docs.py +++ b/tests/documentation/test_docs.py @@ -4,6 +4,8 @@ from mktestdocs import grab_code_blocks from mktestdocs.__main__ import _executors, check_raw_string +from tests.index.elastic.fixture import start_storage_v8 # noqa: F401 + def check_raw_file_full(raw, lang="python", keyword_ignore=[]): if lang not in _executors: From c4b5ea760e1f78e7c6e8da2acbf020172449246c Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 13 Apr 2023 21:20:03 +0800 Subject: [PATCH 18/35] fix: import es fixture for docs test Signed-off-by: AnneY --- tests/documentation/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/documentation/__init__.py diff --git a/tests/documentation/__init__.py b/tests/documentation/__init__.py new file mode 100644 index 00000000000..e69de29bb2d From 0cb7f9d937240395024ff292350ff591a222fe25 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 13 Apr 2023 15:40:57 +0200 Subject: [PATCH 19/35] docs: add back code snippet Signed-off-by: Johannes Messner --- docs/user_guide/storing/first_steps.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/user_guide/storing/first_steps.md b/docs/user_guide/storing/first_steps.md index d91daf3cf40..9c8cfb889f2 100644 --- a/docs/user_guide/storing/first_steps.md +++ b/docs/user_guide/storing/first_steps.md @@ -231,10 +231,12 @@ through [build_query()][docarray.index.backends.hnswlib.HnswDocumentIndex.build_ # prepare a query q_doc = MyDoc(embedding=np.random.rand(128), text='query') # TODO black doesnt like the code below -# query = db.build_query() \ # get empty query object -# .find(query=q_doc, search_field='embedding') \ # add vector similarity search -# .filter(filter_query={'tens': {'$exists': True}}) \ # add filter search -# .build() # build the query +query = ( + db.build_query() # get empty query object + .find(query=q_doc, search_field='embedding') # add vector similarity search + .filter(filter_query={'tens': {'$exists': True}}) # add filter search + .build() # build the query +) # execute the combined query and return the results results = store.execute_query(q) From bbd4f1865ca81ab5619630804f097fd87f10e2f0 Mon Sep 17 00:00:00 2001 From: AnneY Date: Thu, 13 Apr 2023 22:04:04 +0800 Subject: [PATCH 20/35] fix: minor fix Signed-off-by: AnneY --- .github/workflows/ci.yml | 1 + docs/user_guide/storing/index_elastic.md | 11 ++++------- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3027d59fa4b..bedddf289e5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -112,6 +112,7 @@ jobs: python -m pip install --upgrade pip python -m pip install poetry poetry install --all-extras + poetry run pip install elasticsearch==8.6.2 sudo apt-get update sudo apt-get install --no-install-recommends ffmpeg diff --git a/docs/user_guide/storing/index_elastic.md b/docs/user_guide/storing/index_elastic.md index d4c0d486222..dba74cfc586 100644 --- a/docs/user_guide/storing/index_elastic.md +++ b/docs/user_guide/storing/index_elastic.md @@ -47,7 +47,7 @@ docker-compose up ``` ## Construct -To construct an index, you need to define the schema first. You can define the schema in the same way as defining a `Doc`. Dimensionality is necessary for vector space, you need to specify the shape or define it by `dims`. TODO: add links to the detailed explaination. +To construct an index, you need to define the schema first. You can define the schema in the same way as defining a `Doc`. Dimensionality is necessary for vector space that you want to perform similarity search in the future. You need to specify the shape or define it by `dims`. You can set `col_type` to configurate [field data types in ElasticSearch](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/mapping-types.html). `hosts` is the argument for setting the elasticsearch hosts. By default, it is `http://localhost:9200`. @@ -68,7 +68,6 @@ class SimpleDoc(BaseDoc): doc_index = ElasticDocIndex[SimpleDoc]() ``` -TODO some common info: specifying col_type, custom_config, Union etc. ## Index Use `.index()` to add `Doc` into the index. You could use the same class as the schema for defining the `Doc`. Alternatively, you need to define the `Doc` following the schema of the index. `.num_docs()` returns the total number of `Doc` in the index. @@ -117,7 +116,7 @@ del doc_index[index_docs[17].id, index_docs[18].id] ``` ## Find Nearest Neighbors -Use `.find()` to find the nearest neighbors of a tensor. You can use `limit` argument to configurate how much `Doc` to return, and `search_field` argument to configurate the name of the field to search on. +The `.find()` method is used to find the nearest neighbors of a tensor. You need to specify `search_field` that is used when performing the vector search. You can use `limit` argument to configurate how much `Doc` to return. ```python query = SimpleDoc(tensor=np.ones(128)) @@ -167,7 +166,7 @@ index_docs = [ doc_index.index(index_docs) ``` -Use the `search_field` to specify which field to be used when performing the vector search. You can use the dunder operator to specify the field defined in the nested data. In the following codes, you can perform vector search on the `tensor` field of the `YouTubeVideoDoc` or on the `tensor` field of the `thumbnail` and `video` field. +You can use the dunder operator to specify the field defined in the nested data. In the following codes, you can perform vector search on the `tensor` field of the `YouTubeVideoDoc` or on the `tensor` field of the `thumbnail` and `video` field. ```python # example of find nested and flat index @@ -196,8 +195,6 @@ You can only delete `Doc` at the top level. Deletion of the `Doc` on the lower l del doc_index[index_docs[3].id, index_docs[4].id] ``` -TODO style of field_name of nested level - ## Elasticsearch Query Besides the vector search, you can also perform other queries supported by Elasticsearch. @@ -222,7 +219,7 @@ docs, scores = doc_index.text_search(query, search_field='text') ``` ### Query Filter -To filter the docs, you can use `col_type` to configurate the fields. `filter()` accepts queries that follow [Elasticsearch Query DSL](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html) and consists of leaf and compound clauses. +`filter()` accepts queries that follow [Elasticsearch Query DSL](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html) and consists of leaf and compound clauses. #### Keyword filter To filter the docs, you can use `col_type='keyword'` to configurate the keyword search for the fields. From d1ad967da41f8de162baabcb4b0355d1b15777f1 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Thu, 13 Apr 2023 16:40:39 +0200 Subject: [PATCH 21/35] docs: add info about config stuff Signed-off-by: Johannes Messner --- docs/user_guide/storing/first_steps.md | 132 ++++++++++++++++++++++++- 1 file changed, 129 insertions(+), 3 deletions(-) diff --git a/docs/user_guide/storing/first_steps.md b/docs/user_guide/storing/first_steps.md index 9c8cfb889f2..b66a57024bc 100644 --- a/docs/user_guide/storing/first_steps.md +++ b/docs/user_guide/storing/first_steps.md @@ -234,7 +234,7 @@ q_doc = MyDoc(embedding=np.random.rand(128), text='query') query = ( db.build_query() # get empty query object .find(query=q_doc, search_field='embedding') # add vector similarity search - .filter(filter_query={'tens': {'$exists': True}}) # add filter search + .filter(filter_query={'text': {'$exists': True}}) # add filter search .build() # build the query ) @@ -309,7 +309,7 @@ Document Indexes differentiate between three different kind of configurations: **Database configurations** _Database configurations_ are configurations that pertain to the entire DB or DB table (as opposed to just a specific column), -and that you don't dynamically change at runtime. +and that you _don't_ dynamically change at runtime. This commonly includes: - host and port @@ -318,7 +318,133 @@ This commonly includes: - ... -TODO +For every backend, you can get the full list of configurations, and their defaults, like this: + +```python +from docarray.index import HnswDocumentIndex + + +db_config = HnswDocumentIndex.DBConfig() +print(db_config) + +# > HnswDocumentIndex.DBConfig(work_dir='.') +``` + +As you can see, `HnswDocumentIndex.DBConfig` is a dataclass that contains only one possible configuration, `work_dir`, +that defaults to `.`. + +You can customize every field in this configuration: + +=== "Pass individual settings" + +```python +db = HnswDocumentIndex[MyDoc](work_dir='/tmp/my_db') + +custom_db_config = db._db_config +print(custom_db_config) + +# > HnswDocumentIndex.DBConfig(work_dir='/tmp/my_db') +``` + +=== "Pass entire configuration" + +```python +custom_db_config = HnswDocumentIndex.DBConfig(work_dir='/tmp/my_db') + +db = HnswDocumentIndex[MyDoc](custom_db_config) + +print(db._db_config) + +# > HnswDocumentIndex.DBConfig(work_dir='/tmp/my_db') +``` + +**Runtime configurations** + +_Runtime configurations_ are configurations that pertain to the entire DB or DB table (as opposed to just a specific column), +and that you can dynamically change at runtime. + + +This commonly includes: +- default batch size for batching operations +- default mapping from pythong types to DB column types +- default consistency level for various DB operations +- ... + + +For every backend, you can get the full list of configurations, and their defaults, like this: + +```python +from docarray.index import HnswDocumentIndex + + +runtime_config = HnswDocumentIndex.RuntimeConfig() +print(runtime_config) + +# > HnswDocumentIndex.RuntimeConfig(default_column_config={: {'dim': -1, 'index': True, 'space': 'l2', 'max_elements': 1024, 'ef_construction': 200, 'ef': 10, 'M': 16, 'allow_replace_deleted': True, 'num_threads': 1}, None: {}}) +``` + +As you can see, `HnswDocumentIndex.RuntimeConfig` is a dataclass that contains only one configuration: +`default_column_config`, which is a mapping from python types to database column configurations. + +You can customize every field in this configuration using the [configure()][docarray.index.backends.hnswlib.HnswDocumentIndex.configure] method: + +=== "Pass individual settings" + +```python +db = HnswDocumentIndex[MyDoc](work_dir='/tmp/my_db') + +db.configure( + default_column_config={ + np.ndarray: { + 'dim': -1, + 'index': True, + 'space': 'ip', + 'max_elements': 2048, + 'ef_construction': 100, + 'ef': 15, + 'M': 8, + 'allow_replace_deleted': True, + 'num_threads': 5, + }, + None: {}, + } +) + +custom_runtime_config = db._runtime_config +print(custom_runtime_config) + +# > HnswDocumentIndex.RuntimeConfig(default_column_config={: {'dim': -1, 'index': True, 'space': 'ip', 'max_elements': 2048, 'ef_construction': 100, 'ef': 15, 'M': 8, 'allow_replace_deleted': True, 'num_threads': 5}, None: {}}) +``` + +=== "Pass entire configuration" + +```python +custom_runtime_config = HnswDocumentIndex.RuntimeConfig( + default_column_config={ + np.ndarray: { + 'dim': -1, + 'index': True, + 'space': 'ip', + 'max_elements': 2048, + 'ef_construction': 100, + 'ef': 15, + 'M': 8, + 'allow_replace_deleted': True, + 'num_threads': 5, + }, + None: {}, + } +) + +db = HnswDocumentIndex[MyDoc](work_dir='/tmp/my_db') + +db.configure(custom_runtime_config) + +print(db._runtime_config) + +# > HHnswDocumentIndex.RuntimeConfig(default_column_config={: {'dim': -1, 'index': True, 'space': 'ip', 'max_elements': 2048, 'ef_construction': 100, 'ef': 15, 'M': 8, 'allow_replace_deleted': True, 'num_threads': 5}, None: {}}) +``` + ## Document Store This section show you how to use the `DocArray.store` module. `DocArray.store` module is used to store the `Doc`. From 28aa3ed1d8249a9d2659873f722103f58cd89149 Mon Sep 17 00:00:00 2001 From: AnneY Date: Fri, 14 Apr 2023 10:55:37 +0800 Subject: [PATCH 22/35] fix: mypy Signed-off-by: AnneY --- docarray/index/abstract.py | 2 +- docarray/index/backends/elastic.py | 16 +++++++--------- docarray/index/backends/elasticv7.py | 4 +++- docarray/index/backends/hnswlib.py | 4 +++- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/docarray/index/abstract.py b/docarray/index/abstract.py index 3c423137259..2a8344dbd70 100644 --- a/docarray/index/abstract.py +++ b/docarray/index/abstract.py @@ -573,7 +573,7 @@ def text_search_batched( docs = [self._dict_list_to_docarray(docs) for docs in da_list] return FindResultBatched(documents=docs, scores=scores) - return FindResultBatched(documents=da_list, scores=scores) + return FindResultBatched(documents=da_list, scores=scores) # type: ignore ########################################################## # Helper methods # diff --git a/docarray/index/backends/elastic.py b/docarray/index/backends/elastic.py index 52f60e1d098..8542cc20923 100644 --- a/docarray/index/backends/elastic.py +++ b/docarray/index/backends/elastic.py @@ -62,7 +62,7 @@ class ElasticDocIndex(BaseDocIndex, Generic[TSchema]): def __init__(self, db_config=None, **kwargs): super().__init__(db_config=db_config, **kwargs) - self._db_config = cast(self.DBConfig, self._db_config) + self._db_config = cast(ElasticDocIndex.DBConfig, self._db_config) # ElasticSearch client creation if self._db_config.index_name is None: @@ -357,7 +357,7 @@ def execute_query(self, query: Dict[str, Any], *args, **kwargs) -> Any: resp = self._client.search(index=self._index_name, **query) docs, scores = self._format_response(resp) - return _FindResult(documents=docs, scores=scores) + return _FindResult(documents=docs, scores=parse_obj_as(NdArray, scores)) def _find( self, query: np.ndarray, limit: int, search_field: str = '' @@ -368,7 +368,7 @@ def _find( docs, scores = self._format_response(resp) - return _FindResult(documents=docs, scores=scores) + return _FindResult(documents=docs, scores=parse_obj_as(NdArray, scores)) def _find_batched( self, @@ -387,7 +387,7 @@ def _find_batched( das, scores = zip( *[self._format_response(resp) for resp in responses['responses']] ) - return _FindResultBatched(documents=list(das), scores=np.array(scores)) + return _FindResultBatched(documents=list(das), scores=list(scores)) def _filter( self, @@ -445,9 +445,7 @@ def _text_search_batched( das, scores = zip( *[self._format_response(resp) for resp in responses['responses']] ) - return _FindResultBatched( - documents=list(das), scores=np.array(scores, dtype=object) - ) + return _FindResultBatched(documents=list(das), scores=list(scores)) ############################################### # Helpers # @@ -529,7 +527,7 @@ def _form_text_search_body( } return body - def _format_response(self, response: Any) -> Tuple[List[Dict], NdArray]: + def _format_response(self, response: Any) -> Tuple[List[Dict], List[Any]]: docs = [] scores = [] for result in response['hits']['hits']: @@ -544,7 +542,7 @@ def _format_response(self, response: Any) -> Tuple[List[Dict], NdArray]: docs.append(doc_dict) scores.append(result['_score']) - return docs, parse_obj_as(NdArray, scores) + return docs, scores def _refresh(self, index_name: str): self._client.indices.refresh(index=index_name) diff --git a/docarray/index/backends/elasticv7.py b/docarray/index/backends/elasticv7.py index e77aedfc2b4..947e21292ab 100644 --- a/docarray/index/backends/elasticv7.py +++ b/docarray/index/backends/elasticv7.py @@ -3,11 +3,13 @@ from typing import Any, Dict, List, Optional, Sequence, TypeVar, Union import numpy as np +from pydantic import parse_obj_as from docarray import BaseDoc from docarray.index import ElasticDocIndex from docarray.index.abstract import BaseDocIndex, _ColumnInfo from docarray.typing import AnyTensor +from docarray.typing.tensor.ndarray import NdArray from docarray.utils.find import _FindResult TSchema = TypeVar('TSchema', bound=BaseDoc) @@ -94,7 +96,7 @@ def execute_query(self, query: Dict[str, Any], *args, **kwargs) -> Any: resp = self._client.search(index=self._index_name, body=query) docs, scores = self._format_response(resp) - return _FindResult(documents=docs, scores=scores) + return _FindResult(documents=docs, scores=parse_obj_as(NdArray, scores)) ############################################### # Helpers # diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index d0e11e7e959..3da31fdabab 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -19,6 +19,7 @@ ) import numpy as np +from pydantic import parse_obj_as from docarray import BaseDoc, DocList from docarray.index.abstract import ( @@ -30,6 +31,7 @@ ) from docarray.proto import DocProto from docarray.typing.tensor.abstract_tensor import AbstractTensor +from docarray.typing.tensor.ndarray import NdArray from docarray.utils._internal.misc import import_library, is_np_int from docarray.utils.filter import filter_docs from docarray.utils.find import _FindResult @@ -262,7 +264,7 @@ def _find( docs, scores = self._find_batched( queries=query_batched, limit=limit, search_field=search_field ) - return _FindResult(documents=docs[0], scores=scores[0]) + return _FindResult(documents=docs[0], scores=parse_obj_as(NdArray, scores[0])) def _filter( self, From effd8dd5399c5338205d902290ad445f5a23b075 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Fri, 14 Apr 2023 14:01:02 +0200 Subject: [PATCH 23/35] docs: explain advanced configs Signed-off-by: Johannes Messner --- docs/user_guide/storing/first_steps.md | 31 ++++++++++++++++++++++ tests/index/hnswlib/test_configurations.py | 18 +++++++++++++ 2 files changed, 49 insertions(+) diff --git a/docs/user_guide/storing/first_steps.md b/docs/user_guide/storing/first_steps.md index b66a57024bc..dca1f63d5a8 100644 --- a/docs/user_guide/storing/first_steps.md +++ b/docs/user_guide/storing/first_steps.md @@ -445,6 +445,37 @@ print(db._runtime_config) # > HHnswDocumentIndex.RuntimeConfig(default_column_config={: {'dim': -1, 'index': True, 'space': 'ip', 'max_elements': 2048, 'ef_construction': 100, 'ef': 15, 'M': 8, 'allow_replace_deleted': True, 'num_threads': 5}, None: {}}) ``` +After this change, the new setting will be applied to _every_ column that corresponds to a `np.ndarray` type. + +**Column configurations** + +For many vector databases, individual columns can have different configurations. + +This commonly includes: +- The data type of the column, e.g. `vector` vs `varchar` +- If it is a vector column, the dimensionality of the vector +- Whether an index should be built for a specific column + +The exact configurations that are available different from backend to backend, but in any case you can pass them +directly in the schema of your Document Index, using the `Field()` syntax: + +```python +class Schema(BaseDoc): + tens: NdArray[100] = Field(max_elements=12, space='cosine') + tens_two: NdArray[10] = Field(M=4, space='ip') + + +db = HnswDocumentIndex[MyDoc](work_dir='/tmp/my_db') +``` + +The `HnswDocumentIndex` above contains two columns which are configured differently: +- `tens` has a dimensionality of 100, can take up to 12 elements, and uses the `cosine` similarity space +- `tens_two` has a dimensionality of 10, and uses the `ip` similarity space, and an `M` hyperparameter of 4 + +All configurations that are not explicitly set will be taken from the `default_column_config` of the `RuntimeConfig`. + +For an explanation of the configurations that are tweaked in this example, see the `HnswDocumentIndex` documentation TODO link. + ## Document Store This section show you how to use the `DocArray.store` module. `DocArray.store` module is used to store the `Doc`. diff --git a/tests/index/hnswlib/test_configurations.py b/tests/index/hnswlib/test_configurations.py index dff64fdcc19..80de4fd7ef6 100644 --- a/tests/index/hnswlib/test_configurations.py +++ b/tests/index/hnswlib/test_configurations.py @@ -25,3 +25,21 @@ class Schema(BaseDoc): index.index(docs) assert index.num_docs() == 10 + + +def test_configure_index(tmp_path): + class Schema(BaseDoc): + tens: NdArray[100] = Field(max_elements=12, space='cosine') + tens_two: NdArray[10] = Field(M=4, space='ip') + + index = HnswDocumentIndex[Schema](work_dir=str(tmp_path)) + + assert index._hnsw_indices['tens'].max_elements == 12 + assert index._hnsw_indices['tens'].space == 'cosine' + assert index._hnsw_indices['tens'].M == 16 # default + assert index._hnsw_indices['tens'].dim == 100 + + assert index._hnsw_indices['tens_two'].max_elements == 1024 # default + assert index._hnsw_indices['tens_two'].space == 'ip' + assert index._hnsw_indices['tens_two'].M == 4 + assert index._hnsw_indices['tens_two'].dim == 10 From 7d28c0226201b7cacdcfa900829a008fc9b5205f Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Fri, 14 Apr 2023 15:34:42 +0200 Subject: [PATCH 24/35] docs: add missing import Signed-off-by: Johannes Messner --- docs/user_guide/storing/first_steps.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/user_guide/storing/first_steps.md b/docs/user_guide/storing/first_steps.md index dca1f63d5a8..0035adfbb1a 100644 --- a/docs/user_guide/storing/first_steps.md +++ b/docs/user_guide/storing/first_steps.md @@ -460,6 +460,9 @@ The exact configurations that are available different from backend to backend, b directly in the schema of your Document Index, using the `Field()` syntax: ```python +from pydantic import Field + + class Schema(BaseDoc): tens: NdArray[100] = Field(max_elements=12, space='cosine') tens_two: NdArray[10] = Field(M=4, space='ip') From 2a96868c39a750337ce28d8a03ae02407b628f2a Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Fri, 14 Apr 2023 16:05:53 +0200 Subject: [PATCH 25/35] docs: add backend specific docs Signed-off-by: Johannes Messner --- docs/user_guide/storing/index_elastic.md | 3 +- docs/user_guide/storing/index_hnswlib.md | 2 +- docs/user_guide/storing/index_weaviate.md | 451 ++++++++++++++++++++++ mkdocs.yml | 9 +- 4 files changed, 461 insertions(+), 4 deletions(-) create mode 100644 docs/user_guide/storing/index_weaviate.md diff --git a/docs/user_guide/storing/index_elastic.md b/docs/user_guide/storing/index_elastic.md index dba74cfc586..416a116a2df 100644 --- a/docs/user_guide/storing/index_elastic.md +++ b/docs/user_guide/storing/index_elastic.md @@ -1,4 +1,5 @@ -# Elastic +# ElasticSearch Document Index + [ElasticV7DocIndex](docarray.index.backends.elastic.ElasticV7DocIndex) implement the index based on [Elasticsearch 7.10](https://github.com/elastic/elasticsearch). This is an implementation with vectors stored and supporting text/range search. !!! note diff --git a/docs/user_guide/storing/index_hnswlib.md b/docs/user_guide/storing/index_hnswlib.md index 5981b33fe0a..5a2deb1d74d 100644 --- a/docs/user_guide/storing/index_hnswlib.md +++ b/docs/user_guide/storing/index_hnswlib.md @@ -1,4 +1,4 @@ -# Hnswlib +# Hnswlib Document Index [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] implement the index based on [hnswlib](https://github.com/nmslib/hnswlib). This is a lightweight implementation with vectors stored in memory. diff --git a/docs/user_guide/storing/index_weaviate.md b/docs/user_guide/storing/index_weaviate.md new file mode 100644 index 00000000000..3e23596141f --- /dev/null +++ b/docs/user_guide/storing/index_weaviate.md @@ -0,0 +1,451 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .md + format_name: markdown + format_version: '1.3' + jupytext_version: 1.14.5 + kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +# Weaviate Document Index + +This is the user guide for the [WeaviateDocumentIndex](docarray.index.backends.hnswlib.WeaviateDocumentIndex), +focussing on special features and configurations of Weaviate. + +For general usage of a Document Index, see the [general user guide](./first_steps.md#document-index). + + +# 1. Start Weaviate service + +To use [WeaviateDocumentIndex](docarray.index.backends.hnswlib.WeaviateDocumentIndex), it needs to hook into a running Weaviate service. +There are multiple ways to start a Weaviate instance, depending on your use case. + + +## 1.1. Options - Overview + +There are multiple ways to start a Weaviate instance. + +| Instance type | General use case | Configurability | Notes | +| ----- | ----- | ----- | ----- | +| **Weaviate Cloud Services (WCS)** | Development and production | Limited | **Recommended for most users** | +| **Embedded Weaviate** | Experimentation | Limited | Experimental (as of Apr 2023) | +| **Docker-Compose** | Development | Yes | **Recommended for development + customizability** | +| **Kubernetes** | Production | Yes | | + +## 1.2. Instantiation instructions + +### 1.2.1. WCS (managed instance) + +Go to the [WCS console](https://console.weaviate.cloud) and create an instance using the visual interface, following [this guide](https://weaviate.io/developers/wcs/guides/create-instance). + +Weaviate instances on WCS come pre-configured, so no further configuration is required. + +### 1.2.2. Docker-Compose (self-managed) + +Get a configuration file (`docker-compose.yaml`). You can build it using [this interface](https://weaviate.io/developers/weaviate/installation/docker-compose), or download it directly with: + +```bash +curl -o docker-compose.yml "https://configuration.weaviate.io/v2/docker-compose/docker-compose.yml?modules=standalone&runtime=docker-compose&weaviate_version=v" +``` + +Where `v` is the actual version, such as `v1.18.3`. + + +```bash +curl -o docker-compose.yml "https://configuration.weaviate.io/v2/docker-compose/docker-compose.yml?modules=standalone&runtime=docker-compose&weaviate_version=v1.18.3" +``` + + +#### 1.2.2.1 Start up Weaviate with Docker-Compose + +Then you can start up Weaviate by running from a shell: + +```shell +docker-compose up -d +``` + +Or running from a jupyter notebook: +```bash +docker-compose up -d +``` + +#### 1.2.2.2 Shut down Weaviate + +Then you can shut down Weaviate by running from a shell: + +```shell +docker-compose down +``` + +Or from a Jupyter notebook: +```bash +docker-compose down +``` + +#### Notes + +Unless data persistence or backups are set up, shutting down the Docker instance will remove all its data. + +See documentation on [Persistent volume](https://weaviate.io/developers/weaviate/installation/docker-compose#persistent-volume) and [Backups](https://weaviate.io/developers/weaviate/configuration/backups) to prevent this if persistence is desired. + + +```bash +docker-compose up -d +``` + + +### 1.2.3. Embedded Weaviate (from the application) + +With Embedded Weaviate, Weaviate database server can be launched from the client, using: + +```python +from docarray.index.backends.weaviate import EmbeddedOptions + +embedded_options = EmbeddedOptions() +``` + +## 1.3. Authentication + +Weaviate offers [multiple authentication options](https://weaviate.io/developers/weaviate/configuration/authentication), as well as [authorization options](https://weaviate.io/developers/weaviate/configuration/authorization). + +With DocArray, you can use any of: +- Anonymous access (public instance), +- OIDC with username & password, and +- API-key based authentication. + +To access a Weaviate instance. In general, **Weaviate recommends using API-key based authentication** for balance between security and ease of use. You can create, for example, read-only keys to distribute to certain users, while providing read/write keys to administrators. + +See below for examples of connection to Weaviate for each scenario. + + +## 1.4. Connect to Weaviate + +```python +from docarray.index.backends.weaviate import WeaviateDocumentIndex +``` + +### Public instance + + +If using Embedded Weaviate: + +```python +from docarray.index.backends.weaviate import EmbeddedOptions + +dbconfig = WeaviateDocumentIndex.DBConfig(embedded_options=EmbeddedOptions()) +``` + +For all other options: + + +```python +dbconfig = WeaviateDocumentIndex.DBConfig( + host="http://localhost:8080" +) # Replace with your endpoint) +``` + + +### OIDC with username + password + +To authenticate against a Weaviate instance with OIDC username & password: + +```python +dbconfig = WeaviateDocumentIndex.DBConfig( + username="username", # Replace with your username + password="password", # Replace with your password + host="http://localhost:8080", # Replace with your endpoint +) +``` + + +```python +# dbconfig = WeaviateDocumentIndex.DBConfig( +# username="username", # Replace with your username +# password="password", # Replace with your password +# host="http://localhost:8080", # Replace with your endpoint +# ) +``` + + +### API key-based authentication + +To authenticate against a Weaviate instance an API key: + +```python +dbconfig = WeaviateDocumentIndex.DBConfig( + auth_api_key="apikey", # Replace with your own API key + host="http://localhost:8080", # Replace with your endpoint +) +``` + + + + +# 2. Configure Weaviate + +## 2.1. Overview + +**WCS instances come pre-configured**, and as such additional settings are not configurable outside of those chosen at creation, such as whether to enable authentication. + +For other cases, such as **Docker-Compose deployment**, its settings can be modified through the configuration file, such as the `docker-compose.yaml` file. + +Some of the more commonly used settings include: + +- [Persistent volume](https://weaviate.io/developers/weaviate/installation/docker-compose#persistent-volume): Set up data persistence so that data from inside the Docker container is not lost on shutdown +- [Enabling a multi-node setup](https://weaviate.io/developers/weaviate/installation/docker-compose#multi-node-setup) +- [Backups](https://weaviate.io/developers/weaviate/configuration/backups) +- [Authentication (server-side)](https://weaviate.io/developers/weaviate/configuration/authentication) +- [Modules enabled](https://weaviate.io/developers/weaviate/configuration/modules#enable-modules) + +And a list of environment variables is [available on this page](https://weaviate.io/developers/weaviate/config-refs/env-vars). + +## 2.2. DocArray instantiation configuration options + +Additionally, you can specify the below settings when you instantiate a configuration object in DocArray. + +| name | type | explanation | default | example | +| ---- | ---- | ----------- | ------- | ------- | +| **Category: General** | +| host | str | Weaviate instance url | http://localhost:8080 | +| **Category: Authentication** | +| username | str | username known to the specified authentication provider (e.g. WCS) | None | `jp@weaviate.io` | +| password | str | corresponding password | None | `p@ssw0rd` | +| auth_api_key | str | API key known to the Weaviate instance | None | `mys3cretk3y` | +| **Category: Data schema** | +| index_name | str | Class name to use to store the document | `Document` | +| **Category: Embedded Weaviate** | +| embedded_options| EmbeddedOptions | options for embedded weaviate | None | + +The type `EmbeddedOptions` can be specified as described [here](https://weaviate.io/developers/weaviate/installation/embedded#embedded-options) + +## 2.3. Runtime configuration + +Weaviate strongly recommends using batches to perform bulk operations such as importing data, as it will significantly impact performance. You can specify a batch configuration as in the below example, and pass it on as runtime configuration. + +```python +batch_config = { + "batch_size": 20, + "dynamic": False, + "timeout_retries": 3, + "num_workers": 1, +} + +runtimeconfig = WeaviateDocumentIndex.RuntimeConfig(batch_config=batch_config) + +dbconfig = WeaviateDocumentIndex.DBConfig( + host="http://localhost:8080" +) # Replace with your endpoint and/or auth settings +store = WeaviateDocumentIndex[Document](db_config=dbconfig) +store.configure(runtimeconfig) # Batch settings being passed on +``` + +| name | type | explanation | default | +| ---- | ---- | ----------- | ------- | +| batch_config | Dict[str, Any] | dictionary to configure the weaviate client's batching logic | see below | + +Read more: +- Weaviate [docs on batching with the Python client](https://weaviate.io/developers/weaviate/client-libraries/python#batching) + + + +## 3. Available column types + +Python data types are mapped to Weaviate type according to the below convention. + +| python type | weaviate type | +| ----------- | ------------- | +| docarray.typing.ID | string | +| str | text | +| int | int | +| float | number | +| bool | boolean | +| np.ndarray | number[] | +| AbstractTensor | number[] | +| bytes | blob | + +You can override this default mapping by passing a `col_type` to the `Field` of a schema. + +For example to map `str` to `string` you can: + +```python +class StringDoc(BaseDoc): + text: str = Field(col_type="string") +``` + +A list of available Weaviate data types [is here](https://weaviate.io/developers/weaviate/config-refs/datatypes). + + +## 4. Adding example data + +Putting it together, we can add data as shown below using Weaviate as the document store. + +```python +import numpy as np +from pydantic import Field +from docarray import BaseDoc +from docarray.typing import NdArray +from docarray.index.backends.weaviate import WeaviateDocumentIndex + +# Define a document schema +class Document(BaseDoc): + text: str + embedding: NdArray[2] = Field( + dims=2, is_embedding=True + ) # Embedding column -> vector representation of the document + file: NdArray[100] = Field(dims=100) + + +# Make a list of 3 docs to index +docs = [ + Document( + text="Hello world", embedding=np.array([1, 2]), file=np.random.rand(100), id="1" + ), + Document( + text="Hello world, how are you?", + embedding=np.array([3, 4]), + file=np.random.rand(100), + id="2", + ), + Document( + text="Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut", + embedding=np.array([5, 6]), + file=np.random.rand(100), + id="3", + ), +] + +batch_config = { + "batch_size": 20, + "dynamic": False, + "timeout_retries": 3, + "num_workers": 1, +} + +runtimeconfig = WeaviateDocumentIndex.RuntimeConfig(batch_config=batch_config) + +store = WeaviateDocumentIndex[Document](db_config=dbconfig) +store.configure(runtimeconfig) # Batch settings being passed on +store.index(docs) +``` + +### 4.1. Notes + +- In order to use vector search, you need to specify `is_embedding` for exactly one field. + - This is as Weaviate is configured to allow one vector per data object. + - If you would like to see Weaviate support multiple vectors per object, [upvote the issue](https://github.com/weaviate/weaviate/issues/2465) which will help to prioritize it. +- For a field to be considered as an embedding, its type needs to be of subclass `np.ndarray` or `AbstractTensor` and `is_embedding` needs to be set to `True`. + - If `is_embedding` is set to `False` or not provided, the field will be treated as a `number[]`, and as a result, it will not be added to Weaviate's vector index. +- It is possible to create a schema without specifying `is_embedding` for any field. + - This will however mean that the document will not be vectorized and cannot be searched using vector search. + + +## 5. Query Builder/Hybrid Search + + +### 5.1. Text search + +To perform a text search, follow the below syntax. + +This will perform a text search for the word "hello" in the field "text" and return the first 2 results: + +```python +q = store.build_query().text_search("world", search_field="text").limit(2).build() + +docs = store.execute_query(q) +docs +``` + +### 5.2. Vector similarity search + +To perform a vector similarity search, follow the below syntax. + +This will perform a vector similarity search for the vector [1, 2] and return the first 2 results: + +```python +q = store.build_query().find([1, 2]).limit(2).build() + +docs = store.execute_query(q) +docs +``` + +### 5.3. Hybrid search + +To perform a hybrid search, follow the below syntax. + +This will perform a hybrid search for the word "hello" and the vector [1, 2] and return the first 2 results: + +**Note**: Hybrid search searches through the object vector and all fields. Accordingly, the `search_field` keyword it will have no effect. + +```python +q = ( + store.build_query() + .text_search( + "world", search_field=None # Set as None as it is required but has no effect + ) + .find([1, 2]) + .limit(2) + .build() +) + +docs = store.execute_query(q) +docs +``` + +### 5.4. GraphQL query + +You can also perform a raw GraphQL query using any syntax as you might natively in Weaviate. This allows you to run any of the full range of queries that you might wish to. + +The below will perform a GraphQL query to obtain the count of `Document` objects. + +```python +graphql_query = """ +{ + Aggregate { + Document { + meta { + count + } + } + } +} +""" + +store.execute_query(graphql_query) +``` + +Note that running a raw GraphQL query will return Weaviate-type responses, rather than a DocArray object type. + +You can find the documentation for [Weaviate's GraphQL API here](https://weaviate.io/developers/weaviate/api/graphql). + + +## 6. Other notes + +### 6.1. DocArray IDs vs Weaviate IDs + +As you saw earlier, the `id` field is a special field that is used to identify a document in `BaseDoc`. + +```python +Document( + text="Hello world", embedding=np.array([1, 2]), file=np.random.rand(100), id="1" +), +``` + +This is not the same as Weaviate's own `id`, which is a reserved keyword and can't be used as a field name. + +Accordingly, the DocArray document id is stored internally in Weaviate as `docarrayid`. + + +## 7. Shut down Weaviate instance + +```bash +docker-compose down +``` + +----- +----- +----- diff --git a/mkdocs.yml b/mkdocs.yml index 1cdbdb86bd1..b941de96205 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -78,10 +78,10 @@ nav: - Home: README.md - Tutorial/User Guide: - user_guide/intro.md - - Representing data: + - Represent: - user_guide/representing/first_step.md - user_guide/representing/array.md - - Sending: + - Send: - user_guide/sending/first_step.md - Serialization: - user_guide/sending/ser/send_doc.md @@ -90,6 +90,11 @@ nav: - Building API: - user_guide/sending/api/jina.md - user_guide/sending/api/fastAPI.md + - Store: + - user_guide/storing/first_step.md + - user_guide/storing/index_hnswlib.md + - user_guide/storing/index_weaviate.md + - user_guide/storing/index_elastic.md - user_guide/storing/first_step.md From 15f660ddfcd5e5049801816a0a4f112f199dd5dd Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Fri, 14 Apr 2023 17:08:42 +0200 Subject: [PATCH 26/35] docs: remove unneeded snippets Signed-off-by: Johannes Messner --- docs/user_guide/storing/index_weaviate.md | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/docs/user_guide/storing/index_weaviate.md b/docs/user_guide/storing/index_weaviate.md index 3e23596141f..05aed8b90cf 100644 --- a/docs/user_guide/storing/index_weaviate.md +++ b/docs/user_guide/storing/index_weaviate.md @@ -69,11 +69,6 @@ Then you can start up Weaviate by running from a shell: docker-compose up -d ``` -Or running from a jupyter notebook: -```bash -docker-compose up -d -``` - #### 1.2.2.2 Shut down Weaviate Then you can shut down Weaviate by running from a shell: @@ -82,11 +77,6 @@ Then you can shut down Weaviate by running from a shell: docker-compose down ``` -Or from a Jupyter notebook: -```bash -docker-compose down -``` - #### Notes Unless data persistence or backups are set up, shutting down the Docker instance will remove all its data. From 96216f4542af473db07a3811834e1d2b27801841 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 17 Apr 2023 09:32:28 +0200 Subject: [PATCH 27/35] docs: polishing Signed-off-by: Johannes Messner --- docs/user_guide/storing/first_steps.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user_guide/storing/first_steps.md b/docs/user_guide/storing/first_steps.md index 0035adfbb1a..b0954e4a4bc 100644 --- a/docs/user_guide/storing/first_steps.md +++ b/docs/user_guide/storing/first_steps.md @@ -468,7 +468,7 @@ class Schema(BaseDoc): tens_two: NdArray[10] = Field(M=4, space='ip') -db = HnswDocumentIndex[MyDoc](work_dir='/tmp/my_db') +db = HnswDocumentIndex[Schema](work_dir='/tmp/my_db') ``` The `HnswDocumentIndex` above contains two columns which are configured differently: From 99038ca21e30995c67ef2e897da46b316979f759 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 17 Apr 2023 10:15:51 +0200 Subject: [PATCH 28/35] docs: add qdrant Signed-off-by: Johannes Messner --- docs/user_guide/storing/index_hnswlib.md | 4 +- docs/user_guide/storing/index_qdrant.md | 112 ++++++++++++++++++++++ docs/user_guide/storing/index_weaviate.md | 7 ++ 3 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 docs/user_guide/storing/index_qdrant.md diff --git a/docs/user_guide/storing/index_hnswlib.md b/docs/user_guide/storing/index_hnswlib.md index 5a2deb1d74d..9cf688250eb 100644 --- a/docs/user_guide/storing/index_hnswlib.md +++ b/docs/user_guide/storing/index_hnswlib.md @@ -1,7 +1,5 @@ # Hnswlib Document Index -[HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] implement the index based on [hnswlib](https://github.com/nmslib/hnswlib). This is a lightweight implementation with vectors stored in memory. - !!! note To use [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex], one need to install the extra dependency with the following command @@ -9,6 +7,8 @@ pip install "docarray[hnswlib]" ``` +[HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] implements the index based on [hnswlib](https://github.com/nmslib/hnswlib). This is a lightweight implementation with vectors stored in memory. + ## Construct To construct an index, you need to define the schema first. You can define the schema in the same way as define a `Doc`. The only difference is that you need to define the dimensionality of the vector space by `dim` and the name of the space by `space`. The `dim` argument must be an integer. The `space` argument can be one of `l2`, `ip` or `cosine`. TODO: add links to the detailed explaination diff --git a/docs/user_guide/storing/index_qdrant.md b/docs/user_guide/storing/index_qdrant.md new file mode 100644 index 00000000000..4d2a64642f5 --- /dev/null +++ b/docs/user_guide/storing/index_qdrant.md @@ -0,0 +1,112 @@ +# Qdrant Document Index + +!!! note "Install dependencies" + To use [QdrantDocumentIndex][docarray.index.backends.qdrant.QdrantDocumentIndex], you need to install extra dependencies with the following command: + + ```console + pip install "docarray[qdrant]" + ``` + +The following is a starter script for using the [QdrantDocumentIndex][docarray.index.backends.qdrant.QdrantDocumentIndex], +based on the [Qdrant](https://qdrant.tech/) vector search engine. + +!!! tip "See all configuration options" + To see all configuration options for the [QdrantDocumentIndex][docarray.index.backends.qdrant.QdrantDocumentIndex], + you can do the following: + + ```python + from docarray.index import QdrantDocumentIndex + + # the following can be passed to the __init__() method + db_config = QdrantDocumentIndex.DBConfig() + print(db_config) # shows default values + + # the following can be passed to the configure() method + runtime_config = QdrantDocumentIndex.RuntimeConfig() + print(runtime_config) # shows default values + ``` + +```python +import numpy as np + +from typing import Optional + +from docarray import BaseDoc +from docarray.index import QdrantDocumentIndex +from docarray.typing import NdArray + +from qdrant_client.http import models + + +class MyDocument(BaseDoc): + title: str + title_embedding: NdArray[786] + image_path: Optional[str] + image_embedding: NdArray[512] + + +# Creating an in-memory Qdrant document index +qdrant_config = QdrantDocumentIndex.DBConfig(":memory:") +doc_index = QdrantDocumentIndex[MyDocument](qdrant_config) + +# Indexing the documents +doc_index.index( + [ + MyDocument( + title=f"My document {i}", + title_embedding=np.random.random(786), + image_path=None, + image_embedding=np.random.random(512), + ) + for i in range(100) + ] +) + +# Performing a vector search only +results = doc_index.find( + query=np.random.random(512), + search_field="image_embedding", + limit=3, +) + +# Connecting to a local Qdrant instance with Scalar Quantization enabled, +# and using non-default collection name to store the datapoints +qdrant_config = QdrantDocumentIndex.DBConfig( + "http://localhost:6333", + collection_name="another_collection", + quantization_config=models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + quantile=0.99, + always_ram=True, + ), + ), +) +doc_index = QdrantDocumentIndex[MyDocument](qdrant_config) + +# Indexing the documents +doc_index.index( + [ + MyDocument( + title=f"My document {i}", + title_embedding=np.random.random(786), + image_path=None, + image_embedding=np.random.random(512), + ) + for i in range(100) + ] +) + +# Text lookup, without vector search. Using the Qdrant filtering mechanisms: +# https://qdrant.tech/documentation/filtering/ +results = doc_index.filter( + filter_query=models.Filter( + must=[ + models.FieldCondition( + key="title", + match=models.MatchText(text="document 2"), + ), + ], + ), +) +``` \ No newline at end of file diff --git a/docs/user_guide/storing/index_weaviate.md b/docs/user_guide/storing/index_weaviate.md index 05aed8b90cf..35bf53bf078 100644 --- a/docs/user_guide/storing/index_weaviate.md +++ b/docs/user_guide/storing/index_weaviate.md @@ -14,6 +14,13 @@ jupyter: # Weaviate Document Index +!!! note "Install dependencies" + To use [WeaviateDocumentIndex][docarray.index.backends.qdrant.WeaviateDocumentIndex], you need to install extra dependencies with the following command: + + ```console + pip install "docarray[weaviate]" + ``` + This is the user guide for the [WeaviateDocumentIndex](docarray.index.backends.hnswlib.WeaviateDocumentIndex), focussing on special features and configurations of Weaviate. From 921fefc884035f154d3fd63b4aa3ff4e32a4ab16 Mon Sep 17 00:00:00 2001 From: AnneY Date: Mon, 17 Apr 2023 16:32:55 +0800 Subject: [PATCH 29/35] docs: fix code snippets Signed-off-by: AnneY --- docs/user_guide/storing/first_steps.md | 34 +++++++++++++------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/user_guide/storing/first_steps.md b/docs/user_guide/storing/first_steps.md index b0954e4a4bc..d1e045b5748 100644 --- a/docs/user_guide/storing/first_steps.md +++ b/docs/user_guide/storing/first_steps.md @@ -28,7 +28,7 @@ Currently, DocArray supports the following vector databases: - [Elasticsearch](https://www.elastic.co/elasticsearch/) | [Docs v8](TODO), [Docs v7](TODO) - [HNSWlib](https://github.com/nmslib/hnswlib) | [Docs](TODO) -For this user guide you will use the [HNSWLibDocumentIndex](docarray.index.backends.hnswlib.HnswDocumentIndex) +For this user guide you will use the [HnswDocumentIndex](docarray.index.backends.hnswlib.HnswDocumentIndex) because it doesn't require you to launch a database server. Instead, it will store your data locally. !!! note "Using a different vector database" @@ -37,8 +37,8 @@ because it doesn't require you to launch a database server. Instead, it will sto !!! note "HNSWLib-specific settings" The following sections explain the general concept of Document Index by using - [HNSWLibDocumentIndex](docarray.index.backends.hnswlib.HnswDocumentIndex) as an example. - For HNSWLib-specific settings, check out the [HNSWLibDocumentIndex](docarray.index.backends.hnswlib.HnswDocumentIndex) documentation. + [HnswDocumentIndex](docarray.index.backends.hnswlib.HnswDocumentIndex) as an example. + For HNSWLib-specific settings, check out the [HnswDocumentIndex](docarray.index.backends.hnswlib.HnswDocumentIndex) documentation. TODO link docs ### Create a Document Index @@ -54,7 +54,7 @@ To create a Document Index, your first need a Document that defines the schema o ```python from docarray import BaseDoc -from docarray.index import HNSWLibDocumentIndex +from docarray.index import HnswDocumentIndex from docarray.typing import NdArray @@ -63,12 +63,12 @@ class MyDoc(BaseDoc): text: str -db = HNSWLibDocumentIndex[MyDoc](work_dir='./my_test_db') +db = HnswDocumentIndex[MyDoc](work_dir='./my_test_db') ``` **Schema definition:** -In this code snippet, `HNSWLibDocumentIndex` takes a schema of the form of `MyDoc`. +In this code snippet, `HnswDocumentIndex` takes a schema of the form of `MyDoc`. The Document Index then _creates column for each field in `MyDoc`_. The column types in the backend database are determined the type hints of the fields in the Document. @@ -84,7 +84,7 @@ the database will store vectors with 128 dimensions. **Database location:** -For `HNSWLibDocumentIndex` you need to specify a `work_dir` where the data will be stored; for other backends you +For `HnswDocumentIndex` you need to specify a `work_dir` where the data will be stored; for other backends you usually specify a `host` and a `port` instead. Either way, if the location does not yet contain any data, we start from a blank slate. @@ -110,7 +110,7 @@ db.index(docs) That call to [index()][docarray.index.backends.hnswlib.HnswDocumentIndex.index] stores all Documents in `docs` into the Document Index, ready to be retrieved in the next step. -As you can see, `DocList[MyDoc]` and `HNSWLibDocumentIndex[MyDoc]` are both parameterized with `MyDoc`. +As you can see, `DocList[MyDoc]` and `HnswDocumentIndex[MyDoc]` are both parameterized with `MyDoc`. This means that they share the same schema, and in general, the schema of a Document Index and the data that you want to store need to have compatible schemas. @@ -181,14 +181,14 @@ You can also search for multiple Documents at once, in a batch, using the [find_ ```python # create some query Documents queries = DocList[MyDoc]( - MyDoc(embedding=np.random.rand(128), text=f'query {i}') for _ in range(3) + MyDoc(embedding=np.random.rand(128), text=f'query {i}') for i in range(3) ) # find similar Documents -matches, scores = db.find(queries, search_field='embedding', limit=5) +matches, scores = db.find_batched(queries, search_field='embedding', limit=5) print(f'{matches=}') -print(f'{matches.text=}') +print(f'{matches[0].text=}') print(f'{scores=}') ``` @@ -199,7 +199,7 @@ print(f'{scores=}') query = np.random.rand(3, 128) # find similar Documents -matches, scores = db.find(query, search_field='embedding', limit=5) +matches, scores = db.find_batched(query, search_field='embedding', limit=5) print(f'{matches=}') print(f'{matches[0].text=}') @@ -239,7 +239,7 @@ query = ( ) # execute the combined query and return the results -results = store.execute_query(q) +results = db.execute_query(query) print(f'{results=}') ``` @@ -259,11 +259,11 @@ You can also access data by the id that as assigned to every Document: ```python # prepare some data data = DocList[MyDoc]( - MyDoc(embedding=np.random.rand(128), text=f'query {i}') for _ in range(3) + MyDoc(embedding=np.random.rand(128), text=f'query {i}') for i in range(3) ) # remember the Document ids and index the data -ids = data.ids +ids = data.id db.index(data) # access the Documents by id @@ -278,11 +278,11 @@ In the same way you can access Documents by id, you can delete them: ```python # prepare some data data = DocList[MyDoc]( - MyDoc(embedding=np.random.rand(128), text=f'query {i}') for _ in range(3) + MyDoc(embedding=np.random.rand(128), text=f'query {i}') for i in range(3) ) # remember the Document ids and index the data -ids = data.ids +ids = data.id db.index(data) # access the Documents by id From 0fbb24e95e47939e5722d816ab2a7470dd1e7a5d Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 17 Apr 2023 13:26:57 +0200 Subject: [PATCH 30/35] docs: tweak es docs Signed-off-by: Johannes Messner --- docs/user_guide/storing/index_elastic.md | 161 ++++++++++++++++------- docs/user_guide/storing/index_qdrant.md | 2 + 2 files changed, 114 insertions(+), 49 deletions(-) diff --git a/docs/user_guide/storing/index_elastic.md b/docs/user_guide/storing/index_elastic.md index 416a116a2df..bc97b05c1f8 100644 --- a/docs/user_guide/storing/index_elastic.md +++ b/docs/user_guide/storing/index_elastic.md @@ -1,26 +1,40 @@ # ElasticSearch Document Index -[ElasticV7DocIndex](docarray.index.backends.elastic.ElasticV7DocIndex) implement the index based on [Elasticsearch 7.10](https://github.com/elastic/elasticsearch). This is an implementation with vectors stored and supporting text/range search. +DocArray comes with two Document Indexes for [Elasticsearch](https://www.elastic.co/elasticsearch/): +- [ElasticDocIndex](docarray.index.backends.elastic.ElasticDocIndex), based on [Elasticsearch 8](https://github.com/elastic/elasticsearch). +- [ElasticV7DocIndex](docarray.index.backends.elastic.ElasticV7DocIndex), based on [Elasticsearch 7.10](https://www.elastic.co/downloads/past-releases/elasticsearch-7-10-0). -!!! note - To use [ElasticV7DocIndex][docarray.index.backends.elastic.ElasticV7DocIndex], one need to install the extra dependency with the following command +!!! tip "Should you use ES v7 or v8?" + [Elasticsearch v8](https://www.elastic.co/blog/whats-new-elastic-8-0-0) is the current version of ES and offers + **native vector search (ANN) support**, alongside text and range search. - ```console - pip install "docarray[elasticsearch]" - ``` + [Elasticsearch v7.10](https://www.elastic.co/downloads/past-releases/elasticsearch-7-10-0) can store vectors, but + **does _not_ support native ANN vector search**, but only exhaustive (=slow) vector search, alongside text and range search. -[ElasticDocIndex](docarray.index.backends.elastic.ElasticDocIndex) is based on [Elasticsearch 8](https://github.com/elastic/elasticsearch) and supports hnsw based vector search as well. + Some users prefer to use ES v7.10 because it is available under a [different license](https://www.elastic.co/pricing/faq/licensing) compared to ES v8.0.0. -!!! note - To use [ElasticDocIndex][docarray.index.backends.elastic.ElasticDocIndex], one need to install the extra dependency with the following command +!!! note "Installation" + To use [ElasticDocIndex][docarray.index.backends.elastic.ElasticDocIndex], you need to install the following dependencies: ```console pip install elasticsearch==8.6.2 pip install elastic-transport ``` + To use [ElasticV7DocIndex][docarray.index.backends.elastic.ElasticV7DocIndex], you need to install the following dependencies: + + ```console + pip install elasticsearch==7.10.1 + pip install elastic-transport + ``` + -The following examples is based on `ElasticDocIndex`. We use docker-compose to create a local elasticsearch service with the following `docker-compose.yml`. +The following examples is based on [ElasticDocIndex](docarray.index.backends.elastic.ElasticDocIndex), +but should also work with [ElasticV7DocIndex](docarray.index.backends.elastic.ElasticV7DocIndex) + +# Start ElasticSearch + +You can use docker-compose to create a local Elasticsearch service with the following `docker-compose.yml`. ```yaml version: "3.3" @@ -41,16 +55,22 @@ networks: name: elastic ``` -Run the following command in the folder of the above `docker-compose.yml` to start the service, +Run the following command in the folder of the above `docker-compose.yml` to start the service: ```bash docker-compose up ``` ## Construct -To construct an index, you need to define the schema first. You can define the schema in the same way as defining a `Doc`. Dimensionality is necessary for vector space that you want to perform similarity search in the future. You need to specify the shape or define it by `dims`. You can set `col_type` to configurate [field data types in ElasticSearch](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/mapping-types.html). +To construct an index, you first need to define a schema in the form of a `Document`. -`hosts` is the argument for setting the elasticsearch hosts. By default, it is `http://localhost:9200`. +There are a number of configurations you can pack into your schema: +- Every field in your schema will become one column in the database +- For vector fields, such as `NdArray`, `TorchTensor`, or `TensorflowTensor`, you need to specify a dimensionality to be able to perform vector search +- You can override the default column type for every field. To do that, you can pass any [ES field data type](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/mapping-types.html) to `field_name: Type = Field(col_type=...)`. You can see an example of this on the [section on keyword filters](#keyword-filter). + +Additionally, you can pass a `hosts` argument to the `__init__()` method to connect to an ES instance. +By default, it is `http://localhost:9200`. ```python @@ -63,15 +83,18 @@ from docarray.typing import NdArray class SimpleDoc(BaseDoc): - tensor: NdArray = Field(dims=128) - # tensor: NdArray[128] + # specify tensor field with dimensionality 128 + tensor: NdArray[128] + # alternative and equivalent definition: + # tensor: NdArray = Field(dims=128) -doc_index = ElasticDocIndex[SimpleDoc]() +doc_index = ElasticDocIndex[SimpleDoc](hosts='http://localhost:9200') ``` -## Index -Use `.index()` to add `Doc` into the index. You could use the same class as the schema for defining the `Doc`. Alternatively, you need to define the `Doc` following the schema of the index. `.num_docs()` returns the total number of `Doc` in the index. +## Index Documents +Use `.index()` to add Documents into the index. +The`.num_docs()` method returns the total number of Documents in the index. ```python index_docs = [SimpleDoc(tensor=np.ones(128)) for _ in range(64)] @@ -81,8 +104,8 @@ doc_index.index(index_docs) print(f'number of docs in the index: {doc_index.num_docs()}') ``` -## Access -To access the `Doc`, you need to specify the `id`. You can also pass a list of `id` to access multiple `Doc`. +## Access Documents +To access the `Doc`, you need to specify the `id`. You can also pass a list of `id` to access multiple Documents. ```python # access a single Doc @@ -93,20 +116,26 @@ doc_index[index_docs[16].id, index_docs[17].id] ``` ### Persistence -To access a `Doc` formerly persisted, you can specify `index_name` and the `hosts`. +You can hood into a database index that was persisted during a previous session. +To do so, you need to specify `index_name` and the `hosts`: ```python -doc_index = ElasticDocIndex[SimpleDoc](index_name='previously_stored') +doc_index = ElasticDocIndex[SimpleDoc]( + hosts='http://localhost:9200', index_name='previously_stored' +) doc_index.index(index_docs) -doc_index2 = ElasticDocIndex[SimpleDoc](index_name='previously_stored') +doc_index2 = ElasticDocIndex[SimpleDoc]( + hosts='http://localhost:9200', index_name='previously_stored' +) print(f'number of docs in the persisted index: {doc_index2.num_docs()}') ``` -## Delete -To delete the `Doc`, use the built-in function `del` with the `id` of the `Doc` to be deleted. You can also pass a list of `id` to delete multiple `Doc`. +## Delete Documents +To delete the Documents, use the built-in function `del` with the `id` of the Documents that you want to delete. +You can also pass a list of ids to delete multiple Documents. ```python # delete a single Doc @@ -116,8 +145,18 @@ del doc_index[index_docs[16].id] del doc_index[index_docs[17].id, index_docs[18].id] ``` -## Find Nearest Neighbors -The `.find()` method is used to find the nearest neighbors of a tensor. You need to specify `search_field` that is used when performing the vector search. You can use `limit` argument to configurate how much `Doc` to return. +## Find nearest neighbors +The `.find()` method is used to find the nearest neighbors of a vector. + +You need to specify `search_field` that is used when performing the vector search. +This is the field that serves as the basis of comparison between your query and your indexed Documents. + +You can use the `limit` argument to configurate how may Documents to return. + +!!! note + [ElasticV7DocIndex][docarray.index.backends.elastic.ElasticV7DocIndex] is using Elasticsearch v7.10.1 which does not support approximate nearest neighbour algorithms such as HNSW. + This can lead to a poor performance when the search involves many vectors. + [ElasticDocIndex][docarray.index.backends.elastic.ElasticDocIndex] does not have this limitation. ```python query = SimpleDoc(tensor=np.ones(128)) @@ -125,11 +164,13 @@ query = SimpleDoc(tensor=np.ones(128)) docs, scores = doc_index.find(query, limit=5, search_field='tensor') ``` -!!! note - [ElasticV7DocIndex][docarray.index.backends.elastic.ElasticV7DocIndex] is using Elasticsearch v7.10.1 which does not support approximate nearest neighbour algorithms as Hnswlib. This could lead to a poor performance when the search involves too many vectors. -## Nested Index -When using the index, you can define multiple fields as well as the nested structure. In the following example, you have `YouTubeVideoDoc` including the `tensor` field calculated based on the description. Besides, `YouTbueVideoDoc` has `thumbnail` and `video` field, each of which has its own `tensor`. +## Nested data +When using the index you can define multiple fields, including nesting Documents inside another Document. + +Consider the following example: +You have `YouTubeVideoDoc` including the `tensor` field calculated based on the description. +Besides, `YouTbueVideoDoc` has `thumbnail` and `video` field, each of which has its own `tensor`. ```python from docarray.typing import ImageUrl, VideoUrl, AnyTensor @@ -167,7 +208,10 @@ index_docs = [ doc_index.index(index_docs) ``` -You can use the dunder operator to specify the field defined in the nested data. In the following codes, you can perform vector search on the `tensor` field of the `YouTubeVideoDoc` or on the `tensor` field of the `thumbnail` and `video` field. +**You can perform search on any nesting level.** +To do so, use the dunder operator to specify the field defined in the nested data. + +In the following example, you can see how to perform vector search on the `tensor` field of the `YouTubeVideoDoc` or on the `tensor` field of the `thumbnail` and `video` field: ```python # example of find nested and flat index @@ -178,10 +222,13 @@ query_doc = YouTubeVideoDoc( video=VideoDoc(url=f'http://example.ai/videos/1024', tensor=np.ones(128)), tensor=np.ones(256), ) + # find by the youtubevideo tensor docs, scores = doc_index.find(query_doc, search_field='tensor', limit=3) + # find by the thumbnail tensor docs, scores = doc_index.find(query_doc, search_field='thumbnail__tensor', limit=3) + # find by the video tensor docs, scores = doc_index.find(query_doc, search_field='video__tensor', limit=3) ``` @@ -189,18 +236,18 @@ docs, scores = doc_index.find(query_doc, search_field='video__tensor', limit=3) To delete a nested data, you need to specify the `id`. !!! note -You can only delete `Doc` at the top level. Deletion of the `Doc` on the lower level is not supported yet. + You can only delete `Doc` at the top level. Deletion of the `Doc` on the lower level is not supported yet. ```python # example of delete nested and flat index del doc_index[index_docs[3].id, index_docs[4].id] ``` -## Elasticsearch Query -Besides the vector search, you can also perform other queries supported by Elasticsearch. +## Other Elasticsearch queries +Besides the vector search, you can also perform other queries supported by Elasticsearch, such as text search, and various filters. -### Text Search -As in elasticsearch, you could use text search directly on the field of type `str`. +### Text search +As in "pure" Elasticsearch, you can use text search directly on the field of type `str`: ```python class NewsDoc(BaseDoc): @@ -215,20 +262,23 @@ index_docs = [ ] doc_index.index(index_docs) query = 'finance' + # search with text docs, scores = doc_index.text_search(query, search_field='text') ``` ### Query Filter -`filter()` accepts queries that follow [Elasticsearch Query DSL](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html) and consists of leaf and compound clauses. +The `filter()` method accepts queries that follow the [Elasticsearch Query DSL](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html) and consist of leaf and compound clauses. + +Using this, you can perform [keyword filters](#keyword-filter), [geolocation filters](#geolocation-filter) and [range filters](#range-filter). #### Keyword filter -To filter the docs, you can use `col_type='keyword'` to configurate the keyword search for the fields. +To filter the Documents in your index by keyword, you can use `Field(col_type='keyword')` to enable keyword search for a given fields: ```python class NewsDoc(BaseDoc): text: str - category: str = Field(col_type='keyword') + category: str = Field(col_type='keyword') # enable keyword filtering doc_index = ElasticDocIndex[NewsDoc]() @@ -245,12 +295,12 @@ docs = doc_index.filter(query_filter) ``` #### Geolocation filter -To filter the docs, you can use `col_type='geo_point'` to configurate the keyword search for the fields. +To filter the Documents in your index by geolocation, you can use `Field(col_type='geo_point')` on a given field. ```python class NewsDoc(BaseDoc): text: str - location: dict = Field(col_type='geo_point') + location: dict = Field(col_type='geo_point') # enable geolocation filtering doc_index = ElasticDocIndex[NewsDoc]() @@ -279,11 +329,13 @@ docs = doc_index.filter(query) ``` #### Range filter -You can have [range field types](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/range.html) in your `Doc` schema and set `col_type='integer_range'`(or also `date_range`, etc.) to filter the docs based on the range of the field. +You can have [range field types](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/range.html) in your Document schema and set `Field(col_type='integer_range')`(or also `date_range`, etc.) to filter the docs based on the range of the field. ```python class NewsDoc(BaseDoc): - time_frame: dict = Field(col_type='date_range', format='yyyy-MM-dd') + time_frame: dict = Field( + col_type='date_range', format='yyyy-MM-dd' + ) # enable range filtering doc_index = ElasticDocIndex[NewsDoc]() @@ -307,11 +359,16 @@ query = { } } } + docs = doc_index.filter(query) ``` -### QueryBuilder -You can use `QueryBuilder` to build your own query. `find()`, `filter()` and `text_search()` methods and their combination are supported. +### Hybrid serach and query builder +To combine any of the "atomic" search approaches above, you can use the `QueryBuilder` to build your own hybrid query. + +For this the `find()`, `filter()` and `text_search()` methods and their combination are supported. + +For example, you can build a hybrid serach query that performs range filtering, vector search and text search: ```python class MyDoc(BaseDoc): @@ -337,9 +394,9 @@ q = ( docs, _ = doc_index.execute_query(q) ``` -You can also directly pass a query to `execute_query()` method. +You can also manually build a valid ES query and directly pass it to the `execute_query()` method. -## Config +## Configuration options ### DBConfig The following configs can be set in `DBConfig`: @@ -352,6 +409,9 @@ The following configs can be set in `DBConfig`: | `index_settings` | Other [index settings](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/index-modules.html#index-modules-settings) in a Dict for creating the index | dict | | `index_mappings` | Other [index mappings](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/mapping.html) in a Dict for creating the index | dict | +You can pass any of the above as keyword arguments to the `__init__()` method or pass an entire configuration object. +To see how, see [here](user_guide/storing/first_steps.md#configuration-options#customize-configurations). + ### RuntimeConfig The `RuntimeConfig` dataclass of `ElasticDocIndex` consists of `default_column_config` and `chunk_size`. You can change `chunk_size` for batch operations. @@ -369,4 +429,7 @@ class SimpleDoc(BaseDoc): doc_index = ElasticDocIndex[SimpleDoc]() -``` \ No newline at end of file +``` + +You can pass the above as a keyword arguments the `configure()` method or pass an entire configuration object. +To see how, see [here](user_guide/storing/first_steps.md#configuration-options#customize-configurations). \ No newline at end of file diff --git a/docs/user_guide/storing/index_qdrant.md b/docs/user_guide/storing/index_qdrant.md index 4d2a64642f5..d03a12e4e37 100644 --- a/docs/user_guide/storing/index_qdrant.md +++ b/docs/user_guide/storing/index_qdrant.md @@ -10,6 +10,8 @@ The following is a starter script for using the [QdrantDocumentIndex][docarray.index.backends.qdrant.QdrantDocumentIndex], based on the [Qdrant](https://qdrant.tech/) vector search engine. +For general usage of a Document Index, see the [general user guide](./first_steps.md#document-index). + !!! tip "See all configuration options" To see all configuration options for the [QdrantDocumentIndex][docarray.index.backends.qdrant.QdrantDocumentIndex], you can do the following: From 95df90a0393be650817648ac7e0fdb01677cdfd6 Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 17 Apr 2023 14:34:51 +0200 Subject: [PATCH 31/35] docs: add explanation of vector search Signed-off-by: Johannes Messner --- docs/user_guide/storing/first_steps.md | 47 +++++--- docs/user_guide/storing/index_hnswlib.md | 138 ++++++++++++++++------- 2 files changed, 133 insertions(+), 52 deletions(-) diff --git a/docs/user_guide/storing/first_steps.md b/docs/user_guide/storing/first_steps.md index d1e045b5748..315e7d7ae3b 100644 --- a/docs/user_guide/storing/first_steps.md +++ b/docs/user_guide/storing/first_steps.md @@ -17,16 +17,35 @@ Concrete examples where this is relevant are neural search application, Augmenti or recommender systems. !!! question "How does vector similarity search work?" - TODO + Without going into too much detail, the idea behind vector similarity search is the following: + + You represent every data point that you have (in our case, a Document) as a _vector_, or _embedding_. + This vector should represent as much semantic information about your data as possible: Similar data points should + be represented by similar vectors. + + These vectors (embeddings) are usually obtained by passing the data through a suitable neural network that has been + trained to produce such semantic representations - this is the _encoding_ step. + + Once you have your vector that represent your data, you can store them, for example in a vector database. + + To perform similarity search, you take your input query and encode it in the same way as the data in your database. + Then, the database will search through the stored vectors and return the ones that are most similar to your query. + This similarity is measured by a _similarity metric_, which can be [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity), + [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance), or any other metric that you can think of. + + If you store a lot of data, performing this similarity computation for every data point in your database is expensive. + Therefore, vector databases usually perform _approximate nearest neighbor (ANN)_ search. + There are various algorithms for doing this, such as [HNSW](https://arxiv.org/abs/1603.09320), but in a nutshell, + they allow you to search through a large database of vectors very quickly, at the expense of a small loss in accuracy. DocArray's Document Index concept achieves this by providing a unified interface to a number of [vector databases](https://learn.microsoft.com/en-us/semantic-kernel/concepts-ai/vectordb). In fact, you can think of Document Index as an **[ORM](https://sqlmodel.tiangolo.com/db-to-code/) for vector databases**. Currently, DocArray supports the following vector databases: -- [Weaviate](https://weaviate.io/) | [Docs](TODO) -- [Qdrant](https://qdrant.tech/) | [Docs](TODO) -- [Elasticsearch](https://www.elastic.co/elasticsearch/) | [Docs v8](TODO), [Docs v7](TODO) -- [HNSWlib](https://github.com/nmslib/hnswlib) | [Docs](TODO) +- [Weaviate](https://weaviate.io/) | [Docs](index_weaviate.md) +- [Qdrant](https://qdrant.tech/) | [Docs](index_qdrant.md) +- [Elasticsearch](https://www.elastic.co/elasticsearch/) v7 and v8 | [Docs](index_elastic.md) +- [HNSWlib](https://github.com/nmslib/hnswlib) | [Docs](index_hnswlib.md) For this user guide you will use the [HnswDocumentIndex](docarray.index.backends.hnswlib.HnswDocumentIndex) because it doesn't require you to launch a database server. Instead, it will store your data locally. @@ -38,8 +57,8 @@ because it doesn't require you to launch a database server. Instead, it will sto !!! note "HNSWLib-specific settings" The following sections explain the general concept of Document Index by using [HnswDocumentIndex](docarray.index.backends.hnswlib.HnswDocumentIndex) as an example. - For HNSWLib-specific settings, check out the [HnswDocumentIndex](docarray.index.backends.hnswlib.HnswDocumentIndex) documentation. - TODO link docs + For HNSWLib-specific settings, check out the [HnswDocumentIndex](docarray.index.backends.hnswlib.HnswDocumentIndex) documentation + [here](index_hnswlib.md). ### Create a Document Index @@ -72,7 +91,7 @@ In this code snippet, `HnswDocumentIndex` takes a schema of the form of `MyDoc`. The Document Index then _creates column for each field in `MyDoc`_. The column types in the backend database are determined the type hints of the fields in the Document. -Optionally, you can customize the database types for every field TODO link to this. +Optionally, you can customize the database types for every field, as you can see [here](#customize-configurations). Most vector databases need to know the dimensionality of the vectors that will be stored. Here, that is automatically inferred from the type hint of the `embedding` field: `NdArray[128]` means that @@ -80,7 +99,7 @@ the database will store vectors with 128 dimensions. !!! note "PyTorch and TensorFlow support" Instead of using `NdArray` you can use `TorchTensor` or `TensorFlowTensor` and the Document Index will handle that - for you. No need to convert your tensors to numpy arrays! + for you. This is supported for all Document Index backends. No need to convert your tensors to numpy arrays manually! **Database location:** @@ -170,7 +189,7 @@ which one to use for the search. The [find()][docarray.index.backends.hnswlib.HnswDocumentIndex.find] method returns a named tuple containing the closest matching documents and their associated similarity scores. -How these scores are calculated depends on the backend, and can usually be configured TODO link. +How these scores are calculated depends on the backend, and can usually be [configured](#customize-configurations). **Batched search:** @@ -218,7 +237,7 @@ as well as their batched versions [text_search_batched()][docarray.index.backend The [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] implementation does not offer support for filter or text search. -To see how to perform these operations, you can check out other backends that do: TODO add link to those +To see how to perform these operations, you can check out other backends that do. ### Perform hybrid search through the query builder @@ -230,7 +249,7 @@ through [build_query()][docarray.index.backends.hnswlib.HnswDocumentIndex.build_ ```python # prepare a query q_doc = MyDoc(embedding=np.random.rand(128), text='query') -# TODO black doesnt like the code below + query = ( db.build_query() # get empty query object .find(query=q_doc, search_field='embedding') # add vector similarity search @@ -248,7 +267,7 @@ to obtain a combined set of results. What kinds of atomic queries can be combined in this way depends on the backend. Some can combine text search and vector search, others can perform filters and vectors search, etc. -To see what backend can do what, check out the specific docs TODO add links +To see what backend can do what, check out the [specific docs](#document-index). ### Access Documents by id @@ -477,7 +496,7 @@ The `HnswDocumentIndex` above contains two columns which are configured differen All configurations that are not explicitly set will be taken from the `default_column_config` of the `RuntimeConfig`. -For an explanation of the configurations that are tweaked in this example, see the `HnswDocumentIndex` documentation TODO link. +For an explanation of the configurations that are tweaked in this example, see the `HnswDocumentIndex` [documentation](index_hnswlib.md). ## Document Store diff --git a/docs/user_guide/storing/index_hnswlib.md b/docs/user_guide/storing/index_hnswlib.md index 9cf688250eb..e2e1406c448 100644 --- a/docs/user_guide/storing/index_hnswlib.md +++ b/docs/user_guide/storing/index_hnswlib.md @@ -7,74 +7,136 @@ pip install "docarray[hnswlib]" ``` -[HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] implements the index based on [hnswlib](https://github.com/nmslib/hnswlib). This is a lightweight implementation with vectors stored in memory. +[HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] is a lightweight Document Index implementation +that runs fully locally and is best suited for small to medium sized datasets. +It stores vectors on disc in [hnswlib](https://github.com/nmslib/hnswlib), and stores all other data in [SQLite](https://www.sqlite.org/index.html). -## Construct -To construct an index, you need to define the schema first. You can define the schema in the same way as define a `Doc`. The only difference is that you need to define the dimensionality of the vector space by `dim` and the name of the space by `space`. The `dim` argument must be an integer. The `space` argument can be one of `l2`, `ip` or `cosine`. TODO: add links to the detailed explaination +!!! note "Production readiness" + [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] is a great starting point + for small to medium sized datasets, but it is not battle tested in production. If scalability, uptime, etc. are + important to you, we recommend you eventually transition to one of our database backed Document Index implementations: + - [QdrantDocumentIndex][docarray.index.backends.qdrant.QdrantDocumentIndex] + - [WeaviateDocumentIndex][docarray.index.backends.weaviate.WeaviateDocumentIndex] + - [ElasticDocumentIndex][docarray.index.backends.elasticsearch.ElasticDocumentIndex] -`work_dir` is the directory for storing the index. If there is an index in the directory, it will be automatically loaded. When the schema of the saved and the defined index do not match, an exception will be raised. -```python -import numpy as np -from pydantic import Field +## Basic Usage + +To see how to create a [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] instance, add Documents, +perform search, etc. see the [general user guide](./first_steps.md#document-index). + +## Configuration + +This section lays out the configurations and options that are specific to [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex]. + +### DBConfig + +The `DBConfig` of [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] expects only one argument: +`work_dir`. +This is the location where all of the Index's data will be stored: The vaious HNSWLib indexes, as well as the SQLite database. + +You can pass this directly to the constructor: + +```python from docarray import BaseDoc from docarray.index import HnswDocumentIndex from docarray.typing import NdArray -class SimpleSchema(BaseDoc): - tensor: NdArray = Field(dim=128, space='cosine') +class MyDoc(BaseDoc): + embedding: NdArray[128] + text: str -doc_index = HnswDocumentIndex[SimpleSchema](work_dir='./tmp') +db = HnswDocumentIndex[MyDoc](work_dir='./path/to/db') ``` -## Index -Use `.index()` to add `Doc` into the index. You need to define the `Doc` following the schema of the index. `.num_docs()` returns the total number of `Doc` in the index. - -```python -class SimpleDoc(BaseDoc): - tensor: NdArray[128] +You can specify and existing directory that holds that from a previous session. +In that case, the Index will load the data from that directory. +!!! note "HNSWLib file lock" + HNSWLib uses a file lock to prevent multiple processes from accessing the same index at the same time. + This means that if you try to open an index that is already open in another process, you will get an error. + To avoid this, you can specify a different `work_dir` for each process. -index_docs = [SimpleDoc(tensor=np.zeros(128)) for _ in range(64)] +### RuntimeConfig -doc_index.index(index_docs) -print(f'number of docs in the index: {doc_index.num_docs()}') -``` +The `RuntimeConfig` of [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] contains only one entry, +the default mapping from Python types to column configurations. -## Access -To access the `Doc`, you need to specify the `id`. You can also pass a list of `id` to access multiple `Doc`. +You can see in the [section below](#field-wise-configurations) how to override configurations for specific fields. +If you want to set configurations globally, i.e. for all vector fields in your Documents, you can do that using `RuntimeConfig`: ```python -# access a single Doc -doc_index[index_docs[16].id] +import numpy as np -# access multiple Docs -doc_index[index_docs[16].id, index_docs[17].id] +db = HnswDocumentIndex[MyDoc](work_dir='/tmp/my_db') + +db.configure( + default_column_config={ + np.ndarray: { + 'dim': -1, + 'index': True, + 'space': 'ip', + 'max_elements': 2048, + 'ef_construction': 100, + 'ef': 15, + 'M': 8, + 'allow_replace_deleted': True, + 'num_threads': 5, + }, + None: {}, + } +) ``` -## Delete -To delete the `Doc`, use the built-in function `del` with the `id` of the `Doc` to be deleted. You can also pass a list of `id` to delete multiple `Doc`. +This will set the default configuration for all vector fields to the one specified in the example above. -```python -# delete a single Doc -del doc_index[index_docs[16].id] +!!! note + Even if your vectors come from PyTorch or TensorFlow, you can and should still use the `np.ndarray` configuration. + This is because all tensors are converted to `np.ndarray` under the hood. -# delete multiple Docs -del doc_index[index_docs[17].id, index_docs[18].id] -``` +For more information on these settings, see [below](#field-wise-configurations). -## Find Nearest Neighbors -Use `.find()` to find the nearest neighbors. You can use `limit` argument to configurate how much `Doc` to return. +Fields that are not vector fields (e.g. of type `str` or `int` etc.) do not offer any configuration, as they are simply +stored as-is in a SQLite database. + +### Field-wise configurations + +There are various setting that you can tweak for every vector field that you index into HNSWLib. + +You pass all of those using the `field: Type = Field(...)` syntax: ```python -query = SimpleDoc(tensor=np.ones(128)) +from pydantic import Field -docs, scores = doc_index.find(query, limit=5, search_field='tensor') + +class Schema(BaseDoc): + tens: NdArray[100] = Field(max_elements=12, space='cosine') + tens_two: NdArray[10] = Field(M=4, space='ip') + + +db = HnswDocumentIndex[Schema](work_dir='/tmp/my_db') ``` +In the example above you can see how to configure two different vector fields, with two different sets of settings. + +In this way, you can pass [all options that HNSWLib supports](https://github.com/nmslib/hnswlib#api-description): + +| Keyword | Description | Default | +|-------------------|--------------------------------------------------------------------------------------------------------------------------------|---------| +| `max_elements` | Maximum number of vector that can be stored | 1024 | +| `space` | Vector space (similarity metric) the index operates in. Supports 'l2', 'ip', and 'cosine' | 'l2' | +| `index` | Whether or not an index should be built for this field. | True | +| `ef_construction` | defines a construction time/accuracy trade-off | 200 | +| `ef` | parameter controlling query time/accuracy trade-off | 10 | +| `M` | parameter that defines the maximum number of outgoing connections in the graph | 16 | +| `allow_replace_deleted` | enables replacing of deleted elements with new added ones | True | +| `num_threads` | sets the number of cpu threads to use | 1 | + +You can find more details on there parameters [here](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md). + ## Nested Index When using the index, you can define multiple fields as well as the nested structure. In the following example, you have `YouTubeVideoDoc` including the `tensor` field calculated based on the description. Besides, `YouTbueVideoDoc` has `thumbnail` and `video` field, each of which has its own `tensor`. From 5c41d204bf85f1f61fc0ee9a8d584eda1582216a Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 17 Apr 2023 14:43:19 +0200 Subject: [PATCH 32/35] docs: add nested stuff Signed-off-by: Johannes Messner --- docs/user_guide/storing/first_steps.md | 80 ++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/docs/user_guide/storing/first_steps.md b/docs/user_guide/storing/first_steps.md index 315e7d7ae3b..de7be2d5e54 100644 --- a/docs/user_guide/storing/first_steps.md +++ b/docs/user_guide/storing/first_steps.md @@ -498,6 +498,86 @@ All configurations that are not explicitly set will be taken from the `default_c For an explanation of the configurations that are tweaked in this example, see the `HnswDocumentIndex` [documentation](index_hnswlib.md). +### Nested data + +The examples above all operate on a simple schema: All fields in `MyDoc` have "basic" types, such as `str` or `NdArray`. + +**Index nested data:** + +It is, however, also possible to represent nested Documents and store them in a Document Index. + +In the following example you can see a complex schema that contains nested Documents. +The `YouTubeVideoDoc` contains a `VideoDoc` and an `ImageDoc`, alongside some "basic" fields: + +```python +from docarray.typing import ImageUrl, VideoUrl, AnyTensor + + +# define a nested schema +class ImageDoc(BaseDoc): + url: ImageUrl + tensor: AnyTensor = Field(space='cosine', dim=64) + + +class VideoDoc(BaseDoc): + url: VideoUrl + tensor: AnyTensor = Field(space='cosine', dim=128) + + +class YouTubeVideoDoc(BaseDoc): + title: str + description: str + thumbnail: ImageDoc + video: VideoDoc + tensor: AnyTensor = Field(space='cosine', dim=256) + + +# create a Document Index +doc_index = HnswDocumentIndex[YouTubeVideoDoc](work_dir='./tmp2') + +# create some data +index_docs = [ + YouTubeVideoDoc( + title=f'video {i+1}', + description=f'this is video from author {10*i}', + thumbnail=ImageDoc(url=f'http://example.ai/images/{i}', tensor=np.ones(64)), + video=VideoDoc(url=f'http://example.ai/videos/{i}', tensor=np.ones(128)), + tensor=np.ones(256), + ) + for i in range(8) +] + +# index the Documents +doc_index.index(index_docs) +``` + + +**Search nested data:** + +You can perform search on any nesting level. +To do so, use the dunder operator to specify the field defined in the nested data. + +In the following example, you can see how to perform vector search on the `tensor` field of the `YouTubeVideoDoc` or on the `tensor` field of the nested `thumbnail` and `video` fields: + +```python +# create a query Document +query_doc = YouTubeVideoDoc( + title=f'video query', + description=f'this is a query video', + thumbnail=ImageDoc(url=f'http://example.ai/images/1024', tensor=np.ones(64)), + video=VideoDoc(url=f'http://example.ai/videos/1024', tensor=np.ones(128)), + tensor=np.ones(256), +) + +# find by the `youtubevideo` tensor; root level +docs, scores = doc_index.find(query_doc, search_field='tensor', limit=3) + +# find by the `thumbnail` tensor; nested level +docs, scores = doc_index.find(query_doc, search_field='thumbnail__tensor', limit=3) + +# find by the `video` tensor; neseted level +docs, scores = doc_index.find(query_doc, search_field='video__tensor', limit=3) +``` ## Document Store This section show you how to use the `DocArray.store` module. `DocArray.store` module is used to store the `Doc`. From 988c17747422ddc3f6b2de2f84d935a63d49c9df Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 17 Apr 2023 15:42:41 +0200 Subject: [PATCH 33/35] docs: fix rendering and links Signed-off-by: Johannes Messner --- docs/user_guide/storing/first_step.md | 21 +++++- docs/user_guide/storing/first_steps.md | 82 +++++++---------------- docs/user_guide/storing/index_elastic.md | 16 ++--- docs/user_guide/storing/index_hnswlib.md | 3 +- docs/user_guide/storing/index_weaviate.md | 6 +- 5 files changed, 57 insertions(+), 71 deletions(-) diff --git a/docs/user_guide/storing/first_step.md b/docs/user_guide/storing/first_step.md index 13ecfe138c0..49aa3d4ace3 100644 --- a/docs/user_guide/storing/first_step.md +++ b/docs/user_guide/storing/first_step.md @@ -1,9 +1,9 @@ -# Intro +# Overview In the previous sections we saw how to use [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] to represent multi-modal data and send it over the wire. In this section we will see how to store and persist this data. -DocArray offers to ways of storing your data: +DocArray offers to ways of storing your data, each of which have their own documentation sections: 1. In a **[Document Store](#document-store)** for simple long-term storage 2. In a **[Document Index](#document-index)** for fast retrieval using vector similarity @@ -24,3 +24,20 @@ This section covers the following three topics: - [Store on S3](doc_store/store_s3.md) ## Document Index + +A Document Index lets you store your Documents and search through them using vector similarity. + +This is useful if you want to store a bunch of data, and at a later point retrieve Documents that are similar to +some query that you provide. +Concrete examples where this is relevant are neural search application, Augmenting LLMs and Chatbots with domain knowledge ([Retrieval-Augmented Generation](https://arxiv.org/abs/2005.11401))]), +or recommender systems. + +DocArray's Document Index concept achieves this by providing a unified interface to a number of [vector databases](https://learn.microsoft.com/en-us/semantic-kernel/concepts-ai/vectordb). +In fact, you can think of Document Index as an **[ORM](https://sqlmodel.tiangolo.com/db-to-code/) for vector databases**. + +Currently, DocArray supports the following vector databases: + +- [Weaviate](https://weaviate.io/) | [Docs](index_weaviate.md) +- [Qdrant](https://qdrant.tech/) | [Docs](index_qdrant.md) +- [Elasticsearch](https://www.elastic.co/elasticsearch/) v7 and v8 | [Docs](index_elastic.md) +- [HNSWlib](https://github.com/nmslib/hnswlib) | [Docs](index_hnswlib.md) diff --git a/docs/user_guide/storing/first_steps.md b/docs/user_guide/storing/first_steps.md index beeb0cacddf..13d0e603a32 100644 --- a/docs/user_guide/storing/first_steps.md +++ b/docs/user_guide/storing/first_steps.md @@ -1,13 +1,4 @@ -# Store - -If you work with multi-modal data, usually you want to **store** it somewhere. - -DocArray offers to ways of storing your data: - -1. In a **[Document Index](#document-index)** for fast retrieval using vector similarity -2. In a **[Document Store](#document-store)** for simple long-term storage - -## Document Index +# Overview A Document Index lets you store your Documents and search through them using vector similarity. @@ -42,12 +33,13 @@ DocArray's Document Index concept achieves this by providing a unified interface In fact, you can think of Document Index as an **[ORM](https://sqlmodel.tiangolo.com/db-to-code/) for vector databases**. Currently, DocArray supports the following vector databases: + - [Weaviate](https://weaviate.io/) | [Docs](index_weaviate.md) - [Qdrant](https://qdrant.tech/) | [Docs](index_qdrant.md) - [Elasticsearch](https://www.elastic.co/elasticsearch/) v7 and v8 | [Docs](index_elastic.md) - [HNSWlib](https://github.com/nmslib/hnswlib) | [Docs](index_hnswlib.md) -For this user guide you will use the [HnswDocumentIndex](docarray.index.backends.hnswlib.HnswDocumentIndex) +For this user guide you will use the [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] because it doesn't require you to launch a database server. Instead, it will store your data locally. !!! note "Using a different vector database" @@ -56,11 +48,11 @@ because it doesn't require you to launch a database server. Instead, it will sto !!! note "HNSWLib-specific settings" The following sections explain the general concept of Document Index by using - [HnswDocumentIndex](docarray.index.backends.hnswlib.HnswDocumentIndex) as an example. - For HNSWLib-specific settings, check out the [HnswDocumentIndex](docarray.index.backends.hnswlib.HnswDocumentIndex) documentation + [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] as an example. + For HNSWLib-specific settings, check out the [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] documentation [here](index_hnswlib.md). -### Create a Document Index +## Create a Document Index !!! note To use [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex], you need to install extra dependencies with the following command: @@ -109,9 +101,9 @@ usually specify a `host` and a `port` instead. Either way, if the location does not yet contain any data, we start from a blank slate. If the location already contains data from a previous session, it will be accessible through the Document Index. -### Index data +## Index data -Now that you have a Document Index, you can add data to it, using the [index()][docarray.index.backends.hnswlib.HnswDocumentIndex.index] method: +Now that you have a Document Index, you can add data to it, using the [index()][docarray.index.abstract.BaseDocIndex.index] method: ```python import numpy as np @@ -139,16 +131,17 @@ need to have compatible schemas. Let's say A is the schema of your Document Index and B is the schema of your data. There are a few rules that determine if a schema A is compatible with a schema B. If _any_ of the following is true, then A and B are compatible: + - A and B are the same class - A and B have the same field names and field types - A and B have the same field names, and, for every field, the type of B is a subclass of the type of A -### Perform vector similarity search +## Perform vector similarity search -Now that you have indexed your data, you can perform vector similarity search using the [find()][docarray.index.backends.hnswlib.HnswDocumentIndex.find] method. +Now that you have indexed your data, you can perform vector similarity search using the [find()][docarray.index.abstract.BaseDocIndex.find] method. -Provided with a Document of type `MyDoc`, [find()][docarray.index.backends.hnswlib.HnswDocumentIndex.find] can find +Provided with a Document of type `MyDoc`, [find()][docarray.index.abstract.BaseDocIndex.find] can find similar Documents in the Document Index. === "Search by Document" @@ -186,14 +179,14 @@ In this particular example you only have one field (`embedding`) that is a vecto In general, you could have multiple fields of type `NdArray` or `TorchTensor` or `TensorFlowTensor`, and you can choose which one to use for the search. -The [find()][docarray.index.backends.hnswlib.HnswDocumentIndex.find] method returns a named tuple containing the closest +The [find()][docarray.index.abstract.BaseDocIndex.find] method returns a named tuple containing the closest matching documents and their associated similarity scores. How these scores are calculated depends on the backend, and can usually be [configured](#customize-configurations). **Batched search:** -You can also search for multiple Documents at once, in a batch, using the [find_batched()][docarray.index.backends.hnswlib.HnswDocumentIndex.find_batched] method. +You can also search for multiple Documents at once, in a batch, using the [find_batched()][docarray.index.abstract.BaseDocIndex.find_batched] method. === "Search by Documents" @@ -225,26 +218,26 @@ print(f'{matches[0].text=}') print(f'{scores=}') ``` -The [find_batched()][docarray.index.backends.hnswlib.HnswDocumentIndex.find_batched] method returns a named tuple containing +The [find_batched()][docarray.index.abstract.BaseDocIndex.find_batched] method returns a named tuple containing a list of `DocList`s, one for each query, containing the closest matching documents; and the associated similarity scores. -### Perform filter search and text search +## Perform filter search and text search In addition to vector similarity search, the Document Index interface offers methods for text search and filter search: -[text_search()][docarray.index.backends.hnswlib.HnswDocumentIndex.text_search] and [filter()][docarray.index.backends.hnswlib.HnswDocumentIndex.filter], -as well as their batched versions [text_search_batched()][docarray.index.backends.hnswlib.HnswDocumentIndex.text_search_batched] and [filter_batched()][docarray.index.backends.hnswlib.HnswDocumentIndex.filter_batched] +[text_search()][docarray.index.abstract.BaseDocIndex.text_search] and [filter()][docarray.index.abstract.BaseDocIndex.filter], +as well as their batched versions [text_search_batched()][docarray.index.abstract.BaseDocIndex.text_search_batched] and [filter_batched()][docarray.index.abstract.BaseDocIndex.filter_batched] The [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] implementation does not offer support for filter or text search. To see how to perform these operations, you can check out other backends that do. -### Perform hybrid search through the query builder +## Perform hybrid search through the query builder Document Index support atomic operations for vector similarity search, text search and filter search. In order to combine these operations into a singe, hybrid search query, you can use the query builder that is accessible -through [build_query()][docarray.index.backends.hnswlib.HnswDocumentIndex.build_query]: +through [build_query()][docarray.index.abstract.BaseDocIndex.build_query]: ```python # prepare a query @@ -269,7 +262,7 @@ What kinds of atomic queries can be combined in this way depends on the backend. Some can combine text search and vector search, others can perform filters and vectors search, etc. To see what backend can do what, check out the [specific docs](#document-index). -### Access Documents by id +## Access Documents by id To retrieve a Document from a Document Index, you don't necessarily need to perform some fancy search. @@ -290,7 +283,7 @@ doc = db[ids[0]] # get by single id docs = db[ids] # get by list of ids ``` -### Delete Documents +## Delete Documents In the same way you can access Documents by id, you can delete them: @@ -309,7 +302,7 @@ del db[ids[0]] # del by single id del db[ids[1:]] # del by list of ids ``` -### Customize configurations +## Customize configurations It is DocArray's philosophy that each Document Index should "just work", meaning that it comes with a sane set of default settings that can get you most of the way there. @@ -405,7 +398,7 @@ print(runtime_config) As you can see, `HnswDocumentIndex.RuntimeConfig` is a dataclass that contains only one configuration: `default_column_config`, which is a mapping from python types to database column configurations. -You can customize every field in this configuration using the [configure()][docarray.index.backends.hnswlib.HnswDocumentIndex.configure] method: +You can customize every field in this configuration using the [configure()][docarray.index.abstract.BaseDocIndex.configure] method: === "Pass individual settings" @@ -498,7 +491,7 @@ All configurations that are not explicitly set will be taken from the `default_c For an explanation of the configurations that are tweaked in this example, see the `HnswDocumentIndex` [documentation](index_hnswlib.md). -### Nested data +## Nested data The examples above all operate on a simple schema: All fields in `MyDoc` have "basic" types, such as `str` or `NdArray`. @@ -578,28 +571,3 @@ docs, scores = doc_index.find(query_doc, search_field='thumbnail__tensor', limit # find by the `video` tensor; neseted level docs, scores = doc_index.find(query_doc, search_field='video__tensor', limit=3) ``` - -## Document Store - -In the previous sections we saw how to use [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] to represent multi-modal data and send it over the wire. -In this section we will see how to store and persist this data. - -DocArray offers to ways of storing your data: - -1. In a **[Document Store](#document-store)** for simple long-term storage -2. In a **[Document Index](#document-index)** for fast retrieval using vector similarity - -## Document Store - -[DocList][docarray.array.doc_list.doc_list.DocList] can be persisted using the -[`.push()`][docarray.array.doc_list.pushpull.PushPullMixin.push] and -[`.pull()`][docarray.array.doc_list.pushpull.PushPullMixin.pull] methods. -Under the hood, [DocStore][docarray.store.abstract_doc_store.AbstractDocStore] is used to persist a `DocList`. -You can store your documents on-disk. Alternatively, you can upload them to [AWS S3](https://aws.amazon.com/s3/), -[minio](https://min.io) or [Jina AI Cloud](https://cloud.jina.ai/user/storage). - -This section covers the following three topics: - - - [Store](doc_store/store_file.md) of [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] on-disk - - [Store on Jina AI Cloud](doc_store/store_jac.md) - - [Store on S3](doc_store/store_s3.md) diff --git a/docs/user_guide/storing/index_elastic.md b/docs/user_guide/storing/index_elastic.md index bc97b05c1f8..2876c813591 100644 --- a/docs/user_guide/storing/index_elastic.md +++ b/docs/user_guide/storing/index_elastic.md @@ -1,8 +1,8 @@ # ElasticSearch Document Index DocArray comes with two Document Indexes for [Elasticsearch](https://www.elastic.co/elasticsearch/): -- [ElasticDocIndex](docarray.index.backends.elastic.ElasticDocIndex), based on [Elasticsearch 8](https://github.com/elastic/elasticsearch). -- [ElasticV7DocIndex](docarray.index.backends.elastic.ElasticV7DocIndex), based on [Elasticsearch 7.10](https://www.elastic.co/downloads/past-releases/elasticsearch-7-10-0). +- [ElasticDocIndex][docarray.index.backends.elastic.ElasticDocIndex], based on [Elasticsearch 8](https://github.com/elastic/elasticsearch). +- [ElasticV7DocIndex][docarray.index.backends.elasticv7.ElasticV7DocIndex], based on [Elasticsearch 7.10](https://www.elastic.co/downloads/past-releases/elasticsearch-7-10-0). !!! tip "Should you use ES v7 or v8?" [Elasticsearch v8](https://www.elastic.co/blog/whats-new-elastic-8-0-0) is the current version of ES and offers @@ -21,7 +21,7 @@ DocArray comes with two Document Indexes for [Elasticsearch](https://www.elastic pip install elastic-transport ``` - To use [ElasticV7DocIndex][docarray.index.backends.elastic.ElasticV7DocIndex], you need to install the following dependencies: + To use [ElasticV7DocIndex][docarray.index.backends.elasticv7.ElasticV7DocIndex], you need to install the following dependencies: ```console pip install elasticsearch==7.10.1 @@ -29,8 +29,8 @@ DocArray comes with two Document Indexes for [Elasticsearch](https://www.elastic ``` -The following examples is based on [ElasticDocIndex](docarray.index.backends.elastic.ElasticDocIndex), -but should also work with [ElasticV7DocIndex](docarray.index.backends.elastic.ElasticV7DocIndex) +The following examples is based on [ElasticDocIndex][docarray.index.backends.elastic.ElasticDocIndex], +but will also work for [ElasticV7DocIndex][docarray.index.backends.elasticv7.ElasticV7DocIndex]. # Start ElasticSearch @@ -154,7 +154,7 @@ This is the field that serves as the basis of comparison between your query and You can use the `limit` argument to configurate how may Documents to return. !!! note - [ElasticV7DocIndex][docarray.index.backends.elastic.ElasticV7DocIndex] is using Elasticsearch v7.10.1 which does not support approximate nearest neighbour algorithms such as HNSW. + [ElasticV7DocIndex][docarray.index.backends.elasticv7.ElasticV7DocIndex] is using Elasticsearch v7.10.1 which does not support approximate nearest neighbour algorithms such as HNSW. This can lead to a poor performance when the search involves many vectors. [ElasticDocIndex][docarray.index.backends.elastic.ElasticDocIndex] does not have this limitation. @@ -410,7 +410,7 @@ The following configs can be set in `DBConfig`: | `index_mappings` | Other [index mappings](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/mapping.html) in a Dict for creating the index | dict | You can pass any of the above as keyword arguments to the `__init__()` method or pass an entire configuration object. -To see how, see [here](user_guide/storing/first_steps.md#configuration-options#customize-configurations). +To see how, see [here](first_steps.md#configuration-options#customize-configurations). ### RuntimeConfig @@ -432,4 +432,4 @@ doc_index = ElasticDocIndex[SimpleDoc]() ``` You can pass the above as a keyword arguments the `configure()` method or pass an entire configuration object. -To see how, see [here](user_guide/storing/first_steps.md#configuration-options#customize-configurations). \ No newline at end of file +To see how, see [here](first_steps.md#configuration-options#customize-configurations). \ No newline at end of file diff --git a/docs/user_guide/storing/index_hnswlib.md b/docs/user_guide/storing/index_hnswlib.md index e2e1406c448..88530cc2fde 100644 --- a/docs/user_guide/storing/index_hnswlib.md +++ b/docs/user_guide/storing/index_hnswlib.md @@ -15,9 +15,10 @@ It stores vectors on disc in [hnswlib](https://github.com/nmslib/hnswlib), and s [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] is a great starting point for small to medium sized datasets, but it is not battle tested in production. If scalability, uptime, etc. are important to you, we recommend you eventually transition to one of our database backed Document Index implementations: + - [QdrantDocumentIndex][docarray.index.backends.qdrant.QdrantDocumentIndex] - [WeaviateDocumentIndex][docarray.index.backends.weaviate.WeaviateDocumentIndex] - - [ElasticDocumentIndex][docarray.index.backends.elasticsearch.ElasticDocumentIndex] + - [ElasticDocumentIndex][docarray.index.backends.elastic.ElasticDocIndex] ## Basic Usage diff --git a/docs/user_guide/storing/index_weaviate.md b/docs/user_guide/storing/index_weaviate.md index 35bf53bf078..f43c387d875 100644 --- a/docs/user_guide/storing/index_weaviate.md +++ b/docs/user_guide/storing/index_weaviate.md @@ -15,13 +15,13 @@ jupyter: # Weaviate Document Index !!! note "Install dependencies" - To use [WeaviateDocumentIndex][docarray.index.backends.qdrant.WeaviateDocumentIndex], you need to install extra dependencies with the following command: + To use [WeaviateDocumentIndex][docarray.index.backends.weaviate.WeaviateDocumentIndex], you need to install extra dependencies with the following command: ```console pip install "docarray[weaviate]" ``` -This is the user guide for the [WeaviateDocumentIndex](docarray.index.backends.hnswlib.WeaviateDocumentIndex), +This is the user guide for the [WeaviateDocumentIndex][docarray.index.backends.weaviate.WeaviateDocumentIndex], focussing on special features and configurations of Weaviate. For general usage of a Document Index, see the [general user guide](./first_steps.md#document-index). @@ -29,7 +29,7 @@ For general usage of a Document Index, see the [general user guide](./first_step # 1. Start Weaviate service -To use [WeaviateDocumentIndex](docarray.index.backends.hnswlib.WeaviateDocumentIndex), it needs to hook into a running Weaviate service. +To use [WeaviateDocumentIndex][docarray.index.backends.weaviate.WeaviateDocumentIndex], it needs to hook into a running Weaviate service. There are multiple ways to start a Weaviate instance, depending on your use case. From bbdab4de22225065b4b398210b921c9edf83efdb Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 17 Apr 2023 15:47:51 +0200 Subject: [PATCH 34/35] docs: fix rendering of tabs Signed-off-by: Johannes Messner --- docs/user_guide/storing/first_steps.md | 192 ++++++++++++------------- 1 file changed, 96 insertions(+), 96 deletions(-) diff --git a/docs/user_guide/storing/first_steps.md b/docs/user_guide/storing/first_steps.md index 13d0e603a32..ff5675b5c70 100644 --- a/docs/user_guide/storing/first_steps.md +++ b/docs/user_guide/storing/first_steps.md @@ -4,7 +4,7 @@ A Document Index lets you store your Documents and search through them using vec This is useful if you want to store a bunch of data, and at a later point retrieve Documents that are similar to some query that you provide. -Concrete examples where this is relevant are neural search application, Augmenting LLMs and Chatbots with domain knowledge ([Retrieval-Augmented Generation](https://arxiv.org/abs/2005.11401))]), +Concrete examples where this is relevant are neural search application, Augmenting LLMs and Chatbots with domain knowledge ([Retrieval-Augmented Generation](https://arxiv.org/abs/2005.11401)), or recommender systems. !!! question "How does vector similarity search work?" @@ -146,31 +146,31 @@ similar Documents in the Document Index. === "Search by Document" -```python -# create a query Document -query = MyDoc(embedding=np.random.rand(128), text='query') + ```python + # create a query Document + query = MyDoc(embedding=np.random.rand(128), text='query') -# find similar Documents -matches, scores = db.find(query, search_field='embedding', limit=5) + # find similar Documents + matches, scores = db.find(query, search_field='embedding', limit=5) -print(f'{matches=}') -print(f'{matches.text=}') -print(f'{scores=}') -``` + print(f'{matches=}') + print(f'{matches.text=}') + print(f'{scores=}') + ``` === "Search by raw vector" -```python -# create a query vector -query = np.random.rand(128) + ```python + # create a query vector + query = np.random.rand(128) -# find similar Documents -matches, scores = db.find(query, search_field='embedding', limit=5) + # find similar Documents + matches, scores = db.find(query, search_field='embedding', limit=5) -print(f'{matches=}') -print(f'{matches.text=}') -print(f'{scores=}') -``` + print(f'{matches=}') + print(f'{matches.text=}') + print(f'{scores=}') + ``` To succesfully peform a vector search, you need to specify a `search_field`. This is the field that serves as the basis of comparison between your query and the documents in the Document Index. @@ -190,33 +190,33 @@ You can also search for multiple Documents at once, in a batch, using the [find_ === "Search by Documents" -```python -# create some query Documents -queries = DocList[MyDoc]( - MyDoc(embedding=np.random.rand(128), text=f'query {i}') for i in range(3) -) + ```python + # create some query Documents + queries = DocList[MyDoc]( + MyDoc(embedding=np.random.rand(128), text=f'query {i}') for i in range(3) + ) -# find similar Documents -matches, scores = db.find_batched(queries, search_field='embedding', limit=5) + # find similar Documents + matches, scores = db.find_batched(queries, search_field='embedding', limit=5) -print(f'{matches=}') -print(f'{matches[0].text=}') -print(f'{scores=}') -``` + print(f'{matches=}') + print(f'{matches[0].text=}') + print(f'{scores=}') + ``` === "Search by raw vector" -```python -# create some query vectors -query = np.random.rand(3, 128) + ```python + # create some query vectors + query = np.random.rand(3, 128) -# find similar Documents -matches, scores = db.find_batched(query, search_field='embedding', limit=5) + # find similar Documents + matches, scores = db.find_batched(query, search_field='embedding', limit=5) -print(f'{matches=}') -print(f'{matches[0].text=}') -print(f'{scores=}') -``` + print(f'{matches=}') + print(f'{matches[0].text=}') + print(f'{scores=}') + ``` The [find_batched()][docarray.index.abstract.BaseDocIndex.find_batched] method returns a named tuple containing a list of `DocList`s, one for each query, containing the closest matching documents; and the associated similarity scores. @@ -349,26 +349,26 @@ You can customize every field in this configuration: === "Pass individual settings" -```python -db = HnswDocumentIndex[MyDoc](work_dir='/tmp/my_db') + ```python + db = HnswDocumentIndex[MyDoc](work_dir='/tmp/my_db') -custom_db_config = db._db_config -print(custom_db_config) + custom_db_config = db._db_config + print(custom_db_config) -# > HnswDocumentIndex.DBConfig(work_dir='/tmp/my_db') -``` + # > HnswDocumentIndex.DBConfig(work_dir='/tmp/my_db') + ``` === "Pass entire configuration" -```python -custom_db_config = HnswDocumentIndex.DBConfig(work_dir='/tmp/my_db') + ```python + custom_db_config = HnswDocumentIndex.DBConfig(work_dir='/tmp/my_db') -db = HnswDocumentIndex[MyDoc](custom_db_config) + db = HnswDocumentIndex[MyDoc](custom_db_config) -print(db._db_config) + print(db._db_config) -# > HnswDocumentIndex.DBConfig(work_dir='/tmp/my_db') -``` + # > HnswDocumentIndex.DBConfig(work_dir='/tmp/my_db') + ``` **Runtime configurations** @@ -402,60 +402,60 @@ You can customize every field in this configuration using the [configure()][doca === "Pass individual settings" -```python -db = HnswDocumentIndex[MyDoc](work_dir='/tmp/my_db') - -db.configure( - default_column_config={ - np.ndarray: { - 'dim': -1, - 'index': True, - 'space': 'ip', - 'max_elements': 2048, - 'ef_construction': 100, - 'ef': 15, - 'M': 8, - 'allow_replace_deleted': True, - 'num_threads': 5, - }, - None: {}, - } -) + ```python + db = HnswDocumentIndex[MyDoc](work_dir='/tmp/my_db') + + db.configure( + default_column_config={ + np.ndarray: { + 'dim': -1, + 'index': True, + 'space': 'ip', + 'max_elements': 2048, + 'ef_construction': 100, + 'ef': 15, + 'M': 8, + 'allow_replace_deleted': True, + 'num_threads': 5, + }, + None: {}, + } + ) -custom_runtime_config = db._runtime_config -print(custom_runtime_config) + custom_runtime_config = db._runtime_config + print(custom_runtime_config) -# > HnswDocumentIndex.RuntimeConfig(default_column_config={: {'dim': -1, 'index': True, 'space': 'ip', 'max_elements': 2048, 'ef_construction': 100, 'ef': 15, 'M': 8, 'allow_replace_deleted': True, 'num_threads': 5}, None: {}}) -``` + # > HnswDocumentIndex.RuntimeConfig(default_column_config={: {'dim': -1, 'index': True, 'space': 'ip', 'max_elements': 2048, 'ef_construction': 100, 'ef': 15, 'M': 8, 'allow_replace_deleted': True, 'num_threads': 5}, None: {}}) + ``` === "Pass entire configuration" -```python -custom_runtime_config = HnswDocumentIndex.RuntimeConfig( - default_column_config={ - np.ndarray: { - 'dim': -1, - 'index': True, - 'space': 'ip', - 'max_elements': 2048, - 'ef_construction': 100, - 'ef': 15, - 'M': 8, - 'allow_replace_deleted': True, - 'num_threads': 5, - }, - None: {}, - } -) + ```python + custom_runtime_config = HnswDocumentIndex.RuntimeConfig( + default_column_config={ + np.ndarray: { + 'dim': -1, + 'index': True, + 'space': 'ip', + 'max_elements': 2048, + 'ef_construction': 100, + 'ef': 15, + 'M': 8, + 'allow_replace_deleted': True, + 'num_threads': 5, + }, + None: {}, + } + ) -db = HnswDocumentIndex[MyDoc](work_dir='/tmp/my_db') + db = HnswDocumentIndex[MyDoc](work_dir='/tmp/my_db') -db.configure(custom_runtime_config) + db.configure(custom_runtime_config) -print(db._runtime_config) + print(db._runtime_config) -# > HHnswDocumentIndex.RuntimeConfig(default_column_config={: {'dim': -1, 'index': True, 'space': 'ip', 'max_elements': 2048, 'ef_construction': 100, 'ef': 15, 'M': 8, 'allow_replace_deleted': True, 'num_threads': 5}, None: {}}) -``` + # > HHnswDocumentIndex.RuntimeConfig(default_column_config={: {'dim': -1, 'index': True, 'space': 'ip', 'max_elements': 2048, 'ef_construction': 100, 'ef': 15, 'M': 8, 'allow_replace_deleted': True, 'num_threads': 5}, None: {}}) + ``` After this change, the new setting will be applied to _every_ column that corresponds to a `np.ndarray` type. From 8ff99794e705cbd9ceac85a387fa481668eb358f Mon Sep 17 00:00:00 2001 From: Johannes Messner Date: Mon, 17 Apr 2023 16:08:28 +0200 Subject: [PATCH 35/35] test: exclude index docs from doctests Signed-off-by: Johannes Messner --- tests/documentation/test_docs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py index ebdbbd1baa6..37f24a7cf66 100644 --- a/tests/documentation/test_docs.py +++ b/tests/documentation/test_docs.py @@ -6,7 +6,7 @@ from tests.index.elastic.fixture import start_storage_v8 # noqa: F401 -file_to_skip = ['fastAPI', 'jina'] +file_to_skip = ['fastAPI', 'jina', 'index', 'first_steps.md'] def check_raw_file_full(raw, lang="python", keyword_ignore=[]):