diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 50bd5ed2b85..ea673567d77 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -101,6 +101,7 @@ jobs: python -m pip install --upgrade pip python -m pip install poetry poetry install --all-extras + poetry run pip install elasticsearch==8.6.2 sudo apt-get update sudo apt-get install --no-install-recommends ffmpeg @@ -147,6 +148,7 @@ jobs: python -m pip install poetry rm poetry.lock poetry install --all-extras + poetry run pip install elasticsearch==8.6.2 sudo apt-get update sudo apt-get install --no-install-recommends ffmpeg diff --git a/docarray/index/backends/elastic.py b/docarray/index/backends/elastic.py index 862b2673389..646e23e8cbd 100644 --- a/docarray/index/backends/elastic.py +++ b/docarray/index/backends/elastic.py @@ -58,7 +58,7 @@ class ElasticDocIndex(BaseDocIndex, Generic[TSchema]): def __init__(self, db_config=None, **kwargs): """Initialize ElasticDocIndex""" super().__init__(db_config=db_config, **kwargs) - self._db_config = cast(self.DBConfig, self._db_config) + self._db_config = cast(ElasticDocIndex.DBConfig, self._db_config) # ElasticSearch client creation if self._db_config.index_name is None: @@ -406,7 +406,7 @@ def execute_query(self, query: Dict[str, Any], *args, **kwargs) -> Any: resp = self._client.search(index=self._index_name, **query) docs, scores = self._format_response(resp) - return _FindResult(documents=docs, scores=scores) + return _FindResult(documents=docs, scores=parse_obj_as(NdArray, scores)) def _find( self, query: np.ndarray, limit: int, search_field: str = '' @@ -417,7 +417,7 @@ def _find( docs, scores = self._format_response(resp) - return _FindResult(documents=docs, scores=scores) + return _FindResult(documents=docs, scores=parse_obj_as(NdArray, scores)) def _find_batched( self, @@ -576,7 +576,7 @@ def _form_text_search_body( } return body - def _format_response(self, response: Any) -> Tuple[List[Dict], NdArray]: + def _format_response(self, response: Any) -> Tuple[List[Dict], List[Any]]: docs = [] scores = [] for result in response['hits']['hits']: diff --git a/docarray/index/backends/elasticv7.py b/docarray/index/backends/elasticv7.py index 83c35606912..623f11053bb 100644 --- a/docarray/index/backends/elasticv7.py +++ b/docarray/index/backends/elasticv7.py @@ -3,11 +3,13 @@ from typing import Any, Dict, List, Optional, Sequence, TypeVar, Union import numpy as np +from pydantic import parse_obj_as from docarray import BaseDoc from docarray.index import ElasticDocIndex from docarray.index.abstract import BaseDocIndex, _ColumnInfo from docarray.typing import AnyTensor +from docarray.typing.tensor.ndarray import NdArray from docarray.utils.find import _FindResult TSchema = TypeVar('TSchema', bound=BaseDoc) @@ -120,7 +122,7 @@ def execute_query(self, query: Dict[str, Any], *args, **kwargs) -> Any: resp = self._client.search(index=self._index_name, body=query) docs, scores = self._format_response(resp) - return _FindResult(documents=docs, scores=scores) + return _FindResult(documents=docs, scores=parse_obj_as(NdArray, scores)) ############################################### # Helpers # diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py index a15606661e2..467d3f754fb 100644 --- a/docarray/index/backends/hnswlib.py +++ b/docarray/index/backends/hnswlib.py @@ -28,8 +28,8 @@ _raise_not_supported, ) from docarray.proto import DocProto -from docarray.typing import NdArray from docarray.typing.tensor.abstract_tensor import AbstractTensor +from docarray.typing.tensor.ndarray import NdArray from docarray.utils._internal.misc import import_library, is_np_int from docarray.utils.filter import filter_docs from docarray.utils.find import _FindResult, _FindResultBatched diff --git a/docs/.gitignore b/docs/.gitignore index 006c5fe7420..eee951db889 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,6 +1,7 @@ api/* proto/* -../README.md -index.md +<<<<<<< HEAD +README.md +#index.md CONTRIBUTING.md \ No newline at end of file diff --git a/docs/api_references/index/backends.md b/docs/api_references/index/backends.md new file mode 100644 index 00000000000..6bfcaf17670 --- /dev/null +++ b/docs/api_references/index/backends.md @@ -0,0 +1,3 @@ +# Backends + +::: docarray.index.backends diff --git a/docs/user_guide/storing/first_step.md b/docs/user_guide/storing/first_step.md index 13ecfe138c0..49aa3d4ace3 100644 --- a/docs/user_guide/storing/first_step.md +++ b/docs/user_guide/storing/first_step.md @@ -1,9 +1,9 @@ -# Intro +# Overview In the previous sections we saw how to use [`BaseDoc`][docarray.base_doc.doc.BaseDoc], [`DocList`][docarray.array.doc_list.doc_list.DocList] and [`DocVec`][docarray.array.doc_vec.doc_vec.DocVec] to represent multi-modal data and send it over the wire. In this section we will see how to store and persist this data. -DocArray offers to ways of storing your data: +DocArray offers to ways of storing your data, each of which have their own documentation sections: 1. In a **[Document Store](#document-store)** for simple long-term storage 2. In a **[Document Index](#document-index)** for fast retrieval using vector similarity @@ -24,3 +24,20 @@ This section covers the following three topics: - [Store on S3](doc_store/store_s3.md) ## Document Index + +A Document Index lets you store your Documents and search through them using vector similarity. + +This is useful if you want to store a bunch of data, and at a later point retrieve Documents that are similar to +some query that you provide. +Concrete examples where this is relevant are neural search application, Augmenting LLMs and Chatbots with domain knowledge ([Retrieval-Augmented Generation](https://arxiv.org/abs/2005.11401))]), +or recommender systems. + +DocArray's Document Index concept achieves this by providing a unified interface to a number of [vector databases](https://learn.microsoft.com/en-us/semantic-kernel/concepts-ai/vectordb). +In fact, you can think of Document Index as an **[ORM](https://sqlmodel.tiangolo.com/db-to-code/) for vector databases**. + +Currently, DocArray supports the following vector databases: + +- [Weaviate](https://weaviate.io/) | [Docs](index_weaviate.md) +- [Qdrant](https://qdrant.tech/) | [Docs](index_qdrant.md) +- [Elasticsearch](https://www.elastic.co/elasticsearch/) v7 and v8 | [Docs](index_elastic.md) +- [HNSWlib](https://github.com/nmslib/hnswlib) | [Docs](index_hnswlib.md) diff --git a/docs/user_guide/storing/first_steps.md b/docs/user_guide/storing/first_steps.md new file mode 100644 index 00000000000..ff5675b5c70 --- /dev/null +++ b/docs/user_guide/storing/first_steps.md @@ -0,0 +1,573 @@ +# Overview + +A Document Index lets you store your Documents and search through them using vector similarity. + +This is useful if you want to store a bunch of data, and at a later point retrieve Documents that are similar to +some query that you provide. +Concrete examples where this is relevant are neural search application, Augmenting LLMs and Chatbots with domain knowledge ([Retrieval-Augmented Generation](https://arxiv.org/abs/2005.11401)), +or recommender systems. + +!!! question "How does vector similarity search work?" + Without going into too much detail, the idea behind vector similarity search is the following: + + You represent every data point that you have (in our case, a Document) as a _vector_, or _embedding_. + This vector should represent as much semantic information about your data as possible: Similar data points should + be represented by similar vectors. + + These vectors (embeddings) are usually obtained by passing the data through a suitable neural network that has been + trained to produce such semantic representations - this is the _encoding_ step. + + Once you have your vector that represent your data, you can store them, for example in a vector database. + + To perform similarity search, you take your input query and encode it in the same way as the data in your database. + Then, the database will search through the stored vectors and return the ones that are most similar to your query. + This similarity is measured by a _similarity metric_, which can be [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity), + [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance), or any other metric that you can think of. + + If you store a lot of data, performing this similarity computation for every data point in your database is expensive. + Therefore, vector databases usually perform _approximate nearest neighbor (ANN)_ search. + There are various algorithms for doing this, such as [HNSW](https://arxiv.org/abs/1603.09320), but in a nutshell, + they allow you to search through a large database of vectors very quickly, at the expense of a small loss in accuracy. + +DocArray's Document Index concept achieves this by providing a unified interface to a number of [vector databases](https://learn.microsoft.com/en-us/semantic-kernel/concepts-ai/vectordb). +In fact, you can think of Document Index as an **[ORM](https://sqlmodel.tiangolo.com/db-to-code/) for vector databases**. + +Currently, DocArray supports the following vector databases: + +- [Weaviate](https://weaviate.io/) | [Docs](index_weaviate.md) +- [Qdrant](https://qdrant.tech/) | [Docs](index_qdrant.md) +- [Elasticsearch](https://www.elastic.co/elasticsearch/) v7 and v8 | [Docs](index_elastic.md) +- [HNSWlib](https://github.com/nmslib/hnswlib) | [Docs](index_hnswlib.md) + +For this user guide you will use the [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] +because it doesn't require you to launch a database server. Instead, it will store your data locally. + +!!! note "Using a different vector database" + You can easily use Weaviate, Qdrant, or Elasticsearch instead, they share the same API! + To do so, check out their respective documentation sections. + +!!! note "HNSWLib-specific settings" + The following sections explain the general concept of Document Index by using + [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] as an example. + For HNSWLib-specific settings, check out the [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] documentation + [here](index_hnswlib.md). + +## Create a Document Index + +!!! note + To use [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex], you need to install extra dependencies with the following command: + + ```console + pip install "docarray[hnswlib]" + ``` + +To create a Document Index, your first need a Document that defines the schema of your index. + +```python +from docarray import BaseDoc +from docarray.index import HnswDocumentIndex +from docarray.typing import NdArray + + +class MyDoc(BaseDoc): + embedding: NdArray[128] + text: str + + +db = HnswDocumentIndex[MyDoc](work_dir='./my_test_db') +``` + +**Schema definition:** + +In this code snippet, `HnswDocumentIndex` takes a schema of the form of `MyDoc`. +The Document Index then _creates column for each field in `MyDoc`_. + +The column types in the backend database are determined the type hints of the fields in the Document. +Optionally, you can customize the database types for every field, as you can see [here](#customize-configurations). + +Most vector databases need to know the dimensionality of the vectors that will be stored. +Here, that is automatically inferred from the type hint of the `embedding` field: `NdArray[128]` means that +the database will store vectors with 128 dimensions. + +!!! note "PyTorch and TensorFlow support" + Instead of using `NdArray` you can use `TorchTensor` or `TensorFlowTensor` and the Document Index will handle that + for you. This is supported for all Document Index backends. No need to convert your tensors to numpy arrays manually! + +**Database location:** + +For `HnswDocumentIndex` you need to specify a `work_dir` where the data will be stored; for other backends you +usually specify a `host` and a `port` instead. + +Either way, if the location does not yet contain any data, we start from a blank slate. +If the location already contains data from a previous session, it will be accessible through the Document Index. + +## Index data + +Now that you have a Document Index, you can add data to it, using the [index()][docarray.index.abstract.BaseDocIndex.index] method: + +```python +import numpy as np +from docarray import DocList + +# create some random data +docs = DocList[MyDoc]( + [MyDoc(embedding=np.random.rand(128), text=f'text {i}') for i in range(100)] +) + +# index the data +db.index(docs) +``` + +That call to [index()][docarray.index.backends.hnswlib.HnswDocumentIndex.index] stores all Documents in `docs` into the Document Index, +ready to be retrieved in the next step. + +As you can see, `DocList[MyDoc]` and `HnswDocumentIndex[MyDoc]` are both parameterized with `MyDoc`. +This means that they share the same schema, and in general, the schema of a Document Index and the data that you want to store +need to have compatible schemas. + +!!! question "When are two schemas compatible?" + The schema of your Document Index and of your data need to be compatible with each other. + + Let's say A is the schema of your Document Index and B is the schema of your data. + There are a few rules that determine if a schema A is compatible with a schema B. + If _any_ of the following is true, then A and B are compatible: + + - A and B are the same class + - A and B have the same field names and field types + - A and B have the same field names, and, for every field, the type of B is a subclass of the type of A + +## Perform vector similarity search + +Now that you have indexed your data, you can perform vector similarity search using the [find()][docarray.index.abstract.BaseDocIndex.find] method. + + +Provided with a Document of type `MyDoc`, [find()][docarray.index.abstract.BaseDocIndex.find] can find +similar Documents in the Document Index. + +=== "Search by Document" + + ```python + # create a query Document + query = MyDoc(embedding=np.random.rand(128), text='query') + + # find similar Documents + matches, scores = db.find(query, search_field='embedding', limit=5) + + print(f'{matches=}') + print(f'{matches.text=}') + print(f'{scores=}') + ``` + +=== "Search by raw vector" + + ```python + # create a query vector + query = np.random.rand(128) + + # find similar Documents + matches, scores = db.find(query, search_field='embedding', limit=5) + + print(f'{matches=}') + print(f'{matches.text=}') + print(f'{scores=}') + ``` + +To succesfully peform a vector search, you need to specify a `search_field`. This is the field that serves as the +basis of comparison between your query and the documents in the Document Index. + +In this particular example you only have one field (`embedding`) that is a vector, so you can trivially choose that one. +In general, you could have multiple fields of type `NdArray` or `TorchTensor` or `TensorFlowTensor`, and you can choose +which one to use for the search. + +The [find()][docarray.index.abstract.BaseDocIndex.find] method returns a named tuple containing the closest +matching documents and their associated similarity scores. + +How these scores are calculated depends on the backend, and can usually be [configured](#customize-configurations). + +**Batched search:** + +You can also search for multiple Documents at once, in a batch, using the [find_batched()][docarray.index.abstract.BaseDocIndex.find_batched] method. + +=== "Search by Documents" + + ```python + # create some query Documents + queries = DocList[MyDoc]( + MyDoc(embedding=np.random.rand(128), text=f'query {i}') for i in range(3) + ) + + # find similar Documents + matches, scores = db.find_batched(queries, search_field='embedding', limit=5) + + print(f'{matches=}') + print(f'{matches[0].text=}') + print(f'{scores=}') + ``` + +=== "Search by raw vector" + + ```python + # create some query vectors + query = np.random.rand(3, 128) + + # find similar Documents + matches, scores = db.find_batched(query, search_field='embedding', limit=5) + + print(f'{matches=}') + print(f'{matches[0].text=}') + print(f'{scores=}') + ``` + +The [find_batched()][docarray.index.abstract.BaseDocIndex.find_batched] method returns a named tuple containing +a list of `DocList`s, one for each query, containing the closest matching documents; and the associated similarity scores. + +## Perform filter search and text search + +In addition to vector similarity search, the Document Index interface offers methods for text search and filter search: +[text_search()][docarray.index.abstract.BaseDocIndex.text_search] and [filter()][docarray.index.abstract.BaseDocIndex.filter], +as well as their batched versions [text_search_batched()][docarray.index.abstract.BaseDocIndex.text_search_batched] and [filter_batched()][docarray.index.abstract.BaseDocIndex.filter_batched] + +The [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] implementation does not offer support for filter +or text search. + +To see how to perform these operations, you can check out other backends that do. + +## Perform hybrid search through the query builder + +Document Index support atomic operations for vector similarity search, text search and filter search. + +In order to combine these operations into a singe, hybrid search query, you can use the query builder that is accessible +through [build_query()][docarray.index.abstract.BaseDocIndex.build_query]: + +```python +# prepare a query +q_doc = MyDoc(embedding=np.random.rand(128), text='query') + +query = ( + db.build_query() # get empty query object + .find(query=q_doc, search_field='embedding') # add vector similarity search + .filter(filter_query={'text': {'$exists': True}}) # add filter search + .build() # build the query +) + +# execute the combined query and return the results +results = db.execute_query(query) +print(f'{results=}') +``` + +In the example above you can see how to form a hybrid query that combines vector similarity search and filter search +to obtain a combined set of results. + +What kinds of atomic queries can be combined in this way depends on the backend. +Some can combine text search and vector search, others can perform filters and vectors search, etc. +To see what backend can do what, check out the [specific docs](#document-index). + +## Access Documents by id + +To retrieve a Document from a Document Index, you don't necessarily need to perform some fancy search. + +You can also access data by the id that as assigned to every Document: + +```python +# prepare some data +data = DocList[MyDoc]( + MyDoc(embedding=np.random.rand(128), text=f'query {i}') for i in range(3) +) + +# remember the Document ids and index the data +ids = data.id +db.index(data) + +# access the Documents by id +doc = db[ids[0]] # get by single id +docs = db[ids] # get by list of ids +``` + +## Delete Documents + +In the same way you can access Documents by id, you can delete them: + +```python +# prepare some data +data = DocList[MyDoc]( + MyDoc(embedding=np.random.rand(128), text=f'query {i}') for i in range(3) +) + +# remember the Document ids and index the data +ids = data.id +db.index(data) + +# access the Documents by id +del db[ids[0]] # del by single id +del db[ids[1:]] # del by list of ids +``` + +## Customize configurations + +It is DocArray's philosophy that each Document Index should "just work", meaning that it comes with a sane set of default +settings that can get you most of the way there. + +However, there are different configurations that you may want to tweak, including: +- The [ANN](https://ignite.apache.org/docs/latest/machine-learning/binary-classification/ann) algorithm used, for example [HNSW](https://www.pinecone.io/learn/hnsw/) or [ScaNN](https://ai.googleblog.com/2020/07/announcing-scann-efficient-vector.html) +- Hyperparameters of the ANN algorithm, such as `ef_construction` for HNSW +- The distance metric to use, such as cosine or L2 distance +- The data type of each column in the database +- ... + +The specific configurations that you can tweak depend on the backend, but the interface to do so is universal. + +Document Indexes differentiate between three different kind of configurations: + +**Database configurations** + +_Database configurations_ are configurations that pertain to the entire DB or DB table (as opposed to just a specific column), +and that you _don't_ dynamically change at runtime. + +This commonly includes: +- host and port +- index or collection name +- authentication settings +- ... + + +For every backend, you can get the full list of configurations, and their defaults, like this: + +```python +from docarray.index import HnswDocumentIndex + + +db_config = HnswDocumentIndex.DBConfig() +print(db_config) + +# > HnswDocumentIndex.DBConfig(work_dir='.') +``` + +As you can see, `HnswDocumentIndex.DBConfig` is a dataclass that contains only one possible configuration, `work_dir`, +that defaults to `.`. + +You can customize every field in this configuration: + +=== "Pass individual settings" + + ```python + db = HnswDocumentIndex[MyDoc](work_dir='/tmp/my_db') + + custom_db_config = db._db_config + print(custom_db_config) + + # > HnswDocumentIndex.DBConfig(work_dir='/tmp/my_db') + ``` + +=== "Pass entire configuration" + + ```python + custom_db_config = HnswDocumentIndex.DBConfig(work_dir='/tmp/my_db') + + db = HnswDocumentIndex[MyDoc](custom_db_config) + + print(db._db_config) + + # > HnswDocumentIndex.DBConfig(work_dir='/tmp/my_db') + ``` + +**Runtime configurations** + +_Runtime configurations_ are configurations that pertain to the entire DB or DB table (as opposed to just a specific column), +and that you can dynamically change at runtime. + + +This commonly includes: +- default batch size for batching operations +- default mapping from pythong types to DB column types +- default consistency level for various DB operations +- ... + + +For every backend, you can get the full list of configurations, and their defaults, like this: + +```python +from docarray.index import HnswDocumentIndex + + +runtime_config = HnswDocumentIndex.RuntimeConfig() +print(runtime_config) + +# > HnswDocumentIndex.RuntimeConfig(default_column_config={: {'dim': -1, 'index': True, 'space': 'l2', 'max_elements': 1024, 'ef_construction': 200, 'ef': 10, 'M': 16, 'allow_replace_deleted': True, 'num_threads': 1}, None: {}}) +``` + +As you can see, `HnswDocumentIndex.RuntimeConfig` is a dataclass that contains only one configuration: +`default_column_config`, which is a mapping from python types to database column configurations. + +You can customize every field in this configuration using the [configure()][docarray.index.abstract.BaseDocIndex.configure] method: + +=== "Pass individual settings" + + ```python + db = HnswDocumentIndex[MyDoc](work_dir='/tmp/my_db') + + db.configure( + default_column_config={ + np.ndarray: { + 'dim': -1, + 'index': True, + 'space': 'ip', + 'max_elements': 2048, + 'ef_construction': 100, + 'ef': 15, + 'M': 8, + 'allow_replace_deleted': True, + 'num_threads': 5, + }, + None: {}, + } + ) + + custom_runtime_config = db._runtime_config + print(custom_runtime_config) + + # > HnswDocumentIndex.RuntimeConfig(default_column_config={: {'dim': -1, 'index': True, 'space': 'ip', 'max_elements': 2048, 'ef_construction': 100, 'ef': 15, 'M': 8, 'allow_replace_deleted': True, 'num_threads': 5}, None: {}}) + ``` + +=== "Pass entire configuration" + + ```python + custom_runtime_config = HnswDocumentIndex.RuntimeConfig( + default_column_config={ + np.ndarray: { + 'dim': -1, + 'index': True, + 'space': 'ip', + 'max_elements': 2048, + 'ef_construction': 100, + 'ef': 15, + 'M': 8, + 'allow_replace_deleted': True, + 'num_threads': 5, + }, + None: {}, + } + ) + + db = HnswDocumentIndex[MyDoc](work_dir='/tmp/my_db') + + db.configure(custom_runtime_config) + + print(db._runtime_config) + + # > HHnswDocumentIndex.RuntimeConfig(default_column_config={: {'dim': -1, 'index': True, 'space': 'ip', 'max_elements': 2048, 'ef_construction': 100, 'ef': 15, 'M': 8, 'allow_replace_deleted': True, 'num_threads': 5}, None: {}}) + ``` + +After this change, the new setting will be applied to _every_ column that corresponds to a `np.ndarray` type. + +**Column configurations** + +For many vector databases, individual columns can have different configurations. + +This commonly includes: +- The data type of the column, e.g. `vector` vs `varchar` +- If it is a vector column, the dimensionality of the vector +- Whether an index should be built for a specific column + +The exact configurations that are available different from backend to backend, but in any case you can pass them +directly in the schema of your Document Index, using the `Field()` syntax: + +```python +from pydantic import Field + + +class Schema(BaseDoc): + tens: NdArray[100] = Field(max_elements=12, space='cosine') + tens_two: NdArray[10] = Field(M=4, space='ip') + + +db = HnswDocumentIndex[Schema](work_dir='/tmp/my_db') +``` + +The `HnswDocumentIndex` above contains two columns which are configured differently: +- `tens` has a dimensionality of 100, can take up to 12 elements, and uses the `cosine` similarity space +- `tens_two` has a dimensionality of 10, and uses the `ip` similarity space, and an `M` hyperparameter of 4 + +All configurations that are not explicitly set will be taken from the `default_column_config` of the `RuntimeConfig`. + +For an explanation of the configurations that are tweaked in this example, see the `HnswDocumentIndex` [documentation](index_hnswlib.md). + +## Nested data + +The examples above all operate on a simple schema: All fields in `MyDoc` have "basic" types, such as `str` or `NdArray`. + +**Index nested data:** + +It is, however, also possible to represent nested Documents and store them in a Document Index. + +In the following example you can see a complex schema that contains nested Documents. +The `YouTubeVideoDoc` contains a `VideoDoc` and an `ImageDoc`, alongside some "basic" fields: + +```python +from docarray.typing import ImageUrl, VideoUrl, AnyTensor + + +# define a nested schema +class ImageDoc(BaseDoc): + url: ImageUrl + tensor: AnyTensor = Field(space='cosine', dim=64) + + +class VideoDoc(BaseDoc): + url: VideoUrl + tensor: AnyTensor = Field(space='cosine', dim=128) + + +class YouTubeVideoDoc(BaseDoc): + title: str + description: str + thumbnail: ImageDoc + video: VideoDoc + tensor: AnyTensor = Field(space='cosine', dim=256) + + +# create a Document Index +doc_index = HnswDocumentIndex[YouTubeVideoDoc](work_dir='./tmp2') + +# create some data +index_docs = [ + YouTubeVideoDoc( + title=f'video {i+1}', + description=f'this is video from author {10*i}', + thumbnail=ImageDoc(url=f'http://example.ai/images/{i}', tensor=np.ones(64)), + video=VideoDoc(url=f'http://example.ai/videos/{i}', tensor=np.ones(128)), + tensor=np.ones(256), + ) + for i in range(8) +] + +# index the Documents +doc_index.index(index_docs) +``` + + +**Search nested data:** + +You can perform search on any nesting level. +To do so, use the dunder operator to specify the field defined in the nested data. + +In the following example, you can see how to perform vector search on the `tensor` field of the `YouTubeVideoDoc` or on the `tensor` field of the nested `thumbnail` and `video` fields: + +```python +# create a query Document +query_doc = YouTubeVideoDoc( + title=f'video query', + description=f'this is a query video', + thumbnail=ImageDoc(url=f'http://example.ai/images/1024', tensor=np.ones(64)), + video=VideoDoc(url=f'http://example.ai/videos/1024', tensor=np.ones(128)), + tensor=np.ones(256), +) + +# find by the `youtubevideo` tensor; root level +docs, scores = doc_index.find(query_doc, search_field='tensor', limit=3) + +# find by the `thumbnail` tensor; nested level +docs, scores = doc_index.find(query_doc, search_field='thumbnail__tensor', limit=3) + +# find by the `video` tensor; neseted level +docs, scores = doc_index.find(query_doc, search_field='video__tensor', limit=3) +``` diff --git a/docs/user_guide/storing/index_elastic.md b/docs/user_guide/storing/index_elastic.md new file mode 100644 index 00000000000..2876c813591 --- /dev/null +++ b/docs/user_guide/storing/index_elastic.md @@ -0,0 +1,435 @@ +# ElasticSearch Document Index + +DocArray comes with two Document Indexes for [Elasticsearch](https://www.elastic.co/elasticsearch/): +- [ElasticDocIndex][docarray.index.backends.elastic.ElasticDocIndex], based on [Elasticsearch 8](https://github.com/elastic/elasticsearch). +- [ElasticV7DocIndex][docarray.index.backends.elasticv7.ElasticV7DocIndex], based on [Elasticsearch 7.10](https://www.elastic.co/downloads/past-releases/elasticsearch-7-10-0). + +!!! tip "Should you use ES v7 or v8?" + [Elasticsearch v8](https://www.elastic.co/blog/whats-new-elastic-8-0-0) is the current version of ES and offers + **native vector search (ANN) support**, alongside text and range search. + + [Elasticsearch v7.10](https://www.elastic.co/downloads/past-releases/elasticsearch-7-10-0) can store vectors, but + **does _not_ support native ANN vector search**, but only exhaustive (=slow) vector search, alongside text and range search. + + Some users prefer to use ES v7.10 because it is available under a [different license](https://www.elastic.co/pricing/faq/licensing) compared to ES v8.0.0. + +!!! note "Installation" + To use [ElasticDocIndex][docarray.index.backends.elastic.ElasticDocIndex], you need to install the following dependencies: + + ```console + pip install elasticsearch==8.6.2 + pip install elastic-transport + ``` + + To use [ElasticV7DocIndex][docarray.index.backends.elasticv7.ElasticV7DocIndex], you need to install the following dependencies: + + ```console + pip install elasticsearch==7.10.1 + pip install elastic-transport + ``` + + +The following examples is based on [ElasticDocIndex][docarray.index.backends.elastic.ElasticDocIndex], +but will also work for [ElasticV7DocIndex][docarray.index.backends.elasticv7.ElasticV7DocIndex]. + +# Start ElasticSearch + +You can use docker-compose to create a local Elasticsearch service with the following `docker-compose.yml`. + +```yaml +version: "3.3" +services: + elastic: + image: docker.elastic.co/elasticsearch/elasticsearch:8.6.2 + environment: + - xpack.security.enabled=false + - discovery.type=single-node + - ES_JAVA_OPTS=-Xmx1024m + ports: + - "9200:9200" + networks: + - elastic + +networks: + elastic: + name: elastic +``` + +Run the following command in the folder of the above `docker-compose.yml` to start the service: + +```bash +docker-compose up +``` + +## Construct +To construct an index, you first need to define a schema in the form of a `Document`. + +There are a number of configurations you can pack into your schema: +- Every field in your schema will become one column in the database +- For vector fields, such as `NdArray`, `TorchTensor`, or `TensorflowTensor`, you need to specify a dimensionality to be able to perform vector search +- You can override the default column type for every field. To do that, you can pass any [ES field data type](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/mapping-types.html) to `field_name: Type = Field(col_type=...)`. You can see an example of this on the [section on keyword filters](#keyword-filter). + +Additionally, you can pass a `hosts` argument to the `__init__()` method to connect to an ES instance. +By default, it is `http://localhost:9200`. + + +```python +import numpy as np +from pydantic import Field + +from docarray import BaseDoc +from docarray.index import ElasticDocIndex +from docarray.typing import NdArray + + +class SimpleDoc(BaseDoc): + # specify tensor field with dimensionality 128 + tensor: NdArray[128] + # alternative and equivalent definition: + # tensor: NdArray = Field(dims=128) + + +doc_index = ElasticDocIndex[SimpleDoc](hosts='http://localhost:9200') +``` + +## Index Documents +Use `.index()` to add Documents into the index. +The`.num_docs()` method returns the total number of Documents in the index. + +```python +index_docs = [SimpleDoc(tensor=np.ones(128)) for _ in range(64)] + +doc_index.index(index_docs) + +print(f'number of docs in the index: {doc_index.num_docs()}') +``` + +## Access Documents +To access the `Doc`, you need to specify the `id`. You can also pass a list of `id` to access multiple Documents. + +```python +# access a single Doc +doc_index[index_docs[16].id] + +# access multiple Docs +doc_index[index_docs[16].id, index_docs[17].id] +``` + +### Persistence +You can hood into a database index that was persisted during a previous session. +To do so, you need to specify `index_name` and the `hosts`: + +```python +doc_index = ElasticDocIndex[SimpleDoc]( + hosts='http://localhost:9200', index_name='previously_stored' +) +doc_index.index(index_docs) + +doc_index2 = ElasticDocIndex[SimpleDoc]( + hosts='http://localhost:9200', index_name='previously_stored' +) + +print(f'number of docs in the persisted index: {doc_index2.num_docs()}') +``` + + +## Delete Documents +To delete the Documents, use the built-in function `del` with the `id` of the Documents that you want to delete. +You can also pass a list of ids to delete multiple Documents. + +```python +# delete a single Doc +del doc_index[index_docs[16].id] + +# delete multiple Docs +del doc_index[index_docs[17].id, index_docs[18].id] +``` + +## Find nearest neighbors +The `.find()` method is used to find the nearest neighbors of a vector. + +You need to specify `search_field` that is used when performing the vector search. +This is the field that serves as the basis of comparison between your query and your indexed Documents. + +You can use the `limit` argument to configurate how may Documents to return. + +!!! note + [ElasticV7DocIndex][docarray.index.backends.elasticv7.ElasticV7DocIndex] is using Elasticsearch v7.10.1 which does not support approximate nearest neighbour algorithms such as HNSW. + This can lead to a poor performance when the search involves many vectors. + [ElasticDocIndex][docarray.index.backends.elastic.ElasticDocIndex] does not have this limitation. + +```python +query = SimpleDoc(tensor=np.ones(128)) + +docs, scores = doc_index.find(query, limit=5, search_field='tensor') +``` + + +## Nested data +When using the index you can define multiple fields, including nesting Documents inside another Document. + +Consider the following example: +You have `YouTubeVideoDoc` including the `tensor` field calculated based on the description. +Besides, `YouTbueVideoDoc` has `thumbnail` and `video` field, each of which has its own `tensor`. + +```python +from docarray.typing import ImageUrl, VideoUrl, AnyTensor + + +class ImageDoc(BaseDoc): + url: ImageUrl + tensor: AnyTensor = Field(similarity='cosine', dims=64) + + +class VideoDoc(BaseDoc): + url: VideoUrl + tensor: AnyTensor = Field(similarity='cosine', dims=128) + + +class YouTubeVideoDoc(BaseDoc): + title: str + description: str + thumbnail: ImageDoc + video: VideoDoc + tensor: AnyTensor = Field(similarity='cosine', dims=256) + + +doc_index = ElasticDocIndex[YouTubeVideoDoc]() +index_docs = [ + YouTubeVideoDoc( + title=f'video {i+1}', + description=f'this is video from author {10*i}', + thumbnail=ImageDoc(url=f'http://example.ai/images/{i}', tensor=np.ones(64)), + video=VideoDoc(url=f'http://example.ai/videos/{i}', tensor=np.ones(128)), + tensor=np.ones(256), + ) + for i in range(8) +] +doc_index.index(index_docs) +``` + +**You can perform search on any nesting level.** +To do so, use the dunder operator to specify the field defined in the nested data. + +In the following example, you can see how to perform vector search on the `tensor` field of the `YouTubeVideoDoc` or on the `tensor` field of the `thumbnail` and `video` field: + +```python +# example of find nested and flat index +query_doc = YouTubeVideoDoc( + title=f'video query', + description=f'this is a query video', + thumbnail=ImageDoc(url=f'http://example.ai/images/1024', tensor=np.ones(64)), + video=VideoDoc(url=f'http://example.ai/videos/1024', tensor=np.ones(128)), + tensor=np.ones(256), +) + +# find by the youtubevideo tensor +docs, scores = doc_index.find(query_doc, search_field='tensor', limit=3) + +# find by the thumbnail tensor +docs, scores = doc_index.find(query_doc, search_field='thumbnail__tensor', limit=3) + +# find by the video tensor +docs, scores = doc_index.find(query_doc, search_field='video__tensor', limit=3) +``` + +To delete a nested data, you need to specify the `id`. + +!!! note + You can only delete `Doc` at the top level. Deletion of the `Doc` on the lower level is not supported yet. + +```python +# example of delete nested and flat index +del doc_index[index_docs[3].id, index_docs[4].id] +``` + +## Other Elasticsearch queries +Besides the vector search, you can also perform other queries supported by Elasticsearch, such as text search, and various filters. + +### Text search +As in "pure" Elasticsearch, you can use text search directly on the field of type `str`: + +```python +class NewsDoc(BaseDoc): + text: str + + +doc_index = ElasticDocIndex[NewsDoc]() +index_docs = [ + NewsDoc(id='0', text='this is a news for sport'), + NewsDoc(id='1', text='this is a news for finance'), + NewsDoc(id='2', text='this is another news for sport'), +] +doc_index.index(index_docs) +query = 'finance' + +# search with text +docs, scores = doc_index.text_search(query, search_field='text') +``` + +### Query Filter +The `filter()` method accepts queries that follow the [Elasticsearch Query DSL](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html) and consist of leaf and compound clauses. + +Using this, you can perform [keyword filters](#keyword-filter), [geolocation filters](#geolocation-filter) and [range filters](#range-filter). + +#### Keyword filter +To filter the Documents in your index by keyword, you can use `Field(col_type='keyword')` to enable keyword search for a given fields: + +```python +class NewsDoc(BaseDoc): + text: str + category: str = Field(col_type='keyword') # enable keyword filtering + + +doc_index = ElasticDocIndex[NewsDoc]() +index_docs = [ + NewsDoc(id='0', text='this is a news for sport', category='sport'), + NewsDoc(id='1', text='this is a news for finance', category='finance'), + NewsDoc(id='2', text='this is another news for sport', category='sport'), +] +doc_index.index(index_docs) + +# search with filer +query_filter = {'terms': {'category': ['sport']}} +docs = doc_index.filter(query_filter) +``` + +#### Geolocation filter +To filter the Documents in your index by geolocation, you can use `Field(col_type='geo_point')` on a given field. + +```python +class NewsDoc(BaseDoc): + text: str + location: dict = Field(col_type='geo_point') # enable geolocation filtering + + +doc_index = ElasticDocIndex[NewsDoc]() +index_docs = [ + NewsDoc(text='this is from Berlin', location={'lon': 13.24, 'lat': 50.31}), + NewsDoc(text='this is from Beijing', location={'lon': 116.22, 'lat': 39.55}), + NewsDoc(text='this is from San Jose', location={'lon': -121.89, 'lat': 37.34}), +] +doc_index.index(index_docs) + +# filter the eastern hemisphere +query = { + 'bool': { + 'filter': { + 'geo_bounding_box': { + 'location': { + 'top_left': {'lon': 0, 'lat': 90}, + 'bottom_right': {'lon': 180, 'lat': 0}, + } + } + } + } +} + +docs = doc_index.filter(query) +``` + +#### Range filter +You can have [range field types](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/range.html) in your Document schema and set `Field(col_type='integer_range')`(or also `date_range`, etc.) to filter the docs based on the range of the field. + +```python +class NewsDoc(BaseDoc): + time_frame: dict = Field( + col_type='date_range', format='yyyy-MM-dd' + ) # enable range filtering + + +doc_index = ElasticDocIndex[NewsDoc]() +index_docs = [ + NewsDoc(time_frame={'gte': '2023-01-01', 'lt': '2023-02-01'}), + NewsDoc(time_frame={'gte': '2023-02-01', 'lt': '2023-03-01'}), + NewsDoc(time_frame={'gte': '2023-03-01', 'lt': '2023-04-01'}), +] +doc_index.index(index_docs) + +query = { + 'bool': { + 'filter': { + 'range': { + 'time_frame': { + 'gte': '2023-02-05', + 'lt': '2023-02-10', + 'relation': 'contains', + } + } + } + } +} + +docs = doc_index.filter(query) +``` + +### Hybrid serach and query builder +To combine any of the "atomic" search approaches above, you can use the `QueryBuilder` to build your own hybrid query. + +For this the `find()`, `filter()` and `text_search()` methods and their combination are supported. + +For example, you can build a hybrid serach query that performs range filtering, vector search and text search: + +```python +class MyDoc(BaseDoc): + tens: NdArray[10] = Field(similarity='l2_norm') + num: int + text: str + + +doc_index = ElasticDocIndex[MyDoc]() +index_docs = [ + MyDoc(id=f'{i}', tens=np.ones(10) * i, num=int(i / 2), text=f'text {int(i/2)}') + for i in range(10) +] +doc_index.index(index_docs) + +q = ( + doc_index.build_query() + .filter({'range': {'num': {'lte': 3}}}) + .find(index_docs[-1], search_field='tens') + .text_search('0', search_field='text') + .build() +) +docs, _ = doc_index.execute_query(q) +``` + +You can also manually build a valid ES query and directly pass it to the `execute_query()` method. + +## Configuration options + +### DBConfig +The following configs can be set in `DBConfig`: + +| Name | Description | Default | +|-------------------|----------------------------------------------------------------------------------------------------------------------------------------|-------------------------| +| `hosts` | Hostname of the Elasticsearch server | `http://localhost:9200` | +| `es_config` | Other ES [configuration options](https://www.elastic.co/guide/en/elasticsearch/client/python-api/8.6/config.html) in a Dict and pass to `Elasticsearch` client constructor, e.g. `cloud_id`, `api_key` | None | +| `index_name` | Elasticsearch index name, the name of Elasticsearch index object | None | +| `index_settings` | Other [index settings](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/index-modules.html#index-modules-settings) in a Dict for creating the index | dict | +| `index_mappings` | Other [index mappings](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/mapping.html) in a Dict for creating the index | dict | + +You can pass any of the above as keyword arguments to the `__init__()` method or pass an entire configuration object. +To see how, see [here](first_steps.md#configuration-options#customize-configurations). + +### RuntimeConfig + +The `RuntimeConfig` dataclass of `ElasticDocIndex` consists of `default_column_config` and `chunk_size`. You can change `chunk_size` for batch operations. + +```python +doc_index = ElasticDocIndex[SimpleDoc]() +doc_index.configure(ElasticDocIndex.RuntimeConfig(chunk_size=1000)) +``` + +`default_column_config` is the default configurations for every column type. Since there are many column types in Elasticsearch, you can also consider changing the column config when defining the schema. + +```python +class SimpleDoc(BaseDoc): + tensor: NdArray[128] = Field(similarity='l2_norm', m=32, num_candidates=5000) + + +doc_index = ElasticDocIndex[SimpleDoc]() +``` + +You can pass the above as a keyword arguments the `configure()` method or pass an entire configuration object. +To see how, see [here](first_steps.md#configuration-options#customize-configurations). \ No newline at end of file diff --git a/docs/user_guide/storing/index_hnswlib.md b/docs/user_guide/storing/index_hnswlib.md new file mode 100644 index 00000000000..88530cc2fde --- /dev/null +++ b/docs/user_guide/storing/index_hnswlib.md @@ -0,0 +1,207 @@ +# Hnswlib Document Index + +!!! note + To use [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex], one need to install the extra dependency with the following command + + ```console + pip install "docarray[hnswlib]" + ``` + +[HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] is a lightweight Document Index implementation +that runs fully locally and is best suited for small to medium sized datasets. +It stores vectors on disc in [hnswlib](https://github.com/nmslib/hnswlib), and stores all other data in [SQLite](https://www.sqlite.org/index.html). + +!!! note "Production readiness" + [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] is a great starting point + for small to medium sized datasets, but it is not battle tested in production. If scalability, uptime, etc. are + important to you, we recommend you eventually transition to one of our database backed Document Index implementations: + + - [QdrantDocumentIndex][docarray.index.backends.qdrant.QdrantDocumentIndex] + - [WeaviateDocumentIndex][docarray.index.backends.weaviate.WeaviateDocumentIndex] + - [ElasticDocumentIndex][docarray.index.backends.elastic.ElasticDocIndex] + + +## Basic Usage + +To see how to create a [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] instance, add Documents, +perform search, etc. see the [general user guide](./first_steps.md#document-index). + +## Configuration + +This section lays out the configurations and options that are specific to [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex]. + +### DBConfig + +The `DBConfig` of [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] expects only one argument: +`work_dir`. + +This is the location where all of the Index's data will be stored: The vaious HNSWLib indexes, as well as the SQLite database. + +You can pass this directly to the constructor: + +```python +from docarray import BaseDoc +from docarray.index import HnswDocumentIndex +from docarray.typing import NdArray + + +class MyDoc(BaseDoc): + embedding: NdArray[128] + text: str + + +db = HnswDocumentIndex[MyDoc](work_dir='./path/to/db') +``` + +You can specify and existing directory that holds that from a previous session. +In that case, the Index will load the data from that directory. + +!!! note "HNSWLib file lock" + HNSWLib uses a file lock to prevent multiple processes from accessing the same index at the same time. + This means that if you try to open an index that is already open in another process, you will get an error. + To avoid this, you can specify a different `work_dir` for each process. + +### RuntimeConfig + +The `RuntimeConfig` of [HnswDocumentIndex][docarray.index.backends.hnswlib.HnswDocumentIndex] contains only one entry, +the default mapping from Python types to column configurations. + +You can see in the [section below](#field-wise-configurations) how to override configurations for specific fields. +If you want to set configurations globally, i.e. for all vector fields in your Documents, you can do that using `RuntimeConfig`: + +```python +import numpy as np + +db = HnswDocumentIndex[MyDoc](work_dir='/tmp/my_db') + +db.configure( + default_column_config={ + np.ndarray: { + 'dim': -1, + 'index': True, + 'space': 'ip', + 'max_elements': 2048, + 'ef_construction': 100, + 'ef': 15, + 'M': 8, + 'allow_replace_deleted': True, + 'num_threads': 5, + }, + None: {}, + } +) +``` + +This will set the default configuration for all vector fields to the one specified in the example above. + +!!! note + Even if your vectors come from PyTorch or TensorFlow, you can and should still use the `np.ndarray` configuration. + This is because all tensors are converted to `np.ndarray` under the hood. + +For more information on these settings, see [below](#field-wise-configurations). + +Fields that are not vector fields (e.g. of type `str` or `int` etc.) do not offer any configuration, as they are simply +stored as-is in a SQLite database. + +### Field-wise configurations + +There are various setting that you can tweak for every vector field that you index into HNSWLib. + +You pass all of those using the `field: Type = Field(...)` syntax: + +```python +from pydantic import Field + + +class Schema(BaseDoc): + tens: NdArray[100] = Field(max_elements=12, space='cosine') + tens_two: NdArray[10] = Field(M=4, space='ip') + + +db = HnswDocumentIndex[Schema](work_dir='/tmp/my_db') +``` + +In the example above you can see how to configure two different vector fields, with two different sets of settings. + +In this way, you can pass [all options that HNSWLib supports](https://github.com/nmslib/hnswlib#api-description): + +| Keyword | Description | Default | +|-------------------|--------------------------------------------------------------------------------------------------------------------------------|---------| +| `max_elements` | Maximum number of vector that can be stored | 1024 | +| `space` | Vector space (similarity metric) the index operates in. Supports 'l2', 'ip', and 'cosine' | 'l2' | +| `index` | Whether or not an index should be built for this field. | True | +| `ef_construction` | defines a construction time/accuracy trade-off | 200 | +| `ef` | parameter controlling query time/accuracy trade-off | 10 | +| `M` | parameter that defines the maximum number of outgoing connections in the graph | 16 | +| `allow_replace_deleted` | enables replacing of deleted elements with new added ones | True | +| `num_threads` | sets the number of cpu threads to use | 1 | + +You can find more details on there parameters [here](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md). + +## Nested Index +When using the index, you can define multiple fields as well as the nested structure. In the following example, you have `YouTubeVideoDoc` including the `tensor` field calculated based on the description. Besides, `YouTbueVideoDoc` has `thumbnail` and `video` field, each of which has its own `tensor`. + +```python +from docarray.typing import ImageUrl, VideoUrl, AnyTensor + + +class ImageDoc(BaseDoc): + url: ImageUrl + tensor: AnyTensor = Field(space='cosine', dim=64) + + +class VideoDoc(BaseDoc): + url: VideoUrl + tensor: AnyTensor = Field(space='cosine', dim=128) + + +class YouTubeVideoDoc(BaseDoc): + title: str + description: str + thumbnail: ImageDoc + video: VideoDoc + tensor: AnyTensor = Field(space='cosine', dim=256) + + +doc_index = HnswDocumentIndex[YouTubeVideoDoc](work_dir='./tmp2') +index_docs = [ + YouTubeVideoDoc( + title=f'video {i+1}', + description=f'this is video from author {10*i}', + thumbnail=ImageDoc(url=f'http://example.ai/images/{i}', tensor=np.ones(64)), + video=VideoDoc(url=f'http://example.ai/videos/{i}', tensor=np.ones(128)), + tensor=np.ones(256), + ) + for i in range(8) +] +doc_index.index(index_docs) +``` + +Use the `search_field` to specify which field to be used when performing the vector search. You can use the dunder operator to specify the field defined in the nested data. In the following codes, you can perform vector search on the `tensor` field of the `YouTubeVideoDoc` or on the `tensor` field of the `thumbnail` and `video` field. + +```python +# example of find nested and flat index +query_doc = YouTubeVideoDoc( + title=f'video query', + description=f'this is a query video', + thumbnail=ImageDoc(url=f'http://example.ai/images/1024', tensor=np.ones(64)), + video=VideoDoc(url=f'http://example.ai/videos/1024', tensor=np.ones(128)), + tensor=np.ones(256), +) +# find by the youtubevideo tensor +docs, scores = doc_index.find(query_doc, search_field='tensor', limit=3) +# find by the thumbnail tensor +docs, scores = doc_index.find(query_doc, search_field='thumbnail__tensor', limit=3) +# find by the video tensor +docs, scores = doc_index.find(query_doc, search_field='video__tensor', limit=3) +``` + +To delete a nested data, you need to specify the `id`. + +!!! note + You can only delete `Doc` at the top level. Deletion of the `Doc` on the lower level is not supported yet. + +```python +# example of delete nested and flat index +del doc_index[index_docs[6].id] +``` \ No newline at end of file diff --git a/docs/user_guide/storing/index_qdrant.md b/docs/user_guide/storing/index_qdrant.md new file mode 100644 index 00000000000..d03a12e4e37 --- /dev/null +++ b/docs/user_guide/storing/index_qdrant.md @@ -0,0 +1,114 @@ +# Qdrant Document Index + +!!! note "Install dependencies" + To use [QdrantDocumentIndex][docarray.index.backends.qdrant.QdrantDocumentIndex], you need to install extra dependencies with the following command: + + ```console + pip install "docarray[qdrant]" + ``` + +The following is a starter script for using the [QdrantDocumentIndex][docarray.index.backends.qdrant.QdrantDocumentIndex], +based on the [Qdrant](https://qdrant.tech/) vector search engine. + +For general usage of a Document Index, see the [general user guide](./first_steps.md#document-index). + +!!! tip "See all configuration options" + To see all configuration options for the [QdrantDocumentIndex][docarray.index.backends.qdrant.QdrantDocumentIndex], + you can do the following: + + ```python + from docarray.index import QdrantDocumentIndex + + # the following can be passed to the __init__() method + db_config = QdrantDocumentIndex.DBConfig() + print(db_config) # shows default values + + # the following can be passed to the configure() method + runtime_config = QdrantDocumentIndex.RuntimeConfig() + print(runtime_config) # shows default values + ``` + +```python +import numpy as np + +from typing import Optional + +from docarray import BaseDoc +from docarray.index import QdrantDocumentIndex +from docarray.typing import NdArray + +from qdrant_client.http import models + + +class MyDocument(BaseDoc): + title: str + title_embedding: NdArray[786] + image_path: Optional[str] + image_embedding: NdArray[512] + + +# Creating an in-memory Qdrant document index +qdrant_config = QdrantDocumentIndex.DBConfig(":memory:") +doc_index = QdrantDocumentIndex[MyDocument](qdrant_config) + +# Indexing the documents +doc_index.index( + [ + MyDocument( + title=f"My document {i}", + title_embedding=np.random.random(786), + image_path=None, + image_embedding=np.random.random(512), + ) + for i in range(100) + ] +) + +# Performing a vector search only +results = doc_index.find( + query=np.random.random(512), + search_field="image_embedding", + limit=3, +) + +# Connecting to a local Qdrant instance with Scalar Quantization enabled, +# and using non-default collection name to store the datapoints +qdrant_config = QdrantDocumentIndex.DBConfig( + "http://localhost:6333", + collection_name="another_collection", + quantization_config=models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + quantile=0.99, + always_ram=True, + ), + ), +) +doc_index = QdrantDocumentIndex[MyDocument](qdrant_config) + +# Indexing the documents +doc_index.index( + [ + MyDocument( + title=f"My document {i}", + title_embedding=np.random.random(786), + image_path=None, + image_embedding=np.random.random(512), + ) + for i in range(100) + ] +) + +# Text lookup, without vector search. Using the Qdrant filtering mechanisms: +# https://qdrant.tech/documentation/filtering/ +results = doc_index.filter( + filter_query=models.Filter( + must=[ + models.FieldCondition( + key="title", + match=models.MatchText(text="document 2"), + ), + ], + ), +) +``` \ No newline at end of file diff --git a/docs/user_guide/storing/index_weaviate.md b/docs/user_guide/storing/index_weaviate.md new file mode 100644 index 00000000000..f43c387d875 --- /dev/null +++ b/docs/user_guide/storing/index_weaviate.md @@ -0,0 +1,448 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .md + format_name: markdown + format_version: '1.3' + jupytext_version: 1.14.5 + kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +# Weaviate Document Index + +!!! note "Install dependencies" + To use [WeaviateDocumentIndex][docarray.index.backends.weaviate.WeaviateDocumentIndex], you need to install extra dependencies with the following command: + + ```console + pip install "docarray[weaviate]" + ``` + +This is the user guide for the [WeaviateDocumentIndex][docarray.index.backends.weaviate.WeaviateDocumentIndex], +focussing on special features and configurations of Weaviate. + +For general usage of a Document Index, see the [general user guide](./first_steps.md#document-index). + + +# 1. Start Weaviate service + +To use [WeaviateDocumentIndex][docarray.index.backends.weaviate.WeaviateDocumentIndex], it needs to hook into a running Weaviate service. +There are multiple ways to start a Weaviate instance, depending on your use case. + + +## 1.1. Options - Overview + +There are multiple ways to start a Weaviate instance. + +| Instance type | General use case | Configurability | Notes | +| ----- | ----- | ----- | ----- | +| **Weaviate Cloud Services (WCS)** | Development and production | Limited | **Recommended for most users** | +| **Embedded Weaviate** | Experimentation | Limited | Experimental (as of Apr 2023) | +| **Docker-Compose** | Development | Yes | **Recommended for development + customizability** | +| **Kubernetes** | Production | Yes | | + +## 1.2. Instantiation instructions + +### 1.2.1. WCS (managed instance) + +Go to the [WCS console](https://console.weaviate.cloud) and create an instance using the visual interface, following [this guide](https://weaviate.io/developers/wcs/guides/create-instance). + +Weaviate instances on WCS come pre-configured, so no further configuration is required. + +### 1.2.2. Docker-Compose (self-managed) + +Get a configuration file (`docker-compose.yaml`). You can build it using [this interface](https://weaviate.io/developers/weaviate/installation/docker-compose), or download it directly with: + +```bash +curl -o docker-compose.yml "https://configuration.weaviate.io/v2/docker-compose/docker-compose.yml?modules=standalone&runtime=docker-compose&weaviate_version=v" +``` + +Where `v` is the actual version, such as `v1.18.3`. + + +```bash +curl -o docker-compose.yml "https://configuration.weaviate.io/v2/docker-compose/docker-compose.yml?modules=standalone&runtime=docker-compose&weaviate_version=v1.18.3" +``` + + +#### 1.2.2.1 Start up Weaviate with Docker-Compose + +Then you can start up Weaviate by running from a shell: + +```shell +docker-compose up -d +``` + +#### 1.2.2.2 Shut down Weaviate + +Then you can shut down Weaviate by running from a shell: + +```shell +docker-compose down +``` + +#### Notes + +Unless data persistence or backups are set up, shutting down the Docker instance will remove all its data. + +See documentation on [Persistent volume](https://weaviate.io/developers/weaviate/installation/docker-compose#persistent-volume) and [Backups](https://weaviate.io/developers/weaviate/configuration/backups) to prevent this if persistence is desired. + + +```bash +docker-compose up -d +``` + + +### 1.2.3. Embedded Weaviate (from the application) + +With Embedded Weaviate, Weaviate database server can be launched from the client, using: + +```python +from docarray.index.backends.weaviate import EmbeddedOptions + +embedded_options = EmbeddedOptions() +``` + +## 1.3. Authentication + +Weaviate offers [multiple authentication options](https://weaviate.io/developers/weaviate/configuration/authentication), as well as [authorization options](https://weaviate.io/developers/weaviate/configuration/authorization). + +With DocArray, you can use any of: +- Anonymous access (public instance), +- OIDC with username & password, and +- API-key based authentication. + +To access a Weaviate instance. In general, **Weaviate recommends using API-key based authentication** for balance between security and ease of use. You can create, for example, read-only keys to distribute to certain users, while providing read/write keys to administrators. + +See below for examples of connection to Weaviate for each scenario. + + +## 1.4. Connect to Weaviate + +```python +from docarray.index.backends.weaviate import WeaviateDocumentIndex +``` + +### Public instance + + +If using Embedded Weaviate: + +```python +from docarray.index.backends.weaviate import EmbeddedOptions + +dbconfig = WeaviateDocumentIndex.DBConfig(embedded_options=EmbeddedOptions()) +``` + +For all other options: + + +```python +dbconfig = WeaviateDocumentIndex.DBConfig( + host="http://localhost:8080" +) # Replace with your endpoint) +``` + + +### OIDC with username + password + +To authenticate against a Weaviate instance with OIDC username & password: + +```python +dbconfig = WeaviateDocumentIndex.DBConfig( + username="username", # Replace with your username + password="password", # Replace with your password + host="http://localhost:8080", # Replace with your endpoint +) +``` + + +```python +# dbconfig = WeaviateDocumentIndex.DBConfig( +# username="username", # Replace with your username +# password="password", # Replace with your password +# host="http://localhost:8080", # Replace with your endpoint +# ) +``` + + +### API key-based authentication + +To authenticate against a Weaviate instance an API key: + +```python +dbconfig = WeaviateDocumentIndex.DBConfig( + auth_api_key="apikey", # Replace with your own API key + host="http://localhost:8080", # Replace with your endpoint +) +``` + + + + +# 2. Configure Weaviate + +## 2.1. Overview + +**WCS instances come pre-configured**, and as such additional settings are not configurable outside of those chosen at creation, such as whether to enable authentication. + +For other cases, such as **Docker-Compose deployment**, its settings can be modified through the configuration file, such as the `docker-compose.yaml` file. + +Some of the more commonly used settings include: + +- [Persistent volume](https://weaviate.io/developers/weaviate/installation/docker-compose#persistent-volume): Set up data persistence so that data from inside the Docker container is not lost on shutdown +- [Enabling a multi-node setup](https://weaviate.io/developers/weaviate/installation/docker-compose#multi-node-setup) +- [Backups](https://weaviate.io/developers/weaviate/configuration/backups) +- [Authentication (server-side)](https://weaviate.io/developers/weaviate/configuration/authentication) +- [Modules enabled](https://weaviate.io/developers/weaviate/configuration/modules#enable-modules) + +And a list of environment variables is [available on this page](https://weaviate.io/developers/weaviate/config-refs/env-vars). + +## 2.2. DocArray instantiation configuration options + +Additionally, you can specify the below settings when you instantiate a configuration object in DocArray. + +| name | type | explanation | default | example | +| ---- | ---- | ----------- | ------- | ------- | +| **Category: General** | +| host | str | Weaviate instance url | http://localhost:8080 | +| **Category: Authentication** | +| username | str | username known to the specified authentication provider (e.g. WCS) | None | `jp@weaviate.io` | +| password | str | corresponding password | None | `p@ssw0rd` | +| auth_api_key | str | API key known to the Weaviate instance | None | `mys3cretk3y` | +| **Category: Data schema** | +| index_name | str | Class name to use to store the document | `Document` | +| **Category: Embedded Weaviate** | +| embedded_options| EmbeddedOptions | options for embedded weaviate | None | + +The type `EmbeddedOptions` can be specified as described [here](https://weaviate.io/developers/weaviate/installation/embedded#embedded-options) + +## 2.3. Runtime configuration + +Weaviate strongly recommends using batches to perform bulk operations such as importing data, as it will significantly impact performance. You can specify a batch configuration as in the below example, and pass it on as runtime configuration. + +```python +batch_config = { + "batch_size": 20, + "dynamic": False, + "timeout_retries": 3, + "num_workers": 1, +} + +runtimeconfig = WeaviateDocumentIndex.RuntimeConfig(batch_config=batch_config) + +dbconfig = WeaviateDocumentIndex.DBConfig( + host="http://localhost:8080" +) # Replace with your endpoint and/or auth settings +store = WeaviateDocumentIndex[Document](db_config=dbconfig) +store.configure(runtimeconfig) # Batch settings being passed on +``` + +| name | type | explanation | default | +| ---- | ---- | ----------- | ------- | +| batch_config | Dict[str, Any] | dictionary to configure the weaviate client's batching logic | see below | + +Read more: +- Weaviate [docs on batching with the Python client](https://weaviate.io/developers/weaviate/client-libraries/python#batching) + + + +## 3. Available column types + +Python data types are mapped to Weaviate type according to the below convention. + +| python type | weaviate type | +| ----------- | ------------- | +| docarray.typing.ID | string | +| str | text | +| int | int | +| float | number | +| bool | boolean | +| np.ndarray | number[] | +| AbstractTensor | number[] | +| bytes | blob | + +You can override this default mapping by passing a `col_type` to the `Field` of a schema. + +For example to map `str` to `string` you can: + +```python +class StringDoc(BaseDoc): + text: str = Field(col_type="string") +``` + +A list of available Weaviate data types [is here](https://weaviate.io/developers/weaviate/config-refs/datatypes). + + +## 4. Adding example data + +Putting it together, we can add data as shown below using Weaviate as the document store. + +```python +import numpy as np +from pydantic import Field +from docarray import BaseDoc +from docarray.typing import NdArray +from docarray.index.backends.weaviate import WeaviateDocumentIndex + +# Define a document schema +class Document(BaseDoc): + text: str + embedding: NdArray[2] = Field( + dims=2, is_embedding=True + ) # Embedding column -> vector representation of the document + file: NdArray[100] = Field(dims=100) + + +# Make a list of 3 docs to index +docs = [ + Document( + text="Hello world", embedding=np.array([1, 2]), file=np.random.rand(100), id="1" + ), + Document( + text="Hello world, how are you?", + embedding=np.array([3, 4]), + file=np.random.rand(100), + id="2", + ), + Document( + text="Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut", + embedding=np.array([5, 6]), + file=np.random.rand(100), + id="3", + ), +] + +batch_config = { + "batch_size": 20, + "dynamic": False, + "timeout_retries": 3, + "num_workers": 1, +} + +runtimeconfig = WeaviateDocumentIndex.RuntimeConfig(batch_config=batch_config) + +store = WeaviateDocumentIndex[Document](db_config=dbconfig) +store.configure(runtimeconfig) # Batch settings being passed on +store.index(docs) +``` + +### 4.1. Notes + +- In order to use vector search, you need to specify `is_embedding` for exactly one field. + - This is as Weaviate is configured to allow one vector per data object. + - If you would like to see Weaviate support multiple vectors per object, [upvote the issue](https://github.com/weaviate/weaviate/issues/2465) which will help to prioritize it. +- For a field to be considered as an embedding, its type needs to be of subclass `np.ndarray` or `AbstractTensor` and `is_embedding` needs to be set to `True`. + - If `is_embedding` is set to `False` or not provided, the field will be treated as a `number[]`, and as a result, it will not be added to Weaviate's vector index. +- It is possible to create a schema without specifying `is_embedding` for any field. + - This will however mean that the document will not be vectorized and cannot be searched using vector search. + + +## 5. Query Builder/Hybrid Search + + +### 5.1. Text search + +To perform a text search, follow the below syntax. + +This will perform a text search for the word "hello" in the field "text" and return the first 2 results: + +```python +q = store.build_query().text_search("world", search_field="text").limit(2).build() + +docs = store.execute_query(q) +docs +``` + +### 5.2. Vector similarity search + +To perform a vector similarity search, follow the below syntax. + +This will perform a vector similarity search for the vector [1, 2] and return the first 2 results: + +```python +q = store.build_query().find([1, 2]).limit(2).build() + +docs = store.execute_query(q) +docs +``` + +### 5.3. Hybrid search + +To perform a hybrid search, follow the below syntax. + +This will perform a hybrid search for the word "hello" and the vector [1, 2] and return the first 2 results: + +**Note**: Hybrid search searches through the object vector and all fields. Accordingly, the `search_field` keyword it will have no effect. + +```python +q = ( + store.build_query() + .text_search( + "world", search_field=None # Set as None as it is required but has no effect + ) + .find([1, 2]) + .limit(2) + .build() +) + +docs = store.execute_query(q) +docs +``` + +### 5.4. GraphQL query + +You can also perform a raw GraphQL query using any syntax as you might natively in Weaviate. This allows you to run any of the full range of queries that you might wish to. + +The below will perform a GraphQL query to obtain the count of `Document` objects. + +```python +graphql_query = """ +{ + Aggregate { + Document { + meta { + count + } + } + } +} +""" + +store.execute_query(graphql_query) +``` + +Note that running a raw GraphQL query will return Weaviate-type responses, rather than a DocArray object type. + +You can find the documentation for [Weaviate's GraphQL API here](https://weaviate.io/developers/weaviate/api/graphql). + + +## 6. Other notes + +### 6.1. DocArray IDs vs Weaviate IDs + +As you saw earlier, the `id` field is a special field that is used to identify a document in `BaseDoc`. + +```python +Document( + text="Hello world", embedding=np.array([1, 2]), file=np.random.rand(100), id="1" +), +``` + +This is not the same as Weaviate's own `id`, which is a reserved keyword and can't be used as a field name. + +Accordingly, the DocArray document id is stored internally in Weaviate as `docarrayid`. + + +## 7. Shut down Weaviate instance + +```bash +docker-compose down +``` + +----- +----- +----- diff --git a/mkdocs.yml b/mkdocs.yml index 100a4da336a..00d66e44129 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -81,10 +81,10 @@ nav: - Home: README.md - Tutorial/User Guide: - user_guide/intro.md - - Representing data: + - Represent: - user_guide/representing/first_step.md - user_guide/representing/array.md - - Sending: + - Send: - user_guide/sending/first_step.md - Serialization: - user_guide/sending/ser/send_doc.md @@ -93,8 +93,15 @@ nav: - Building API: - user_guide/sending/api/jina.md - user_guide/sending/api/fastAPI.md - - Storing: + + - Store: - user_guide/storing/first_step.md + - DocIndex: + - user_guide/storing/first_steps.md + - user_guide/storing/index_hnswlib.md + - user_guide/storing/index_weaviate.md + - user_guide/storing/index_elastic.md + - user_guide/storing/index_qdrant.md - DocStore: - user_guide/storing/doc_store/store_file.md - user_guide/storing/doc_store/store_jac.md diff --git a/tests/documentation/__init__.py b/tests/documentation/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/documentation/test_docs.py b/tests/documentation/test_docs.py index b071839c88c..37f24a7cf66 100644 --- a/tests/documentation/test_docs.py +++ b/tests/documentation/test_docs.py @@ -4,7 +4,9 @@ from mktestdocs import grab_code_blocks from mktestdocs.__main__ import _executors, check_raw_string -file_to_skip = ['fastAPI', 'jina'] +from tests.index.elastic.fixture import start_storage_v8 # noqa: F401 + +file_to_skip = ['fastAPI', 'jina', 'index', 'first_steps.md'] def check_raw_file_full(raw, lang="python", keyword_ignore=[]): diff --git a/tests/index/hnswlib/test_configurations.py b/tests/index/hnswlib/test_configurations.py index dff64fdcc19..80de4fd7ef6 100644 --- a/tests/index/hnswlib/test_configurations.py +++ b/tests/index/hnswlib/test_configurations.py @@ -25,3 +25,21 @@ class Schema(BaseDoc): index.index(docs) assert index.num_docs() == 10 + + +def test_configure_index(tmp_path): + class Schema(BaseDoc): + tens: NdArray[100] = Field(max_elements=12, space='cosine') + tens_two: NdArray[10] = Field(M=4, space='ip') + + index = HnswDocumentIndex[Schema](work_dir=str(tmp_path)) + + assert index._hnsw_indices['tens'].max_elements == 12 + assert index._hnsw_indices['tens'].space == 'cosine' + assert index._hnsw_indices['tens'].M == 16 # default + assert index._hnsw_indices['tens'].dim == 100 + + assert index._hnsw_indices['tens_two'].max_elements == 1024 # default + assert index._hnsw_indices['tens_two'].space == 'ip' + assert index._hnsw_indices['tens_two'].M == 4 + assert index._hnsw_indices['tens_two'].dim == 10