Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
a34d9cc
docs: add serialization for json
nan-wang Apr 10, 2023
2a9de6d
docs: add serialization for binary and protobuf
nan-wang Apr 10, 2023
189787a
docs: add serialization for base64 and bytes
nan-wang Apr 10, 2023
f363a1b
docs: add serialization for csv
nan-wang Apr 10, 2023
aedb7d8
docs: add serialization for dataframe
nan-wang Apr 10, 2023
e223aba
fix: add doctring to documentaion basedoc
samsja Apr 12, 2023
7491246
fix: fix mypy
samsja Apr 12, 2023
e0937f6
Merge branch 'feat-rewrite-v2' into docs-send-doclist-0410
samsja Apr 12, 2023
5a40e79
fix: add docstring doc list
samsja Apr 12, 2023
4e53699
fix: dic doc array docstring
samsja Apr 12, 2023
39c1df9
fix: fix page for doc list serilizaiton
samsja Apr 12, 2023
5de2719
fix: fix docstring
samsja Apr 12, 2023
e9df25b
feat: add docvec
samsja Apr 12, 2023
2853474
docs: add send doc section
samsja Apr 12, 2023
a6910f1
docs: fix docstring
samsja Apr 12, 2023
e73a6c4
refactor: better tree structure for sending
samsja Apr 12, 2023
66fc6db
fix: fix tests
samsja Apr 12, 2023
f32bcca
fix: fix python code snippet ods
samsja Apr 12, 2023
4047c23
fix: fix remove breakpoint
samsja Apr 12, 2023
6821307
feat: add intro
samsja Apr 12, 2023
ce60c65
feat: add ref
samsja Apr 12, 2023
5fae5da
feat: merege dev branch
samsja Apr 12, 2023
e66a900
feat: move fastapi part
samsja Apr 12, 2023
a642abe
fix: fix fastAPI
samsja Apr 12, 2023
840a650
fix: remove uselss mixin
samsja Apr 12, 2023
8c2cf02
faet: add jina section
samsja Apr 13, 2023
c7507bf
fix: compress -> compression
samsja Apr 13, 2023
27b48bf
feat: apply suggestion
samsja Apr 13, 2023
1b1c503
fix: apply alex suggestion
samsja Apr 13, 2023
dde1612
wip
samsja Apr 13, 2023
3d0d745
fix: fix all docstring
samsja Apr 13, 2023
a6c9aa9
fix: fix update docstring
samsja Apr 13, 2023
0f18972
Merge branch 'feat-rewrite-v2' into docs-send-doclist-0410
samsja Apr 13, 2023
70c0f45
fix: fix ruff
samsja Apr 13, 2023
2828cf2
fix: fix smth
samsja Apr 13, 2023
228ddff
feat: apply charllote suggestion
samsja Apr 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions docarray/array/any_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def _set_data_column(
field: str,
values: Union[List, T, 'AbstractTensor'],
):
"""Set all Documents in this [`DocList`][docarray.typing.DocList] using the passed values
"""Set all Documents in this [`DocList`][docarray.array.doc_list.doc_list.DocList] using the passed values

:param field: name of the fields to extract
:values: the values to set at the DocList level
Expand All @@ -140,7 +140,7 @@ def to_protobuf(self) -> 'DocListProto':
...

def _to_node_protobuf(self) -> 'NodeProto':
"""Convert a [`DocList`][docarray.typing.DocList] into a NodeProto protobuf message.
"""Convert a [`DocList`][docarray.array.doc_list.doc_list.DocList] into a NodeProto protobuf message.
This function should be called when a DocList
is nested into another Document that need to be converted into a protobuf

Expand All @@ -157,13 +157,11 @@ def traverse_flat(
) -> Union[List[Any], 'AbstractTensor']:
"""
Return a List of the accessed objects when applying the `access_path`. If this
results in a nested list or list of [`DocList`s][docarray.typing.DocList], the list will be flattened
results in a nested list or list of [`DocList`s][docarray.array.doc_list.doc_list.DocList], the list will be flattened
on the first level. The access path is a string that consists of attribute
names, concatenated and `"__"`-separated. It describes the path from the first
level to an arbitrary one, e.g. `'content__image__url'`.

:param access_path: a string that represents the access path (`"__"`-separated).
:return: list of the accessed objects, flattened if nested.

```python
from docarray import BaseDoc, DocList, Text
Expand Down Expand Up @@ -210,7 +208,8 @@ class Book(BaseDoc):
chapters = docs.traverse_flat(access_path='chapters') # list of 30 strings
```

If your [`DocList`][docarray.typing.DocList] is in doc_vec mode, and you want to access a field of

If your [`DocList`][docarray.array.doc_list.doc_list.DocList] is in doc_vec mode, and you want to access a field of
type [`AnyTensor`][docarray.typing.AnyTensor], the doc_vec tensor will be returned instead of a list:

```python
Expand All @@ -232,6 +231,9 @@ class Image(BaseDoc):
access_path='tensor'
) # tensor of shape (2, 3, 224, 224)
```

:param access_path: a string that represents the access path ("__"-separated).
:return: list of the accessed objects, flattened if nested.
"""
...

Expand Down Expand Up @@ -263,7 +265,7 @@ def _flatten_one_level(sequence: List[Any]) -> List[Any]:

def summary(self):
"""
Print a summary of this [`DocList`][docarray.typing.DocList] object and a summary of the schema of its
Print a summary of this [`DocList`][docarray.array.doc_list.doc_list.DocList] object and a summary of the schema of its
Document type.
"""
DocArraySummary(self).summary()
Expand All @@ -275,13 +277,13 @@ def _batch(
show_progress: bool = False,
) -> Generator[T, None, None]:
"""
Creates a `Generator` that yields [`DocList`][docarray.typing.DocList] of size `batch_size`.
Creates a `Generator` that yields [`DocList`][docarray.array.doc_list.doc_list.DocList] of size `batch_size`.
Note, that the last batch might be smaller than `batch_size`.

:param batch_size: Size of each generated batch.
:param shuffle: If set, shuffle the Documents before dividing into minibatches.
:param show_progress: if set, show a progress bar when batching documents.
:yield: a Generator of [`DocList`][docarray.typing.DocList], each in the length of `batch_size`
:yield: a Generator of [`DocList`][docarray.array.doc_list.doc_list.DocList], each in the length of `batch_size`
"""
from rich.progress import track

Expand Down
1 change: 1 addition & 0 deletions docarray/array/doc_list/doc_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ class Image(BaseDoc):

# You can also set fields, with `docs.tensor = np.random.random([10, 100])`:


import numpy as np

docs.tensor = np.random.random([10, 100])
Expand Down
29 changes: 16 additions & 13 deletions docarray/array/doc_list/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def from_bytes(

:param data: Bytes from which to deserialize
:param protocol: protocol that was used to serialize
:param compress: compress algorithm that was used to serialize
:param compress: compression algorithm that was used to serialize between `lz4`, `bz2`, `lzma`, `zlib`, `gzip`
:param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf`
:return: the deserialized `DocList`
"""
Expand Down Expand Up @@ -247,7 +247,7 @@ def to_bytes(
For more Pythonic code, please use ``bytes(...)``.

:param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf'
:param compress: compress algorithm to use
:param compress: compress algorithm to use between : `lz4`, `bz2`, `lzma`, `zlib`, `gzip`
:param file_ctx: File or filename or serialized bytes where the data is stored.
:param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf`
:return: the binary serialization in bytes or None if file_ctx is passed where to store
Expand Down Expand Up @@ -277,7 +277,7 @@ def from_base64(

:param data: Base64 string to deserialize
:param protocol: protocol that was used to serialize
:param compress: compress algorithm that was used to serialize
:param compress: compress algorithm that was used to serialize between `lz4`, `bz2`, `lzma`, `zlib`, `gzip`
:param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf`
:return: the deserialized `DocList`
"""
Expand All @@ -297,7 +297,7 @@ def to_base64(
"""Serialize itself into base64 encoded string.

:param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf'
:param compress: compress algorithm to use
:param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip`
:param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf`
:return: the binary serialization in bytes or None if file_ctx is passed where to store
"""
Expand Down Expand Up @@ -566,7 +566,7 @@ def _load_binary_all(
):
"""Read a `DocList` object from a binary file
:param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf'
:param compress: compress algorithm to use
:param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip`
:param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf`
:return: a `DocList`
"""
Expand Down Expand Up @@ -646,7 +646,7 @@ def _load_binary_stream(
"""Yield `Document` objects from a binary file

:param protocol: protocol to use. It can be 'pickle' or 'protobuf'
:param compress: compress algorithm to use
:param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip`
:param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf`
:return: a generator of `Document` objects
"""
Expand Down Expand Up @@ -702,20 +702,23 @@ def load_binary(
) -> Union[T, Generator['T_doc', None, None]]:
"""Load doc_list elements from a compressed binary file.

:param file: File or filename or serialized bytes where the data is stored.
:param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf'
:param compress: compress algorithm to use
:param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf`
:param streaming: if `True` returns a generator over `Document` objects.
In case protocol is pickle the `Documents` are streamed from disk to save memory usage
:return: a `DocList` object

!!! note
If `file` is `str` it can specify `protocol` and `compress` as file extensions.
This functionality assumes `file=file_name.$protocol.$compress` where `$protocol` and `$compress` refer to a
string interpolation of the respective `protocol` and `compress` methods.
For example if `file=my_docarray.protobuf.lz4` then the binary data will be loaded assuming `protocol=protobuf`
and `compress=lz4`.

:param file: File or filename or serialized bytes where the data is stored.
:param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf'
:param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip`
:param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf`
:param streaming: if `True` returns a generator over `Document` objects.

:return: a `DocList` object

"""
load_protocol: Optional[str] = protocol
load_compress: Optional[str] = compress
Expand Down Expand Up @@ -765,7 +768,7 @@ def save_binary(

:param file: File or filename to which the data is saved.
:param protocol: protocol to use. It can be 'pickle-array', 'protobuf-array', 'pickle' or 'protobuf'
:param compress: compress algorithm to use
:param compress: compress algorithm to use between `lz4`, `bz2`, `lzma`, `zlib`, `gzip`
:param show_progress: show progress bar, only works when protocol is `pickle` or `protobuf`

!!! note
Expand Down
81 changes: 80 additions & 1 deletion docarray/base_doc/doc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
import os
from typing import TYPE_CHECKING, Any, Dict, Optional, Type, TypeVar
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Optional,
Type,
TypeVar,
Union,
no_type_check,
)

import orjson
from pydantic import BaseModel, Field
Expand All @@ -12,11 +22,16 @@
from docarray.typing.tensor.abstract_tensor import AbstractTensor

if TYPE_CHECKING:
from pydantic import Protocol
from pydantic.types import StrBytes
from pydantic.typing import AbstractSetIntStr, MappingIntStrAny

from docarray.array.doc_vec.column_storage import ColumnStorageView

_console: Console = Console()

T = TypeVar('T', bound='BaseDoc')
T_update = TypeVar('T_update', bound='UpdateMixin')


class BaseDoc(BaseModel, IOMixin, UpdateMixin, BaseNode):
Expand Down Expand Up @@ -141,3 +156,67 @@ def _docarray_to_json_compatible(self) -> Dict:
:return: A dictionary of the BaseDoc object
"""
return self.dict()

########################################################################################################################################################
### this section is just for documentation purposes will be removed later once https://github.com/mkdocstrings/griffe/issues/138 is fixed ##############
########################################################################################################################################################

def json(
self,
*,
include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
exclude: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
by_alias: bool = False,
skip_defaults: Optional[bool] = None,
exclude_unset: bool = False,
exclude_defaults: bool = False,
exclude_none: bool = False,
encoder: Optional[Callable[[Any], Any]] = None,
models_as_dict: bool = True,
**dumps_kwargs: Any,
) -> str:
"""
Generate a JSON representation of the model, `include` and `exclude` arguments as per `dict()`.

`encoder` is an optional function to supply as `default` to json.dumps(), other arguments as per `json.dumps()`.
"""
return super().json(
include=include,
exclude=exclude,
by_alias=by_alias,
skip_defaults=skip_defaults,
exclude_unset=exclude_unset,
exclude_defaults=exclude_defaults,
exclude_none=exclude_none,
encoder=encoder,
models_as_dict=models_as_dict,
**dumps_kwargs,
)

@no_type_check
@classmethod
def parse_raw(
cls: Type[T],
b: 'StrBytes',
*,
content_type: str = None,
encoding: str = 'utf8',
proto: 'Protocol' = None,
allow_pickle: bool = False,
) -> T:
"""
Parse a raw string or bytes into a base doc
:param b:
:param content_type:
:param encoding: the encoding to use when parsing a string, defaults to 'utf8'
:param proto: protocol to use.
:param allow_pickle: allow pickle protocol
:return: a document
"""
return super(BaseDoc, cls).parse_raw(
b,
content_type=content_type,
encoding=encoding,
proto=proto,
allow_pickle=allow_pickle,
)
2 changes: 1 addition & 1 deletion docarray/base_doc/mixins/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def to_bytes(
For more Pythonic code, please use ``bytes(...)``.

:param protocol: protocol to use. It can be 'pickle' or 'protobuf'
:param compress: compress algorithm to use
:param compress: compression algorithm to use
:return: the binary serialization in bytes
"""
import pickle
Expand Down
38 changes: 21 additions & 17 deletions docarray/base_doc/mixins/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ def update(self, other: T):
Updates self with the content of other. Changes are applied to self.
Updating one Document with another consists in the following:
- setting data properties of the second Document to the first Document
if they are not None
if they are not None:

- Concatenating lists and updating sets
- Updating recursively Documents and DocArrays
- Updating Dictionaries of the left with the right
Expand All @@ -38,30 +39,33 @@ def update(self, other: T):
so they behave as regular types and the value of `self` is updated
with the value of `other`

EXAMPLE USAGE

.. code-block:: python
---

```python
from typing import List, Optional

from docarray import BaseDoc
from docarray.documents import Text
from docarray import BaseDoc


class MyDocument(BaseDoc):
content: str
title: Optional[str] = None
tags_: List
class MyDocument(BaseDoc):
content: str
title: Optional[str] = None
tags_: List


doc1 = MyDocument(
content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])
doc1 = MyDocument(
content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']
doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']
```

---
:param other: The Document with which to update the contents of this
"""
if type(self) != type(other):
Expand Down
1 change: 1 addition & 0 deletions docs/api_references/array/da.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@

::: docarray.array.doc_list.doc_list.DocList
::: docarray.array.doc_list.io.IOMixinArray
::: docarray.array.doc_list.pushpull.PushPullMixin
3 changes: 3 additions & 0 deletions docs/api_references/base_doc/base_doc.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# BaseDoc

::: docarray.base_doc.doc.BaseDoc
::: docarray.base_doc.mixins.io.IOMixin
::: docarray.base_doc.mixins.update.UpdateMixin

Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
# Use DocArray with FastAPI
# FastAPI

FastAPI is a high-performance web framework for building APIs with Python. It's designed to be easy to use and supports asynchronous programming.
Since [`DocArray` documents are Pydantic Models (with a twist)](../user_guide/representing/first_step.md) they can be easily integrated with FastAPI,
[FastAPI](https://fastapi.tiangolo.com/) is a high-performance web framework for building APIs with Python based on Python type hints. It's designed to be easy to use and supports asynchronous programming.
Since [`DocArray` documents are Pydantic Models (with a twist)](../../representing/first_step.md) they can be easily integrated with FastAPI,
and provide a seamless and efficient way to work with multimodal data in FastAPI-powered APIs.

!!! note
you need to install FastAPI to follow this section
```
pip install fastapi
```


First, you should define schemas for your input and/or output Documents:
```python
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# Jina

# Create an audio to text app with Jina and DocArray V2

This is how you can build an Audio to Text app using Jina, DocArray and Whisper.
Expand Down
Loading