Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docarray/document/mixins/blob.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@
class BlobDataMixin:
"""Provide helper functions for :class:`Document` to handle binary data."""

def load_uri_to_blob(self: 'T') -> 'T':
def load_uri_to_blob(self: 'T', **kwargs) -> 'T':
"""Convert :attr:`.uri` to :attr:`.blob` inplace.
Internally it downloads from the URI and set :attr:`blob`.

:param kwargs: keyword arguments to pass to `:meth:_uri_to_blob` such as timeout
:return: itself after processed
"""
self.blob = _uri_to_blob(self.uri)
self.blob = _uri_to_blob(self.uri, **kwargs)
return self

def convert_blob_to_datauri(
Expand Down
8 changes: 5 additions & 3 deletions docarray/document/mixins/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@
from contextlib import nullcontext


def _uri_to_blob(uri: str) -> bytes:
def _uri_to_blob(uri: str, **kwargs) -> bytes:
"""Convert uri to blob
Internally it reads uri into blob.

:param uri: the uri of Document
:param kwargs: keyword arguments to pass to `urlopen` such as timeout
:return: blob bytes.
"""
timeout = kwargs.get('timeout', None)
if urllib.parse.urlparse(uri).scheme in {'http', 'https', 'data'}:
req = urllib.request.Request(uri, headers={'User-Agent': 'Mozilla/5.0'})
with urllib.request.urlopen(req) as fp:
urlopen_kwargs = {'timeout': timeout} if timeout is not None else {}
with urllib.request.urlopen(req, **urlopen_kwargs) as fp:
return fp.read()
elif os.path.exists(uri):
with open(uri, 'rb') as fp:
Expand Down
4 changes: 3 additions & 1 deletion docarray/document/mixins/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,17 +175,19 @@ def load_uri_to_image_tensor(
width: Optional[int] = None,
height: Optional[int] = None,
channel_axis: int = -1,
**kwargs,
) -> 'T':
"""Convert the image-like :attr:`.uri` into :attr:`.tensor`

:param width: the width of the image tensor.
:param height: the height of the tensor.
:param channel_axis: the axis id of the color channel, ``-1`` indicates the color channel info at the last axis
:param kwargs: keyword arguments to pass to `:meth:_uri_to_blob` such as timeout

:return: itself after processed
"""

buffer = _uri_to_blob(self.uri)
buffer = _uri_to_blob(self.uri, **kwargs)
tensor = _to_image_tensor(io.BytesIO(buffer), width=width, height=height)
self.tensor = _move_channel_axis(tensor, original_channel_axis=channel_axis)
return self
Expand Down
5 changes: 3 additions & 2 deletions docarray/document/mixins/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@
class TextDataMixin:
"""Provide helper functions for :class:`Document` to support text data."""

def load_uri_to_text(self: 'T', charset: str = 'utf-8') -> 'T':
def load_uri_to_text(self: 'T', charset: str = 'utf-8', **kwargs) -> 'T':
"""Convert :attr:`.uri` to :attr`.text` inplace.

:param charset: charset may be any character set registered with IANA
:param kwargs: keyword arguments to pass to `:meth:_uri_to_blob` such as timeout
:return: itself after processed
"""
blob = _uri_to_blob(self.uri)
blob = _uri_to_blob(self.uri, **kwargs)
self.text = blob.decode(charset)
return self

Expand Down