From adb3ce1853131b3248352e85412116025f1c3019 Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Thu, 22 Sep 2022 22:14:25 +0200 Subject: [PATCH 1/2] feat: allow pass parameters to load_uri_to* --- docarray/document/mixins/blob.py | 5 +++-- docarray/document/mixins/helper.py | 6 +++--- docarray/document/mixins/image.py | 4 +++- docarray/document/mixins/text.py | 5 +++-- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/docarray/document/mixins/blob.py b/docarray/document/mixins/blob.py index 910c4119d65..ec2bcbaf726 100644 --- a/docarray/document/mixins/blob.py +++ b/docarray/document/mixins/blob.py @@ -9,13 +9,14 @@ class BlobDataMixin: """Provide helper functions for :class:`Document` to handle binary data.""" - def load_uri_to_blob(self: 'T') -> 'T': + def load_uri_to_blob(self: 'T', **kwargs) -> 'T': """Convert :attr:`.uri` to :attr:`.blob` inplace. Internally it downloads from the URI and set :attr:`blob`. + :param kwargs: keyword arguments to pass to `:meth:_uri_to_blob` such as timeout :return: itself after processed """ - self.blob = _uri_to_blob(self.uri) + self.blob = _uri_to_blob(self.uri, **kwargs) return self def convert_blob_to_datauri( diff --git a/docarray/document/mixins/helper.py b/docarray/document/mixins/helper.py index 043275b445f..14a9788cb3c 100644 --- a/docarray/document/mixins/helper.py +++ b/docarray/document/mixins/helper.py @@ -4,16 +4,16 @@ from contextlib import nullcontext -def _uri_to_blob(uri: str) -> bytes: +def _uri_to_blob(uri: str, **kwargs) -> bytes: """Convert uri to blob Internally it reads uri into blob. - :param uri: the uri of Document + :param kwargs: keyword arguments to pass to `urlopen` such as timeout :return: blob bytes. """ if urllib.parse.urlparse(uri).scheme in {'http', 'https', 'data'}: req = urllib.request.Request(uri, headers={'User-Agent': 'Mozilla/5.0'}) - with urllib.request.urlopen(req) as fp: + with urllib.request.urlopen(req, **kwargs) as fp: return fp.read() elif os.path.exists(uri): with open(uri, 'rb') as fp: diff --git a/docarray/document/mixins/image.py b/docarray/document/mixins/image.py index 5092718495b..d6a9aeef548 100644 --- a/docarray/document/mixins/image.py +++ b/docarray/document/mixins/image.py @@ -175,17 +175,19 @@ def load_uri_to_image_tensor( width: Optional[int] = None, height: Optional[int] = None, channel_axis: int = -1, + **kwargs, ) -> 'T': """Convert the image-like :attr:`.uri` into :attr:`.tensor` :param width: the width of the image tensor. :param height: the height of the tensor. :param channel_axis: the axis id of the color channel, ``-1`` indicates the color channel info at the last axis + :param kwargs: keyword arguments to pass to `:meth:_uri_to_blob` such as timeout :return: itself after processed """ - buffer = _uri_to_blob(self.uri) + buffer = _uri_to_blob(self.uri, **kwargs) tensor = _to_image_tensor(io.BytesIO(buffer), width=width, height=height) self.tensor = _move_channel_axis(tensor, original_channel_axis=channel_axis) return self diff --git a/docarray/document/mixins/text.py b/docarray/document/mixins/text.py index 8d859031fdb..de99e2c554d 100644 --- a/docarray/document/mixins/text.py +++ b/docarray/document/mixins/text.py @@ -12,13 +12,14 @@ class TextDataMixin: """Provide helper functions for :class:`Document` to support text data.""" - def load_uri_to_text(self: 'T', charset: str = 'utf-8') -> 'T': + def load_uri_to_text(self: 'T', charset: str = 'utf-8', **kwargs) -> 'T': """Convert :attr:`.uri` to :attr`.text` inplace. :param charset: charset may be any character set registered with IANA + :param kwargs: keyword arguments to pass to `:meth:_uri_to_blob` such as timeout :return: itself after processed """ - blob = _uri_to_blob(self.uri) + blob = _uri_to_blob(self.uri, **kwargs) self.text = blob.decode(charset) return self From a4ad460e80dd0a7202a55b4b102b9298917f3da2 Mon Sep 17 00:00:00 2001 From: Joan Fontanals Martinez Date: Fri, 23 Sep 2022 10:00:18 +0200 Subject: [PATCH 2/2] refactor: only read timeout --- docarray/document/mixins/helper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docarray/document/mixins/helper.py b/docarray/document/mixins/helper.py index 14a9788cb3c..465cf3f9ff0 100644 --- a/docarray/document/mixins/helper.py +++ b/docarray/document/mixins/helper.py @@ -11,9 +11,11 @@ def _uri_to_blob(uri: str, **kwargs) -> bytes: :param kwargs: keyword arguments to pass to `urlopen` such as timeout :return: blob bytes. """ + timeout = kwargs.get('timeout', None) if urllib.parse.urlparse(uri).scheme in {'http', 'https', 'data'}: req = urllib.request.Request(uri, headers={'User-Agent': 'Mozilla/5.0'}) - with urllib.request.urlopen(req, **kwargs) as fp: + urlopen_kwargs = {'timeout': timeout} if timeout is not None else {} + with urllib.request.urlopen(req, **urlopen_kwargs) as fp: return fp.read() elif os.path.exists(uri): with open(uri, 'rb') as fp: