diff --git a/docarray/display/document_summary.py b/docarray/display/document_summary.py index fd0598ded32..b54423a4df0 100644 --- a/docarray/display/document_summary.py +++ b/docarray/display/document_summary.py @@ -130,7 +130,7 @@ def __rich_console__( or value is None ): continue - elif isinstance(value, str): + elif isinstance(value, (str, bytes)): col_2 = str(value)[:50] if len(value) > 50: col_2 += f' ... (length: {len(value)})' diff --git a/docarray/helper.py b/docarray/helper.py index 5c92c731acc..9c843a3c675 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -1,5 +1,19 @@ +import glob +import itertools +import os +import re from types import LambdaType -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Generator, + List, + Optional, + Type, + Union, +) if TYPE_CHECKING: from docarray import BaseDocument @@ -150,3 +164,77 @@ def _is_lambda_or_partial_or_local_function(func: Callable[[Any], Any]) -> bool: or not hasattr(func, '__qualname__') or ('' in func.__qualname__) ) + + +def get_paths( + patterns: Union[str, List[str]], + recursive: bool = True, + size: Optional[int] = None, + exclude_regex: Optional[str] = None, +) -> Generator[str, None, None]: + """ + Yield file paths described by `patterns`. + + EXAMPLE USAGE + + .. code-block:: python + + from typing import Optional + from docarray import BaseDocument, DocumentArray + from docarray.helper import get_paths + from docarray.typing import TextUrl, ImageUrl + + + class Banner(BaseDocument): + text_url: TextUrl + image_url: Optional[ImageUrl] + + + # you can call it in the constructor + da = DocumentArray[Banner]( + [Banner(text_url=url) for url in get_paths(patterns='*.txt')] + ) + + # and call it after construction to set the urls + da.image_url = list(get_paths(patterns='*.jpg', exclude_regex='test')) + + for doc in da: + assert doc.image_url.endswith('.txt') + assert doc.text_url.endswith('.jpg') + + :param patterns: The pattern may contain simple shell-style wildcards, + e.g. '\*.py', '[\*.zip, \*.gz]' + :param recursive: If recursive is true, the pattern '**' will match any + files and zero or more directories and subdirectories + :param size: the maximum number of the files + :param exclude_regex: if set, then filenames that match to this pattern + are not included. + :yield: file paths + + """ + + if isinstance(patterns, str): + patterns = [patterns] + + regex_to_exclude = None + if exclude_regex: + try: + regex_to_exclude = re.compile(exclude_regex) + except re.error: + raise ValueError(f'`{exclude_regex}` is not a valid regex.') + + def _iter_file_extensions(ps): + return itertools.chain.from_iterable( + glob.iglob(os.path.expanduser(p), recursive=recursive) for p in ps + ) + + num_docs = 0 + for file_path in _iter_file_extensions(patterns): + if regex_to_exclude and regex_to_exclude.match(file_path): + continue + + yield file_path + + num_docs += 1 + if size is not None and num_docs >= size: + break diff --git a/tests/units/test_helper.py b/tests/units/test_helper.py index 69649074bbd..cd3131eb0ae 100644 --- a/tests/units/test_helper.py +++ b/tests/units/test_helper.py @@ -10,6 +10,7 @@ _dict_to_access_paths, _is_access_path_valid, _update_nested_dicts, + get_paths, ) @@ -109,3 +110,24 @@ def test_update_nested_dict(): _update_nested_dicts(d1, d2) assert d1 == {'text': 'hello', 'image': {'tensor': None, 'url': 'some.png'}} + + +def test_get_paths(): + paths = list(get_paths(patterns='*.py')) + for path in paths: + assert path.endswith('.py') + + +def test_get_paths_recursive(): + paths_rec = list(get_paths(patterns='**', recursive=True)) + paths_not_rec = list(get_paths(patterns='**', recursive=False)) + + assert len(paths_rec) > len(paths_not_rec) + + +def test_get_paths_exclude(): + paths = list(get_paths(patterns='*.py')) + paths_wo_init = list(get_paths(patterns='*.py', exclude_regex='__init__.[a-z]*')) + + assert len(paths_wo_init) <= len(paths) + assert '__init__.py' not in paths_wo_init