Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docarray/display/document_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def __rich_console__(
or value is None
):
continue
elif isinstance(value, str):
elif isinstance(value, (str, bytes)):
col_2 = str(value)[:50]
if len(value) > 50:
col_2 += f' ... (length: {len(value)})'
Expand Down
90 changes: 89 additions & 1 deletion docarray/helper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
import glob
import itertools
import os
import re
from types import LambdaType
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Generator,
List,
Optional,
Type,
Union,
)

if TYPE_CHECKING:
from docarray import BaseDocument
Expand Down Expand Up @@ -150,3 +164,77 @@ def _is_lambda_or_partial_or_local_function(func: Callable[[Any], Any]) -> bool:
or not hasattr(func, '__qualname__')
or ('<locals>' in func.__qualname__)
)


def get_paths(
patterns: Union[str, List[str]],
recursive: bool = True,
size: Optional[int] = None,
exclude_regex: Optional[str] = None,
) -> Generator[str, None, None]:
"""
Yield file paths described by `patterns`.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you provide a code example ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes sure, will add one


EXAMPLE USAGE

.. code-block:: python

from typing import Optional
from docarray import BaseDocument, DocumentArray
from docarray.helper import get_paths
from docarray.typing import TextUrl, ImageUrl


class Banner(BaseDocument):
text_url: TextUrl
image_url: Optional[ImageUrl]


# you can call it in the constructor
da = DocumentArray[Banner](
[Banner(text_url=url) for url in get_paths(patterns='*.txt')]
)

# and call it after construction to set the urls
da.image_url = list(get_paths(patterns='*.jpg', exclude_regex='test'))

for doc in da:
assert doc.image_url.endswith('.txt')
assert doc.text_url.endswith('.jpg')

:param patterns: The pattern may contain simple shell-style wildcards,
e.g. '\*.py', '[\*.zip, \*.gz]'
:param recursive: If recursive is true, the pattern '**' will match any
files and zero or more directories and subdirectories
:param size: the maximum number of the files
:param exclude_regex: if set, then filenames that match to this pattern
are not included.
:yield: file paths

"""

if isinstance(patterns, str):
patterns = [patterns]

regex_to_exclude = None
if exclude_regex:
try:
regex_to_exclude = re.compile(exclude_regex)
except re.error:
raise ValueError(f'`{exclude_regex}` is not a valid regex.')

def _iter_file_extensions(ps):
return itertools.chain.from_iterable(
glob.iglob(os.path.expanduser(p), recursive=recursive) for p in ps
)

num_docs = 0
for file_path in _iter_file_extensions(patterns):
if regex_to_exclude and regex_to_exclude.match(file_path):
continue

yield file_path

num_docs += 1
if size is not None and num_docs >= size:
break
22 changes: 22 additions & 0 deletions tests/units/test_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
_dict_to_access_paths,
_is_access_path_valid,
_update_nested_dicts,
get_paths,
)


Expand Down Expand Up @@ -109,3 +110,24 @@ def test_update_nested_dict():

_update_nested_dicts(d1, d2)
assert d1 == {'text': 'hello', 'image': {'tensor': None, 'url': 'some.png'}}


def test_get_paths():
paths = list(get_paths(patterns='*.py'))
for path in paths:
assert path.endswith('.py')


def test_get_paths_recursive():
paths_rec = list(get_paths(patterns='**', recursive=True))
paths_not_rec = list(get_paths(patterns='**', recursive=False))

assert len(paths_rec) > len(paths_not_rec)


def test_get_paths_exclude():
paths = list(get_paths(patterns='*.py'))
paths_wo_init = list(get_paths(patterns='*.py', exclude_regex='__init__.[a-z]*'))

assert len(paths_wo_init) <= len(paths)
assert '__init__.py' not in paths_wo_init