From 20e14f6f71c59d7bd7c25efe7b45620178056a04 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Tue, 21 Mar 2023 21:10:54 +0100
Subject: [PATCH 1/8] feat: add from_files()

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/array/array/io.py                 | 71 ++++++++++++++++++++++
 docarray/display/document_summary.py       |  2 +-
 tests/units/array/test_array_from_files.py | 67 ++++++++++++++++++++
 3 files changed, 139 insertions(+), 1 deletion(-)
 create mode 100644 tests/units/array/test_array_from_files.py

diff --git a/docarray/array/array/io.py b/docarray/array/array/io.py
index ef674639ab3..a19e45ce1d1 100644
--- a/docarray/array/array/io.py
+++ b/docarray/array/array/io.py
@@ -726,3 +726,74 @@ def save_binary(
             file_ctx=file_ctx,
             show_progress=show_progress,
         )
+
+
+def from_files(
+    patterns: Union[str, List[str]],
+    doc_type: Type['BaseDocument'],
+    url_field: str,
+    content_field: Optional[str] = None,
+    recursive: bool = True,
+    size: Optional[int] = None,
+    sampling_rate: Optional[float] = None,
+    to_dataturi: bool = False,
+    exclude_regex: Optional[str] = None,
+    *args,
+    **kwargs,
+) -> Generator['BaseDocument', None, None]:
+    """Creates an iterator over a list of file path or the content of the files.
+
+    :param patterns: The pattern may contain simple shell-style wildcards, e.g. '\*.py', '[\*.zip, \*.gz]'
+    :param doc_type: type of document to create and store file to
+    :param url_field: stores url to this field
+    :param content_field: stores content of url to this field
+    :param recursive: If recursive is true, the pattern '**' will match any files
+        and zero or more directories and subdirectories
+    :param size: the maximum number of the files
+    :param sampling_rate: the sampling rate between [0, 1]
+    :param to_dataturi: if set, then the Document.uri will be filled with DataURI instead of the plan URI
+    :param exclude_regex: if set, then filenames that match to this pattern are not included.
+    :yield: file paths or binary content
+
+    .. note::
+        This function should not be directly used, use :meth:`Flow.index_files`, :meth:`Flow.search_files` instead
+    """
+    import glob
+    import itertools
+    import random
+    import re
+
+    def _iter_file_exts(ps):
+        return itertools.chain.from_iterable(
+            glob.iglob(os.path.expanduser(p), recursive=recursive) for p in ps
+        )
+
+    num_docs = 0
+    if isinstance(patterns, str):
+        patterns = [patterns]
+
+    regex = None
+    if exclude_regex:
+        try:
+            regex = re.compile(exclude_regex)
+        except re.error:
+            raise ValueError(f'`{exclude_regex}` is not a valid regex.')
+
+    for g in _iter_file_exts(patterns):
+        if os.path.isdir(g):
+            continue
+        if regex and regex.match(g):
+            continue
+
+        if sampling_rate is None or random.random() < sampling_rate:
+            if content_field is None:
+                doc = doc_type(**{url_field: g})
+            else:
+                with open(g, 'r') as fp:
+                    doc = doc_type(**{content_field: fp.read(), url_field: g})
+            # if to_dataturi:
+            #     doc.convert_uri_to_datauri()
+            yield doc
+            num_docs += 1
+        if size is not None and num_docs >= size:
+            break
diff --git a/docarray/display/document_summary.py b/docarray/display/document_summary.py
index fd0598ded32..b54423a4df0 100644
--- a/docarray/display/document_summary.py
+++ b/docarray/display/document_summary.py
@@ -130,7 +130,7 @@ def __rich_console__(
                 or value is None
             ):
                 continue
-            elif isinstance(value, str):
+            elif isinstance(value, (str, bytes)):
                 col_2 = str(value)[:50]
                 if len(value) > 50:
                     col_2 += f' ... (length: {len(value)})'
diff --git a/tests/units/array/test_array_from_files.py b/tests/units/array/test_array_from_files.py
new file mode 100644
index 00000000000..70c098916d9
--- /dev/null
+++ b/tests/units/array/test_array_from_files.py
@@ -0,0 +1,67 @@
+import pytest
+
+from docarray import BaseDocument, DocumentArray
+from docarray.array.array.io import from_files
+from docarray.documents import ImageDoc
+from docarray.typing import TextUrl
+from tests.units.typing.url.test_image_url import PATH_TO_IMAGE_DATA
+
+
+@pytest.mark.parametrize(
+    'patterns, recursive, size, sampling_rate',
+    [
+        (f'{PATH_TO_IMAGE_DATA}/*.*', True, None, None),
+        (f'{PATH_TO_IMAGE_DATA}/*.*', False, None, None),
+        (f'{PATH_TO_IMAGE_DATA}/*.*', True, 2, None),
+        (f'{PATH_TO_IMAGE_DATA}/*.*', True, None, 0.5),
+    ],
+)
+def test_from_files(patterns, recursive, size, sampling_rate):
+    da = DocumentArray[ImageDoc](
+        list(
+            from_files(
+                url_field='url',
+                doc_type=ImageDoc,
+                patterns=patterns,
+                recursive=recursive,
+                size=size,
+                sampling_rate=sampling_rate,
+            )
+        )
+    )
+    if size:
+        assert len(da) <= size
+    for doc in da:
+        doc.summary()
+        assert doc.url is not None
+
+
+@pytest.mark.parametrize(
+    'patterns, size',
+    [
+        ('*.*', 2),
+    ],
+)
+def test_from_files_with_storing_file_content(patterns, size):
+    class MyDoc(BaseDocument):
+        url: TextUrl
+        some_text: str
+
+    da = DocumentArray[MyDoc](
+        list(
+            from_files(
+                url_field='url',
+                content_field='some_text',
+                doc_type=MyDoc,
+                patterns=patterns,
+                size=size,
+            )
+        )
+    )
+    if size:
+        assert len(da) <= size
+    for doc in da:
+        doc.summary()
+        assert isinstance(doc, MyDoc)
+        assert doc.url is not None
+        assert doc.some_text is not None

From 42edce0be9a500e19da792cd8e216c3f3b130b11 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 22 Mar 2023 08:15:19 +0100
Subject: [PATCH 2/8] feat: add da classmethod from_files()

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/array/array/io.py                 | 37 +++++++++++++++++++---
 tests/units/array/test_array_from_files.py | 31 ++++++++++++++++--
 2 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/docarray/array/array/io.py b/docarray/array/array/io.py
index a19e45ce1d1..c700de6e610 100644
--- a/docarray/array/array/io.py
+++ b/docarray/array/array/io.py
@@ -727,6 +727,37 @@ def save_binary(
             show_progress=show_progress,
         )
 
+    @classmethod
+    def from_files(
+        cls,
+        patterns: Union[str, List[str]],
+        url_field: str,
+        content_field: Optional[str] = None,
+        recursive: bool = True,
+        size: Optional[int] = None,
+        sampling_rate: Optional[float] = None,
+        to_dataturi: bool = False,
+        exclude_regex: Optional[str] = None,
+    ) -> 'DocumentArray':
+        from docarray import DocumentArray
+
+        doc_type = cls.document_type
+        da = DocumentArray.__class_getitem__(doc_type)()
+        da.extend(
+            docs=from_files(
+                patterns=patterns,
+                doc_type=doc_type,
+                url_field=url_field,
+                content_field=content_field,
+                recursive=recursive,
+                size=size,
+                sampling_rate=sampling_rate,
+                to_dataturi=to_dataturi,
+                exclude_regex=exclude_regex,
+            )
+        )
+        return da
+
 
 def from_files(
     patterns: Union[str, List[str]],
@@ -738,8 +769,6 @@ def from_files(
     sampling_rate: Optional[float] = None,
     to_dataturi: bool = False,
     exclude_regex: Optional[str] = None,
-    *args,
-    **kwargs,
 ) -> Generator['BaseDocument', None, None]:
     """Creates an iterator over a list of file path or the content of the files.
 
@@ -763,7 +792,7 @@ def from_files(
     import random
     import re
 
-    def _iter_file_exts(ps):
+    def _iter_file_extensions(ps):
         return itertools.chain.from_iterable(
             glob.iglob(os.path.expanduser(p), recursive=recursive) for p in ps
         )
@@ -779,7 +808,7 @@ def _iter_file_exts(ps):
         except re.error:
             raise ValueError(f'`{exclude_regex}` is not a valid regex.')
 
-    for g in _iter_file_exts(patterns):
+    for g in _iter_file_extensions(patterns):
         if os.path.isdir(g):
             continue
         if regex and regex.match(g):
diff --git a/tests/units/array/test_array_from_files.py b/tests/units/array/test_array_from_files.py
index 70c098916d9..d4fb18e2e00 100644
--- a/tests/units/array/test_array_from_files.py
+++ b/tests/units/array/test_array_from_files.py
@@ -50,10 +50,10 @@ class MyDoc(BaseDocument):
     da = DocumentArray[MyDoc](
         list(
             from_files(
+                patterns=patterns,
+                doc_type=MyDoc,
                 url_field='url',
                 content_field='some_text',
-                doc_type=MyDoc,
-                patterns=patterns,
                 size=size,
             )
         )
@@ -65,3 +65,30 @@ class MyDoc(BaseDocument):
         assert isinstance(doc, MyDoc)
         assert doc.url is not None
         assert doc.some_text is not None
+
+
+@pytest.mark.parametrize(
+    'patterns, size',
+    [
+        ('*.*', 2),
+    ],
+)
+def test_document_array_from_files(patterns, size):
+    class MyDoc(BaseDocument):
+        url: TextUrl
+        some_text: str
+
+    da = DocumentArray[MyDoc].from_files(
+        patterns=patterns,
+        url_field='url',
+        content_field='some_text',
+        size=size,
+    )
+
+    if size:
+        assert len(da) <= size
+    for doc in da:
+        doc.summary()
+        assert isinstance(doc, MyDoc)
+        assert doc.url is not None
+        assert doc.some_text is not None

From f6b6a5920d413a2d4944c0de61437f9f36340fe6 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 22 Mar 2023 08:37:47 +0100
Subject: [PATCH 3/8] docs: update docstring

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/array/array/io.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/docarray/array/array/io.py b/docarray/array/array/io.py
index c700de6e610..4a343cb5dfd 100644
--- a/docarray/array/array/io.py
+++ b/docarray/array/array/io.py
@@ -1,10 +1,14 @@
 import base64
 import csv
+import glob
 import io
+import itertools
 import json
 import os
 import pathlib
 import pickle
+import random
+import re
 from abc import abstractmethod
 from contextlib import nullcontext
 from itertools import compress
@@ -770,27 +774,23 @@ def from_files(
     to_dataturi: bool = False,
     exclude_regex: Optional[str] = None,
 ) -> Generator['BaseDocument', None, None]:
-    """Creates an iterator over a list of file path or the content of the files.
+    """Yield Documents with stored urls and optionally the file content.
 
     :param patterns: The pattern may contain simple shell-style wildcards, e.g. '\*.py', '[\*.zip, \*.gz]'
-    :param doc_type: type of document to create and store file to
-    :param url_field: stores url to this field
-    :param content_field: stores content of url to this field
+    :param doc_type: type of document to create and store url (and content) to
+    :param url_field: field to store url to
+    :param content_field: If not None, the file content will be stored to this field.
     :param recursive: If recursive is true, the pattern '**' will match any files
         and zero or more directories and subdirectories
     :param size: the maximum number of the files
     :param sampling_rate: the sampling rate between [0, 1]
-    :param to_dataturi: if set, then the Document.uri will be filled with DataURI instead of the plan URI
+    :param to_dataturi: if true, the url will be transformed to the datauri and then stored to `url_field`
     :param exclude_regex: if set, then filenames that match to this pattern are not included.
-    :yield: file paths or binary content
+    :yield: Documents with stored file paths and optionally the file content
 
     .. note::
         This function should not be directly used, use :meth:`Flow.index_files`, :meth:`Flow.search_files` instead
     """
-    import glob
-    import itertools
-    import random
-    import re
 
     def _iter_file_extensions(ps):
         return itertools.chain.from_iterable(

From c833cee0a2e1317be8b2c588e595295bf05fcf32 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 22 Mar 2023 09:07:42 +0100
Subject: [PATCH 4/8] docs: add example usage

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/array/array/io.py                 | 47 ++++++++++++++++++++++
 tests/units/array/test_array_from_files.py | 44 +++++++++++++-------
 2 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/docarray/array/array/io.py b/docarray/array/array/io.py
index 4a343cb5dfd..2257d38e4c4 100644
--- a/docarray/array/array/io.py
+++ b/docarray/array/array/io.py
@@ -743,6 +743,53 @@ def from_files(
         to_dataturi: bool = False,
         exclude_regex: Optional[str] = None,
     ) -> 'DocumentArray':
+        """
+        Load a DocumentArray from file paths described by `patterns` following the
+        schema defined in the :attr:`~docarray.DocumentArray.document_type` attribute.
+        One file path will be mapped to one Document, and will be stored to `url_field`.
+        Additionally, if `content_field` is not None, the content of the file will be
+        stored in there. The type of `content_field` should be str or bytes.
+
+        For nested fields use "__"-separated access paths, such as 'image__url'.
+
+        EXAMPLE USAGE:
+
+        .. code-block:: python
+
+                from docarray import BaseDocument, DocumentArray
+                from docarray.typing import TextUrl
+
+
+                class MyDoc(BaseDocument):
+                    url: TextUrl
+                    some_text: str
+
+
+                da = DocumentArray[MyDoc].from_files(
+                    patterns='path/to/files/*.txt', url_field='url', content_field='some_text', size=3
+                )
+
+                assert len(da) == 3
+                for doc in da:
+                    assert isinstance(doc, MyDoc)
+                    assert doc.url is not None
+                    assert doc.some_text is not None
+
+        :param patterns: The pattern may contain simple shell-style wildcards,
+            e.g. '\*.py', '[\*.zip, \*.gz]'
+        :param url_field: field to store url to
+        :param content_field: If not None, the file content will be stored to this field.
+        :param recursive: If recursive is true, the pattern '**' will match any files
+            and zero or more directories and subdirectories
+        :param size: the maximum number of the files
+        :param sampling_rate: the sampling rate between [0, 1]
+        :param to_dataturi: if true, the url will be transformed to the datauri and then
+            stored to `url_field`
+        :param exclude_regex: if set, then filenames that match to this pattern are not
+            included.
+
+        :return: DocumentArray where each Document stores one file path and optionally the file content.
+        """
         from docarray import DocumentArray
 
         doc_type = cls.document_type
diff --git a/tests/units/array/test_array_from_files.py b/tests/units/array/test_array_from_files.py
index d4fb18e2e00..48023babba5 100644
--- a/tests/units/array/test_array_from_files.py
+++ b/tests/units/array/test_array_from_files.py
@@ -4,6 +4,7 @@
 from docarray.array.array.io import from_files
 from docarray.documents import ImageDoc
 from docarray.typing import TextUrl
+from tests import TOYDATA_DIR
 from tests.units.typing.url.test_image_url import PATH_TO_IMAGE_DATA
 
 
@@ -37,12 +38,12 @@ def test_from_files(patterns, recursive, size, sampling_rate):
 
 
 @pytest.mark.parametrize(
-    'patterns, size',
+    'patterns',
     [
-        ('*.*', 2),
+        (f'{TOYDATA_DIR}/*.txt'),
     ],
 )
-def test_from_files_with_storing_file_content(patterns, size):
+def test_from_files_with_storing_file_content(patterns):
     class MyDoc(BaseDocument):
         url: TextUrl
         some_text: str
@@ -54,26 +55,23 @@ class MyDoc(BaseDocument):
                 doc_type=MyDoc,
                 url_field='url',
                 content_field='some_text',
-                size=size,
             )
         )
     )
-    if size:
-        assert len(da) <= size
+    assert len(da) == 1
     for doc in da:
-        doc.summary()
         assert isinstance(doc, MyDoc)
         assert doc.url is not None
         assert doc.some_text is not None
 
 
 @pytest.mark.parametrize(
-    'patterns, size',
+    'patterns',
     [
-        ('*.*', 2),
+        (f'{TOYDATA_DIR}/*.txt'),
     ],
 )
-def test_document_array_from_files(patterns, size):
+def test_document_array_from_files(patterns):
     class MyDoc(BaseDocument):
         url: TextUrl
         some_text: str
@@ -82,13 +80,31 @@ class MyDoc(BaseDocument):
         patterns=patterns,
         url_field='url',
         content_field='some_text',
-        size=size,
     )
 
-    if size:
-        assert len(da) <= size
+    assert len(da) == 1
     for doc in da:
-        doc.summary()
         assert isinstance(doc, MyDoc)
         assert doc.url is not None
         assert doc.some_text is not None
+
+
+def test_from_files_with_nested_fields():
+    class MyDoc(BaseDocument):
+        url: TextUrl
+        some_text: str
+
+    class NestedDoc(BaseDocument):
+        my_doc: MyDoc
+
+    da = DocumentArray[NestedDoc].from_files(
+        patterns=f'{TOYDATA_DIR}/*.txt',
+        url_field='my_doc__url',
+        content_field='my_doc__some_text',
+    )
+
+    assert len(da) == 1
+    for doc in da:
+        assert isinstance(doc, NestedDoc)
+        assert doc.my_doc.url is not None
+        assert doc.my_doc.some_text is not None

From c1c9db5f33c35db389e2fc37af72004fa16204cf Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 22 Mar 2023 18:30:36 +0100
Subject: [PATCH 5/8] fix: add get_paths, rm from_files

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/array/array/io.py                 | 147 ---------------------
 docarray/helper.py                         |  63 ++++++++-
 tests/units/array/test_array_from_files.py | 110 ---------------
 tests/units/test_helper.py                 |  23 ++++
 4 files changed, 85 insertions(+), 258 deletions(-)
 delete mode 100644 tests/units/array/test_array_from_files.py

diff --git a/docarray/array/array/io.py b/docarray/array/array/io.py
index 2257d38e4c4..ef674639ab3 100644
--- a/docarray/array/array/io.py
+++ b/docarray/array/array/io.py
@@ -1,14 +1,10 @@
 import base64
 import csv
-import glob
 import io
-import itertools
 import json
 import os
 import pathlib
 import pickle
-import random
-import re
 from abc import abstractmethod
 from contextlib import nullcontext
 from itertools import compress
@@ -730,146 +726,3 @@ def save_binary(
             file_ctx=file_ctx,
             show_progress=show_progress,
         )
-
-    @classmethod
-    def from_files(
-        cls,
-        patterns: Union[str, List[str]],
-        url_field: str,
-        content_field: Optional[str] = None,
-        recursive: bool = True,
-        size: Optional[int] = None,
-        sampling_rate: Optional[float] = None,
-        to_dataturi: bool = False,
-        exclude_regex: Optional[str] = None,
-    ) -> 'DocumentArray':
-        """
-        Load a DocumentArray from file paths described by `patterns` following the
-        schema defined in the :attr:`~docarray.DocumentArray.document_type` attribute.
-        One file path will be mapped to one Document, and will be stored to `url_field`.
-        Additionally, if `content_field` is not None, the content of the file will be
-        stored in there. The type of `content_field` should be str or bytes.
-
-        For nested fields use "__"-separated access paths, such as 'image__url'.
-
-        EXAMPLE USAGE:
-
-        .. code-block:: python
-
-                from docarray import BaseDocument, DocumentArray
-                from docarray.typing import TextUrl
-
-
-                class MyDoc(BaseDocument):
-                    url: TextUrl
-                    some_text: str
-
-
-                da = DocumentArray[MyDoc].from_files(
-                    patterns='path/to/files/*.txt', url_field='url', content_field='some_text', size=3
-                )
-
-                assert len(da) == 3
-                for doc in da:
-                    assert isinstance(doc, MyDoc)
-                    assert doc.url is not None
-                    assert doc.some_text is not None
-
-        :param patterns: The pattern may contain simple shell-style wildcards,
-            e.g. '\*.py', '[\*.zip, \*.gz]'
-        :param url_field: field to store url to
-        :param content_field: If not None, the file content will be stored to this field.
-        :param recursive: If recursive is true, the pattern '**' will match any files
-            and zero or more directories and subdirectories
-        :param size: the maximum number of the files
-        :param sampling_rate: the sampling rate between [0, 1]
-        :param to_dataturi: if true, the url will be transformed to the datauri and then
-            stored to `url_field`
-        :param exclude_regex: if set, then filenames that match to this pattern are not
-            included.
-
-        :return: DocumentArray where each Document stores one file path and optionally the file content.
-        """
-        from docarray import DocumentArray
-
-        doc_type = cls.document_type
-        da = DocumentArray.__class_getitem__(doc_type)()
-        da.extend(
-            docs=from_files(
-                patterns=patterns,
-                doc_type=doc_type,
-                url_field=url_field,
-                content_field=content_field,
-                recursive=recursive,
-                size=size,
-                sampling_rate=sampling_rate,
-                to_dataturi=to_dataturi,
-                exclude_regex=exclude_regex,
-            )
-        )
-        return da
-
-
-def from_files(
-    patterns: Union[str, List[str]],
-    doc_type: Type['BaseDocument'],
-    url_field: str,
-    content_field: Optional[str] = None,
-    recursive: bool = True,
-    size: Optional[int] = None,
-    sampling_rate: Optional[float] = None,
-    to_dataturi: bool = False,
-    exclude_regex: Optional[str] = None,
-) -> Generator['BaseDocument', None, None]:
-    """Yield Documents with stored urls and optionally the file content.
-
-    :param patterns: The pattern may contain simple shell-style wildcards, e.g. '\*.py', '[\*.zip, \*.gz]'
-    :param doc_type: type of document to create and store url (and content) to
-    :param url_field: field to store url to
-    :param content_field: If not None, the file content will be stored to this field.
-    :param recursive: If recursive is true, the pattern '**' will match any files
-        and zero or more directories and subdirectories
-    :param size: the maximum number of the files
-    :param sampling_rate: the sampling rate between [0, 1]
-    :param to_dataturi: if true, the url will be transformed to the datauri and then stored to `url_field`
-    :param exclude_regex: if set, then filenames that match to this pattern are not included.
-    :yield: Documents with stored file paths and optionally the file content
-
-    .. note::
-        This function should not be directly used, use :meth:`Flow.index_files`, :meth:`Flow.search_files` instead
-    """
-
-    def _iter_file_extensions(ps):
-        return itertools.chain.from_iterable(
-            glob.iglob(os.path.expanduser(p), recursive=recursive) for p in ps
-        )
-
-    num_docs = 0
-    if isinstance(patterns, str):
-        patterns = [patterns]
-
-    regex = None
-    if exclude_regex:
-        try:
-            regex = re.compile(exclude_regex)
-        except re.error:
-            raise ValueError(f'`{exclude_regex}` is not a valid regex.')
-
-    for g in _iter_file_extensions(patterns):
-        if os.path.isdir(g):
-            continue
-        if regex and regex.match(g):
-            continue
-
-        if sampling_rate is None or random.random() < sampling_rate:
-            if content_field is None:
-                doc = doc_type(**{url_field: g})
-            else:
-                with open(g, 'r') as fp:
-                    doc = doc_type(**{content_field: fp.read(), url_field: g})
-            # if to_dataturi:
-            #     doc.convert_uri_to_datauri()
-            yield doc
-            num_docs += 1
-        if size is not None and num_docs >= size:
-            break
diff --git a/docarray/helper.py b/docarray/helper.py
index 5c92c731acc..0cea6bf1622 100644
--- a/docarray/helper.py
+++ b/docarray/helper.py
@@ -1,5 +1,19 @@
+import glob
+import itertools
+import os
+import re
 from types import LambdaType
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    List,
+    Optional,
+    Type,
+    Union,
+)
 
 if TYPE_CHECKING:
     from docarray import BaseDocument
@@ -150,3 +164,50 @@ def _is_lambda_or_partial_or_local_function(func: Callable[[Any], Any]) -> bool:
         or not hasattr(func, '__qualname__')
         or ('<locals>' in func.__qualname__)
     )
+
+
+def get_paths(
+    patterns: Union[str, List[str]],
+    recursive: bool = True,
+    size: Optional[int] = None,
+    exclude_regex: Optional[str] = None,
+) -> Generator[str, None, None]:
+    """
+    Yield file paths described by `patterns`.
+
+    :param patterns: The pattern may contain simple shell-style wildcards,
+        e.g. '\*.py', '[\*.zip, \*.gz]'
+    :param recursive: If recursive is true, the pattern '**' will match any
+        files and zero or more directories and subdirectories
+    :param size: the maximum number of the files
+    :param exclude_regex: if set, then filenames that match to this pattern
+        are not included.
+    :yield: file paths
+
+    """
+
+    if isinstance(patterns, str):
+        patterns = [patterns]
+
+    regex_to_exclude = None
+    if exclude_regex:
+        try:
+            regex_to_exclude = re.compile(exclude_regex)
+        except re.error:
+            raise ValueError(f'`{exclude_regex}` is not a valid regex.')
+
+    def _iter_file_extensions(ps):
+        return itertools.chain.from_iterable(
+            glob.iglob(os.path.expanduser(p), recursive=recursive) for p in ps
+        )
+
+    num_docs = 0
+    for file_path in _iter_file_extensions(patterns):
+        if regex_to_exclude and regex_to_exclude.match(file_path):
+            continue
+
+        yield file_path
+
+        num_docs += 1
+        if size is not None and num_docs >= size:
+            break
diff --git a/tests/units/array/test_array_from_files.py b/tests/units/array/test_array_from_files.py
deleted file mode 100644
index 48023babba5..00000000000
--- a/tests/units/array/test_array_from_files.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import pytest
-
-from docarray import BaseDocument, DocumentArray
-from docarray.array.array.io import from_files
-from docarray.documents import ImageDoc
-from docarray.typing import TextUrl
-from tests import TOYDATA_DIR
-from tests.units.typing.url.test_image_url import PATH_TO_IMAGE_DATA
-
-
-@pytest.mark.parametrize(
-    'patterns, recursive, size, sampling_rate',
-    [
-        (f'{PATH_TO_IMAGE_DATA}/*.*', True, None, None),
-        (f'{PATH_TO_IMAGE_DATA}/*.*', False, None, None),
-        (f'{PATH_TO_IMAGE_DATA}/*.*', True, 2, None),
-        (f'{PATH_TO_IMAGE_DATA}/*.*', True, None, 0.5),
-    ],
-)
-def test_from_files(patterns, recursive, size, sampling_rate):
-    da = DocumentArray[ImageDoc](
-        list(
-            from_files(
-                url_field='url',
-                doc_type=ImageDoc,
-                patterns=patterns,
-                recursive=recursive,
-                size=size,
-                sampling_rate=sampling_rate,
-            )
-        )
-    )
-    if size:
-        assert len(da) <= size
-    for doc in da:
-        doc.summary()
-        assert doc.url is not None
-
-
-@pytest.mark.parametrize(
-    'patterns',
-    [
-        (f'{TOYDATA_DIR}/*.txt'),
-    ],
-)
-def test_from_files_with_storing_file_content(patterns):
-    class MyDoc(BaseDocument):
-        url: TextUrl
-        some_text: str
-
-    da = DocumentArray[MyDoc](
-        list(
-            from_files(
-                patterns=patterns,
-                doc_type=MyDoc,
-                url_field='url',
-                content_field='some_text',
-            )
-        )
-    )
-    assert len(da) == 1
-    for doc in da:
-        assert isinstance(doc, MyDoc)
-        assert doc.url is not None
-        assert doc.some_text is not None
-
-
-@pytest.mark.parametrize(
-    'patterns',
-    [
-        (f'{TOYDATA_DIR}/*.txt'),
-    ],
-)
-def test_document_array_from_files(patterns):
-    class MyDoc(BaseDocument):
-        url: TextUrl
-        some_text: str
-
-    da = DocumentArray[MyDoc].from_files(
-        patterns=patterns,
-        url_field='url',
-        content_field='some_text',
-    )
-
-    assert len(da) == 1
-    for doc in da:
-        assert isinstance(doc, MyDoc)
-        assert doc.url is not None
-        assert doc.some_text is not None
-
-
-def test_from_files_with_nested_fields():
-    class MyDoc(BaseDocument):
-        url: TextUrl
-        some_text: str
-
-    class NestedDoc(BaseDocument):
-        my_doc: MyDoc
-
-    da = DocumentArray[NestedDoc].from_files(
-        patterns=f'{TOYDATA_DIR}/*.txt',
-        url_field='my_doc__url',
-        content_field='my_doc__some_text',
-    )
-
-    assert len(da) == 1
-    for doc in da:
-        assert isinstance(doc, NestedDoc)
-        assert doc.my_doc.url is not None
-        assert doc.my_doc.some_text is not None
diff --git a/tests/units/test_helper.py b/tests/units/test_helper.py
index 69649074bbd..4645a13ad7b 100644
--- a/tests/units/test_helper.py
+++ b/tests/units/test_helper.py
@@ -10,6 +10,7 @@
     _dict_to_access_paths,
     _is_access_path_valid,
     _update_nested_dicts,
+    get_paths,
 )
 
 
@@ -109,3 +110,25 @@ def test_update_nested_dict():
 
     _update_nested_dicts(d1, d2)
     assert d1 == {'text': 'hello', 'image': {'tensor': None, 'url': 'some.png'}}
+
+
+def test_get_paths():
+    paths = list(get_paths(patterns='*.py'))
+    for path in paths:
+        assert path.endswith('.py')
+
+
+def test_get_paths_recursive():
+    paths_rec = list(get_paths(patterns='**', recursive=True))
+    paths_not_rec = list(get_paths(patterns='**', recursive=False))
+
+    assert len(paths_rec) > len(paths_not_rec)
+
+
+def test_get_paths_exclude():
+    paths = list(get_paths(patterns='*.py'))
+    paths_wo_init = list(get_paths(patterns='*.py', exclude_regex='__init__.[a-z]*'))
+
+    assert len(paths_wo_init) < len(paths)
+    assert '__init__.py' in paths
+    assert '__init__.py' not in paths_wo_init

From 440a2bd5ab109eec34c9b84c6ea6e5cfb2a3b78c Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 22 Mar 2023 19:45:19 +0100
Subject: [PATCH 6/8] fix: add print to debug ci

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/helper.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docarray/helper.py b/docarray/helper.py
index 0cea6bf1622..3db9154b658 100644
--- a/docarray/helper.py
+++ b/docarray/helper.py
@@ -203,6 +203,7 @@ def _iter_file_extensions(ps):
 
     num_docs = 0
     for file_path in _iter_file_extensions(patterns):
+        print(f"file_path = {file_path}")
         if regex_to_exclude and regex_to_exclude.match(file_path):
             continue
 

From fc1425723cd586b05bac14e899f37bdb7d6a6cb6 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 22 Mar 2023 20:07:29 +0100
Subject: [PATCH 7/8] fix: test

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 tests/units/test_helper.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/units/test_helper.py b/tests/units/test_helper.py
index 4645a13ad7b..cd3131eb0ae 100644
--- a/tests/units/test_helper.py
+++ b/tests/units/test_helper.py
@@ -129,6 +129,5 @@ def test_get_paths_exclude():
     paths = list(get_paths(patterns='*.py'))
     paths_wo_init = list(get_paths(patterns='*.py', exclude_regex='__init__.[a-z]*'))
 
-    assert len(paths_wo_init) < len(paths)
-    assert '__init__.py' in paths
+    assert len(paths_wo_init) <= len(paths)
     assert '__init__.py' not in paths_wo_init

From 2792ae6935016777ec613131e9d61b261f83c967 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Thu, 23 Mar 2023 12:32:33 +0100
Subject: [PATCH 8/8] fix: apply suggestions from code review

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/helper.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/docarray/helper.py b/docarray/helper.py
index 3db9154b658..9c843a3c675 100644
--- a/docarray/helper.py
+++ b/docarray/helper.py
@@ -175,6 +175,33 @@ def get_paths(
     """
     Yield file paths described by `patterns`.
 
+    EXAMPLE USAGE
+
+    .. code-block:: python
+
+        from typing import Optional
+        from docarray import BaseDocument, DocumentArray
+        from docarray.helper import get_paths
+        from docarray.typing import TextUrl, ImageUrl
+
+
+        class Banner(BaseDocument):
+            text_url: TextUrl
+            image_url: Optional[ImageUrl]
+
+
+        # you can call it in the constructor
+        da = DocumentArray[Banner](
+            [Banner(text_url=url) for url in get_paths(patterns='*.txt')]
+        )
+
+        # and call it after construction to set the urls
+        da.image_url = list(get_paths(patterns='*.jpg', exclude_regex='test'))
+
+        for doc in da:
+            assert doc.image_url.endswith('.txt')
+            assert doc.text_url.endswith('.jpg')
+
     :param patterns: The pattern may contain simple shell-style wildcards,
         e.g. '\*.py', '[\*.zip, \*.gz]'
     :param recursive: If recursive is true, the pattern '**' will match any
@@ -203,7 +230,6 @@ def _iter_file_extensions(ps):
 
     num_docs = 0
     for file_path in _iter_file_extensions(patterns):
-        print(f"file_path = {file_path}")
         if regex_to_exclude and regex_to_exclude.match(file_path):
             continue