From 20e14f6f71c59d7bd7c25efe7b45620178056a04 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 21 Mar 2023 21:10:54 +0100 Subject: [PATCH 1/8] feat: add from_files() Signed-off-by: anna-charlotte --- docarray/array/array/io.py | 71 ++++++++++++++++++++++ docarray/display/document_summary.py | 2 +- tests/units/array/test_array_from_files.py | 67 ++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 tests/units/array/test_array_from_files.py diff --git a/docarray/array/array/io.py b/docarray/array/array/io.py index ef674639ab3..a19e45ce1d1 100644 --- a/docarray/array/array/io.py +++ b/docarray/array/array/io.py @@ -726,3 +726,74 @@ def save_binary( file_ctx=file_ctx, show_progress=show_progress, ) + + +def from_files( + patterns: Union[str, List[str]], + doc_type: Type['BaseDocument'], + url_field: str, + content_field: Optional[str] = None, + recursive: bool = True, + size: Optional[int] = None, + sampling_rate: Optional[float] = None, + to_dataturi: bool = False, + exclude_regex: Optional[str] = None, + *args, + **kwargs, +) -> Generator['BaseDocument', None, None]: + """Creates an iterator over a list of file path or the content of the files. + + :param patterns: The pattern may contain simple shell-style wildcards, e.g. '\*.py', '[\*.zip, \*.gz]' + :param doc_type: type of document to create and store file to + :param url_field: stores url to this field + :param content_field: stores content of url to this field + :param recursive: If recursive is true, the pattern '**' will match any files + and zero or more directories and subdirectories + :param size: the maximum number of the files + :param sampling_rate: the sampling rate between [0, 1] + :param to_dataturi: if set, then the Document.uri will be filled with DataURI instead of the plan URI + :param exclude_regex: if set, then filenames that match to this pattern are not included. + :yield: file paths or binary content + + .. note:: + This function should not be directly used, use :meth:`Flow.index_files`, :meth:`Flow.search_files` instead + """ + import glob + import itertools + import random + import re + + def _iter_file_exts(ps): + return itertools.chain.from_iterable( + glob.iglob(os.path.expanduser(p), recursive=recursive) for p in ps + ) + + num_docs = 0 + if isinstance(patterns, str): + patterns = [patterns] + + regex = None + if exclude_regex: + try: + regex = re.compile(exclude_regex) + except re.error: + raise ValueError(f'`{exclude_regex}` is not a valid regex.') + + for g in _iter_file_exts(patterns): + if os.path.isdir(g): + continue + if regex and regex.match(g): + continue + + if sampling_rate is None or random.random() < sampling_rate: + if content_field is None: + doc = doc_type(**{url_field: g}) + else: + with open(g, 'r') as fp: + doc = doc_type(**{content_field: fp.read(), url_field: g}) + # if to_dataturi: + # doc.convert_uri_to_datauri() + yield doc + num_docs += 1 + if size is not None and num_docs >= size: + break diff --git a/docarray/display/document_summary.py b/docarray/display/document_summary.py index fd0598ded32..b54423a4df0 100644 --- a/docarray/display/document_summary.py +++ b/docarray/display/document_summary.py @@ -130,7 +130,7 @@ def __rich_console__( or value is None ): continue - elif isinstance(value, str): + elif isinstance(value, (str, bytes)): col_2 = str(value)[:50] if len(value) > 50: col_2 += f' ... (length: {len(value)})' diff --git a/tests/units/array/test_array_from_files.py b/tests/units/array/test_array_from_files.py new file mode 100644 index 00000000000..70c098916d9 --- /dev/null +++ b/tests/units/array/test_array_from_files.py @@ -0,0 +1,67 @@ +import pytest + +from docarray import BaseDocument, DocumentArray +from docarray.array.array.io import from_files +from docarray.documents import ImageDoc +from docarray.typing import TextUrl +from tests.units.typing.url.test_image_url import PATH_TO_IMAGE_DATA + + +@pytest.mark.parametrize( + 'patterns, recursive, size, sampling_rate', + [ + (f'{PATH_TO_IMAGE_DATA}/*.*', True, None, None), + (f'{PATH_TO_IMAGE_DATA}/*.*', False, None, None), + (f'{PATH_TO_IMAGE_DATA}/*.*', True, 2, None), + (f'{PATH_TO_IMAGE_DATA}/*.*', True, None, 0.5), + ], +) +def test_from_files(patterns, recursive, size, sampling_rate): + da = DocumentArray[ImageDoc]( + list( + from_files( + url_field='url', + doc_type=ImageDoc, + patterns=patterns, + recursive=recursive, + size=size, + sampling_rate=sampling_rate, + ) + ) + ) + if size: + assert len(da) <= size + for doc in da: + doc.summary() + assert doc.url is not None + + +@pytest.mark.parametrize( + 'patterns, size', + [ + ('*.*', 2), + ], +) +def test_from_files_with_storing_file_content(patterns, size): + class MyDoc(BaseDocument): + url: TextUrl + some_text: str + + da = DocumentArray[MyDoc]( + list( + from_files( + url_field='url', + content_field='some_text', + doc_type=MyDoc, + patterns=patterns, + size=size, + ) + ) + ) + if size: + assert len(da) <= size + for doc in da: + doc.summary() + assert isinstance(doc, MyDoc) + assert doc.url is not None + assert doc.some_text is not None From 42edce0be9a500e19da792cd8e216c3f3b130b11 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Wed, 22 Mar 2023 08:15:19 +0100 Subject: [PATCH 2/8] feat: add da classmethod from_files() Signed-off-by: anna-charlotte --- docarray/array/array/io.py | 37 +++++++++++++++++++--- tests/units/array/test_array_from_files.py | 31 ++++++++++++++++-- 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/docarray/array/array/io.py b/docarray/array/array/io.py index a19e45ce1d1..c700de6e610 100644 --- a/docarray/array/array/io.py +++ b/docarray/array/array/io.py @@ -727,6 +727,37 @@ def save_binary( show_progress=show_progress, ) + @classmethod + def from_files( + cls, + patterns: Union[str, List[str]], + url_field: str, + content_field: Optional[str] = None, + recursive: bool = True, + size: Optional[int] = None, + sampling_rate: Optional[float] = None, + to_dataturi: bool = False, + exclude_regex: Optional[str] = None, + ) -> 'DocumentArray': + from docarray import DocumentArray + + doc_type = cls.document_type + da = DocumentArray.__class_getitem__(doc_type)() + da.extend( + docs=from_files( + patterns=patterns, + doc_type=doc_type, + url_field=url_field, + content_field=content_field, + recursive=recursive, + size=size, + sampling_rate=sampling_rate, + to_dataturi=to_dataturi, + exclude_regex=exclude_regex, + ) + ) + return da + def from_files( patterns: Union[str, List[str]], @@ -738,8 +769,6 @@ def from_files( sampling_rate: Optional[float] = None, to_dataturi: bool = False, exclude_regex: Optional[str] = None, - *args, - **kwargs, ) -> Generator['BaseDocument', None, None]: """Creates an iterator over a list of file path or the content of the files. @@ -763,7 +792,7 @@ def from_files( import random import re - def _iter_file_exts(ps): + def _iter_file_extensions(ps): return itertools.chain.from_iterable( glob.iglob(os.path.expanduser(p), recursive=recursive) for p in ps ) @@ -779,7 +808,7 @@ def _iter_file_exts(ps): except re.error: raise ValueError(f'`{exclude_regex}` is not a valid regex.') - for g in _iter_file_exts(patterns): + for g in _iter_file_extensions(patterns): if os.path.isdir(g): continue if regex and regex.match(g): diff --git a/tests/units/array/test_array_from_files.py b/tests/units/array/test_array_from_files.py index 70c098916d9..d4fb18e2e00 100644 --- a/tests/units/array/test_array_from_files.py +++ b/tests/units/array/test_array_from_files.py @@ -50,10 +50,10 @@ class MyDoc(BaseDocument): da = DocumentArray[MyDoc]( list( from_files( + patterns=patterns, + doc_type=MyDoc, url_field='url', content_field='some_text', - doc_type=MyDoc, - patterns=patterns, size=size, ) ) @@ -65,3 +65,30 @@ class MyDoc(BaseDocument): assert isinstance(doc, MyDoc) assert doc.url is not None assert doc.some_text is not None + + +@pytest.mark.parametrize( + 'patterns, size', + [ + ('*.*', 2), + ], +) +def test_document_array_from_files(patterns, size): + class MyDoc(BaseDocument): + url: TextUrl + some_text: str + + da = DocumentArray[MyDoc].from_files( + patterns=patterns, + url_field='url', + content_field='some_text', + size=size, + ) + + if size: + assert len(da) <= size + for doc in da: + doc.summary() + assert isinstance(doc, MyDoc) + assert doc.url is not None + assert doc.some_text is not None From f6b6a5920d413a2d4944c0de61437f9f36340fe6 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Wed, 22 Mar 2023 08:37:47 +0100 Subject: [PATCH 3/8] docs: update docstring Signed-off-by: anna-charlotte --- docarray/array/array/io.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docarray/array/array/io.py b/docarray/array/array/io.py index c700de6e610..4a343cb5dfd 100644 --- a/docarray/array/array/io.py +++ b/docarray/array/array/io.py @@ -1,10 +1,14 @@ import base64 import csv +import glob import io +import itertools import json import os import pathlib import pickle +import random +import re from abc import abstractmethod from contextlib import nullcontext from itertools import compress @@ -770,27 +774,23 @@ def from_files( to_dataturi: bool = False, exclude_regex: Optional[str] = None, ) -> Generator['BaseDocument', None, None]: - """Creates an iterator over a list of file path or the content of the files. + """Yield Documents with stored urls and optionally the file content. :param patterns: The pattern may contain simple shell-style wildcards, e.g. '\*.py', '[\*.zip, \*.gz]' - :param doc_type: type of document to create and store file to - :param url_field: stores url to this field - :param content_field: stores content of url to this field + :param doc_type: type of document to create and store url (and content) to + :param url_field: field to store url to + :param content_field: If not None, the file content will be stored to this field. :param recursive: If recursive is true, the pattern '**' will match any files and zero or more directories and subdirectories :param size: the maximum number of the files :param sampling_rate: the sampling rate between [0, 1] - :param to_dataturi: if set, then the Document.uri will be filled with DataURI instead of the plan URI + :param to_dataturi: if true, the url will be transformed to the datauri and then stored to `url_field` :param exclude_regex: if set, then filenames that match to this pattern are not included. - :yield: file paths or binary content + :yield: Documents with stored file paths and optionally the file content .. note:: This function should not be directly used, use :meth:`Flow.index_files`, :meth:`Flow.search_files` instead """ - import glob - import itertools - import random - import re def _iter_file_extensions(ps): return itertools.chain.from_iterable( From c833cee0a2e1317be8b2c588e595295bf05fcf32 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Wed, 22 Mar 2023 09:07:42 +0100 Subject: [PATCH 4/8] docs: add example usage Signed-off-by: anna-charlotte --- docarray/array/array/io.py | 47 ++++++++++++++++++++++ tests/units/array/test_array_from_files.py | 44 +++++++++++++------- 2 files changed, 77 insertions(+), 14 deletions(-) diff --git a/docarray/array/array/io.py b/docarray/array/array/io.py index 4a343cb5dfd..2257d38e4c4 100644 --- a/docarray/array/array/io.py +++ b/docarray/array/array/io.py @@ -743,6 +743,53 @@ def from_files( to_dataturi: bool = False, exclude_regex: Optional[str] = None, ) -> 'DocumentArray': + """ + Load a DocumentArray from file paths described by `patterns` following the + schema defined in the :attr:`~docarray.DocumentArray.document_type` attribute. + One file path will be mapped to one Document, and will be stored to `url_field`. + Additionally, if `content_field` is not None, the content of the file will be + stored in there. The type of `content_field` should be str or bytes. + + For nested fields use "__"-separated access paths, such as 'image__url'. + + EXAMPLE USAGE: + + .. code-block:: python + + from docarray import BaseDocument, DocumentArray + from docarray.typing import TextUrl + + + class MyDoc(BaseDocument): + url: TextUrl + some_text: str + + + da = DocumentArray[MyDoc].from_files( + patterns='path/to/files/*.txt', url_field='url', content_field='some_text', size=3 + ) + + assert len(da) == 3 + for doc in da: + assert isinstance(doc, MyDoc) + assert doc.url is not None + assert doc.some_text is not None + + :param patterns: The pattern may contain simple shell-style wildcards, + e.g. '\*.py', '[\*.zip, \*.gz]' + :param url_field: field to store url to + :param content_field: If not None, the file content will be stored to this field. + :param recursive: If recursive is true, the pattern '**' will match any files + and zero or more directories and subdirectories + :param size: the maximum number of the files + :param sampling_rate: the sampling rate between [0, 1] + :param to_dataturi: if true, the url will be transformed to the datauri and then + stored to `url_field` + :param exclude_regex: if set, then filenames that match to this pattern are not + included. + + :return: DocumentArray where each Document stores one file path and optionally the file content. + """ from docarray import DocumentArray doc_type = cls.document_type diff --git a/tests/units/array/test_array_from_files.py b/tests/units/array/test_array_from_files.py index d4fb18e2e00..48023babba5 100644 --- a/tests/units/array/test_array_from_files.py +++ b/tests/units/array/test_array_from_files.py @@ -4,6 +4,7 @@ from docarray.array.array.io import from_files from docarray.documents import ImageDoc from docarray.typing import TextUrl +from tests import TOYDATA_DIR from tests.units.typing.url.test_image_url import PATH_TO_IMAGE_DATA @@ -37,12 +38,12 @@ def test_from_files(patterns, recursive, size, sampling_rate): @pytest.mark.parametrize( - 'patterns, size', + 'patterns', [ - ('*.*', 2), + (f'{TOYDATA_DIR}/*.txt'), ], ) -def test_from_files_with_storing_file_content(patterns, size): +def test_from_files_with_storing_file_content(patterns): class MyDoc(BaseDocument): url: TextUrl some_text: str @@ -54,26 +55,23 @@ class MyDoc(BaseDocument): doc_type=MyDoc, url_field='url', content_field='some_text', - size=size, ) ) ) - if size: - assert len(da) <= size + assert len(da) == 1 for doc in da: - doc.summary() assert isinstance(doc, MyDoc) assert doc.url is not None assert doc.some_text is not None @pytest.mark.parametrize( - 'patterns, size', + 'patterns', [ - ('*.*', 2), + (f'{TOYDATA_DIR}/*.txt'), ], ) -def test_document_array_from_files(patterns, size): +def test_document_array_from_files(patterns): class MyDoc(BaseDocument): url: TextUrl some_text: str @@ -82,13 +80,31 @@ class MyDoc(BaseDocument): patterns=patterns, url_field='url', content_field='some_text', - size=size, ) - if size: - assert len(da) <= size + assert len(da) == 1 for doc in da: - doc.summary() assert isinstance(doc, MyDoc) assert doc.url is not None assert doc.some_text is not None + + +def test_from_files_with_nested_fields(): + class MyDoc(BaseDocument): + url: TextUrl + some_text: str + + class NestedDoc(BaseDocument): + my_doc: MyDoc + + da = DocumentArray[NestedDoc].from_files( + patterns=f'{TOYDATA_DIR}/*.txt', + url_field='my_doc__url', + content_field='my_doc__some_text', + ) + + assert len(da) == 1 + for doc in da: + assert isinstance(doc, NestedDoc) + assert doc.my_doc.url is not None + assert doc.my_doc.some_text is not None From c1c9db5f33c35db389e2fc37af72004fa16204cf Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Wed, 22 Mar 2023 18:30:36 +0100 Subject: [PATCH 5/8] fix: add get_paths, rm from_files Signed-off-by: anna-charlotte --- docarray/array/array/io.py | 147 --------------------- docarray/helper.py | 63 ++++++++- tests/units/array/test_array_from_files.py | 110 --------------- tests/units/test_helper.py | 23 ++++ 4 files changed, 85 insertions(+), 258 deletions(-) delete mode 100644 tests/units/array/test_array_from_files.py diff --git a/docarray/array/array/io.py b/docarray/array/array/io.py index 2257d38e4c4..ef674639ab3 100644 --- a/docarray/array/array/io.py +++ b/docarray/array/array/io.py @@ -1,14 +1,10 @@ import base64 import csv -import glob import io -import itertools import json import os import pathlib import pickle -import random -import re from abc import abstractmethod from contextlib import nullcontext from itertools import compress @@ -730,146 +726,3 @@ def save_binary( file_ctx=file_ctx, show_progress=show_progress, ) - - @classmethod - def from_files( - cls, - patterns: Union[str, List[str]], - url_field: str, - content_field: Optional[str] = None, - recursive: bool = True, - size: Optional[int] = None, - sampling_rate: Optional[float] = None, - to_dataturi: bool = False, - exclude_regex: Optional[str] = None, - ) -> 'DocumentArray': - """ - Load a DocumentArray from file paths described by `patterns` following the - schema defined in the :attr:`~docarray.DocumentArray.document_type` attribute. - One file path will be mapped to one Document, and will be stored to `url_field`. - Additionally, if `content_field` is not None, the content of the file will be - stored in there. The type of `content_field` should be str or bytes. - - For nested fields use "__"-separated access paths, such as 'image__url'. - - EXAMPLE USAGE: - - .. code-block:: python - - from docarray import BaseDocument, DocumentArray - from docarray.typing import TextUrl - - - class MyDoc(BaseDocument): - url: TextUrl - some_text: str - - - da = DocumentArray[MyDoc].from_files( - patterns='path/to/files/*.txt', url_field='url', content_field='some_text', size=3 - ) - - assert len(da) == 3 - for doc in da: - assert isinstance(doc, MyDoc) - assert doc.url is not None - assert doc.some_text is not None - - :param patterns: The pattern may contain simple shell-style wildcards, - e.g. '\*.py', '[\*.zip, \*.gz]' - :param url_field: field to store url to - :param content_field: If not None, the file content will be stored to this field. - :param recursive: If recursive is true, the pattern '**' will match any files - and zero or more directories and subdirectories - :param size: the maximum number of the files - :param sampling_rate: the sampling rate between [0, 1] - :param to_dataturi: if true, the url will be transformed to the datauri and then - stored to `url_field` - :param exclude_regex: if set, then filenames that match to this pattern are not - included. - - :return: DocumentArray where each Document stores one file path and optionally the file content. - """ - from docarray import DocumentArray - - doc_type = cls.document_type - da = DocumentArray.__class_getitem__(doc_type)() - da.extend( - docs=from_files( - patterns=patterns, - doc_type=doc_type, - url_field=url_field, - content_field=content_field, - recursive=recursive, - size=size, - sampling_rate=sampling_rate, - to_dataturi=to_dataturi, - exclude_regex=exclude_regex, - ) - ) - return da - - -def from_files( - patterns: Union[str, List[str]], - doc_type: Type['BaseDocument'], - url_field: str, - content_field: Optional[str] = None, - recursive: bool = True, - size: Optional[int] = None, - sampling_rate: Optional[float] = None, - to_dataturi: bool = False, - exclude_regex: Optional[str] = None, -) -> Generator['BaseDocument', None, None]: - """Yield Documents with stored urls and optionally the file content. - - :param patterns: The pattern may contain simple shell-style wildcards, e.g. '\*.py', '[\*.zip, \*.gz]' - :param doc_type: type of document to create and store url (and content) to - :param url_field: field to store url to - :param content_field: If not None, the file content will be stored to this field. - :param recursive: If recursive is true, the pattern '**' will match any files - and zero or more directories and subdirectories - :param size: the maximum number of the files - :param sampling_rate: the sampling rate between [0, 1] - :param to_dataturi: if true, the url will be transformed to the datauri and then stored to `url_field` - :param exclude_regex: if set, then filenames that match to this pattern are not included. - :yield: Documents with stored file paths and optionally the file content - - .. note:: - This function should not be directly used, use :meth:`Flow.index_files`, :meth:`Flow.search_files` instead - """ - - def _iter_file_extensions(ps): - return itertools.chain.from_iterable( - glob.iglob(os.path.expanduser(p), recursive=recursive) for p in ps - ) - - num_docs = 0 - if isinstance(patterns, str): - patterns = [patterns] - - regex = None - if exclude_regex: - try: - regex = re.compile(exclude_regex) - except re.error: - raise ValueError(f'`{exclude_regex}` is not a valid regex.') - - for g in _iter_file_extensions(patterns): - if os.path.isdir(g): - continue - if regex and regex.match(g): - continue - - if sampling_rate is None or random.random() < sampling_rate: - if content_field is None: - doc = doc_type(**{url_field: g}) - else: - with open(g, 'r') as fp: - doc = doc_type(**{content_field: fp.read(), url_field: g}) - # if to_dataturi: - # doc.convert_uri_to_datauri() - yield doc - num_docs += 1 - if size is not None and num_docs >= size: - break diff --git a/docarray/helper.py b/docarray/helper.py index 5c92c731acc..0cea6bf1622 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -1,5 +1,19 @@ +import glob +import itertools +import os +import re from types import LambdaType -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Generator, + List, + Optional, + Type, + Union, +) if TYPE_CHECKING: from docarray import BaseDocument @@ -150,3 +164,50 @@ def _is_lambda_or_partial_or_local_function(func: Callable[[Any], Any]) -> bool: or not hasattr(func, '__qualname__') or ('' in func.__qualname__) ) + + +def get_paths( + patterns: Union[str, List[str]], + recursive: bool = True, + size: Optional[int] = None, + exclude_regex: Optional[str] = None, +) -> Generator[str, None, None]: + """ + Yield file paths described by `patterns`. + + :param patterns: The pattern may contain simple shell-style wildcards, + e.g. '\*.py', '[\*.zip, \*.gz]' + :param recursive: If recursive is true, the pattern '**' will match any + files and zero or more directories and subdirectories + :param size: the maximum number of the files + :param exclude_regex: if set, then filenames that match to this pattern + are not included. + :yield: file paths + + """ + + if isinstance(patterns, str): + patterns = [patterns] + + regex_to_exclude = None + if exclude_regex: + try: + regex_to_exclude = re.compile(exclude_regex) + except re.error: + raise ValueError(f'`{exclude_regex}` is not a valid regex.') + + def _iter_file_extensions(ps): + return itertools.chain.from_iterable( + glob.iglob(os.path.expanduser(p), recursive=recursive) for p in ps + ) + + num_docs = 0 + for file_path in _iter_file_extensions(patterns): + if regex_to_exclude and regex_to_exclude.match(file_path): + continue + + yield file_path + + num_docs += 1 + if size is not None and num_docs >= size: + break diff --git a/tests/units/array/test_array_from_files.py b/tests/units/array/test_array_from_files.py deleted file mode 100644 index 48023babba5..00000000000 --- a/tests/units/array/test_array_from_files.py +++ /dev/null @@ -1,110 +0,0 @@ -import pytest - -from docarray import BaseDocument, DocumentArray -from docarray.array.array.io import from_files -from docarray.documents import ImageDoc -from docarray.typing import TextUrl -from tests import TOYDATA_DIR -from tests.units.typing.url.test_image_url import PATH_TO_IMAGE_DATA - - -@pytest.mark.parametrize( - 'patterns, recursive, size, sampling_rate', - [ - (f'{PATH_TO_IMAGE_DATA}/*.*', True, None, None), - (f'{PATH_TO_IMAGE_DATA}/*.*', False, None, None), - (f'{PATH_TO_IMAGE_DATA}/*.*', True, 2, None), - (f'{PATH_TO_IMAGE_DATA}/*.*', True, None, 0.5), - ], -) -def test_from_files(patterns, recursive, size, sampling_rate): - da = DocumentArray[ImageDoc]( - list( - from_files( - url_field='url', - doc_type=ImageDoc, - patterns=patterns, - recursive=recursive, - size=size, - sampling_rate=sampling_rate, - ) - ) - ) - if size: - assert len(da) <= size - for doc in da: - doc.summary() - assert doc.url is not None - - -@pytest.mark.parametrize( - 'patterns', - [ - (f'{TOYDATA_DIR}/*.txt'), - ], -) -def test_from_files_with_storing_file_content(patterns): - class MyDoc(BaseDocument): - url: TextUrl - some_text: str - - da = DocumentArray[MyDoc]( - list( - from_files( - patterns=patterns, - doc_type=MyDoc, - url_field='url', - content_field='some_text', - ) - ) - ) - assert len(da) == 1 - for doc in da: - assert isinstance(doc, MyDoc) - assert doc.url is not None - assert doc.some_text is not None - - -@pytest.mark.parametrize( - 'patterns', - [ - (f'{TOYDATA_DIR}/*.txt'), - ], -) -def test_document_array_from_files(patterns): - class MyDoc(BaseDocument): - url: TextUrl - some_text: str - - da = DocumentArray[MyDoc].from_files( - patterns=patterns, - url_field='url', - content_field='some_text', - ) - - assert len(da) == 1 - for doc in da: - assert isinstance(doc, MyDoc) - assert doc.url is not None - assert doc.some_text is not None - - -def test_from_files_with_nested_fields(): - class MyDoc(BaseDocument): - url: TextUrl - some_text: str - - class NestedDoc(BaseDocument): - my_doc: MyDoc - - da = DocumentArray[NestedDoc].from_files( - patterns=f'{TOYDATA_DIR}/*.txt', - url_field='my_doc__url', - content_field='my_doc__some_text', - ) - - assert len(da) == 1 - for doc in da: - assert isinstance(doc, NestedDoc) - assert doc.my_doc.url is not None - assert doc.my_doc.some_text is not None diff --git a/tests/units/test_helper.py b/tests/units/test_helper.py index 69649074bbd..4645a13ad7b 100644 --- a/tests/units/test_helper.py +++ b/tests/units/test_helper.py @@ -10,6 +10,7 @@ _dict_to_access_paths, _is_access_path_valid, _update_nested_dicts, + get_paths, ) @@ -109,3 +110,25 @@ def test_update_nested_dict(): _update_nested_dicts(d1, d2) assert d1 == {'text': 'hello', 'image': {'tensor': None, 'url': 'some.png'}} + + +def test_get_paths(): + paths = list(get_paths(patterns='*.py')) + for path in paths: + assert path.endswith('.py') + + +def test_get_paths_recursive(): + paths_rec = list(get_paths(patterns='**', recursive=True)) + paths_not_rec = list(get_paths(patterns='**', recursive=False)) + + assert len(paths_rec) > len(paths_not_rec) + + +def test_get_paths_exclude(): + paths = list(get_paths(patterns='*.py')) + paths_wo_init = list(get_paths(patterns='*.py', exclude_regex='__init__.[a-z]*')) + + assert len(paths_wo_init) < len(paths) + assert '__init__.py' in paths + assert '__init__.py' not in paths_wo_init From 440a2bd5ab109eec34c9b84c6ea6e5cfb2a3b78c Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Wed, 22 Mar 2023 19:45:19 +0100 Subject: [PATCH 6/8] fix: add print to debug ci Signed-off-by: anna-charlotte --- docarray/helper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docarray/helper.py b/docarray/helper.py index 0cea6bf1622..3db9154b658 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -203,6 +203,7 @@ def _iter_file_extensions(ps): num_docs = 0 for file_path in _iter_file_extensions(patterns): + print(f"file_path = {file_path}") if regex_to_exclude and regex_to_exclude.match(file_path): continue From fc1425723cd586b05bac14e899f37bdb7d6a6cb6 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Wed, 22 Mar 2023 20:07:29 +0100 Subject: [PATCH 7/8] fix: test Signed-off-by: anna-charlotte --- tests/units/test_helper.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/units/test_helper.py b/tests/units/test_helper.py index 4645a13ad7b..cd3131eb0ae 100644 --- a/tests/units/test_helper.py +++ b/tests/units/test_helper.py @@ -129,6 +129,5 @@ def test_get_paths_exclude(): paths = list(get_paths(patterns='*.py')) paths_wo_init = list(get_paths(patterns='*.py', exclude_regex='__init__.[a-z]*')) - assert len(paths_wo_init) < len(paths) - assert '__init__.py' in paths + assert len(paths_wo_init) <= len(paths) assert '__init__.py' not in paths_wo_init From 2792ae6935016777ec613131e9d61b261f83c967 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 23 Mar 2023 12:32:33 +0100 Subject: [PATCH 8/8] fix: apply suggestions from code review Signed-off-by: anna-charlotte --- docarray/helper.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/docarray/helper.py b/docarray/helper.py index 3db9154b658..9c843a3c675 100644 --- a/docarray/helper.py +++ b/docarray/helper.py @@ -175,6 +175,33 @@ def get_paths( """ Yield file paths described by `patterns`. + EXAMPLE USAGE + + .. code-block:: python + + from typing import Optional + from docarray import BaseDocument, DocumentArray + from docarray.helper import get_paths + from docarray.typing import TextUrl, ImageUrl + + + class Banner(BaseDocument): + text_url: TextUrl + image_url: Optional[ImageUrl] + + + # you can call it in the constructor + da = DocumentArray[Banner]( + [Banner(text_url=url) for url in get_paths(patterns='*.txt')] + ) + + # and call it after construction to set the urls + da.image_url = list(get_paths(patterns='*.jpg', exclude_regex='test')) + + for doc in da: + assert doc.image_url.endswith('.txt') + assert doc.text_url.endswith('.jpg') + :param patterns: The pattern may contain simple shell-style wildcards, e.g. '\*.py', '[\*.zip, \*.gz]' :param recursive: If recursive is true, the pattern '**' will match any @@ -203,7 +230,6 @@ def _iter_file_extensions(ps): num_docs = 0 for file_path in _iter_file_extensions(patterns): - print(f"file_path = {file_path}") if regex_to_exclude and regex_to_exclude.match(file_path): continue