Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 35 additions & 22 deletions docarray/array/doc_list/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pickle
from abc import abstractmethod
from contextlib import nullcontext
from io import StringIO, TextIOWrapper
from itertools import compress
from typing import (
TYPE_CHECKING,
Expand Down Expand Up @@ -361,40 +362,52 @@ def from_csv(
'unix' (for csv file generated on UNIX systems).
:return: DocList
"""
from docarray import DocList

if cls.doc_type == AnyDoc:
raise TypeError(
'There is no document schema defined. '
'Please specify the DocList\'s Document type using `DocList[MyDoc]`.'
)

if file_path.startswith('http'):
import urllib.request

with urllib.request.urlopen(file_path) as f:
file = StringIO(f.read().decode(encoding))
return cls._from_csv_file(file, dialect)
else:
with open(file_path, 'r', encoding=encoding) as fp:
return cls._from_csv_file(fp, dialect)

@classmethod
def _from_csv_file(
cls, file: Union[StringIO, TextIOWrapper], dialect: Union[str, csv.Dialect]
) -> 'DocList':
from docarray import DocList

rows = csv.DictReader(file, dialect=dialect)

doc_type = cls.doc_type
docs = DocList.__class_getitem__(doc_type)()

with open(file_path, 'r', encoding=encoding) as fp:
rows = csv.DictReader(fp, dialect=dialect)
field_names: List[str] = (
[] if rows.fieldnames is None else [str(f) for f in rows.fieldnames]
)
if field_names is None or len(field_names) == 0:
raise TypeError("No field names are given.")
field_names: List[str] = (
[] if rows.fieldnames is None else [str(f) for f in rows.fieldnames]
)
if field_names is None or len(field_names) == 0:
raise TypeError("No field names are given.")

valid_paths = _all_access_paths_valid(
doc_type=doc_type, access_paths=field_names
valid_paths = _all_access_paths_valid(
doc_type=doc_type, access_paths=field_names
)
if not all(valid_paths):
raise ValueError(
f'Column names do not match the schema of the DocList\'s '
f'document type ({cls.doc_type.__name__}): '
f'{list(compress(field_names, [not v for v in valid_paths]))}'
)
if not all(valid_paths):
raise ValueError(
f'Column names do not match the schema of the DocList\'s '
f'document type ({cls.doc_type.__name__}): '
f'{list(compress(field_names, [not v for v in valid_paths]))}'
)

for access_path2val in rows:
doc_dict: Dict[Any, Any] = _access_path_dict_to_nested_dict(
access_path2val
)
docs.append(doc_type.parse_obj(doc_dict))
for access_path2val in rows:
doc_dict: Dict[Any, Any] = _access_path_dict_to_nested_dict(access_path2val)
docs.append(doc_type.parse_obj(doc_dict))

return docs

Expand Down
4 changes: 4 additions & 0 deletions tests/toydata/books.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
title,author,year
Harry Potter and the Philosopher's Stone,J. K. Rowling,1997
Klara and the sun,Kazuo Ishiguro,2020
A little life,Hanya Yanagihara,2015
13 changes: 13 additions & 0 deletions tests/units/array/test_array_from_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,16 @@ def test_from_csv_without_schema_raise_exception():
def test_from_csv_with_wrong_schema_raise_exception(nested_doc):
with pytest.raises(ValueError, match='Column names do not match the schema'):
DocList[nested_doc.__class__].from_csv(file_path=str(TOYDATA_DIR / 'docs.csv'))


def test_from_remote_csv_file():
remote_url = 'https://github.com/docarray/docarray/blob/feat-csv-from-remote-file/tests/toydata/books.csv?raw=true'

class Book(BaseDoc):
title: str
author: str
year: int

books = DocList[Book].from_csv(file_path=remote_url)

assert len(books) == 3