diff --git a/.github/README-img/9nn.png b/.github/README-img/9nn.png new file mode 100644 index 00000000000..ee3e9236c79 Binary files /dev/null and b/.github/README-img/9nn.png differ diff --git a/.github/README-img/sprite.png b/.github/README-img/sprite.png new file mode 100644 index 00000000000..dde4e54a1cf Binary files /dev/null and b/.github/README-img/sprite.png differ diff --git a/.github/README-img/tsne.gif b/.github/README-img/tsne.gif new file mode 100644 index 00000000000..2760ec6ab59 Binary files /dev/null and b/.github/README-img/tsne.gif differ diff --git a/README.md b/README.md index 47a3f97288d..5682eabdb39 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The data structure for unstructured data. 🌌 **All data types**: super-expressive data structure for representing complicated/mixed/nested text, image, video, audio, 3D mesh data. -🧑🔬 **Data science powerhouse**: easy-to-use functions for facilitating data scientists work on embedding, matching, visualizing unstructured data via Torch/Tensorflow/ONNX/PaddlePaddle. +🧑🔬 **Data science powerhouse**: easy-to-use functions for facilitating data scientists work on embedding, matching, visualizing, evaluating unstructured data via Torch/Tensorflow/ONNX/PaddlePaddle. 🚡 **Portable**: ready to wire with efficient conversion from/to Protobuf, binary, JSON, CSV, dataframe. @@ -21,4 +21,120 @@ pip install docarray ``` To install full dependencies, please use `pip install docarray[full]`. -## [Documentation](https://docarray.jina.ai) \ No newline at end of file +## [Documentation](https://docarray.jina.ai) + +## Get Started + +Let's use DocArray and ResNet50 to build a meme image search on [Totally Looks Like](https://sites.google.com/view/totally-looks-like-dataset). This dataset contains 6016 image-pairs stored in `/left` and `/right`. Images that shares the same filename are labeled as perceptually similar. For example, + +| `/left` | `/right` | `/left` | `/right` | +|---------|----------|---------|----------| +| + +Our problem is given an image from `/left` and find its most-similar image in `/right` (without looking at the filename of course). + +### Load images + +First load images and preprocess them with standard computer vision techniques: + +```python +from docarray import DocumentArray, Document + +left_da = DocumentArray.from_files('left/*.jpg') +``` + +To get a feeling of the data you will handle, plot them in one sprite image: + +```python +left_da.plot_image_sprites() +``` + +
+ +### Apply preprocessing + +Let's do some standard computer vision preprocessing: + +```python +def preproc(d: Document): + return (d.load_uri_to_image_blob() # load + .set_image_blob_normalization() # normalize color + .set_image_blob_channel_axis(-1, 0)) # switch color axis + +left_da.apply(preproc) +``` + +Did I mention `apply` work in parallel? + +### Embed images + +Now convert images into embeddings using a pretrained ResNet50: + +```python +import torchvision +model = torchvision.models.resnet50(pretrained=True) # load ResNet50 +left_da.embed(model, device='cuda') # embed via GPU to speedup +``` + +### Visualize embeddings + +You can visualize the embeddings via tSNE in an interactive embedding projector: + +```python +left_da.plot_embeddings() +``` + + + +Fun is fun, but recall our goal is to match left images against right images and so far we have only handled the left. Let's repeat the same procedure for the right: + +```python +right_da = (DocumentArray.from_files('right/*.jpg') + .apply(preproc) + .embed(model, device='cuda')) +``` + +### Match nearest neighbours + +We can now match the left to the right and take the top-9 results. + +```python +left_da.match(right_da, limit=9) +``` + +Let's inspect what's inside `left_da` now: + +```python +for d in left_da: + for m in d.matches: + print(d.uri, m.uri, m.scores['cosine'].value) +``` + +```text +left/02262.jpg right/03459.jpg 0.21102 +left/02262.jpg right/02964.jpg 0.13871843 +left/02262.jpg right/02103.jpg 0.18265384 +left/02262.jpg right/04520.jpg 0.16477376 +... +``` + +Better see it. + +```python +(DocumentArray(left_da[12].matches, copy=True) + .apply(lambda d: d.set_image_blob_channel_axis(0, -1) + .set_image_blob_inv_normalization()) + .plot_image_sprites('result.png')) +``` + + + +### Quantitative evaluation + + diff --git a/docarray/array/mixins/evaluation.py b/docarray/array/mixins/evaluation.py index abaf9b55fef..3024adbf553 100644 --- a/docarray/array/mixins/evaluation.py +++ b/docarray/array/mixins/evaluation.py @@ -53,7 +53,7 @@ def evaluate( metric_name = metric_name or metric_fn.__name__ results = [] for d, gd in zip(self, other): - if not strict or hash_fn(d) != hash_fn(gd): + if strict and hash_fn(d) != hash_fn(gd): raise ValueError( f'Document {d} from the left-hand side and ' f'{gd} from the right-hand are not hashed to the same value. ' diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py index eab0cc31ede..a971fc53cd8 100644 --- a/docarray/array/mixins/io/pushpull.py +++ b/docarray/array/mixins/io/pushpull.py @@ -50,7 +50,13 @@ def read(self, n=-1): self._p_bar.update(self._task_id, advance=len(chunk)) return chunk - dict_data = {'file': ('DocumentArray', bytes(self)), 'token': token} + dict_data = { + 'file': ( + 'DocumentArray', + self.to_bytes(protocol='protobuf', compress='lz4'), + ), + 'token': token, + } (data, ctype) = requests.packages.urllib3.filepost.encode_multipart_formdata( dict_data @@ -97,7 +103,7 @@ def pull(cls: Type['T'], token: str, show_progress: bool = False) -> 'T': if show_progress: progress.update(task_id, advance=len(chunk)) - return cls.load_binary(f.getvalue()) + return cls.from_bytes(f.getvalue(), protocol='protobuf', compress='lz4') def _get_progressbar(show_progress): diff --git a/docarray/array/mixins/parallel.py b/docarray/array/mixins/parallel.py index d375b48096e..eaaf5d66815 100644 --- a/docarray/array/mixins/parallel.py +++ b/docarray/array/mixins/parallel.py @@ -44,8 +44,9 @@ def apply(self: 'T', *args, **kwargs) -> 'T': # noqa: DAR201 :return: a new :class:`DocumentArray` """ - new_da = type(self)() - new_da.extend(self.map(*args, **kwargs)) + from ... import DocumentArray + + new_da = DocumentArray(self.map(*args, **kwargs)) self.clear() self.extend(new_da) return self @@ -118,7 +119,9 @@ def apply_batch(self: 'T', *args, **kwargs) -> 'T': # noqa: DAR201 :return: a new :class:`DocumentArray` """ - new_da = type(self)() + from ... import DocumentArray + + new_da = DocumentArray() for _b in self.map_batch(*args, **kwargs): new_da.extend(_b) self.clear() diff --git a/tests/unit/document/test_converters.py b/tests/unit/document/test_converters.py index 93cef070bff..37923f84e1e 100644 --- a/tests/unit/document/test_converters.py +++ b/tests/unit/document/test_converters.py @@ -34,7 +34,7 @@ def test_audio_convert_pipe(pytestconfig, tmpdir): def test_image_convert_pipe(pytestconfig): for d in from_files(f'{pytestconfig.rootdir}/.github/**/*.png'): ( - d.convert_uri_to_image_blob() + d.load_uri_to_image_blob() .convert_uri_to_datauri() .set_image_blob_shape((64, 64)) .set_image_blob_normalization()