docarray · hanxiao · Jan 3, 2022 · Dec 23, 2021 · Dec 23, 2021 · Dec 24, 2021
diff --git a/.github/README-img/9nn.png b/.github/README-img/9nn.png
diff --git a/.github/README-img/sprite.png b/.github/README-img/sprite.png
diff --git a/.github/README-img/tsne.gif b/.github/README-img/tsne.gif
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ The data structure for unstructured data.
 
 🌌 **All data types**: super-expressive data structure for representing complicated/mixed/nested text, image, video, audio, 3D mesh data.
 
-🧑‍🔬 **Data science powerhouse**: easy-to-use functions for facilitating data scientists work on embedding, matching, visualizing unstructured data via Torch/Tensorflow/ONNX/PaddlePaddle.
+🧑‍🔬 **Data science powerhouse**: easy-to-use functions for facilitating data scientists work on embedding, matching, visualizing, evaluating unstructured data via Torch/Tensorflow/ONNX/PaddlePaddle.
 
 🚡 **Portable**: ready to wire with efficient conversion from/to Protobuf, binary, JSON, CSV, dataframe.
 
@@ -21,4 +21,120 @@ pip install docarray
 ```
 To install full dependencies, please use `pip install docarray[full]`.
 
-## [Documentation](https://docarray.jina.ai)
+## [Documentation](https://docarray.jina.ai)
+
+## Get Started
+
+Let's use DocArray and ResNet50 to build a meme image search on [Totally Looks Like](https://sites.google.com/view/totally-looks-like-dataset). This dataset contains 6016 image-pairs stored in `/left` and `/right`. Images that shares the same filename are labeled as perceptually similar. For example, 
+
+| `/left` | `/right` | `/left` | `/right` |
+|---------|----------|---------|----------|
+|
+
+Our problem is given an image from `/left` and find its most-similar image in `/right` (without looking at the filename of course).
+
+### Load images
+
+First load images and preprocess them with standard computer vision techniques:
+
+```python
+from docarray import DocumentArray, Document
+
+left_da = DocumentArray.from_files('left/*.jpg')
+```
+
+To get a feeling of the data you will handle, plot them in one sprite image:
+
+```python
+left_da.plot_image_sprites()
+```
+
+<p align="center">
+<a href="https://docs.jina.ai"><img src="https://github.com/jina-ai/docarray/blob/master/.github/README-img/sprite.png?raw=true" alt="Load totally looks like dataset with docarray API" width="70%"></a>
+</p>
+
+### Apply preprocessing
+
+Let's do some standard computer vision preprocessing:
+
+```python
+def preproc(d: Document):
+    return (d.load_uri_to_image_blob()  # load
+             .set_image_blob_normalization()  # normalize color 
+             .set_image_blob_channel_axis(-1, 0))  # switch color axis
+
+left_da.apply(preproc)
+```
+
+Did I mention `apply` work in parallel?
+
+### Embed images
+
+Now convert images into embeddings using a pretrained ResNet50:
+
+```python
+import torchvision
+model = torchvision.models.resnet50(pretrained=True)  # load ResNet50
+left_da.embed(model, device='cuda')  # embed via GPU to speedup
+```
+
+### Visualize embeddings
+
+You can visualize the embeddings via tSNE in an interactive embedding projector:
+
+```python
+left_da.plot_embeddings()
+```
+
+<p align="center">
+<a href="https://docs.jina.ai"><img src="https://github.com/jina-ai/docarray/blob/master/.github/README-img/tsne.gif?raw=true" alt="Visualizing embedding via tSNE and embedding projector" width="90%"></a>
+</p>
+
+Fun is fun, but recall our goal is to match left images against right images and so far we have only handled the left. Let's repeat the same procedure for the right:
+
+```python
+right_da = (DocumentArray.from_files('right/*.jpg')
+                         .apply(preproc)
+                         .embed(model, device='cuda'))
+```
+
+### Match nearest neighbours
+
+We can now match the left to the right and take the top-9 results.
+
+```python
+left_da.match(right_da, limit=9)
+```
+
+Let's inspect what's inside `left_da` now:
+
+```python
+for d in left_da:
+    for m in d.matches:
+        print(d.uri, m.uri, m.scores['cosine'].value)
+```
+
+```text
+left/02262.jpg right/03459.jpg 0.21102
+left/02262.jpg right/02964.jpg 0.13871843
+left/02262.jpg right/02103.jpg 0.18265384
+left/02262.jpg right/04520.jpg 0.16477376
+...
+```
+
+Better see it.
+
+```python
+(DocumentArray(left_da[12].matches, copy=True)
+    .apply(lambda d: d.set_image_blob_channel_axis(0, -1)
+                      .set_image_blob_inv_normalization())
+    .plot_image_sprites('result.png'))
+```
+
+<p align="center">
+<a href="https://docs.jina.ai"><img src="https://github.com/jina-ai/docarray/blob/master/.github/README-img/9nn.png?raw=true" alt="Visualizing top-9 matches using DocArray API" width="50%"></a>
+</p>
+
+### Quantitative evaluation
+
+
diff --git a/docarray/array/mixins/evaluation.py b/docarray/array/mixins/evaluation.py
@@ -53,7 +53,7 @@ def evaluate(
         metric_name = metric_name or metric_fn.__name__
         results = []
         for d, gd in zip(self, other):
-            if not strict or hash_fn(d) != hash_fn(gd):
+            if strict and hash_fn(d) != hash_fn(gd):
                 raise ValueError(
                     f'Document {d} from the left-hand side and '
                     f'{gd} from the right-hand are not hashed to the same value. '

diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py
@@ -50,7 +50,13 @@ def read(self, n=-1):
                     self._p_bar.update(self._task_id, advance=len(chunk))
                 return chunk
 
-        dict_data = {'file': ('DocumentArray', bytes(self)), 'token': token}
+        dict_data = {
+            'file': (
+                'DocumentArray',
+                self.to_bytes(protocol='protobuf', compress='lz4'),
+            ),
+            'token': token,
+        }
 
         (data, ctype) = requests.packages.urllib3.filepost.encode_multipart_formdata(
             dict_data
@@ -97,7 +103,7 @@ def pull(cls: Type['T'], token: str, show_progress: bool = False) -> 'T':
                     if show_progress:
                         progress.update(task_id, advance=len(chunk))
 
-                return cls.load_binary(f.getvalue())
+                return cls.from_bytes(f.getvalue(), protocol='protobuf', compress='lz4')
 
 
 def _get_progressbar(show_progress):

diff --git a/docarray/array/mixins/parallel.py b/docarray/array/mixins/parallel.py
@@ -44,8 +44,9 @@ def apply(self: 'T', *args, **kwargs) -> 'T':
         # noqa: DAR201
         :return: a new :class:`DocumentArray`
         """
-        new_da = type(self)()
-        new_da.extend(self.map(*args, **kwargs))
+        from ... import DocumentArray
+
+        new_da = DocumentArray(self.map(*args, **kwargs))
         self.clear()
         self.extend(new_da)
         return self
@@ -118,7 +119,9 @@ def apply_batch(self: 'T', *args, **kwargs) -> 'T':
         # noqa: DAR201
         :return: a new :class:`DocumentArray`
         """
-        new_da = type(self)()
+        from ... import DocumentArray
+
+        new_da = DocumentArray()
         for _b in self.map_batch(*args, **kwargs):
             new_da.extend(_b)
         self.clear()

diff --git a/tests/unit/document/test_converters.py b/tests/unit/document/test_converters.py
@@ -34,7 +34,7 @@ def test_audio_convert_pipe(pytestconfig, tmpdir):
 def test_image_convert_pipe(pytestconfig):
     for d in from_files(f'{pytestconfig.rootdir}/.github/**/*.png'):
         (
-            d.convert_uri_to_image_blob()
+            d.load_uri_to_image_blob()
             .convert_uri_to_datauri()
             .set_image_blob_shape((64, 64))
             .set_image_blob_normalization()