diff --git a/benchmark_serializing.py b/benchmark_serializing.py new file mode 100644 index 00000000000..5ac73f540fe --- /dev/null +++ b/benchmark_serializing.py @@ -0,0 +1,71 @@ +import os +import sys +import time + +from tests import random_docs + +from docarray import DocumentArray, Document + +from docarray.proto.dummy_pb2 import BytesWrapper, DocsWrapper + +DOC_SIZE = 1024 * 100 +DOC_COUNT = 10000 +# da = DocumentArray( +# [ +# Document(buffer=bytes(bytearray(os.urandom(DOC_SIZE)))) +# for _ in range(DOC_COUNT) +# ] +# ) +da = random_docs(DOC_COUNT) + + +def serialize_bytes_wrapper(): + return BytesWrapper(docs=da.to_bytes()).SerializeToString() + + +def deserialize_bytes_wrapper(proto_byte_array): + loaded_bw = BytesWrapper() + loaded_bw.ParseFromString(proto_byte_array) + return DocumentArray.load_binary(loaded_bw.docs) + + +def serialize_doc_wrapper(): + dw = DocsWrapper() + for d in da: + dw.docs.append(d.to_protobuf()) + return dw.SerializeToString() + + +def deserialize_doc_wrapper(proto_byte_array): + loaded_dw = DocsWrapper() + loaded_dw.ParseFromString(proto_byte_array) + return loaded_dw.docs + + +start_bw_serializer = time.time() +proto_byte_array = serialize_bytes_wrapper() +end_bw_serializer = time.time() + +start_bw_deserializer = time.time() +loaded_da = deserialize_bytes_wrapper(proto_byte_array) +end_bw_deserializer = time.time() + +print( + f'Byte array proto serialization took {end_bw_serializer-start_bw_serializer}, deserialization took {end_bw_deserializer-start_bw_deserializer} and serialized size is {sys.getsizeof(proto_byte_array)} - loaded da has {len(loaded_da)} docs' +) + +start_dw_serializer = time.time() +proto_byte_array = serialize_doc_wrapper() +end_dw_serializer = time.time() + +start_dw_deserializer = time.time() +new_da = DocumentArray() +loaded_da = deserialize_doc_wrapper(proto_byte_array) +for d in loaded_da: + new_da.append(Document.from_protobuf(d)) + +end_dw_deserializer = time.time() + +print( + f'Doc array proto serialization took {end_dw_serializer-start_dw_serializer}, deserialization took {end_dw_deserializer-start_dw_deserializer} and serialized size is {sys.getsizeof(proto_byte_array)} - loaded da has {len(loaded_da)} docs' +) diff --git a/docarray/proto/dummy.proto b/docarray/proto/dummy.proto new file mode 100644 index 00000000000..4958df8d4af --- /dev/null +++ b/docarray/proto/dummy.proto @@ -0,0 +1,10 @@ +syntax = "proto3"; +import "docarray.proto"; + +message BytesWrapper { + bytes docs = 1; +} + +message DocsWrapper { + repeated docarray.DocumentProto docs = 1; +} \ No newline at end of file diff --git a/docarray/proto/dummy_pb2.py b/docarray/proto/dummy_pb2.py new file mode 100644 index 00000000000..79291d3ea4f --- /dev/null +++ b/docarray/proto/dummy_pb2.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: dummy.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +import docarray.proto.docarray_pb2 as docarray__pb2 + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='dummy.proto', + package='', + syntax='proto3', + serialized_options=None, + create_key=_descriptor._internal_create_key, + serialized_pb=b'\n\x0b\x64ummy.proto\x1a\x0e\x64ocarray.proto\"\x1c\n\x0c\x42ytesWrapper\x12\x0c\n\x04\x64ocs\x18\x01 \x01(\x0c\"4\n\x0b\x44ocsWrapper\x12%\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x17.docarray.DocumentProtob\x06proto3' + , + dependencies=[docarray__pb2.DESCRIPTOR,]) + + + + +_BYTESWRAPPER = _descriptor.Descriptor( + name='BytesWrapper', + full_name='BytesWrapper', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='docs', full_name='BytesWrapper.docs', index=0, + number=1, type=12, cpp_type=9, label=1, + has_default_value=False, default_value=b"", + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=31, + serialized_end=59, +) + + +_DOCSWRAPPER = _descriptor.Descriptor( + name='DocsWrapper', + full_name='DocsWrapper', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='docs', full_name='DocsWrapper.docs', index=0, + number=1, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=61, + serialized_end=113, +) + +_DOCSWRAPPER.fields_by_name['docs'].message_type = docarray__pb2._DOCUMENTPROTO +DESCRIPTOR.message_types_by_name['BytesWrapper'] = _BYTESWRAPPER +DESCRIPTOR.message_types_by_name['DocsWrapper'] = _DOCSWRAPPER +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +BytesWrapper = _reflection.GeneratedProtocolMessageType('BytesWrapper', (_message.Message,), { + 'DESCRIPTOR' : _BYTESWRAPPER, + '__module__' : 'dummy_pb2' + # @@protoc_insertion_point(class_scope:BytesWrapper) + }) +_sym_db.RegisterMessage(BytesWrapper) + +DocsWrapper = _reflection.GeneratedProtocolMessageType('DocsWrapper', (_message.Message,), { + 'DESCRIPTOR' : _DOCSWRAPPER, + '__module__' : 'dummy_pb2' + # @@protoc_insertion_point(class_scope:DocsWrapper) + }) +_sym_db.RegisterMessage(DocsWrapper) + + +# @@protoc_insertion_point(module_scope)