From bb5fa0838bbf1ab6de6fb781b53ef87fdcc252c9 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 30 Aug 2022 16:02:57 +0100 Subject: [PATCH 1/7] fix: adapt benchmark script to latest docarray --- scripts/benchmarking.py | 74 +++++++++++++++++++++++++------------- scripts/docker-compose.yml | 12 ++++--- 2 files changed, 58 insertions(+), 28 deletions(-) diff --git a/scripts/benchmarking.py b/scripts/benchmarking.py index 0d2dc057844..ad623b451e8 100644 --- a/scripts/benchmarking.py +++ b/scripts/benchmarking.py @@ -12,7 +12,7 @@ from rich.console import Console from rich.table import Table -n_index_values = [1_000_000] +n_index_values = [1_000] n_query = 1 D = 128 TENSOR_SHAPE = (512, 256) @@ -98,12 +98,12 @@ def recall(predicted, relevant, eval_at): if args.default_hnsw: storage_backends = [ - # ('memory', None), - # ('sqlite', None), - # ( - # 'annlite', - # {'n_dim': D}, - # ), + ('memory', None), + ('sqlite', None), + ( + 'annlite', + {'n_dim': D, 'columns': [('i', 'int')]}, + ), ( 'qdrant', { @@ -117,13 +117,15 @@ def recall(predicted, relevant, eval_at): { 'n_dim': D, 'port': '41234', + 'columns': ('i', 'int'), }, ), ( 'elasticsearch', { 'n_dim': D, - 'port': '41235', + 'hosts': 'http://localhost:41235', + 'columns': [('i', 'int')], }, ), ( @@ -131,22 +133,24 @@ def recall(predicted, relevant, eval_at): { 'n_dim': D, 'port': '41236', + 'columns': [('i', 'int')], }, ), ] else: storage_backends = [ - # ('memory', None), - # ('sqlite', None), - # ( - # 'annlite', - # { - # 'n_dim': D, - # 'ef_construction': 100, - # 'ef_search': 100, - # 'max_connection': 16, - # }, - # ), + ('memory', None), + ('sqlite', None), + ( + 'annlite', + { + 'n_dim': D, + 'ef_construction': 100, + 'ef_search': 100, + 'max_connection': 16, + 'columns': [('i', 'int')], + }, + ), ( 'qdrant', { @@ -165,15 +169,32 @@ def recall(predicted, relevant, eval_at): 'ef_construction': 100, 'max_connections': 16, 'port': '41234', + 'columns': [('i', 'int')], }, ), ( 'elasticsearch', - {'n_dim': D, 'ef_construction': 100, 'm': 16, 'port': '41235'}, + { + 'n_dim': D, + 'ef_construction': 100, + 'm': 16, + 'hosts': 'http://localhost:41235', + 'columns': [('i', 'int')], + }, ), ('redis', {'n_dim': D, 'ef_construction': 100, 'm': 16, 'port': '41236'}), ] +storage_backend_filters = { + 'memory': {'tags__i': {'$eq': 0}}, + 'sqlite': {'tags__i': {'$eq': 0}}, + 'annlite': {'i': {'$eq': 0}}, + 'qdrant': {'tags__i': {'$eq': 0}}, + 'weaviate': {'path': 'i', 'operator': 'Equal', 'valueInt': 0}, + 'elasticsearch': {'match': {'i': 0}}, + 'redis': {'i': {'$eq': 0}}, +} + table = Table( title=f'DocArray Benchmarking n_index={n_index_values[-1]} n_query={n_query} D={D} K={K}' ) @@ -236,13 +257,15 @@ def recall(predicted, relevant, eval_at): f'finding {n_query} docs by vector averaged {n_vector_queries} times ...' ) if backend == 'sqlite': - find_by_vector_time, result = find_by_vector(da, vector_queries[0]) + find_by_vector_time, result = find_by_vector( + da, vector_queries[0].squeeze() + ) recall_at_k = recall(result, ground_truth[0], K) else: recall_at_k_values = [] find_by_vector_times = [] for i, query in enumerate(vector_queries): - find_by_vector_time, results = find_by_vector(da, query) + find_by_vector_time, results = find_by_vector(da, query.squeeze()) find_by_vector_times.append(find_by_vector_time) if backend == 'memory': ground_truth.append(results) @@ -256,7 +279,9 @@ def recall(predicted, relevant, eval_at): ) console.print(f'finding {n_query} docs by condition ...') - find_by_condition_time, _ = find_by_condition(da, {'tags__i': {'$eq': 0}}) + find_by_condition_time, _ = find_by_condition( + da, storage_backend_filters[backend] + ) if idx == len(n_index_values) - 1: table.add_row( @@ -290,7 +315,8 @@ def recall(predicted, relevant, eval_at): find_by_vector_values[str(n_index)].append(find_by_vector_time) create_values[str(n_index)].append(create_time) except Exception as e: - console.print(f'Storage Backend {backend} failed: {e}') + console.print(f'Storage Backend {backend} failed') + raise e find_df = pd.DataFrame(find_by_vector_values) find_df.index = [backend for backend, _ in storage_backends] diff --git a/scripts/docker-compose.yml b/scripts/docker-compose.yml index 4d917761b60..638375158ae 100644 --- a/scripts/docker-compose.yml +++ b/scripts/docker-compose.yml @@ -3,7 +3,7 @@ services: weaviate: image: semitechnologies/weaviate:1.13.2 ports: - - "41234:41234" + - "41234:8080" environment: CONTEXTIONARY_URL: contextionary:9999 QUERY_DEFAULTS_LIMIT: 25 @@ -12,7 +12,7 @@ services: qdrant: image: qdrant/qdrant:v0.7.0 ports: - - "41233:41233" + - "41233:6333" ulimits: # Only required for tests, as there are a lot of collections created nofile: soft: 65535 @@ -23,8 +23,12 @@ services: - xpack.security.enabled=false - discovery.type=single-node ports: - - "41235:41235" + - "41235:9200" redis: image: redislabs/redisearch:2.6.0 ports: - - "41236:41236" + - "41236:6379" + +networks: + elastic: + name: elastic From 5809d3f6091183488304c50390c7451c7263373c Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 30 Aug 2022 16:20:52 +0100 Subject: [PATCH 2/7] chore: increase index size --- scripts/benchmarking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/benchmarking.py b/scripts/benchmarking.py index ad623b451e8..0880a4778f3 100644 --- a/scripts/benchmarking.py +++ b/scripts/benchmarking.py @@ -12,7 +12,7 @@ from rich.console import Console from rich.table import Table -n_index_values = [1_000] +n_index_values = [1_000_000] n_query = 1 D = 128 TENSOR_SHAPE = (512, 256) From 012669877fbe83c73b84bacfc15406255768824f Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Wed, 31 Aug 2022 16:22:49 +0100 Subject: [PATCH 3/7] chore: add exclude backends --- scripts/benchmarking.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/benchmarking.py b/scripts/benchmarking.py index 0880a4778f3..75f0b32eef7 100644 --- a/scripts/benchmarking.py +++ b/scripts/benchmarking.py @@ -12,6 +12,8 @@ from rich.console import Console from rich.table import Table +np.random.seed(123) + n_index_values = [1_000_000] n_query = 1 D = 128 @@ -26,6 +28,12 @@ help='Whether to use default HNSW configurations', action='store_true', ) + +parser.add_argument( + '--exclude-backends', + help='list of comma separated backends to exclude from the benchmarks', + type=str, +) args = parser.parse_args() times = {} @@ -185,6 +193,12 @@ def recall(predicted, relevant, eval_at): ('redis', {'n_dim': D, 'ef_construction': 100, 'm': 16, 'port': '41236'}), ] +storage_backends = [ + (backend, config) + for backend, config in storage_backends + if backend not in args.exclude_backends.split(',') +] + storage_backend_filters = { 'memory': {'tags__i': {'$eq': 0}}, 'sqlite': {'tags__i': {'$eq': 0}}, From 14fc604bd9609a4e09f7374df6d883f3b46708bf Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Thu, 1 Sep 2022 10:33:16 +0100 Subject: [PATCH 4/7] fix: ignore errors in drop --- scripts/benchmarking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/benchmarking.py b/scripts/benchmarking.py index 75f0b32eef7..18bf29494c4 100644 --- a/scripts/benchmarking.py +++ b/scripts/benchmarking.py @@ -334,7 +334,7 @@ def recall(predicted, relevant, eval_at): find_df = pd.DataFrame(find_by_vector_values) find_df.index = [backend for backend, _ in storage_backends] -find_df = find_df.drop(['sqlite']) +find_df = find_df.drop(['sqlite'], errors='ignore') print(find_df) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(17, 5)) From c3f2a7d8352dc133584999a6c63a9969e81ab282 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 20 Sep 2022 07:47:36 +0100 Subject: [PATCH 5/7] chore: ef_runtime for redis --- scripts/benchmarking.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/scripts/benchmarking.py b/scripts/benchmarking.py index 18bf29494c4..70fc010f9dc 100644 --- a/scripts/benchmarking.py +++ b/scripts/benchmarking.py @@ -190,7 +190,16 @@ def recall(predicted, relevant, eval_at): 'columns': [('i', 'int')], }, ), - ('redis', {'n_dim': D, 'ef_construction': 100, 'm': 16, 'port': '41236'}), + ( + 'redis', + { + 'n_dim': D, + 'ef_construction': 100, + 'm': 16, + 'ef_runtime': 100, + 'port': '41236', + }, + ), ] storage_backends = [ From d5605fc8052570d0d3e467d0543e49f1a915aa0b Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 20 Sep 2022 07:58:35 +0100 Subject: [PATCH 6/7] fix: fix exclude --- scripts/benchmarking.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/benchmarking.py b/scripts/benchmarking.py index 70fc010f9dc..e5b55fb1df6 100644 --- a/scripts/benchmarking.py +++ b/scripts/benchmarking.py @@ -105,6 +105,7 @@ def recall(predicted, relevant, eval_at): if args.default_hnsw: + print('here') storage_backends = [ ('memory', None), ('sqlite', None), @@ -205,7 +206,7 @@ def recall(predicted, relevant, eval_at): storage_backends = [ (backend, config) for backend, config in storage_backends - if backend not in args.exclude_backends.split(',') + if backend not in (args.exclude_backends or '').split(',') ] storage_backend_filters = { From a65992a34b5f5de2242c3998d2883caf1ecaa273 Mon Sep 17 00:00:00 2001 From: Alaeddine Abdessalem Date: Tue, 20 Sep 2022 08:25:15 +0100 Subject: [PATCH 7/7] chore: remove print --- scripts/benchmarking.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/benchmarking.py b/scripts/benchmarking.py index e5b55fb1df6..453a1afa276 100644 --- a/scripts/benchmarking.py +++ b/scripts/benchmarking.py @@ -105,7 +105,6 @@ def recall(predicted, relevant, eval_at): if args.default_hnsw: - print('here') storage_backends = [ ('memory', None), ('sqlite', None),