diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index b1e89fdf0bf62..9723f0584035e 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -39,25 +39,34 @@ class GPUbenchmark final
   template <typename... T>
   float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args);
 
-  // Single stream synchronous (sequential kernels) execution
+  // Single stream (sequential kernels) execution
   template <typename... T>
   float runSequential(void (*kernel)(chunk_t*, size_t, T...),
-                      std::pair<int, int>& chunkRanges,
+                      std::pair<float, float>& chunkRanges,
                       int nLaunches,
                       int dimGrid,
                       int dimBlock,
                       T&... args);
 
-  // Multi-streams asynchronous executions on whole memory
+  // Multi-streams asynchronous executions
   template <typename... T>
   std::vector<float> runConcurrent(void (*kernel)(chunk_t*, size_t, T...),
-                                   std::vector<std::pair<int, int>>& chunkRanges,
+                                   std::vector<std::pair<float, float>>& chunkRanges,
                                    int nLaunches,
                                    int dimStreams,
                                    int nBlocks,
                                    int nThreads,
                                    T&... args);
 
+  // Single stream executions on all chunks at a time by same kernel
+  template <typename... T>
+  float runDistributed(void (*kernel)(chunk_t**, size_t*, T...),
+                       std::vector<std::pair<float, float>>& chunkRanges,
+                       int nLaunches,
+                       int nBlocks,
+                       int nThreads,
+                       T&... args);
+
   // Main interface
   void globalInit();     // Allocate scratch buffers and compute runtime parameters
   void run();            // Execute all specified callbacks
diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h
index 4b28c19e2be66..d1e83f8bb282b 100644
--- a/GPU/GPUbenchmark/Shared/Utils.h
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -72,7 +72,8 @@ inline std::ostream& operator<<(std::ostream& os, Test test)
 
 enum class Mode {
   Sequential,
-  Concurrent
+  Concurrent,
+  Distributed
 };
 
 inline std::ostream& operator<<(std::ostream& os, Mode mode)
@@ -84,6 +85,9 @@ inline std::ostream& operator<<(std::ostream& os, Mode mode)
     case Mode::Concurrent:
       os << "concurrent";
       break;
+    case Mode::Distributed:
+      os << "distributed";
+      break;
   }
   return os;
 }
@@ -138,17 +142,11 @@ inline std::string getTestName(Mode mode, Test test, KernelConfig blocks)
   return tname;
 }
 
-template <class chunk_t>
-inline chunk_t* getPartPtr(chunk_t* scratchPtr, float chunkReservedGB, int partNumber)
-{
-  return reinterpret_cast<chunk_t*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * chunkReservedGB) * partNumber);
-}
-
 // Return pointer to custom offset (GB)
 template <class chunk_t>
-inline chunk_t* getCustomPtr(chunk_t* scratchPtr, int partNumber)
+inline chunk_t* getCustomPtr(chunk_t* scratchPtr, float startGB)
 {
-  return reinterpret_cast<chunk_t*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * partNumber));
+  return reinterpret_cast<chunk_t*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * startGB));
 }
 
 inline float computeThroughput(Test test, float result, float chunkSizeGB, int ntests)
@@ -160,9 +158,9 @@ inline float computeThroughput(Test test, float result, float chunkSizeGB, int n
 }
 
 template <class chunk_t>
-inline size_t getBufferCapacity(int chunkReservedGB)
+inline size_t getBufferCapacity(float chunkReservedGB)
 {
-  return static_cast<size_t>(GB * chunkReservedGB / sizeof(chunk_t));
+  return static_cast<size_t>((GB * chunkReservedGB) / sizeof(chunk_t));
 }
 
 // LCG: https://rosettacode.org/wiki/Linear_congruential_generator
@@ -202,7 +200,7 @@ struct benchmarkOpts {
   std::vector<Mode> modes = {Mode::Sequential, Mode::Concurrent};
   std::vector<KernelConfig> pools = {KernelConfig::Single, KernelConfig::Multi};
   std::vector<std::string> dtypes = {"char", "int", "ulong"};
-  std::vector<std::pair<int, int>> testChunks;
+  std::vector<std::pair<float, float>> testChunks;
   float chunkReservedGB = 1.f;
   float threadPoolFraction = 1.f;
   float freeMemoryFractionToAllocate = 0.95f;
@@ -235,10 +233,10 @@ struct gpuState {
   float chunkReservedGB; // Size of each partition (GB)
 
   // General containers and state
-  chunk_t* scratchPtr;                         // Pointer to scratch buffer
-  size_t scratchSize;                          // Size of scratch area (B)
-  std::vector<chunk_t*> partAddrOnHost;        // Pointers to scratch partitions on host vector
-  std::vector<std::pair<int, int>> testChunks; // Vector of definitions for arbitrary chunks
+  chunk_t* scratchPtr;                             // Pointer to scratch buffer
+  size_t scratchSize;                              // Size of scratch area (B)
+  std::vector<chunk_t*> partAddrOnHost;            // Pointers to scratch partitions on host vector
+  std::vector<std::pair<float, float>> testChunks; // Vector of definitions for arbitrary chunks
 
   // Static info
   size_t totalMemory;
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index e2baa83b5fdfe..8413894073bb5 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -13,6 +13,7 @@
 /// \author: mconcas@cern.ch
 
 #include "../Shared/Kernels.h"
+#include <chrono>
 #include <cstdio>
 #include <numeric>
 
@@ -32,6 +33,31 @@
 double bytesToconfig(size_t s) { return (double)s / (1024.0); }
 double bytesToGB(size_t s) { return (double)s / GB; }
 
+bool checkTestChunks(std::vector<std::pair<float, float>>& chunks, size_t availMemSizeGB)
+{
+  if (!chunks.size()) {
+    return true;
+  }
+
+  bool check{false};
+
+  sort(chunks.begin(), chunks.end());
+  for (auto iChunk{0}; iChunk < chunks.size(); ++iChunk) { // Check boundaries
+    if (chunks[iChunk].first + chunks[iChunk].second > availMemSizeGB) {
+      check = false;
+      break;
+    }
+    if (iChunk > 0) { // Check intersections
+      if (chunks[iChunk].first < chunks[iChunk - 1].first + chunks[iChunk - 1].second) {
+        check = false;
+        break;
+      }
+    }
+    check = true;
+  }
+  return check;
+}
+
 // CUDA does not support <type4> operations:
 // https://forums.developer.nvidia.com/t/swizzling-float4-arithmetic-support/217
 #ifndef __HIPCC__
@@ -113,6 +139,69 @@ __global__ void rand_read_k(
   }
   chunkPtr[threadIdx.x] = sink; // writing done once
 }
+
+// Distributed read
+template <class chunk_t>
+__global__ void read_dist_k(
+  chunk_t** block_ptr,
+  size_t* block_size)
+{
+  chunk_t sink{0};
+  chunk_t* ptr = block_ptr[blockIdx.x];
+  size_t n = block_size[blockIdx.x];
+  for (size_t i = threadIdx.x; i < n; i += blockDim.x) {
+    sink += ptr[i];
+  }
+  ptr[threadIdx.x] = sink;
+}
+
+// Distributed write
+template <class chunk_t>
+__global__ void write_dist_k(
+  chunk_t** block_ptr,
+  size_t* block_size)
+{
+  chunk_t* ptr = block_ptr[blockIdx.x];
+  size_t n = block_size[blockIdx.x];
+  for (size_t i = threadIdx.x; i < n; i += blockDim.x) {
+    ptr[i] = 0;
+  }
+}
+
+template <>
+__global__ void write_dist_k(
+  int4** block_ptr,
+  size_t* block_size)
+{
+  int4* ptr = block_ptr[blockIdx.x];
+  size_t n = block_size[blockIdx.x];
+  for (size_t i = threadIdx.x; i < n; i += blockDim.x) {
+    ptr[i] = {0, 1, 0, 0};
+  }
+}
+
+// Distributed copy
+template <class chunk_t>
+__global__ void copy_dist_k(
+  chunk_t** block_ptr,
+  size_t* block_size)
+{
+  chunk_t* ptr = block_ptr[blockIdx.x];
+  size_t n = block_size[blockIdx.x];
+  size_t offset = n / 2;
+  for (size_t i = threadIdx.x; i < offset; i += blockDim.x) {
+    ptr[i] = ptr[offset + i];
+  }
+}
+
+// Distributed Random read
+template <class chunk_t>
+__global__ void rand_read_dist_k(
+  chunk_t** block_ptr,
+  size_t* block_size)
+{
+}
+
 } // namespace gpu
 
 void printDeviceProp(int deviceId)
@@ -238,7 +327,7 @@ void printDeviceProp(int deviceId)
 template <class chunk_t>
 template <typename... T>
 float GPUbenchmark<chunk_t>::runSequential(void (*kernel)(chunk_t*, size_t, T...),
-                                           std::pair<int, int>& chunk,
+                                           std::pair<float, float>& chunk,
                                            int nLaunches,
                                            int nBlocks,
                                            int nThreads,
@@ -248,8 +337,8 @@ float GPUbenchmark<chunk_t>::runSequential(void (*kernel)(chunk_t*, size_t, T...
   cudaEvent_t start, stop;
   cudaStream_t stream;
   GPUCHECK(cudaStreamCreate(&stream));
-
   GPUCHECK(cudaSetDevice(mOptions.deviceId));
+
   chunk_t* chunkPtr = getCustomPtr<chunk_t>(mState.scratchPtr, chunk.first);
 
   // Warm up
@@ -275,7 +364,7 @@ float GPUbenchmark<chunk_t>::runSequential(void (*kernel)(chunk_t*, size_t, T...
 template <class chunk_t>
 template <typename... T>
 std::vector<float> GPUbenchmark<chunk_t>::runConcurrent(void (*kernel)(chunk_t*, size_t, T...),
-                                                        std::vector<std::pair<int, int>>& chunkRanges,
+                                                        std::vector<std::pair<float, float>>& chunkRanges,
                                                         int nLaunches,
                                                         int dimStreams,
                                                         int nBlocks,
@@ -283,7 +372,7 @@ std::vector<float> GPUbenchmark<chunk_t>::runConcurrent(void (*kernel)(chunk_t*,
                                                         T&... args)
 {
   auto nChunks = chunkRanges.size();
-  std::vector<float> results(nChunks);
+  std::vector<float> results(nChunks + 1); // last spot is for the host time
   std::vector<cudaEvent_t> starts(nChunks), stops(nChunks);
   std::vector<cudaStream_t> streams(dimStreams);
 
@@ -302,6 +391,7 @@ std::vector<float> GPUbenchmark<chunk_t>::runConcurrent(void (*kernel)(chunk_t*,
     chunk_t* chunkPtr = getCustomPtr<chunk_t>(mState.scratchPtr, chunk.first);
     (*kernel)<<<nBlocks, nThreads, 0, streams[iChunk % dimStreams]>>>(chunkPtr, getBufferCapacity<chunk_t>(chunk.second), args...);
   }
+  auto start = std::chrono::high_resolution_clock::now();
 
   for (auto iChunk{0}; iChunk < nChunks; ++iChunk) {
     auto& chunk = chunkRanges[iChunk];
@@ -319,14 +409,90 @@ std::vector<float> GPUbenchmark<chunk_t>::runConcurrent(void (*kernel)(chunk_t*,
     GPUCHECK(cudaEventDestroy(starts[iChunk]));
     GPUCHECK(cudaEventDestroy(stops[iChunk]));
   }
+  GPUCHECK(cudaDeviceSynchronize());
+
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double, std::milli> diff_t{end - start};
 
   for (auto iStream{0}; iStream < dimStreams; ++iStream) {
     GPUCHECK(cudaStreamDestroy(streams[iStream]));
   }
 
+  results[nChunks] = diff_t.count(); // register host time on latest spot
   return results;
 }
 
+template <class chunk_t>
+template <typename... T>
+float GPUbenchmark<chunk_t>::runDistributed(void (*kernel)(chunk_t**, size_t*, T...),
+                                            std::vector<std::pair<float, float>>& chunkRanges,
+                                            int nLaunches,
+                                            int nBlocks,
+                                            int nThreads,
+                                            T&... args)
+{
+  std::vector<chunk_t*> chunkPtrs(chunkRanges.size()); // Pointers to the beginning of each chunk
+  std::vector<chunk_t*> ptrPerBlocks(nBlocks);         // Pointers for each block
+  std::vector<size_t> perBlockCapacity(nBlocks);       // Capacity of sub-buffer for block
+
+  float totChunkGB{0.f};
+  int totComputedBlocks{0};
+
+  for (size_t iChunk{0}; iChunk < chunkRanges.size(); ++iChunk) {
+    chunkPtrs[iChunk] = getCustomPtr<chunk_t>(mState.scratchPtr, chunkRanges[iChunk].first);
+    totChunkGB += chunkRanges[iChunk].second;
+  }
+  int index{0};
+  for (size_t iChunk{0}; iChunk < chunkRanges.size(); ++iChunk) {
+    float percFromMem = chunkRanges[iChunk].second / totChunkGB;
+    int blocksPerChunk = percFromMem * nBlocks;
+    totComputedBlocks += blocksPerChunk;
+    for (int iBlock{0}; iBlock < blocksPerChunk; ++iBlock, ++index) {
+      float memPerBlock = chunkRanges[iChunk].second / blocksPerChunk;
+      ptrPerBlocks[index] = getCustomPtr<chunk_t>(chunkPtrs[iChunk], iBlock * memPerBlock);
+      perBlockCapacity[index] = getBufferCapacity<chunk_t>(memPerBlock);
+    }
+  }
+
+  if (totComputedBlocks != nBlocks) {
+    std::cout << "   │   - \033[1;33mWarning: Sum of used blocks (" << totComputedBlocks
+              << ") is different from requested one (" << nBlocks << ")!\e[0m"
+              << std::endl;
+  }
+
+  // Setup
+  chunk_t** block_ptr;
+  size_t* block_size;
+  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&block_ptr), 60 * sizeof(chunk_t*)));
+  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&block_size), 60 * sizeof(size_t)));
+  GPUCHECK(cudaMemcpy(block_ptr, ptrPerBlocks.data(), nBlocks * sizeof(chunk_t*), cudaMemcpyHostToDevice));
+  GPUCHECK(cudaMemcpy(block_size, perBlockCapacity.data(), nBlocks * sizeof(size_t), cudaMemcpyHostToDevice));
+
+  float milliseconds{0.f};
+  cudaEvent_t start, stop;
+  cudaStream_t stream;
+
+  GPUCHECK(cudaStreamCreate(&stream));
+  GPUCHECK(cudaSetDevice(mOptions.deviceId));
+  GPUCHECK(cudaEventCreate(&start));
+  GPUCHECK(cudaEventCreate(&stop));
+
+  // Warm up
+  (*kernel)<<<totComputedBlocks, nThreads, 0, stream>>>(block_ptr, block_size, args...);
+
+  GPUCHECK(cudaEventRecord(start));
+  for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) {                                  // Schedule all the requested kernel launches
+    (*kernel)<<<totComputedBlocks, nThreads, 0, stream>>>(block_ptr, block_size, args...); // NOLINT: clang-tidy false-positive
+  }
+  GPUCHECK(cudaEventRecord(stop));      // record checkpoint
+  GPUCHECK(cudaEventSynchronize(stop)); // synchronize executions
+  GPUCHECK(cudaEventElapsedTime(&milliseconds, start, stop));
+  GPUCHECK(cudaEventDestroy(start));
+  GPUCHECK(cudaEventDestroy(stop));
+  GPUCHECK(cudaStreamDestroy(stream));
+  return milliseconds;
+}
+
 template <class chunk_t>
 void GPUbenchmark<chunk_t>::printDevices()
 {
@@ -354,6 +520,10 @@ void GPUbenchmark<chunk_t>::globalInit()
   mState.iterations = mOptions.kernelLaunches;
   mState.streams = mOptions.streams;
   mState.testChunks = mOptions.testChunks;
+  if (!checkTestChunks(mState.testChunks, mOptions.freeMemoryFractionToAllocate * free / GB)) {
+    std::cerr << "Failed to configure memory chunks: check arbitrary chunks boundaries." << std::endl;
+    exit(1);
+  }
   mState.nMultiprocessors = props.multiProcessorCount;
   mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor;
   mState.nMaxThreadsPerDimension = props.maxThreadsDim[0];
@@ -368,7 +538,6 @@ void GPUbenchmark<chunk_t>::globalInit()
   std::cout << " ◈ Running on: \033[1;31m" << props.name << "\e[0m" << std::endl;
   // Allocate scratch on GPU
   GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&mState.scratchPtr), mState.scratchSize));
-  // mState.computeScratchPtrs();
   GPUCHECK(cudaMemset(mState.scratchPtr, 0, mState.scratchSize))
 
   std::cout << "   ├ Buffer type: \e[1m" << getType<chunk_t>() << "\e[0m" << std::endl
@@ -394,28 +563,46 @@ void GPUbenchmark<chunk_t>::runTest(Test test, Mode mode, KernelConfig config)
                                                                                       : dimGrid};
   auto capacity{mState.getChunkCapacity()};
   void (*kernel)(chunk_t*, size_t);
+  void (*kernel_distributed)(chunk_t**, size_t*);
 
-  switch (test) {
-    case Test::Read: {
-      kernel = &gpu::read_k<chunk_t>;
-      break;
-    }
-    case Test::Write: {
-      kernel = &gpu::write_k<chunk_t>;
-      break;
+  if (mode != Mode::Distributed) {
+    switch (test) {
+      case Test::Read: {
+        kernel = &gpu::read_k<chunk_t>;
+        break;
+      }
+      case Test::Write: {
+        kernel = &gpu::write_k<chunk_t>;
+        break;
+      }
+      case Test::Copy: {
+        kernel = &gpu::copy_k<chunk_t>;
+        break;
+      }
     }
-    case Test::Copy: {
-      kernel = &gpu::copy_k<chunk_t>;
-      break;
+  } else {
+    switch (test) {
+      case Test::Read: {
+        kernel_distributed = &gpu::read_dist_k<chunk_t>;
+        break;
+      }
+      case Test::Write: {
+        kernel_distributed = &gpu::write_dist_k<chunk_t>;
+        break;
+      }
+      case Test::Copy: {
+        kernel_distributed = &gpu::copy_dist_k<chunk_t>;
+        break;
+      }
     }
   }
 
   for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
     std::cout << "   ├ " << mode << " " << test << " " << config << " block(s) (" << measurement + 1 << "/" << mOptions.nTests << "): \n"
               << "   │   - blocks per kernel: " << nBlocks << "/" << dimGrid << "\n"
-              << "   │   - threads per block: " << (int)nThreads << "\n"
-              << "   │   - per chunk throughput:\n";
+              << "   │   - threads per block: " << (int)nThreads << "\n";
     if (mode == Mode::Sequential) {
+      std::cout << "   │   - per chunk throughput:\n";
       for (auto iChunk{0}; iChunk < mState.testChunks.size(); ++iChunk) { // loop over single chunks separately
         auto& chunk = mState.testChunks[iChunk];
         auto result = runSequential(kernel,
@@ -428,7 +615,8 @@ void GPUbenchmark<chunk_t>::runTest(Test test, Mode mode, KernelConfig config)
                   << ": [" << chunk.first << "-" << chunk.first + chunk.second << ") \e[1m" << throughput << " GB/s \e[0m(" << result * 1e-3 << " s)\n";
         mResultWriter.get()->storeBenchmarkEntry(test, iChunk, result, chunk.second, mState.getNKernelLaunches());
       }
-    } else {
+    } else if (mode == Mode::Concurrent) {
+      std::cout << "   │   - per chunk throughput:\n";
       auto results = runConcurrent(kernel,
                                    mState.testChunks,
                                    mState.getNKernelLaunches(),
@@ -436,7 +624,7 @@ void GPUbenchmark<chunk_t>::runTest(Test test, Mode mode, KernelConfig config)
                                    nBlocks,
                                    nThreads);
       float sum{0};
-      for (auto iChunk{0}; iChunk < results.size(); ++iChunk) {
+      for (auto iChunk{0}; iChunk < mState.testChunks.size(); ++iChunk) {
         auto& chunk = mState.testChunks[iChunk];
         auto throughput = computeThroughput(test, results[iChunk], chunk.second, mState.getNKernelLaunches());
         sum += throughput;
@@ -447,6 +635,28 @@ void GPUbenchmark<chunk_t>::runTest(Test test, Mode mode, KernelConfig config)
       if (mState.testChunks.size() > 1) {
         std::cout << "   │   - total throughput: \e[1m" << sum << " GB/s \e[0m" << std::endl;
       }
+
+      // Add throughput computed via system time measurement
+      float tot{0};
+      for (auto& chunk : mState.testChunks) {
+        tot += chunk.second;
+      }
+
+      std::cout << "   │   - total throughput with host time: \e[1m" << computeThroughput(test, results[mState.testChunks.size()], tot, mState.getNKernelLaunches())
+                << " GB/s \e[0m (" << std::setw(2) << results[mState.testChunks.size()] / 1000 << " s)" << std::endl;
+    } else if (mode == Mode::Distributed) {
+      auto result = runDistributed(kernel_distributed,
+                                   mState.testChunks,
+                                   mState.getNKernelLaunches(),
+                                   nBlocks,
+                                   nThreads);
+      float tot{0};
+      for (auto& chunk : mState.testChunks) {
+        tot += chunk.second;
+      }
+      auto throughput = computeThroughput(test, result, tot, mState.getNKernelLaunches());
+      std::cout << "   │     └ throughput: \e[1m" << throughput << " GB/s \e[0m(" << result * 1e-3 << " s)\n";
+      mResultWriter.get()->storeBenchmarkEntry(test, 0, result, tot, mState.getNKernelLaunches());
     }
     mResultWriter.get()->snapshotBenchmark();
   }
diff --git a/GPU/GPUbenchmark/cuda/benchmark.cu b/GPU/GPUbenchmark/cuda/benchmark.cu
index db90dd4888c11..3471865334b6f 100644
--- a/GPU/GPUbenchmark/cuda/benchmark.cu
+++ b/GPU/GPUbenchmark/cuda/benchmark.cu
@@ -13,7 +13,7 @@
 ///
 
 #include "../Shared/Kernels.h"
-#define VERSION "version 0.1-pr#6773"
+#define VERSION "version 0.2"
 
 bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
 {
@@ -28,7 +28,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
     "device,d", bpo::value<int>()->default_value(0), "Id of the device to run test on, EPN targeted.")(
     "test,t", bpo::value<std::vector<std::string>>()->multitoken()->default_value(std::vector<std::string>{"read", "write", "copy"}, "read write copy"), "Tests to be performed.")(
     "kind,k", bpo::value<std::vector<std::string>>()->multitoken()->default_value(std::vector<std::string>{"char", "int", "ulong", "int4"}, "char int ulong int4"), "Test data type to be used.")(
-    "mode,m", bpo::value<std::vector<std::string>>()->multitoken()->default_value(std::vector<std::string>{"seq", "con"}, "seq con"), "Mode: sequential or concurrent.")(
+    "mode,m", bpo::value<std::vector<std::string>>()->multitoken()->default_value(std::vector<std::string>{"seq", "con", "dis"}, "seq con dis"), "Mode: sequential, concurrent or distributed.")(
     "blockPool,b", bpo::value<std::vector<std::string>>()->multitoken()->default_value(std::vector<std::string>{"sb", "mb", "ab"}, "sb mb ab"), "Pool strategy: single, multi or all blocks.")(
     "threadPool,e", bpo::value<float>()->default_value(1.f), "Fraction of blockDim.x to use (aka: rounded fraction of thread pool).")(
     "chunkSize,c", bpo::value<float>()->default_value(1.f), "Size of scratch partitions (GB).")(
@@ -93,6 +93,8 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
       conf.modes.push_back(Mode::Sequential);
     } else if (mode == "con") {
       conf.modes.push_back(Mode::Concurrent);
+    } else if (mode == "dis") {
+      conf.modes.push_back(Mode::Distributed);
     } else {
       std::cerr << "Unkonwn mode: " << mode << std::endl;
       exit(1);
@@ -117,7 +119,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
   for (auto& aChunk : vm["arbitrary"].as<std::vector<std::string>>()) {
     const size_t sep = aChunk.find(':');
     if (sep != std::string::npos) {
-      conf.testChunks.emplace_back(std::stoi(aChunk.substr(0, sep)), std::stoi(aChunk.substr(sep + 1)));
+      conf.testChunks.emplace_back(std::stof(aChunk.substr(0, sep)), std::stof(aChunk.substr(sep + 1)));
     }
   }