From 71700972034de2a38dcbd8802b5fc6cd394c7f28 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Tue, 2 Nov 2021 16:23:20 +0100
Subject: [PATCH 1/8] Add boundaries protection + float

---
 GPU/GPUbenchmark/Shared/Kernels.h  |  4 ++--
 GPU/GPUbenchmark/Shared/Utils.h    | 24 ++++++++-------------
 GPU/GPUbenchmark/cuda/Kernels.cu   | 34 +++++++++++++++++++++++++++---
 GPU/GPUbenchmark/cuda/benchmark.cu |  4 ++--
 4 files changed, 44 insertions(+), 22 deletions(-)
diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index b1e89fdf0bf62..fd6f1fd93bd37 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -42,7 +42,7 @@ class GPUbenchmark final
   // Single stream synchronous (sequential kernels) execution
   template <typename... T>
   float runSequential(void (*kernel)(chunk_t*, size_t, T...),
-                      std::pair<int, int>& chunkRanges,
+                      std::pair<float, float>& chunkRanges,
                       int nLaunches,
                       int dimGrid,
                       int dimBlock,
@@ -51,7 +51,7 @@ class GPUbenchmark final
   // Multi-streams asynchronous executions on whole memory
   template <typename... T>
   std::vector<float> runConcurrent(void (*kernel)(chunk_t*, size_t, T...),
-                                   std::vector<std::pair<int, int>>& chunkRanges,
+                                   std::vector<std::pair<float, float>>& chunkRanges,
                                    int nLaunches,
                                    int dimStreams,
                                    int nBlocks,
diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h
index 4b28c19e2be66..7c8271119c5fb 100644
--- a/GPU/GPUbenchmark/Shared/Utils.h
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -138,17 +138,11 @@ inline std::string getTestName(Mode mode, Test test, KernelConfig blocks)
   return tname;
 }
 
-template <class chunk_t>
-inline chunk_t* getPartPtr(chunk_t* scratchPtr, float chunkReservedGB, int partNumber)
-{
-  return reinterpret_cast<chunk_t*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * chunkReservedGB) * partNumber);
-}
-
 // Return pointer to custom offset (GB)
 template <class chunk_t>
-inline chunk_t* getCustomPtr(chunk_t* scratchPtr, int partNumber)
+inline chunk_t* getCustomPtr(chunk_t* scratchPtr, float startGB)
 {
-  return reinterpret_cast<chunk_t*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * partNumber));
+  return reinterpret_cast<chunk_t*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * startGB));
 }
 
 inline float computeThroughput(Test test, float result, float chunkSizeGB, int ntests)
@@ -160,9 +154,9 @@ inline float computeThroughput(Test test, float result, float chunkSizeGB, int n
 }
 
 template <class chunk_t>
-inline size_t getBufferCapacity(int chunkReservedGB)
+inline size_t getBufferCapacity(float chunkReservedGB)
 {
-  return static_cast<size_t>(GB * chunkReservedGB / sizeof(chunk_t));
+  return static_cast<size_t>((GB * chunkReservedGB) / sizeof(chunk_t));
 }
 
 // LCG: https://rosettacode.org/wiki/Linear_congruential_generator
@@ -202,7 +196,7 @@ struct benchmarkOpts {
   std::vector<Mode> modes = {Mode::Sequential, Mode::Concurrent};
   std::vector<KernelConfig> pools = {KernelConfig::Single, KernelConfig::Multi};
   std::vector<std::string> dtypes = {"char", "int", "ulong"};
-  std::vector<std::pair<int, int>> testChunks;
+  std::vector<std::pair<float, float>> testChunks;
   float chunkReservedGB = 1.f;
   float threadPoolFraction = 1.f;
   float freeMemoryFractionToAllocate = 0.95f;
@@ -235,10 +229,10 @@ struct gpuState {
   float chunkReservedGB; // Size of each partition (GB)
 
   // General containers and state
-  chunk_t* scratchPtr;                         // Pointer to scratch buffer
-  size_t scratchSize;                          // Size of scratch area (B)
-  std::vector<chunk_t*> partAddrOnHost;        // Pointers to scratch partitions on host vector
-  std::vector<std::pair<int, int>> testChunks; // Vector of definitions for arbitrary chunks
+  chunk_t* scratchPtr;                             // Pointer to scratch buffer
+  size_t scratchSize;                              // Size of scratch area (B)
+  std::vector<chunk_t*> partAddrOnHost;            // Pointers to scratch partitions on host vector
+  std::vector<std::pair<float, float>> testChunks; // Vector of definitions for arbitrary chunks
 
   // Static info
   size_t totalMemory;
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index e2baa83b5fdfe..7ec12e3111251 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -32,6 +32,31 @@
 double bytesToconfig(size_t s) { return (double)s / (1024.0); }
 double bytesToGB(size_t s) { return (double)s / GB; }
 
+bool checkTestChunks(std::vector<std::pair<float, float>>& chunks, size_t availMemSizeGB)
+{
+  if (!chunks.size()) {
+    return true;
+  }
+
+  bool check{false};
+
+  sort(chunks.begin(), chunks.end());
+  for (auto iChunk{0}; iChunk < chunks.size(); ++iChunk) { // Check boundaries
+    if (chunks[iChunk].first + chunks[iChunk].second > availMemSizeGB) {
+      check = false;
+      break;
+    }
+    if (iChunk > 0) { // Check intersections
+      if (chunks[iChunk].first < chunks[iChunk - 1].first + chunks[iChunk - 1].second) {
+        check = false;
+        break;
+      }
+    }
+    check = true;
+  }
+  return check;
+}
+
 // CUDA does not support <type4> operations:
 // https://forums.developer.nvidia.com/t/swizzling-float4-arithmetic-support/217
 #ifndef __HIPCC__
@@ -238,7 +263,7 @@ void printDeviceProp(int deviceId)
 template <class chunk_t>
 template <typename... T>
 float GPUbenchmark<chunk_t>::runSequential(void (*kernel)(chunk_t*, size_t, T...),
-                                           std::pair<int, int>& chunk,
+                                           std::pair<float, float>& chunk,
                                            int nLaunches,
                                            int nBlocks,
                                            int nThreads,
@@ -275,7 +300,7 @@ float GPUbenchmark<chunk_t>::runSequential(void (*kernel)(chunk_t*, size_t, T...
 template <class chunk_t>
 template <typename... T>
 std::vector<float> GPUbenchmark<chunk_t>::runConcurrent(void (*kernel)(chunk_t*, size_t, T...),
-                                                        std::vector<std::pair<int, int>>& chunkRanges,
+                                                        std::vector<std::pair<float, float>>& chunkRanges,
                                                         int nLaunches,
                                                         int dimStreams,
                                                         int nBlocks,
@@ -354,6 +379,10 @@ void GPUbenchmark<chunk_t>::globalInit()
   mState.iterations = mOptions.kernelLaunches;
   mState.streams = mOptions.streams;
   mState.testChunks = mOptions.testChunks;
+  if (!checkTestChunks(mOptions.testChunks, free / GB)) {
+    std::cerr << "Failed to configure memory chunks: check arbitrary chunks boundaries." << std::endl;
+    exit(1);
+  }
   mState.nMultiprocessors = props.multiProcessorCount;
   mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor;
   mState.nMaxThreadsPerDimension = props.maxThreadsDim[0];
@@ -368,7 +397,6 @@ void GPUbenchmark<chunk_t>::globalInit()
   std::cout << " ◈ Running on: \033[1;31m" << props.name << "\e[0m" << std::endl;
   // Allocate scratch on GPU
   GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&mState.scratchPtr), mState.scratchSize));
-  // mState.computeScratchPtrs();
   GPUCHECK(cudaMemset(mState.scratchPtr, 0, mState.scratchSize))
 
   std::cout << "   ├ Buffer type: \e[1m" << getType<chunk_t>() << "\e[0m" << std::endl
diff --git a/GPU/GPUbenchmark/cuda/benchmark.cu b/GPU/GPUbenchmark/cuda/benchmark.cu
index db90dd4888c11..1e84d430c1039 100644
--- a/GPU/GPUbenchmark/cuda/benchmark.cu
+++ b/GPU/GPUbenchmark/cuda/benchmark.cu
@@ -13,7 +13,7 @@
 ///
 
 #include "../Shared/Kernels.h"
-#define VERSION "version 0.1-pr#6773"
+#define VERSION "version 0.2"
 
 bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
 {
@@ -117,7 +117,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
   for (auto& aChunk : vm["arbitrary"].as<std::vector<std::string>>()) {
     const size_t sep = aChunk.find(':');
     if (sep != std::string::npos) {
-      conf.testChunks.emplace_back(std::stoi(aChunk.substr(0, sep)), std::stoi(aChunk.substr(sep + 1)));
+      conf.testChunks.emplace_back(std::stof(aChunk.substr(0, sep)), std::stof(aChunk.substr(sep + 1)));
     }
   }
 

From 619788357708b647e69e65bf032228b91416f253 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Tue, 2 Nov 2021 19:09:01 +0100
Subject: [PATCH 2/8] Add Host-based throughput computation

---
 GPU/GPUbenchmark/cuda/Kernels.cu | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 7ec12e3111251..d579d22dc5b59 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -13,6 +13,7 @@
 /// \author: mconcas@cern.ch
 
 #include "../Shared/Kernels.h"
+#include <chrono>
 #include <cstdio>
 #include <numeric>
 
@@ -308,7 +309,7 @@ std::vector<float> GPUbenchmark<chunk_t>::runConcurrent(void (*kernel)(chunk_t*,
                                                         T&... args)
 {
   auto nChunks = chunkRanges.size();
-  std::vector<float> results(nChunks);
+  std::vector<float> results(nChunks + 1); // last spot is for the host time
   std::vector<cudaEvent_t> starts(nChunks), stops(nChunks);
   std::vector<cudaStream_t> streams(dimStreams);
 
@@ -327,6 +328,7 @@ std::vector<float> GPUbenchmark<chunk_t>::runConcurrent(void (*kernel)(chunk_t*,
     chunk_t* chunkPtr = getCustomPtr<chunk_t>(mState.scratchPtr, chunk.first);
     (*kernel)<<<nBlocks, nThreads, 0, streams[iChunk % dimStreams]>>>(chunkPtr, getBufferCapacity<chunk_t>(chunk.second), args...);
   }
+  auto start = std::chrono::high_resolution_clock::now();
 
   for (auto iChunk{0}; iChunk < nChunks; ++iChunk) {
     auto& chunk = chunkRanges[iChunk];
@@ -344,11 +346,16 @@ std::vector<float> GPUbenchmark<chunk_t>::runConcurrent(void (*kernel)(chunk_t*,
     GPUCHECK(cudaEventDestroy(starts[iChunk]));
     GPUCHECK(cudaEventDestroy(stops[iChunk]));
   }
+  GPUCHECK(cudaDeviceSynchronize());
+
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double, std::milli> diff_t{end - start};
 
   for (auto iStream{0}; iStream < dimStreams; ++iStream) {
     GPUCHECK(cudaStreamDestroy(streams[iStream]));
   }
 
+  results[nChunks] = diff_t.count(); // register host time on latest spot
   return results;
 }
 
@@ -464,7 +471,7 @@ void GPUbenchmark<chunk_t>::runTest(Test test, Mode mode, KernelConfig config)
                                    nBlocks,
                                    nThreads);
       float sum{0};
-      for (auto iChunk{0}; iChunk < results.size(); ++iChunk) {
+      for (auto iChunk{0}; iChunk < mState.testChunks.size(); ++iChunk) {
         auto& chunk = mState.testChunks[iChunk];
         auto throughput = computeThroughput(test, results[iChunk], chunk.second, mState.getNKernelLaunches());
         sum += throughput;
@@ -475,6 +482,15 @@ void GPUbenchmark<chunk_t>::runTest(Test test, Mode mode, KernelConfig config)
       if (mState.testChunks.size() > 1) {
         std::cout << "   │   - total throughput: \e[1m" << sum << " GB/s \e[0m" << std::endl;
       }
+
+      // Add throughput computed via system time measurement
+      float tot{0};
+      for (auto& chunk : mState.testChunks) {
+        tot += chunk.second;
+      }
+
+      std::cout << "   │   - total throughput with host time: \e[1m" << computeThroughput(test, results[mState.testChunks.size()], tot, mState.getNKernelLaunches())
+                << " GB/s \e[0m (" << std::setw(2) << results[mState.testChunks.size()] / 1000 << " s)" << std::endl;
     }
     mResultWriter.get()->snapshotBenchmark();
   }

From 337f68233d4d3bf9d23e70860d7f9f608753cb3c Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Tue, 2 Nov 2021 19:16:57 +0100
Subject: [PATCH 3/8] Fix boundary checking bug

---
 GPU/GPUbenchmark/cuda/Kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index d579d22dc5b59..3ff0fd5c8fc77 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -386,7 +386,7 @@ void GPUbenchmark<chunk_t>::globalInit()
   mState.iterations = mOptions.kernelLaunches;
   mState.streams = mOptions.streams;
   mState.testChunks = mOptions.testChunks;
-  if (!checkTestChunks(mOptions.testChunks, free / GB)) {
+  if (!checkTestChunks(mOptions.testChunks, mOptions.freeMemoryFractionToAllocate * free / GB)) {
     std::cerr << "Failed to configure memory chunks: check arbitrary chunks boundaries." << std::endl;
     exit(1);
   }

From 63700abc37043f721de2541b26ab1b8ecd2530c7 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Tue, 2 Nov 2021 23:24:13 +0100
Subject: [PATCH 4/8] Add proto function for distributed benchmark

---
 GPU/GPUbenchmark/Shared/Kernels.h  |  13 ++-
 GPU/GPUbenchmark/Shared/Utils.h    |   6 +-
 GPU/GPUbenchmark/cuda/Kernels.cu   | 153 ++++++++++++++++++++++++++---
 GPU/GPUbenchmark/cuda/benchmark.cu |   6 +-
 4 files changed, 159 insertions(+), 19 deletions(-)

diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h
index fd6f1fd93bd37..9723f0584035e 100644
--- a/GPU/GPUbenchmark/Shared/Kernels.h
+++ b/GPU/GPUbenchmark/Shared/Kernels.h
@@ -39,7 +39,7 @@ class GPUbenchmark final
   template <typename... T>
   float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args);
 
-  // Single stream synchronous (sequential kernels) execution
+  // Single stream (sequential kernels) execution
   template <typename... T>
   float runSequential(void (*kernel)(chunk_t*, size_t, T...),
                       std::pair<float, float>& chunkRanges,
@@ -48,7 +48,7 @@ class GPUbenchmark final
                       int dimBlock,
                       T&... args);
 
-  // Multi-streams asynchronous executions on whole memory
+  // Multi-streams asynchronous executions
   template <typename... T>
   std::vector<float> runConcurrent(void (*kernel)(chunk_t*, size_t, T...),
                                    std::vector<std::pair<float, float>>& chunkRanges,
@@ -58,6 +58,15 @@ class GPUbenchmark final
                                    int nThreads,
                                    T&... args);
 
+  // Single stream executions on all chunks at a time by same kernel
+  template <typename... T>
+  float runDistributed(void (*kernel)(chunk_t**, size_t*, T...),
+                       std::vector<std::pair<float, float>>& chunkRanges,
+                       int nLaunches,
+                       int nBlocks,
+                       int nThreads,
+                       T&... args);
+
   // Main interface
   void globalInit();     // Allocate scratch buffers and compute runtime parameters
   void run();            // Execute all specified callbacks
diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h
index 7c8271119c5fb..d1e83f8bb282b 100644
--- a/GPU/GPUbenchmark/Shared/Utils.h
+++ b/GPU/GPUbenchmark/Shared/Utils.h
@@ -72,7 +72,8 @@ inline std::ostream& operator<<(std::ostream& os, Test test)
 
 enum class Mode {
   Sequential,
-  Concurrent
+  Concurrent,
+  Distributed
 };
 
 inline std::ostream& operator<<(std::ostream& os, Mode mode)
@@ -84,6 +85,9 @@ inline std::ostream& operator<<(std::ostream& os, Mode mode)
     case Mode::Concurrent:
       os << "concurrent";
       break;
+    case Mode::Distributed:
+      os << "distributed";
+      break;
   }
   return os;
 }
diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 3ff0fd5c8fc77..90e82aa03eb2b 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -139,6 +139,46 @@ __global__ void rand_read_k(
   }
   chunkPtr[threadIdx.x] = sink; // writing done once
 }
+
+// Distributed read
+template <class chunk_t>
+__global__ void read_dist_k(
+  chunk_t** chunkPtr,
+  size_t* chunkSize)
+{
+}
+
+// Distributed write
+template <class chunk_t>
+__global__ void write_dist_k(
+  chunk_t** chunkPtr,
+  size_t* chunkSize)
+{
+}
+
+template <>
+__global__ void write_dist_k(
+  int4** chunkPtr,
+  size_t* chunkSize)
+{
+}
+
+// Distributed copy
+template <class chunk_t>
+__global__ void copy_dist_k(
+  chunk_t** chunkPtr,
+  size_t* chunkSize)
+{
+}
+
+// Distributed Random read
+template <class chunk_t>
+__global__ void rand_read_dist_k(
+  chunk_t** chunkPtr,
+  size_t* chunkSize)
+{
+}
+
 } // namespace gpu
 
 void printDeviceProp(int deviceId)
@@ -274,8 +314,8 @@ float GPUbenchmark<chunk_t>::runSequential(void (*kernel)(chunk_t*, size_t, T...
   cudaEvent_t start, stop;
   cudaStream_t stream;
   GPUCHECK(cudaStreamCreate(&stream));
-
   GPUCHECK(cudaSetDevice(mOptions.deviceId));
+
   chunk_t* chunkPtr = getCustomPtr<chunk_t>(mState.scratchPtr, chunk.first);
 
   // Warm up
@@ -359,6 +399,62 @@ std::vector<float> GPUbenchmark<chunk_t>::runConcurrent(void (*kernel)(chunk_t*,
   return results;
 }
 
+template <class chunk_t>
+template <typename... T>
+float GPUbenchmark<chunk_t>::runDistributed(void (*kernel)(chunk_t**, size_t*, T...),
+                                            std::vector<std::pair<float, float>>& chunkRanges,
+                                            int nLaunches,
+                                            int nBlocks,
+                                            int nThreads,
+                                            T&... args)
+{
+  std::vector<chunk_t*> chunkPtrs(chunkRanges.size());
+  std::vector<chunk_t*> ptrPerBlocks(nBlocks);
+
+  float totChunkGB{0.f};
+  for (size_t iChunk{0}; iChunk < chunkRanges.size(); ++iChunk) {
+    chunkPtrs[iChunk] = getCustomPtr<chunk_t>(mState.scratchPtr, chunkRanges[iChunk].first);
+    totChunkGB += chunkRanges[iChunk].second;
+  }
+  float memPerBlock = (float)totChunkGB / (float)nBlocks;
+  std::cout << "Mem per block: " << memPerBlock << std::endl;
+  int index{0};
+  for (size_t iChunk{0}; iChunk < chunkRanges.size(); ++iChunk) {
+    int blocks = chunkRanges[iChunk].second / memPerBlock;
+    for (size_t iBlock{0}; iBlock < blocks; ++iBlock) {
+      ptrPerBlocks[index] = getCustomPtr<chunk_t>(chunkPtrs[iChunk], iBlock * memPerBlock);
+      index++;
+    }
+  }
+  std::cout << "starting ptr: " << mState.scratchPtr << std::endl;
+  for (auto& ptr : ptrPerBlocks) {
+    std::cout << ptr << std::endl;
+  }
+
+  float milliseconds{0.f};
+  cudaEvent_t start, stop;
+  cudaStream_t stream;
+
+  GPUCHECK(cudaStreamCreate(&stream));
+  GPUCHECK(cudaSetDevice(mOptions.deviceId));
+  GPUCHECK(cudaEventCreate(&start));
+  GPUCHECK(cudaEventCreate(&stop));
+  // Warm up
+  (*kernel)<<<nBlocks, nThreads, 0, stream>>>(ptrPerBlocks.data(), nullptr, args...);
+
+  GPUCHECK(cudaEventRecord(start));
+  for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) {                               // Schedule all the requested kernel launches
+    (*kernel)<<<nBlocks, nThreads, 0, stream>>>(ptrPerBlocks.data(), nullptr, args...); // NOLINT: clang-tidy false-positive
+  }
+  GPUCHECK(cudaEventRecord(stop));      // record checkpoint
+  GPUCHECK(cudaEventSynchronize(stop)); // synchronize executions
+  GPUCHECK(cudaEventElapsedTime(&milliseconds, start, stop));
+  GPUCHECK(cudaEventDestroy(start));
+  GPUCHECK(cudaEventDestroy(stop));
+  GPUCHECK(cudaStreamDestroy(stream));
+  return milliseconds;
+}
+
 template <class chunk_t>
 void GPUbenchmark<chunk_t>::printDevices()
 {
@@ -386,7 +482,7 @@ void GPUbenchmark<chunk_t>::globalInit()
   mState.iterations = mOptions.kernelLaunches;
   mState.streams = mOptions.streams;
   mState.testChunks = mOptions.testChunks;
-  if (!checkTestChunks(mOptions.testChunks, mOptions.freeMemoryFractionToAllocate * free / GB)) {
+  if (!checkTestChunks(mState.testChunks, mOptions.freeMemoryFractionToAllocate * free / GB)) {
     std::cerr << "Failed to configure memory chunks: check arbitrary chunks boundaries." << std::endl;
     exit(1);
   }
@@ -429,19 +525,37 @@ void GPUbenchmark<chunk_t>::runTest(Test test, Mode mode, KernelConfig config)
                                                                                       : dimGrid};
   auto capacity{mState.getChunkCapacity()};
   void (*kernel)(chunk_t*, size_t);
+  void (*kernel_distributed)(chunk_t**, size_t*);
 
-  switch (test) {
-    case Test::Read: {
-      kernel = &gpu::read_k<chunk_t>;
-      break;
-    }
-    case Test::Write: {
-      kernel = &gpu::write_k<chunk_t>;
-      break;
+  if (mode != Mode::Distributed) {
+    switch (test) {
+      case Test::Read: {
+        kernel = &gpu::read_k<chunk_t>;
+        break;
+      }
+      case Test::Write: {
+        kernel = &gpu::write_k<chunk_t>;
+        break;
+      }
+      case Test::Copy: {
+        kernel = &gpu::copy_k<chunk_t>;
+        break;
+      }
     }
-    case Test::Copy: {
-      kernel = &gpu::copy_k<chunk_t>;
-      break;
+  } else {
+    switch (test) {
+      case Test::Read: {
+        kernel_distributed = &gpu::read_dist_k<chunk_t>;
+        break;
+      }
+      case Test::Write: {
+        kernel_distributed = &gpu::write_dist_k<chunk_t>;
+        break;
+      }
+      case Test::Copy: {
+        kernel_distributed = &gpu::copy_dist_k<chunk_t>;
+        break;
+      }
     }
   }
 
@@ -463,7 +577,7 @@ void GPUbenchmark<chunk_t>::runTest(Test test, Mode mode, KernelConfig config)
                   << ": [" << chunk.first << "-" << chunk.first + chunk.second << ") \e[1m" << throughput << " GB/s \e[0m(" << result * 1e-3 << " s)\n";
         mResultWriter.get()->storeBenchmarkEntry(test, iChunk, result, chunk.second, mState.getNKernelLaunches());
       }
-    } else {
+    } else if (mode == Mode::Concurrent) {
       auto results = runConcurrent(kernel,
                                    mState.testChunks,
                                    mState.getNKernelLaunches(),
@@ -491,6 +605,17 @@ void GPUbenchmark<chunk_t>::runTest(Test test, Mode mode, KernelConfig config)
 
       std::cout << "   │   - total throughput with host time: \e[1m" << computeThroughput(test, results[mState.testChunks.size()], tot, mState.getNKernelLaunches())
                 << " GB/s \e[0m (" << std::setw(2) << results[mState.testChunks.size()] / 1000 << " s)" << std::endl;
+    } else if (mode == Mode::Distributed) {
+
+      auto result = runDistributed(kernel_distributed,
+                                   mState.testChunks,
+                                   mState.getNKernelLaunches(),
+                                   nBlocks,
+                                   nThreads);
+      //   auto throughput = computeThroughput(test, result, chunk.second, mState.getNKernelLaunches());
+      //   std::cout << "   │     " << ((mState.testChunks.size() - iChunk != 1) ? "├ " : "└ ") << iChunk + 1 << "/" << mState.testChunks.size()
+      //             << ": [" << chunk.first << "-" << chunk.first + chunk.second << ") \e[1m" << throughput << " GB/s \e[0m(" << result * 1e-3 << " s)\n";
+      //   mResultWriter.get()->storeBenchmarkEntry(test, iChunk, result, chunk.second, mState.getNKernelLaunches());
     }
     mResultWriter.get()->snapshotBenchmark();
   }
diff --git a/GPU/GPUbenchmark/cuda/benchmark.cu b/GPU/GPUbenchmark/cuda/benchmark.cu
index 1e84d430c1039..e98f2fc4c9e37 100644
--- a/GPU/GPUbenchmark/cuda/benchmark.cu
+++ b/GPU/GPUbenchmark/cuda/benchmark.cu
@@ -28,7 +28,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
     "device,d", bpo::value<int>()->default_value(0), "Id of the device to run test on, EPN targeted.")(
     "test,t", bpo::value<std::vector<std::string>>()->multitoken()->default_value(std::vector<std::string>{"read", "write", "copy"}, "read write copy"), "Tests to be performed.")(
     "kind,k", bpo::value<std::vector<std::string>>()->multitoken()->default_value(std::vector<std::string>{"char", "int", "ulong", "int4"}, "char int ulong int4"), "Test data type to be used.")(
-    "mode,m", bpo::value<std::vector<std::string>>()->multitoken()->default_value(std::vector<std::string>{"seq", "con"}, "seq con"), "Mode: sequential or concurrent.")(
+    "mode,m", bpo::value<std::vector<std::string>>()->multitoken()->default_value(std::vector<std::string>{"seq", "con", "dis"}, "seq con dis"), "Mode: sequential, concurrent or distributed.")(
     "blockPool,b", bpo::value<std::vector<std::string>>()->multitoken()->default_value(std::vector<std::string>{"sb", "mb", "ab"}, "sb mb ab"), "Pool strategy: single, multi or all blocks.")(
     "threadPool,e", bpo::value<float>()->default_value(1.f), "Fraction of blockDim.x to use (aka: rounded fraction of thread pool).")(
     "chunkSize,c", bpo::value<float>()->default_value(1.f), "Size of scratch partitions (GB).")(
@@ -93,7 +93,9 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
       conf.modes.push_back(Mode::Sequential);
     } else if (mode == "con") {
       conf.modes.push_back(Mode::Concurrent);
-    } else {
+    } else if (mode == "dis") {
+      conf.modes.push_back(Mode::Distributed);
+    }else {
       std::cerr << "Unkonwn mode: " << mode << std::endl;
       exit(1);
     }

From cbbe3e47f132d971afe26dc8c33088d6b475087d Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Wed, 3 Nov 2021 18:43:25 +0100
Subject: [PATCH 5/8] Add read distributed kernel (segfaults)

---
 GPU/GPUbenchmark/cuda/Kernels.cu | 50 +++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 90e82aa03eb2b..fa96e8f9f4242 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -143,9 +143,16 @@ __global__ void rand_read_k(
 // Distributed read
 template <class chunk_t>
 __global__ void read_dist_k(
-  chunk_t** chunkPtr,
-  size_t* chunkSize)
+  chunk_t** blockPtrs,
+  size_t* blockSizes)
 {
+  chunk_t sink{0};
+  auto* ptr = blockPtrs[blockIdx.x];
+  auto n = blockSizes[blockIdx.x];
+  for (size_t i = threadIdx.x; i < n; i += blockDim.x) {
+    sink += ptr[i];
+  }
+  blockPtrs[blockIdx.x][threadIdx.x] = sink;
 }
 
 // Distributed write
@@ -408,27 +415,34 @@ float GPUbenchmark<chunk_t>::runDistributed(void (*kernel)(chunk_t**, size_t*, T
                                             int nThreads,
                                             T&... args)
 {
-  std::vector<chunk_t*> chunkPtrs(chunkRanges.size());
-  std::vector<chunk_t*> ptrPerBlocks(nBlocks);
+  std::vector<chunk_t*> chunkPtrs(chunkRanges.size()); // Pointers to the beginning of each chunk
+  std::vector<chunk_t*> ptrPerBlocks(nBlocks);         // Pointers for each block
+  std::vector<size_t> perBlockCapacity(nBlocks);       // Capacity of sub-buffer for block
 
   float totChunkGB{0.f};
+  int totComputedBlocks{0};
+
   for (size_t iChunk{0}; iChunk < chunkRanges.size(); ++iChunk) {
     chunkPtrs[iChunk] = getCustomPtr<chunk_t>(mState.scratchPtr, chunkRanges[iChunk].first);
     totChunkGB += chunkRanges[iChunk].second;
   }
-  float memPerBlock = (float)totChunkGB / (float)nBlocks;
-  std::cout << "Mem per block: " << memPerBlock << std::endl;
   int index{0};
   for (size_t iChunk{0}; iChunk < chunkRanges.size(); ++iChunk) {
-    int blocks = chunkRanges[iChunk].second / memPerBlock;
-    for (size_t iBlock{0}; iBlock < blocks; ++iBlock) {
+    float percFromMem = chunkRanges[iChunk].second / totChunkGB;
+    int blocksPerChunk = percFromMem * nBlocks;
+    totComputedBlocks += blocksPerChunk;
+    for (int iBlock{0}; iBlock < blocksPerChunk; ++iBlock, ++index) {
+      float memPerBlock = chunkRanges[iChunk].second / blocksPerChunk;
       ptrPerBlocks[index] = getCustomPtr<chunk_t>(chunkPtrs[iChunk], iBlock * memPerBlock);
-      index++;
+      perBlockCapacity[index] = getBufferCapacity<chunk_t>(memPerBlock);
     }
   }
-  std::cout << "starting ptr: " << mState.scratchPtr << std::endl;
-  for (auto& ptr : ptrPerBlocks) {
-    std::cout << ptr << std::endl;
+
+  if (totComputedBlocks != nBlocks) {
+    std::cout << "   │   - \033[1;33mWarning: total number of estimated blocks (" << totComputedBlocks
+              << ") is different from requested one (" << nBlocks
+              << "). This may cause imbalance in the test!\e[0m"
+              << std::endl;
   }
 
   float milliseconds{0.f};
@@ -439,12 +453,13 @@ float GPUbenchmark<chunk_t>::runDistributed(void (*kernel)(chunk_t**, size_t*, T
   GPUCHECK(cudaSetDevice(mOptions.deviceId));
   GPUCHECK(cudaEventCreate(&start));
   GPUCHECK(cudaEventCreate(&stop));
+
   // Warm up
-  (*kernel)<<<nBlocks, nThreads, 0, stream>>>(ptrPerBlocks.data(), nullptr, args...);
+  (*kernel)<<<totComputedBlocks, nThreads, 0, stream>>>(ptrPerBlocks.data(), perBlockCapacity.data(), args...);
 
   GPUCHECK(cudaEventRecord(start));
-  for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) {                               // Schedule all the requested kernel launches
-    (*kernel)<<<nBlocks, nThreads, 0, stream>>>(ptrPerBlocks.data(), nullptr, args...); // NOLINT: clang-tidy false-positive
+  for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) {                                                         // Schedule all the requested kernel launches
+    (*kernel)<<<totComputedBlocks, nThreads, 0, stream>>>(ptrPerBlocks.data(), perBlockCapacity.data(), args...); // NOLINT: clang-tidy false-positive
   }
   GPUCHECK(cudaEventRecord(stop));      // record checkpoint
   GPUCHECK(cudaEventSynchronize(stop)); // synchronize executions
@@ -562,9 +577,9 @@ void GPUbenchmark<chunk_t>::runTest(Test test, Mode mode, KernelConfig config)
   for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) {
     std::cout << "   ├ " << mode << " " << test << " " << config << " block(s) (" << measurement + 1 << "/" << mOptions.nTests << "): \n"
               << "   │   - blocks per kernel: " << nBlocks << "/" << dimGrid << "\n"
-              << "   │   - threads per block: " << (int)nThreads << "\n"
-              << "   │   - per chunk throughput:\n";
+              << "   │   - threads per block: " << (int)nThreads << "\n";
     if (mode == Mode::Sequential) {
+      std::cout << "   │   - per chunk throughput:\n";
       for (auto iChunk{0}; iChunk < mState.testChunks.size(); ++iChunk) { // loop over single chunks separately
         auto& chunk = mState.testChunks[iChunk];
         auto result = runSequential(kernel,
@@ -578,6 +593,7 @@ void GPUbenchmark<chunk_t>::runTest(Test test, Mode mode, KernelConfig config)
         mResultWriter.get()->storeBenchmarkEntry(test, iChunk, result, chunk.second, mState.getNKernelLaunches());
       }
     } else if (mode == Mode::Concurrent) {
+      std::cout << "   │   - per chunk throughput:\n";
       auto results = runConcurrent(kernel,
                                    mState.testChunks,
                                    mState.getNKernelLaunches(),

From d7fba08abc7470b94d8c83e90cacf93ea4027b7a Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Wed, 3 Nov 2021 20:29:28 +0100
Subject: [PATCH 6/8] Fix read distributed kernel

---
 GPU/GPUbenchmark/cuda/Kernels.cu | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index fa96e8f9f4242..701777d737216 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -142,17 +142,16 @@ __global__ void rand_read_k(
 
 // Distributed read
 template <class chunk_t>
-__global__ void read_dist_k(
-  chunk_t** blockPtrs,
-  size_t* blockSizes)
+__global__ void read_dist_k(chunk_t** block_ptr,
+                            size_t* block_size)
 {
   chunk_t sink{0};
-  auto* ptr = blockPtrs[blockIdx.x];
-  auto n = blockSizes[blockIdx.x];
+  chunk_t* ptr = block_ptr[blockIdx.x];
+  size_t n = block_size[blockIdx.x];
   for (size_t i = threadIdx.x; i < n; i += blockDim.x) {
     sink += ptr[i];
   }
-  blockPtrs[blockIdx.x][threadIdx.x] = sink;
+  block_ptr[blockIdx.x][threadIdx.x] = sink;
 }
 
 // Distributed write
@@ -445,6 +444,15 @@ float GPUbenchmark<chunk_t>::runDistributed(void (*kernel)(chunk_t**, size_t*, T
               << std::endl;
   }
 
+  // Setup
+
+  chunk_t** block_ptr;
+  size_t* block_size;
+  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&block_ptr), 60 * sizeof(chunk_t*)));
+  GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&block_size), 60 * sizeof(size_t)));
+  GPUCHECK(cudaMemcpy(block_ptr, ptrPerBlocks.data(), nBlocks * sizeof(chunk_t*), cudaMemcpyHostToDevice));
+  GPUCHECK(cudaMemcpy(block_size, perBlockCapacity.data(), nBlocks * sizeof(size_t), cudaMemcpyHostToDevice));
+
   float milliseconds{0.f};
   cudaEvent_t start, stop;
   cudaStream_t stream;
@@ -455,11 +463,11 @@ float GPUbenchmark<chunk_t>::runDistributed(void (*kernel)(chunk_t**, size_t*, T
   GPUCHECK(cudaEventCreate(&stop));
 
   // Warm up
-  (*kernel)<<<totComputedBlocks, nThreads, 0, stream>>>(ptrPerBlocks.data(), perBlockCapacity.data(), args...);
+  (*kernel)<<<totComputedBlocks, nThreads, 0, stream>>>(block_ptr, block_size, args...);
 
   GPUCHECK(cudaEventRecord(start));
-  for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) {                                                         // Schedule all the requested kernel launches
-    (*kernel)<<<totComputedBlocks, nThreads, 0, stream>>>(ptrPerBlocks.data(), perBlockCapacity.data(), args...); // NOLINT: clang-tidy false-positive
+  for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) {                                  // Schedule all the requested kernel launches
+    (*kernel)<<<totComputedBlocks, nThreads, 0, stream>>>(block_ptr, block_size, args...); // NOLINT: clang-tidy false-positive
   }
   GPUCHECK(cudaEventRecord(stop));      // record checkpoint
   GPUCHECK(cudaEventSynchronize(stop)); // synchronize executions

From 271919e4d4956779ca52e0179f4c454e149f27bc Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Wed, 3 Nov 2021 21:00:27 +0100
Subject: [PATCH 7/8] Compute throughput for distributed case

---
 GPU/GPUbenchmark/cuda/Kernels.cu | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 701777d737216..8aba6981bf2a8 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -445,7 +445,6 @@ float GPUbenchmark<chunk_t>::runDistributed(void (*kernel)(chunk_t**, size_t*, T
   }
 
   // Setup
-
   chunk_t** block_ptr;
   size_t* block_size;
   GPUCHECK(cudaMalloc(reinterpret_cast<void**>(&block_ptr), 60 * sizeof(chunk_t*)));
@@ -630,16 +629,18 @@ void GPUbenchmark<chunk_t>::runTest(Test test, Mode mode, KernelConfig config)
       std::cout << "   │   - total throughput with host time: \e[1m" << computeThroughput(test, results[mState.testChunks.size()], tot, mState.getNKernelLaunches())
                 << " GB/s \e[0m (" << std::setw(2) << results[mState.testChunks.size()] / 1000 << " s)" << std::endl;
     } else if (mode == Mode::Distributed) {
-
       auto result = runDistributed(kernel_distributed,
                                    mState.testChunks,
                                    mState.getNKernelLaunches(),
                                    nBlocks,
                                    nThreads);
-      //   auto throughput = computeThroughput(test, result, chunk.second, mState.getNKernelLaunches());
-      //   std::cout << "   │     " << ((mState.testChunks.size() - iChunk != 1) ? "├ " : "└ ") << iChunk + 1 << "/" << mState.testChunks.size()
-      //             << ": [" << chunk.first << "-" << chunk.first + chunk.second << ") \e[1m" << throughput << " GB/s \e[0m(" << result * 1e-3 << " s)\n";
-      //   mResultWriter.get()->storeBenchmarkEntry(test, iChunk, result, chunk.second, mState.getNKernelLaunches());
+      float tot{0};
+      for (auto& chunk : mState.testChunks) {
+        tot += chunk.second;
+      }
+      auto throughput = computeThroughput(test, result, tot, mState.getNKernelLaunches());
+      std::cout << "   │     └ throughput: \e[1m" << throughput << " GB/s \e[0m(" << result * 1e-3 << " s)\n";
+      mResultWriter.get()->storeBenchmarkEntry(test, 0, result, tot, mState.getNKernelLaunches());
     }
     mResultWriter.get()->snapshotBenchmark();
   }

From 3bbedec4c8023a294dfae8c74396c11794696c6c Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Wed, 3 Nov 2021 21:26:16 +0100
Subject: [PATCH 8/8] Add rest of the kernels

---
 GPU/GPUbenchmark/cuda/Kernels.cu   | 44 ++++++++++++++++++++----------
 GPU/GPUbenchmark/cuda/benchmark.cu |  2 +-
 2 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu
index 8aba6981bf2a8..8413894073bb5 100644
--- a/GPU/GPUbenchmark/cuda/Kernels.cu
+++ b/GPU/GPUbenchmark/cuda/Kernels.cu
@@ -142,8 +142,9 @@ __global__ void rand_read_k(
 
 // Distributed read
 template <class chunk_t>
-__global__ void read_dist_k(chunk_t** block_ptr,
-                            size_t* block_size)
+__global__ void read_dist_k(
+  chunk_t** block_ptr,
+  size_t* block_size)
 {
   chunk_t sink{0};
   chunk_t* ptr = block_ptr[blockIdx.x];
@@ -151,37 +152,53 @@ __global__ void read_dist_k(chunk_t** block_ptr,
   for (size_t i = threadIdx.x; i < n; i += blockDim.x) {
     sink += ptr[i];
   }
-  block_ptr[blockIdx.x][threadIdx.x] = sink;
+  ptr[threadIdx.x] = sink;
 }
 
 // Distributed write
 template <class chunk_t>
 __global__ void write_dist_k(
-  chunk_t** chunkPtr,
-  size_t* chunkSize)
+  chunk_t** block_ptr,
+  size_t* block_size)
 {
+  chunk_t* ptr = block_ptr[blockIdx.x];
+  size_t n = block_size[blockIdx.x];
+  for (size_t i = threadIdx.x; i < n; i += blockDim.x) {
+    ptr[i] = 0;
+  }
 }
 
 template <>
 __global__ void write_dist_k(
-  int4** chunkPtr,
-  size_t* chunkSize)
+  int4** block_ptr,
+  size_t* block_size)
 {
+  int4* ptr = block_ptr[blockIdx.x];
+  size_t n = block_size[blockIdx.x];
+  for (size_t i = threadIdx.x; i < n; i += blockDim.x) {
+    ptr[i] = {0, 1, 0, 0};
+  }
 }
 
 // Distributed copy
 template <class chunk_t>
 __global__ void copy_dist_k(
-  chunk_t** chunkPtr,
-  size_t* chunkSize)
+  chunk_t** block_ptr,
+  size_t* block_size)
 {
+  chunk_t* ptr = block_ptr[blockIdx.x];
+  size_t n = block_size[blockIdx.x];
+  size_t offset = n / 2;
+  for (size_t i = threadIdx.x; i < offset; i += blockDim.x) {
+    ptr[i] = ptr[offset + i];
+  }
 }
 
 // Distributed Random read
 template <class chunk_t>
 __global__ void rand_read_dist_k(
-  chunk_t** chunkPtr,
-  size_t* chunkSize)
+  chunk_t** block_ptr,
+  size_t* block_size)
 {
 }
 
@@ -438,9 +455,8 @@ float GPUbenchmark<chunk_t>::runDistributed(void (*kernel)(chunk_t**, size_t*, T
   }
 
   if (totComputedBlocks != nBlocks) {
-    std::cout << "   │   - \033[1;33mWarning: total number of estimated blocks (" << totComputedBlocks
-              << ") is different from requested one (" << nBlocks
-              << "). This may cause imbalance in the test!\e[0m"
+    std::cout << "   │   - \033[1;33mWarning: Sum of used blocks (" << totComputedBlocks
+              << ") is different from requested one (" << nBlocks << ")!\e[0m"
               << std::endl;
   }
 
diff --git a/GPU/GPUbenchmark/cuda/benchmark.cu b/GPU/GPUbenchmark/cuda/benchmark.cu
index e98f2fc4c9e37..3471865334b6f 100644
--- a/GPU/GPUbenchmark/cuda/benchmark.cu
+++ b/GPU/GPUbenchmark/cuda/benchmark.cu
@@ -95,7 +95,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[])
       conf.modes.push_back(Mode::Concurrent);
     } else if (mode == "dis") {
       conf.modes.push_back(Mode::Distributed);
-    }else {
+    } else {
       std::cerr << "Unkonwn mode: " << mode << std::endl;
       exit(1);
     }