From 71700972034de2a38dcbd8802b5fc6cd394c7f28 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 2 Nov 2021 16:23:20 +0100 Subject: [PATCH 1/8] Add boundaries protection + float --- GPU/GPUbenchmark/Shared/Kernels.h | 4 ++-- GPU/GPUbenchmark/Shared/Utils.h | 24 ++++++++------------- GPU/GPUbenchmark/cuda/Kernels.cu | 34 +++++++++++++++++++++++++++--- GPU/GPUbenchmark/cuda/benchmark.cu | 4 ++-- 4 files changed, 44 insertions(+), 22 deletions(-) diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index b1e89fdf0bf62..fd6f1fd93bd37 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -42,7 +42,7 @@ class GPUbenchmark final // Single stream synchronous (sequential kernels) execution template float runSequential(void (*kernel)(chunk_t*, size_t, T...), - std::pair& chunkRanges, + std::pair& chunkRanges, int nLaunches, int dimGrid, int dimBlock, @@ -51,7 +51,7 @@ class GPUbenchmark final // Multi-streams asynchronous executions on whole memory template std::vector runConcurrent(void (*kernel)(chunk_t*, size_t, T...), - std::vector>& chunkRanges, + std::vector>& chunkRanges, int nLaunches, int dimStreams, int nBlocks, diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h index 4b28c19e2be66..7c8271119c5fb 100644 --- a/GPU/GPUbenchmark/Shared/Utils.h +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -138,17 +138,11 @@ inline std::string getTestName(Mode mode, Test test, KernelConfig blocks) return tname; } -template -inline chunk_t* getPartPtr(chunk_t* scratchPtr, float chunkReservedGB, int partNumber) -{ - return reinterpret_cast(reinterpret_cast(scratchPtr) + static_cast(GB * chunkReservedGB) * partNumber); -} - // Return pointer to custom offset (GB) template -inline chunk_t* getCustomPtr(chunk_t* scratchPtr, int partNumber) +inline chunk_t* getCustomPtr(chunk_t* scratchPtr, float startGB) { - return reinterpret_cast(reinterpret_cast(scratchPtr) + static_cast(GB * partNumber)); + return reinterpret_cast(reinterpret_cast(scratchPtr) + static_cast(GB * startGB)); } inline float computeThroughput(Test test, float result, float chunkSizeGB, int ntests) @@ -160,9 +154,9 @@ inline float computeThroughput(Test test, float result, float chunkSizeGB, int n } template -inline size_t getBufferCapacity(int chunkReservedGB) +inline size_t getBufferCapacity(float chunkReservedGB) { - return static_cast(GB * chunkReservedGB / sizeof(chunk_t)); + return static_cast((GB * chunkReservedGB) / sizeof(chunk_t)); } // LCG: https://rosettacode.org/wiki/Linear_congruential_generator @@ -202,7 +196,7 @@ struct benchmarkOpts { std::vector modes = {Mode::Sequential, Mode::Concurrent}; std::vector pools = {KernelConfig::Single, KernelConfig::Multi}; std::vector dtypes = {"char", "int", "ulong"}; - std::vector> testChunks; + std::vector> testChunks; float chunkReservedGB = 1.f; float threadPoolFraction = 1.f; float freeMemoryFractionToAllocate = 0.95f; @@ -235,10 +229,10 @@ struct gpuState { float chunkReservedGB; // Size of each partition (GB) // General containers and state - chunk_t* scratchPtr; // Pointer to scratch buffer - size_t scratchSize; // Size of scratch area (B) - std::vector partAddrOnHost; // Pointers to scratch partitions on host vector - std::vector> testChunks; // Vector of definitions for arbitrary chunks + chunk_t* scratchPtr; // Pointer to scratch buffer + size_t scratchSize; // Size of scratch area (B) + std::vector partAddrOnHost; // Pointers to scratch partitions on host vector + std::vector> testChunks; // Vector of definitions for arbitrary chunks // Static info size_t totalMemory; diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index e2baa83b5fdfe..7ec12e3111251 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -32,6 +32,31 @@ double bytesToconfig(size_t s) { return (double)s / (1024.0); } double bytesToGB(size_t s) { return (double)s / GB; } +bool checkTestChunks(std::vector>& chunks, size_t availMemSizeGB) +{ + if (!chunks.size()) { + return true; + } + + bool check{false}; + + sort(chunks.begin(), chunks.end()); + for (auto iChunk{0}; iChunk < chunks.size(); ++iChunk) { // Check boundaries + if (chunks[iChunk].first + chunks[iChunk].second > availMemSizeGB) { + check = false; + break; + } + if (iChunk > 0) { // Check intersections + if (chunks[iChunk].first < chunks[iChunk - 1].first + chunks[iChunk - 1].second) { + check = false; + break; + } + } + check = true; + } + return check; +} + // CUDA does not support operations: // https://forums.developer.nvidia.com/t/swizzling-float4-arithmetic-support/217 #ifndef __HIPCC__ @@ -238,7 +263,7 @@ void printDeviceProp(int deviceId) template template float GPUbenchmark::runSequential(void (*kernel)(chunk_t*, size_t, T...), - std::pair& chunk, + std::pair& chunk, int nLaunches, int nBlocks, int nThreads, @@ -275,7 +300,7 @@ float GPUbenchmark::runSequential(void (*kernel)(chunk_t*, size_t, T... template template std::vector GPUbenchmark::runConcurrent(void (*kernel)(chunk_t*, size_t, T...), - std::vector>& chunkRanges, + std::vector>& chunkRanges, int nLaunches, int dimStreams, int nBlocks, @@ -354,6 +379,10 @@ void GPUbenchmark::globalInit() mState.iterations = mOptions.kernelLaunches; mState.streams = mOptions.streams; mState.testChunks = mOptions.testChunks; + if (!checkTestChunks(mOptions.testChunks, free / GB)) { + std::cerr << "Failed to configure memory chunks: check arbitrary chunks boundaries." << std::endl; + exit(1); + } mState.nMultiprocessors = props.multiProcessorCount; mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor; mState.nMaxThreadsPerDimension = props.maxThreadsDim[0]; @@ -368,7 +397,6 @@ void GPUbenchmark::globalInit() std::cout << " ◈ Running on: \033[1;31m" << props.name << "\e[0m" << std::endl; // Allocate scratch on GPU GPUCHECK(cudaMalloc(reinterpret_cast(&mState.scratchPtr), mState.scratchSize)); - // mState.computeScratchPtrs(); GPUCHECK(cudaMemset(mState.scratchPtr, 0, mState.scratchSize)) std::cout << " ├ Buffer type: \e[1m" << getType() << "\e[0m" << std::endl diff --git a/GPU/GPUbenchmark/cuda/benchmark.cu b/GPU/GPUbenchmark/cuda/benchmark.cu index db90dd4888c11..1e84d430c1039 100644 --- a/GPU/GPUbenchmark/cuda/benchmark.cu +++ b/GPU/GPUbenchmark/cuda/benchmark.cu @@ -13,7 +13,7 @@ /// #include "../Shared/Kernels.h" -#define VERSION "version 0.1-pr#6773" +#define VERSION "version 0.2" bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) { @@ -117,7 +117,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) for (auto& aChunk : vm["arbitrary"].as>()) { const size_t sep = aChunk.find(':'); if (sep != std::string::npos) { - conf.testChunks.emplace_back(std::stoi(aChunk.substr(0, sep)), std::stoi(aChunk.substr(sep + 1))); + conf.testChunks.emplace_back(std::stof(aChunk.substr(0, sep)), std::stof(aChunk.substr(sep + 1))); } } From 619788357708b647e69e65bf032228b91416f253 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 2 Nov 2021 19:09:01 +0100 Subject: [PATCH 2/8] Add Host-based throughput computation --- GPU/GPUbenchmark/cuda/Kernels.cu | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 7ec12e3111251..d579d22dc5b59 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -13,6 +13,7 @@ /// \author: mconcas@cern.ch #include "../Shared/Kernels.h" +#include #include #include @@ -308,7 +309,7 @@ std::vector GPUbenchmark::runConcurrent(void (*kernel)(chunk_t*, T&... args) { auto nChunks = chunkRanges.size(); - std::vector results(nChunks); + std::vector results(nChunks + 1); // last spot is for the host time std::vector starts(nChunks), stops(nChunks); std::vector streams(dimStreams); @@ -327,6 +328,7 @@ std::vector GPUbenchmark::runConcurrent(void (*kernel)(chunk_t*, chunk_t* chunkPtr = getCustomPtr(mState.scratchPtr, chunk.first); (*kernel)<<>>(chunkPtr, getBufferCapacity(chunk.second), args...); } + auto start = std::chrono::high_resolution_clock::now(); for (auto iChunk{0}; iChunk < nChunks; ++iChunk) { auto& chunk = chunkRanges[iChunk]; @@ -344,11 +346,16 @@ std::vector GPUbenchmark::runConcurrent(void (*kernel)(chunk_t*, GPUCHECK(cudaEventDestroy(starts[iChunk])); GPUCHECK(cudaEventDestroy(stops[iChunk])); } + GPUCHECK(cudaDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff_t{end - start}; for (auto iStream{0}; iStream < dimStreams; ++iStream) { GPUCHECK(cudaStreamDestroy(streams[iStream])); } + results[nChunks] = diff_t.count(); // register host time on latest spot return results; } @@ -464,7 +471,7 @@ void GPUbenchmark::runTest(Test test, Mode mode, KernelConfig config) nBlocks, nThreads); float sum{0}; - for (auto iChunk{0}; iChunk < results.size(); ++iChunk) { + for (auto iChunk{0}; iChunk < mState.testChunks.size(); ++iChunk) { auto& chunk = mState.testChunks[iChunk]; auto throughput = computeThroughput(test, results[iChunk], chunk.second, mState.getNKernelLaunches()); sum += throughput; @@ -475,6 +482,15 @@ void GPUbenchmark::runTest(Test test, Mode mode, KernelConfig config) if (mState.testChunks.size() > 1) { std::cout << " │ - total throughput: \e[1m" << sum << " GB/s \e[0m" << std::endl; } + + // Add throughput computed via system time measurement + float tot{0}; + for (auto& chunk : mState.testChunks) { + tot += chunk.second; + } + + std::cout << " │ - total throughput with host time: \e[1m" << computeThroughput(test, results[mState.testChunks.size()], tot, mState.getNKernelLaunches()) + << " GB/s \e[0m (" << std::setw(2) << results[mState.testChunks.size()] / 1000 << " s)" << std::endl; } mResultWriter.get()->snapshotBenchmark(); } From 337f68233d4d3bf9d23e70860d7f9f608753cb3c Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 2 Nov 2021 19:16:57 +0100 Subject: [PATCH 3/8] Fix boundary checking bug --- GPU/GPUbenchmark/cuda/Kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index d579d22dc5b59..3ff0fd5c8fc77 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -386,7 +386,7 @@ void GPUbenchmark::globalInit() mState.iterations = mOptions.kernelLaunches; mState.streams = mOptions.streams; mState.testChunks = mOptions.testChunks; - if (!checkTestChunks(mOptions.testChunks, free / GB)) { + if (!checkTestChunks(mOptions.testChunks, mOptions.freeMemoryFractionToAllocate * free / GB)) { std::cerr << "Failed to configure memory chunks: check arbitrary chunks boundaries." << std::endl; exit(1); } From 63700abc37043f721de2541b26ab1b8ecd2530c7 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 2 Nov 2021 23:24:13 +0100 Subject: [PATCH 4/8] Add proto function for distributed benchmark --- GPU/GPUbenchmark/Shared/Kernels.h | 13 ++- GPU/GPUbenchmark/Shared/Utils.h | 6 +- GPU/GPUbenchmark/cuda/Kernels.cu | 153 ++++++++++++++++++++++++++--- GPU/GPUbenchmark/cuda/benchmark.cu | 6 +- 4 files changed, 159 insertions(+), 19 deletions(-) diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index fd6f1fd93bd37..9723f0584035e 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -39,7 +39,7 @@ class GPUbenchmark final template float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args); - // Single stream synchronous (sequential kernels) execution + // Single stream (sequential kernels) execution template float runSequential(void (*kernel)(chunk_t*, size_t, T...), std::pair& chunkRanges, @@ -48,7 +48,7 @@ class GPUbenchmark final int dimBlock, T&... args); - // Multi-streams asynchronous executions on whole memory + // Multi-streams asynchronous executions template std::vector runConcurrent(void (*kernel)(chunk_t*, size_t, T...), std::vector>& chunkRanges, @@ -58,6 +58,15 @@ class GPUbenchmark final int nThreads, T&... args); + // Single stream executions on all chunks at a time by same kernel + template + float runDistributed(void (*kernel)(chunk_t**, size_t*, T...), + std::vector>& chunkRanges, + int nLaunches, + int nBlocks, + int nThreads, + T&... args); + // Main interface void globalInit(); // Allocate scratch buffers and compute runtime parameters void run(); // Execute all specified callbacks diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h index 7c8271119c5fb..d1e83f8bb282b 100644 --- a/GPU/GPUbenchmark/Shared/Utils.h +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -72,7 +72,8 @@ inline std::ostream& operator<<(std::ostream& os, Test test) enum class Mode { Sequential, - Concurrent + Concurrent, + Distributed }; inline std::ostream& operator<<(std::ostream& os, Mode mode) @@ -84,6 +85,9 @@ inline std::ostream& operator<<(std::ostream& os, Mode mode) case Mode::Concurrent: os << "concurrent"; break; + case Mode::Distributed: + os << "distributed"; + break; } return os; } diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 3ff0fd5c8fc77..90e82aa03eb2b 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -139,6 +139,46 @@ __global__ void rand_read_k( } chunkPtr[threadIdx.x] = sink; // writing done once } + +// Distributed read +template +__global__ void read_dist_k( + chunk_t** chunkPtr, + size_t* chunkSize) +{ +} + +// Distributed write +template +__global__ void write_dist_k( + chunk_t** chunkPtr, + size_t* chunkSize) +{ +} + +template <> +__global__ void write_dist_k( + int4** chunkPtr, + size_t* chunkSize) +{ +} + +// Distributed copy +template +__global__ void copy_dist_k( + chunk_t** chunkPtr, + size_t* chunkSize) +{ +} + +// Distributed Random read +template +__global__ void rand_read_dist_k( + chunk_t** chunkPtr, + size_t* chunkSize) +{ +} + } // namespace gpu void printDeviceProp(int deviceId) @@ -274,8 +314,8 @@ float GPUbenchmark::runSequential(void (*kernel)(chunk_t*, size_t, T... cudaEvent_t start, stop; cudaStream_t stream; GPUCHECK(cudaStreamCreate(&stream)); - GPUCHECK(cudaSetDevice(mOptions.deviceId)); + chunk_t* chunkPtr = getCustomPtr(mState.scratchPtr, chunk.first); // Warm up @@ -359,6 +399,62 @@ std::vector GPUbenchmark::runConcurrent(void (*kernel)(chunk_t*, return results; } +template +template +float GPUbenchmark::runDistributed(void (*kernel)(chunk_t**, size_t*, T...), + std::vector>& chunkRanges, + int nLaunches, + int nBlocks, + int nThreads, + T&... args) +{ + std::vector chunkPtrs(chunkRanges.size()); + std::vector ptrPerBlocks(nBlocks); + + float totChunkGB{0.f}; + for (size_t iChunk{0}; iChunk < chunkRanges.size(); ++iChunk) { + chunkPtrs[iChunk] = getCustomPtr(mState.scratchPtr, chunkRanges[iChunk].first); + totChunkGB += chunkRanges[iChunk].second; + } + float memPerBlock = (float)totChunkGB / (float)nBlocks; + std::cout << "Mem per block: " << memPerBlock << std::endl; + int index{0}; + for (size_t iChunk{0}; iChunk < chunkRanges.size(); ++iChunk) { + int blocks = chunkRanges[iChunk].second / memPerBlock; + for (size_t iBlock{0}; iBlock < blocks; ++iBlock) { + ptrPerBlocks[index] = getCustomPtr(chunkPtrs[iChunk], iBlock * memPerBlock); + index++; + } + } + std::cout << "starting ptr: " << mState.scratchPtr << std::endl; + for (auto& ptr : ptrPerBlocks) { + std::cout << ptr << std::endl; + } + + float milliseconds{0.f}; + cudaEvent_t start, stop; + cudaStream_t stream; + + GPUCHECK(cudaStreamCreate(&stream)); + GPUCHECK(cudaSetDevice(mOptions.deviceId)); + GPUCHECK(cudaEventCreate(&start)); + GPUCHECK(cudaEventCreate(&stop)); + // Warm up + (*kernel)<<>>(ptrPerBlocks.data(), nullptr, args...); + + GPUCHECK(cudaEventRecord(start)); + for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // Schedule all the requested kernel launches + (*kernel)<<>>(ptrPerBlocks.data(), nullptr, args...); // NOLINT: clang-tidy false-positive + } + GPUCHECK(cudaEventRecord(stop)); // record checkpoint + GPUCHECK(cudaEventSynchronize(stop)); // synchronize executions + GPUCHECK(cudaEventElapsedTime(&milliseconds, start, stop)); + GPUCHECK(cudaEventDestroy(start)); + GPUCHECK(cudaEventDestroy(stop)); + GPUCHECK(cudaStreamDestroy(stream)); + return milliseconds; +} + template void GPUbenchmark::printDevices() { @@ -386,7 +482,7 @@ void GPUbenchmark::globalInit() mState.iterations = mOptions.kernelLaunches; mState.streams = mOptions.streams; mState.testChunks = mOptions.testChunks; - if (!checkTestChunks(mOptions.testChunks, mOptions.freeMemoryFractionToAllocate * free / GB)) { + if (!checkTestChunks(mState.testChunks, mOptions.freeMemoryFractionToAllocate * free / GB)) { std::cerr << "Failed to configure memory chunks: check arbitrary chunks boundaries." << std::endl; exit(1); } @@ -429,19 +525,37 @@ void GPUbenchmark::runTest(Test test, Mode mode, KernelConfig config) : dimGrid}; auto capacity{mState.getChunkCapacity()}; void (*kernel)(chunk_t*, size_t); + void (*kernel_distributed)(chunk_t**, size_t*); - switch (test) { - case Test::Read: { - kernel = &gpu::read_k; - break; - } - case Test::Write: { - kernel = &gpu::write_k; - break; + if (mode != Mode::Distributed) { + switch (test) { + case Test::Read: { + kernel = &gpu::read_k; + break; + } + case Test::Write: { + kernel = &gpu::write_k; + break; + } + case Test::Copy: { + kernel = &gpu::copy_k; + break; + } } - case Test::Copy: { - kernel = &gpu::copy_k; - break; + } else { + switch (test) { + case Test::Read: { + kernel_distributed = &gpu::read_dist_k; + break; + } + case Test::Write: { + kernel_distributed = &gpu::write_dist_k; + break; + } + case Test::Copy: { + kernel_distributed = &gpu::copy_dist_k; + break; + } } } @@ -463,7 +577,7 @@ void GPUbenchmark::runTest(Test test, Mode mode, KernelConfig config) << ": [" << chunk.first << "-" << chunk.first + chunk.second << ") \e[1m" << throughput << " GB/s \e[0m(" << result * 1e-3 << " s)\n"; mResultWriter.get()->storeBenchmarkEntry(test, iChunk, result, chunk.second, mState.getNKernelLaunches()); } - } else { + } else if (mode == Mode::Concurrent) { auto results = runConcurrent(kernel, mState.testChunks, mState.getNKernelLaunches(), @@ -491,6 +605,17 @@ void GPUbenchmark::runTest(Test test, Mode mode, KernelConfig config) std::cout << " │ - total throughput with host time: \e[1m" << computeThroughput(test, results[mState.testChunks.size()], tot, mState.getNKernelLaunches()) << " GB/s \e[0m (" << std::setw(2) << results[mState.testChunks.size()] / 1000 << " s)" << std::endl; + } else if (mode == Mode::Distributed) { + + auto result = runDistributed(kernel_distributed, + mState.testChunks, + mState.getNKernelLaunches(), + nBlocks, + nThreads); + // auto throughput = computeThroughput(test, result, chunk.second, mState.getNKernelLaunches()); + // std::cout << " │ " << ((mState.testChunks.size() - iChunk != 1) ? "├ " : "└ ") << iChunk + 1 << "/" << mState.testChunks.size() + // << ": [" << chunk.first << "-" << chunk.first + chunk.second << ") \e[1m" << throughput << " GB/s \e[0m(" << result * 1e-3 << " s)\n"; + // mResultWriter.get()->storeBenchmarkEntry(test, iChunk, result, chunk.second, mState.getNKernelLaunches()); } mResultWriter.get()->snapshotBenchmark(); } diff --git a/GPU/GPUbenchmark/cuda/benchmark.cu b/GPU/GPUbenchmark/cuda/benchmark.cu index 1e84d430c1039..e98f2fc4c9e37 100644 --- a/GPU/GPUbenchmark/cuda/benchmark.cu +++ b/GPU/GPUbenchmark/cuda/benchmark.cu @@ -28,7 +28,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) "device,d", bpo::value()->default_value(0), "Id of the device to run test on, EPN targeted.")( "test,t", bpo::value>()->multitoken()->default_value(std::vector{"read", "write", "copy"}, "read write copy"), "Tests to be performed.")( "kind,k", bpo::value>()->multitoken()->default_value(std::vector{"char", "int", "ulong", "int4"}, "char int ulong int4"), "Test data type to be used.")( - "mode,m", bpo::value>()->multitoken()->default_value(std::vector{"seq", "con"}, "seq con"), "Mode: sequential or concurrent.")( + "mode,m", bpo::value>()->multitoken()->default_value(std::vector{"seq", "con", "dis"}, "seq con dis"), "Mode: sequential, concurrent or distributed.")( "blockPool,b", bpo::value>()->multitoken()->default_value(std::vector{"sb", "mb", "ab"}, "sb mb ab"), "Pool strategy: single, multi or all blocks.")( "threadPool,e", bpo::value()->default_value(1.f), "Fraction of blockDim.x to use (aka: rounded fraction of thread pool).")( "chunkSize,c", bpo::value()->default_value(1.f), "Size of scratch partitions (GB).")( @@ -93,7 +93,9 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) conf.modes.push_back(Mode::Sequential); } else if (mode == "con") { conf.modes.push_back(Mode::Concurrent); - } else { + } else if (mode == "dis") { + conf.modes.push_back(Mode::Distributed); + }else { std::cerr << "Unkonwn mode: " << mode << std::endl; exit(1); } From cbbe3e47f132d971afe26dc8c33088d6b475087d Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Wed, 3 Nov 2021 18:43:25 +0100 Subject: [PATCH 5/8] Add read distributed kernel (segfaults) --- GPU/GPUbenchmark/cuda/Kernels.cu | 50 +++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 90e82aa03eb2b..fa96e8f9f4242 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -143,9 +143,16 @@ __global__ void rand_read_k( // Distributed read template __global__ void read_dist_k( - chunk_t** chunkPtr, - size_t* chunkSize) + chunk_t** blockPtrs, + size_t* blockSizes) { + chunk_t sink{0}; + auto* ptr = blockPtrs[blockIdx.x]; + auto n = blockSizes[blockIdx.x]; + for (size_t i = threadIdx.x; i < n; i += blockDim.x) { + sink += ptr[i]; + } + blockPtrs[blockIdx.x][threadIdx.x] = sink; } // Distributed write @@ -408,27 +415,34 @@ float GPUbenchmark::runDistributed(void (*kernel)(chunk_t**, size_t*, T int nThreads, T&... args) { - std::vector chunkPtrs(chunkRanges.size()); - std::vector ptrPerBlocks(nBlocks); + std::vector chunkPtrs(chunkRanges.size()); // Pointers to the beginning of each chunk + std::vector ptrPerBlocks(nBlocks); // Pointers for each block + std::vector perBlockCapacity(nBlocks); // Capacity of sub-buffer for block float totChunkGB{0.f}; + int totComputedBlocks{0}; + for (size_t iChunk{0}; iChunk < chunkRanges.size(); ++iChunk) { chunkPtrs[iChunk] = getCustomPtr(mState.scratchPtr, chunkRanges[iChunk].first); totChunkGB += chunkRanges[iChunk].second; } - float memPerBlock = (float)totChunkGB / (float)nBlocks; - std::cout << "Mem per block: " << memPerBlock << std::endl; int index{0}; for (size_t iChunk{0}; iChunk < chunkRanges.size(); ++iChunk) { - int blocks = chunkRanges[iChunk].second / memPerBlock; - for (size_t iBlock{0}; iBlock < blocks; ++iBlock) { + float percFromMem = chunkRanges[iChunk].second / totChunkGB; + int blocksPerChunk = percFromMem * nBlocks; + totComputedBlocks += blocksPerChunk; + for (int iBlock{0}; iBlock < blocksPerChunk; ++iBlock, ++index) { + float memPerBlock = chunkRanges[iChunk].second / blocksPerChunk; ptrPerBlocks[index] = getCustomPtr(chunkPtrs[iChunk], iBlock * memPerBlock); - index++; + perBlockCapacity[index] = getBufferCapacity(memPerBlock); } } - std::cout << "starting ptr: " << mState.scratchPtr << std::endl; - for (auto& ptr : ptrPerBlocks) { - std::cout << ptr << std::endl; + + if (totComputedBlocks != nBlocks) { + std::cout << " │ - \033[1;33mWarning: total number of estimated blocks (" << totComputedBlocks + << ") is different from requested one (" << nBlocks + << "). This may cause imbalance in the test!\e[0m" + << std::endl; } float milliseconds{0.f}; @@ -439,12 +453,13 @@ float GPUbenchmark::runDistributed(void (*kernel)(chunk_t**, size_t*, T GPUCHECK(cudaSetDevice(mOptions.deviceId)); GPUCHECK(cudaEventCreate(&start)); GPUCHECK(cudaEventCreate(&stop)); + // Warm up - (*kernel)<<>>(ptrPerBlocks.data(), nullptr, args...); + (*kernel)<<>>(ptrPerBlocks.data(), perBlockCapacity.data(), args...); GPUCHECK(cudaEventRecord(start)); - for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // Schedule all the requested kernel launches - (*kernel)<<>>(ptrPerBlocks.data(), nullptr, args...); // NOLINT: clang-tidy false-positive + for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // Schedule all the requested kernel launches + (*kernel)<<>>(ptrPerBlocks.data(), perBlockCapacity.data(), args...); // NOLINT: clang-tidy false-positive } GPUCHECK(cudaEventRecord(stop)); // record checkpoint GPUCHECK(cudaEventSynchronize(stop)); // synchronize executions @@ -562,9 +577,9 @@ void GPUbenchmark::runTest(Test test, Mode mode, KernelConfig config) for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { std::cout << " ├ " << mode << " " << test << " " << config << " block(s) (" << measurement + 1 << "/" << mOptions.nTests << "): \n" << " │ - blocks per kernel: " << nBlocks << "/" << dimGrid << "\n" - << " │ - threads per block: " << (int)nThreads << "\n" - << " │ - per chunk throughput:\n"; + << " │ - threads per block: " << (int)nThreads << "\n"; if (mode == Mode::Sequential) { + std::cout << " │ - per chunk throughput:\n"; for (auto iChunk{0}; iChunk < mState.testChunks.size(); ++iChunk) { // loop over single chunks separately auto& chunk = mState.testChunks[iChunk]; auto result = runSequential(kernel, @@ -578,6 +593,7 @@ void GPUbenchmark::runTest(Test test, Mode mode, KernelConfig config) mResultWriter.get()->storeBenchmarkEntry(test, iChunk, result, chunk.second, mState.getNKernelLaunches()); } } else if (mode == Mode::Concurrent) { + std::cout << " │ - per chunk throughput:\n"; auto results = runConcurrent(kernel, mState.testChunks, mState.getNKernelLaunches(), From d7fba08abc7470b94d8c83e90cacf93ea4027b7a Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Wed, 3 Nov 2021 20:29:28 +0100 Subject: [PATCH 6/8] Fix read distributed kernel --- GPU/GPUbenchmark/cuda/Kernels.cu | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index fa96e8f9f4242..701777d737216 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -142,17 +142,16 @@ __global__ void rand_read_k( // Distributed read template -__global__ void read_dist_k( - chunk_t** blockPtrs, - size_t* blockSizes) +__global__ void read_dist_k(chunk_t** block_ptr, + size_t* block_size) { chunk_t sink{0}; - auto* ptr = blockPtrs[blockIdx.x]; - auto n = blockSizes[blockIdx.x]; + chunk_t* ptr = block_ptr[blockIdx.x]; + size_t n = block_size[blockIdx.x]; for (size_t i = threadIdx.x; i < n; i += blockDim.x) { sink += ptr[i]; } - blockPtrs[blockIdx.x][threadIdx.x] = sink; + block_ptr[blockIdx.x][threadIdx.x] = sink; } // Distributed write @@ -445,6 +444,15 @@ float GPUbenchmark::runDistributed(void (*kernel)(chunk_t**, size_t*, T << std::endl; } + // Setup + + chunk_t** block_ptr; + size_t* block_size; + GPUCHECK(cudaMalloc(reinterpret_cast(&block_ptr), 60 * sizeof(chunk_t*))); + GPUCHECK(cudaMalloc(reinterpret_cast(&block_size), 60 * sizeof(size_t))); + GPUCHECK(cudaMemcpy(block_ptr, ptrPerBlocks.data(), nBlocks * sizeof(chunk_t*), cudaMemcpyHostToDevice)); + GPUCHECK(cudaMemcpy(block_size, perBlockCapacity.data(), nBlocks * sizeof(size_t), cudaMemcpyHostToDevice)); + float milliseconds{0.f}; cudaEvent_t start, stop; cudaStream_t stream; @@ -455,11 +463,11 @@ float GPUbenchmark::runDistributed(void (*kernel)(chunk_t**, size_t*, T GPUCHECK(cudaEventCreate(&stop)); // Warm up - (*kernel)<<>>(ptrPerBlocks.data(), perBlockCapacity.data(), args...); + (*kernel)<<>>(block_ptr, block_size, args...); GPUCHECK(cudaEventRecord(start)); - for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // Schedule all the requested kernel launches - (*kernel)<<>>(ptrPerBlocks.data(), perBlockCapacity.data(), args...); // NOLINT: clang-tidy false-positive + for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // Schedule all the requested kernel launches + (*kernel)<<>>(block_ptr, block_size, args...); // NOLINT: clang-tidy false-positive } GPUCHECK(cudaEventRecord(stop)); // record checkpoint GPUCHECK(cudaEventSynchronize(stop)); // synchronize executions From 271919e4d4956779ca52e0179f4c454e149f27bc Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Wed, 3 Nov 2021 21:00:27 +0100 Subject: [PATCH 7/8] Compute throughput for distributed case --- GPU/GPUbenchmark/cuda/Kernels.cu | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 701777d737216..8aba6981bf2a8 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -445,7 +445,6 @@ float GPUbenchmark::runDistributed(void (*kernel)(chunk_t**, size_t*, T } // Setup - chunk_t** block_ptr; size_t* block_size; GPUCHECK(cudaMalloc(reinterpret_cast(&block_ptr), 60 * sizeof(chunk_t*))); @@ -630,16 +629,18 @@ void GPUbenchmark::runTest(Test test, Mode mode, KernelConfig config) std::cout << " │ - total throughput with host time: \e[1m" << computeThroughput(test, results[mState.testChunks.size()], tot, mState.getNKernelLaunches()) << " GB/s \e[0m (" << std::setw(2) << results[mState.testChunks.size()] / 1000 << " s)" << std::endl; } else if (mode == Mode::Distributed) { - auto result = runDistributed(kernel_distributed, mState.testChunks, mState.getNKernelLaunches(), nBlocks, nThreads); - // auto throughput = computeThroughput(test, result, chunk.second, mState.getNKernelLaunches()); - // std::cout << " │ " << ((mState.testChunks.size() - iChunk != 1) ? "├ " : "└ ") << iChunk + 1 << "/" << mState.testChunks.size() - // << ": [" << chunk.first << "-" << chunk.first + chunk.second << ") \e[1m" << throughput << " GB/s \e[0m(" << result * 1e-3 << " s)\n"; - // mResultWriter.get()->storeBenchmarkEntry(test, iChunk, result, chunk.second, mState.getNKernelLaunches()); + float tot{0}; + for (auto& chunk : mState.testChunks) { + tot += chunk.second; + } + auto throughput = computeThroughput(test, result, tot, mState.getNKernelLaunches()); + std::cout << " │ └ throughput: \e[1m" << throughput << " GB/s \e[0m(" << result * 1e-3 << " s)\n"; + mResultWriter.get()->storeBenchmarkEntry(test, 0, result, tot, mState.getNKernelLaunches()); } mResultWriter.get()->snapshotBenchmark(); } From 3bbedec4c8023a294dfae8c74396c11794696c6c Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Wed, 3 Nov 2021 21:26:16 +0100 Subject: [PATCH 8/8] Add rest of the kernels --- GPU/GPUbenchmark/cuda/Kernels.cu | 44 ++++++++++++++++++++---------- GPU/GPUbenchmark/cuda/benchmark.cu | 2 +- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 8aba6981bf2a8..8413894073bb5 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -142,8 +142,9 @@ __global__ void rand_read_k( // Distributed read template -__global__ void read_dist_k(chunk_t** block_ptr, - size_t* block_size) +__global__ void read_dist_k( + chunk_t** block_ptr, + size_t* block_size) { chunk_t sink{0}; chunk_t* ptr = block_ptr[blockIdx.x]; @@ -151,37 +152,53 @@ __global__ void read_dist_k(chunk_t** block_ptr, for (size_t i = threadIdx.x; i < n; i += blockDim.x) { sink += ptr[i]; } - block_ptr[blockIdx.x][threadIdx.x] = sink; + ptr[threadIdx.x] = sink; } // Distributed write template __global__ void write_dist_k( - chunk_t** chunkPtr, - size_t* chunkSize) + chunk_t** block_ptr, + size_t* block_size) { + chunk_t* ptr = block_ptr[blockIdx.x]; + size_t n = block_size[blockIdx.x]; + for (size_t i = threadIdx.x; i < n; i += blockDim.x) { + ptr[i] = 0; + } } template <> __global__ void write_dist_k( - int4** chunkPtr, - size_t* chunkSize) + int4** block_ptr, + size_t* block_size) { + int4* ptr = block_ptr[blockIdx.x]; + size_t n = block_size[blockIdx.x]; + for (size_t i = threadIdx.x; i < n; i += blockDim.x) { + ptr[i] = {0, 1, 0, 0}; + } } // Distributed copy template __global__ void copy_dist_k( - chunk_t** chunkPtr, - size_t* chunkSize) + chunk_t** block_ptr, + size_t* block_size) { + chunk_t* ptr = block_ptr[blockIdx.x]; + size_t n = block_size[blockIdx.x]; + size_t offset = n / 2; + for (size_t i = threadIdx.x; i < offset; i += blockDim.x) { + ptr[i] = ptr[offset + i]; + } } // Distributed Random read template __global__ void rand_read_dist_k( - chunk_t** chunkPtr, - size_t* chunkSize) + chunk_t** block_ptr, + size_t* block_size) { } @@ -438,9 +455,8 @@ float GPUbenchmark::runDistributed(void (*kernel)(chunk_t**, size_t*, T } if (totComputedBlocks != nBlocks) { - std::cout << " │ - \033[1;33mWarning: total number of estimated blocks (" << totComputedBlocks - << ") is different from requested one (" << nBlocks - << "). This may cause imbalance in the test!\e[0m" + std::cout << " │ - \033[1;33mWarning: Sum of used blocks (" << totComputedBlocks + << ") is different from requested one (" << nBlocks << ")!\e[0m" << std::endl; } diff --git a/GPU/GPUbenchmark/cuda/benchmark.cu b/GPU/GPUbenchmark/cuda/benchmark.cu index e98f2fc4c9e37..3471865334b6f 100644 --- a/GPU/GPUbenchmark/cuda/benchmark.cu +++ b/GPU/GPUbenchmark/cuda/benchmark.cu @@ -95,7 +95,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) conf.modes.push_back(Mode::Concurrent); } else if (mode == "dis") { conf.modes.push_back(Mode::Distributed); - }else { + } else { std::cerr << "Unkonwn mode: " << mode << std::endl; exit(1); }