diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index b1e89fdf0bf62..9723f0584035e 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -39,25 +39,34 @@ class GPUbenchmark final template float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args); - // Single stream synchronous (sequential kernels) execution + // Single stream (sequential kernels) execution template float runSequential(void (*kernel)(chunk_t*, size_t, T...), - std::pair& chunkRanges, + std::pair& chunkRanges, int nLaunches, int dimGrid, int dimBlock, T&... args); - // Multi-streams asynchronous executions on whole memory + // Multi-streams asynchronous executions template std::vector runConcurrent(void (*kernel)(chunk_t*, size_t, T...), - std::vector>& chunkRanges, + std::vector>& chunkRanges, int nLaunches, int dimStreams, int nBlocks, int nThreads, T&... args); + // Single stream executions on all chunks at a time by same kernel + template + float runDistributed(void (*kernel)(chunk_t**, size_t*, T...), + std::vector>& chunkRanges, + int nLaunches, + int nBlocks, + int nThreads, + T&... args); + // Main interface void globalInit(); // Allocate scratch buffers and compute runtime parameters void run(); // Execute all specified callbacks diff --git a/GPU/GPUbenchmark/Shared/Utils.h b/GPU/GPUbenchmark/Shared/Utils.h index 4b28c19e2be66..d1e83f8bb282b 100644 --- a/GPU/GPUbenchmark/Shared/Utils.h +++ b/GPU/GPUbenchmark/Shared/Utils.h @@ -72,7 +72,8 @@ inline std::ostream& operator<<(std::ostream& os, Test test) enum class Mode { Sequential, - Concurrent + Concurrent, + Distributed }; inline std::ostream& operator<<(std::ostream& os, Mode mode) @@ -84,6 +85,9 @@ inline std::ostream& operator<<(std::ostream& os, Mode mode) case Mode::Concurrent: os << "concurrent"; break; + case Mode::Distributed: + os << "distributed"; + break; } return os; } @@ -138,17 +142,11 @@ inline std::string getTestName(Mode mode, Test test, KernelConfig blocks) return tname; } -template -inline chunk_t* getPartPtr(chunk_t* scratchPtr, float chunkReservedGB, int partNumber) -{ - return reinterpret_cast(reinterpret_cast(scratchPtr) + static_cast(GB * chunkReservedGB) * partNumber); -} - // Return pointer to custom offset (GB) template -inline chunk_t* getCustomPtr(chunk_t* scratchPtr, int partNumber) +inline chunk_t* getCustomPtr(chunk_t* scratchPtr, float startGB) { - return reinterpret_cast(reinterpret_cast(scratchPtr) + static_cast(GB * partNumber)); + return reinterpret_cast(reinterpret_cast(scratchPtr) + static_cast(GB * startGB)); } inline float computeThroughput(Test test, float result, float chunkSizeGB, int ntests) @@ -160,9 +158,9 @@ inline float computeThroughput(Test test, float result, float chunkSizeGB, int n } template -inline size_t getBufferCapacity(int chunkReservedGB) +inline size_t getBufferCapacity(float chunkReservedGB) { - return static_cast(GB * chunkReservedGB / sizeof(chunk_t)); + return static_cast((GB * chunkReservedGB) / sizeof(chunk_t)); } // LCG: https://rosettacode.org/wiki/Linear_congruential_generator @@ -202,7 +200,7 @@ struct benchmarkOpts { std::vector modes = {Mode::Sequential, Mode::Concurrent}; std::vector pools = {KernelConfig::Single, KernelConfig::Multi}; std::vector dtypes = {"char", "int", "ulong"}; - std::vector> testChunks; + std::vector> testChunks; float chunkReservedGB = 1.f; float threadPoolFraction = 1.f; float freeMemoryFractionToAllocate = 0.95f; @@ -235,10 +233,10 @@ struct gpuState { float chunkReservedGB; // Size of each partition (GB) // General containers and state - chunk_t* scratchPtr; // Pointer to scratch buffer - size_t scratchSize; // Size of scratch area (B) - std::vector partAddrOnHost; // Pointers to scratch partitions on host vector - std::vector> testChunks; // Vector of definitions for arbitrary chunks + chunk_t* scratchPtr; // Pointer to scratch buffer + size_t scratchSize; // Size of scratch area (B) + std::vector partAddrOnHost; // Pointers to scratch partitions on host vector + std::vector> testChunks; // Vector of definitions for arbitrary chunks // Static info size_t totalMemory; diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index e2baa83b5fdfe..8413894073bb5 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -13,6 +13,7 @@ /// \author: mconcas@cern.ch #include "../Shared/Kernels.h" +#include #include #include @@ -32,6 +33,31 @@ double bytesToconfig(size_t s) { return (double)s / (1024.0); } double bytesToGB(size_t s) { return (double)s / GB; } +bool checkTestChunks(std::vector>& chunks, size_t availMemSizeGB) +{ + if (!chunks.size()) { + return true; + } + + bool check{false}; + + sort(chunks.begin(), chunks.end()); + for (auto iChunk{0}; iChunk < chunks.size(); ++iChunk) { // Check boundaries + if (chunks[iChunk].first + chunks[iChunk].second > availMemSizeGB) { + check = false; + break; + } + if (iChunk > 0) { // Check intersections + if (chunks[iChunk].first < chunks[iChunk - 1].first + chunks[iChunk - 1].second) { + check = false; + break; + } + } + check = true; + } + return check; +} + // CUDA does not support operations: // https://forums.developer.nvidia.com/t/swizzling-float4-arithmetic-support/217 #ifndef __HIPCC__ @@ -113,6 +139,69 @@ __global__ void rand_read_k( } chunkPtr[threadIdx.x] = sink; // writing done once } + +// Distributed read +template +__global__ void read_dist_k( + chunk_t** block_ptr, + size_t* block_size) +{ + chunk_t sink{0}; + chunk_t* ptr = block_ptr[blockIdx.x]; + size_t n = block_size[blockIdx.x]; + for (size_t i = threadIdx.x; i < n; i += blockDim.x) { + sink += ptr[i]; + } + ptr[threadIdx.x] = sink; +} + +// Distributed write +template +__global__ void write_dist_k( + chunk_t** block_ptr, + size_t* block_size) +{ + chunk_t* ptr = block_ptr[blockIdx.x]; + size_t n = block_size[blockIdx.x]; + for (size_t i = threadIdx.x; i < n; i += blockDim.x) { + ptr[i] = 0; + } +} + +template <> +__global__ void write_dist_k( + int4** block_ptr, + size_t* block_size) +{ + int4* ptr = block_ptr[blockIdx.x]; + size_t n = block_size[blockIdx.x]; + for (size_t i = threadIdx.x; i < n; i += blockDim.x) { + ptr[i] = {0, 1, 0, 0}; + } +} + +// Distributed copy +template +__global__ void copy_dist_k( + chunk_t** block_ptr, + size_t* block_size) +{ + chunk_t* ptr = block_ptr[blockIdx.x]; + size_t n = block_size[blockIdx.x]; + size_t offset = n / 2; + for (size_t i = threadIdx.x; i < offset; i += blockDim.x) { + ptr[i] = ptr[offset + i]; + } +} + +// Distributed Random read +template +__global__ void rand_read_dist_k( + chunk_t** block_ptr, + size_t* block_size) +{ +} + } // namespace gpu void printDeviceProp(int deviceId) @@ -238,7 +327,7 @@ void printDeviceProp(int deviceId) template template float GPUbenchmark::runSequential(void (*kernel)(chunk_t*, size_t, T...), - std::pair& chunk, + std::pair& chunk, int nLaunches, int nBlocks, int nThreads, @@ -248,8 +337,8 @@ float GPUbenchmark::runSequential(void (*kernel)(chunk_t*, size_t, T... cudaEvent_t start, stop; cudaStream_t stream; GPUCHECK(cudaStreamCreate(&stream)); - GPUCHECK(cudaSetDevice(mOptions.deviceId)); + chunk_t* chunkPtr = getCustomPtr(mState.scratchPtr, chunk.first); // Warm up @@ -275,7 +364,7 @@ float GPUbenchmark::runSequential(void (*kernel)(chunk_t*, size_t, T... template template std::vector GPUbenchmark::runConcurrent(void (*kernel)(chunk_t*, size_t, T...), - std::vector>& chunkRanges, + std::vector>& chunkRanges, int nLaunches, int dimStreams, int nBlocks, @@ -283,7 +372,7 @@ std::vector GPUbenchmark::runConcurrent(void (*kernel)(chunk_t*, T&... args) { auto nChunks = chunkRanges.size(); - std::vector results(nChunks); + std::vector results(nChunks + 1); // last spot is for the host time std::vector starts(nChunks), stops(nChunks); std::vector streams(dimStreams); @@ -302,6 +391,7 @@ std::vector GPUbenchmark::runConcurrent(void (*kernel)(chunk_t*, chunk_t* chunkPtr = getCustomPtr(mState.scratchPtr, chunk.first); (*kernel)<<>>(chunkPtr, getBufferCapacity(chunk.second), args...); } + auto start = std::chrono::high_resolution_clock::now(); for (auto iChunk{0}; iChunk < nChunks; ++iChunk) { auto& chunk = chunkRanges[iChunk]; @@ -319,14 +409,90 @@ std::vector GPUbenchmark::runConcurrent(void (*kernel)(chunk_t*, GPUCHECK(cudaEventDestroy(starts[iChunk])); GPUCHECK(cudaEventDestroy(stops[iChunk])); } + GPUCHECK(cudaDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff_t{end - start}; for (auto iStream{0}; iStream < dimStreams; ++iStream) { GPUCHECK(cudaStreamDestroy(streams[iStream])); } + results[nChunks] = diff_t.count(); // register host time on latest spot return results; } +template +template +float GPUbenchmark::runDistributed(void (*kernel)(chunk_t**, size_t*, T...), + std::vector>& chunkRanges, + int nLaunches, + int nBlocks, + int nThreads, + T&... args) +{ + std::vector chunkPtrs(chunkRanges.size()); // Pointers to the beginning of each chunk + std::vector ptrPerBlocks(nBlocks); // Pointers for each block + std::vector perBlockCapacity(nBlocks); // Capacity of sub-buffer for block + + float totChunkGB{0.f}; + int totComputedBlocks{0}; + + for (size_t iChunk{0}; iChunk < chunkRanges.size(); ++iChunk) { + chunkPtrs[iChunk] = getCustomPtr(mState.scratchPtr, chunkRanges[iChunk].first); + totChunkGB += chunkRanges[iChunk].second; + } + int index{0}; + for (size_t iChunk{0}; iChunk < chunkRanges.size(); ++iChunk) { + float percFromMem = chunkRanges[iChunk].second / totChunkGB; + int blocksPerChunk = percFromMem * nBlocks; + totComputedBlocks += blocksPerChunk; + for (int iBlock{0}; iBlock < blocksPerChunk; ++iBlock, ++index) { + float memPerBlock = chunkRanges[iChunk].second / blocksPerChunk; + ptrPerBlocks[index] = getCustomPtr(chunkPtrs[iChunk], iBlock * memPerBlock); + perBlockCapacity[index] = getBufferCapacity(memPerBlock); + } + } + + if (totComputedBlocks != nBlocks) { + std::cout << " │ - \033[1;33mWarning: Sum of used blocks (" << totComputedBlocks + << ") is different from requested one (" << nBlocks << ")!\e[0m" + << std::endl; + } + + // Setup + chunk_t** block_ptr; + size_t* block_size; + GPUCHECK(cudaMalloc(reinterpret_cast(&block_ptr), 60 * sizeof(chunk_t*))); + GPUCHECK(cudaMalloc(reinterpret_cast(&block_size), 60 * sizeof(size_t))); + GPUCHECK(cudaMemcpy(block_ptr, ptrPerBlocks.data(), nBlocks * sizeof(chunk_t*), cudaMemcpyHostToDevice)); + GPUCHECK(cudaMemcpy(block_size, perBlockCapacity.data(), nBlocks * sizeof(size_t), cudaMemcpyHostToDevice)); + + float milliseconds{0.f}; + cudaEvent_t start, stop; + cudaStream_t stream; + + GPUCHECK(cudaStreamCreate(&stream)); + GPUCHECK(cudaSetDevice(mOptions.deviceId)); + GPUCHECK(cudaEventCreate(&start)); + GPUCHECK(cudaEventCreate(&stop)); + + // Warm up + (*kernel)<<>>(block_ptr, block_size, args...); + + GPUCHECK(cudaEventRecord(start)); + for (auto iLaunch{0}; iLaunch < nLaunches; ++iLaunch) { // Schedule all the requested kernel launches + (*kernel)<<>>(block_ptr, block_size, args...); // NOLINT: clang-tidy false-positive + } + GPUCHECK(cudaEventRecord(stop)); // record checkpoint + GPUCHECK(cudaEventSynchronize(stop)); // synchronize executions + GPUCHECK(cudaEventElapsedTime(&milliseconds, start, stop)); + GPUCHECK(cudaEventDestroy(start)); + GPUCHECK(cudaEventDestroy(stop)); + GPUCHECK(cudaStreamDestroy(stream)); + return milliseconds; +} + template void GPUbenchmark::printDevices() { @@ -354,6 +520,10 @@ void GPUbenchmark::globalInit() mState.iterations = mOptions.kernelLaunches; mState.streams = mOptions.streams; mState.testChunks = mOptions.testChunks; + if (!checkTestChunks(mState.testChunks, mOptions.freeMemoryFractionToAllocate * free / GB)) { + std::cerr << "Failed to configure memory chunks: check arbitrary chunks boundaries." << std::endl; + exit(1); + } mState.nMultiprocessors = props.multiProcessorCount; mState.nMaxThreadsPerBlock = props.maxThreadsPerMultiProcessor; mState.nMaxThreadsPerDimension = props.maxThreadsDim[0]; @@ -368,7 +538,6 @@ void GPUbenchmark::globalInit() std::cout << " ◈ Running on: \033[1;31m" << props.name << "\e[0m" << std::endl; // Allocate scratch on GPU GPUCHECK(cudaMalloc(reinterpret_cast(&mState.scratchPtr), mState.scratchSize)); - // mState.computeScratchPtrs(); GPUCHECK(cudaMemset(mState.scratchPtr, 0, mState.scratchSize)) std::cout << " ├ Buffer type: \e[1m" << getType() << "\e[0m" << std::endl @@ -394,28 +563,46 @@ void GPUbenchmark::runTest(Test test, Mode mode, KernelConfig config) : dimGrid}; auto capacity{mState.getChunkCapacity()}; void (*kernel)(chunk_t*, size_t); + void (*kernel_distributed)(chunk_t**, size_t*); - switch (test) { - case Test::Read: { - kernel = &gpu::read_k; - break; - } - case Test::Write: { - kernel = &gpu::write_k; - break; + if (mode != Mode::Distributed) { + switch (test) { + case Test::Read: { + kernel = &gpu::read_k; + break; + } + case Test::Write: { + kernel = &gpu::write_k; + break; + } + case Test::Copy: { + kernel = &gpu::copy_k; + break; + } } - case Test::Copy: { - kernel = &gpu::copy_k; - break; + } else { + switch (test) { + case Test::Read: { + kernel_distributed = &gpu::read_dist_k; + break; + } + case Test::Write: { + kernel_distributed = &gpu::write_dist_k; + break; + } + case Test::Copy: { + kernel_distributed = &gpu::copy_dist_k; + break; + } } } for (auto measurement{0}; measurement < mOptions.nTests; ++measurement) { std::cout << " ├ " << mode << " " << test << " " << config << " block(s) (" << measurement + 1 << "/" << mOptions.nTests << "): \n" << " │ - blocks per kernel: " << nBlocks << "/" << dimGrid << "\n" - << " │ - threads per block: " << (int)nThreads << "\n" - << " │ - per chunk throughput:\n"; + << " │ - threads per block: " << (int)nThreads << "\n"; if (mode == Mode::Sequential) { + std::cout << " │ - per chunk throughput:\n"; for (auto iChunk{0}; iChunk < mState.testChunks.size(); ++iChunk) { // loop over single chunks separately auto& chunk = mState.testChunks[iChunk]; auto result = runSequential(kernel, @@ -428,7 +615,8 @@ void GPUbenchmark::runTest(Test test, Mode mode, KernelConfig config) << ": [" << chunk.first << "-" << chunk.first + chunk.second << ") \e[1m" << throughput << " GB/s \e[0m(" << result * 1e-3 << " s)\n"; mResultWriter.get()->storeBenchmarkEntry(test, iChunk, result, chunk.second, mState.getNKernelLaunches()); } - } else { + } else if (mode == Mode::Concurrent) { + std::cout << " │ - per chunk throughput:\n"; auto results = runConcurrent(kernel, mState.testChunks, mState.getNKernelLaunches(), @@ -436,7 +624,7 @@ void GPUbenchmark::runTest(Test test, Mode mode, KernelConfig config) nBlocks, nThreads); float sum{0}; - for (auto iChunk{0}; iChunk < results.size(); ++iChunk) { + for (auto iChunk{0}; iChunk < mState.testChunks.size(); ++iChunk) { auto& chunk = mState.testChunks[iChunk]; auto throughput = computeThroughput(test, results[iChunk], chunk.second, mState.getNKernelLaunches()); sum += throughput; @@ -447,6 +635,28 @@ void GPUbenchmark::runTest(Test test, Mode mode, KernelConfig config) if (mState.testChunks.size() > 1) { std::cout << " │ - total throughput: \e[1m" << sum << " GB/s \e[0m" << std::endl; } + + // Add throughput computed via system time measurement + float tot{0}; + for (auto& chunk : mState.testChunks) { + tot += chunk.second; + } + + std::cout << " │ - total throughput with host time: \e[1m" << computeThroughput(test, results[mState.testChunks.size()], tot, mState.getNKernelLaunches()) + << " GB/s \e[0m (" << std::setw(2) << results[mState.testChunks.size()] / 1000 << " s)" << std::endl; + } else if (mode == Mode::Distributed) { + auto result = runDistributed(kernel_distributed, + mState.testChunks, + mState.getNKernelLaunches(), + nBlocks, + nThreads); + float tot{0}; + for (auto& chunk : mState.testChunks) { + tot += chunk.second; + } + auto throughput = computeThroughput(test, result, tot, mState.getNKernelLaunches()); + std::cout << " │ └ throughput: \e[1m" << throughput << " GB/s \e[0m(" << result * 1e-3 << " s)\n"; + mResultWriter.get()->storeBenchmarkEntry(test, 0, result, tot, mState.getNKernelLaunches()); } mResultWriter.get()->snapshotBenchmark(); } diff --git a/GPU/GPUbenchmark/cuda/benchmark.cu b/GPU/GPUbenchmark/cuda/benchmark.cu index db90dd4888c11..3471865334b6f 100644 --- a/GPU/GPUbenchmark/cuda/benchmark.cu +++ b/GPU/GPUbenchmark/cuda/benchmark.cu @@ -13,7 +13,7 @@ /// #include "../Shared/Kernels.h" -#define VERSION "version 0.1-pr#6773" +#define VERSION "version 0.2" bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) { @@ -28,7 +28,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) "device,d", bpo::value()->default_value(0), "Id of the device to run test on, EPN targeted.")( "test,t", bpo::value>()->multitoken()->default_value(std::vector{"read", "write", "copy"}, "read write copy"), "Tests to be performed.")( "kind,k", bpo::value>()->multitoken()->default_value(std::vector{"char", "int", "ulong", "int4"}, "char int ulong int4"), "Test data type to be used.")( - "mode,m", bpo::value>()->multitoken()->default_value(std::vector{"seq", "con"}, "seq con"), "Mode: sequential or concurrent.")( + "mode,m", bpo::value>()->multitoken()->default_value(std::vector{"seq", "con", "dis"}, "seq con dis"), "Mode: sequential, concurrent or distributed.")( "blockPool,b", bpo::value>()->multitoken()->default_value(std::vector{"sb", "mb", "ab"}, "sb mb ab"), "Pool strategy: single, multi or all blocks.")( "threadPool,e", bpo::value()->default_value(1.f), "Fraction of blockDim.x to use (aka: rounded fraction of thread pool).")( "chunkSize,c", bpo::value()->default_value(1.f), "Size of scratch partitions (GB).")( @@ -93,6 +93,8 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) conf.modes.push_back(Mode::Sequential); } else if (mode == "con") { conf.modes.push_back(Mode::Concurrent); + } else if (mode == "dis") { + conf.modes.push_back(Mode::Distributed); } else { std::cerr << "Unkonwn mode: " << mode << std::endl; exit(1); @@ -117,7 +119,7 @@ bool parseArgs(o2::benchmark::benchmarkOpts& conf, int argc, const char* argv[]) for (auto& aChunk : vm["arbitrary"].as>()) { const size_t sep = aChunk.find(':'); if (sep != std::string::npos) { - conf.testChunks.emplace_back(std::stoi(aChunk.substr(0, sep)), std::stoi(aChunk.substr(sep + 1))); + conf.testChunks.emplace_back(std::stof(aChunk.substr(0, sep)), std::stof(aChunk.substr(sep + 1))); } }