Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions GPU/GPUbenchmark/Shared/Kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,25 +39,34 @@ class GPUbenchmark final
template <typename... T>
float measure(void (GPUbenchmark::*)(T...), const char*, T&&... args);

// Single stream synchronous (sequential kernels) execution
// Single stream (sequential kernels) execution
template <typename... T>
float runSequential(void (*kernel)(chunk_t*, size_t, T...),
std::pair<int, int>& chunkRanges,
std::pair<float, float>& chunkRanges,
int nLaunches,
int dimGrid,
int dimBlock,
T&... args);

// Multi-streams asynchronous executions on whole memory
// Multi-streams asynchronous executions
template <typename... T>
std::vector<float> runConcurrent(void (*kernel)(chunk_t*, size_t, T...),
std::vector<std::pair<int, int>>& chunkRanges,
std::vector<std::pair<float, float>>& chunkRanges,
int nLaunches,
int dimStreams,
int nBlocks,
int nThreads,
T&... args);

// Single stream executions on all chunks at a time by same kernel
template <typename... T>
float runDistributed(void (*kernel)(chunk_t**, size_t*, T...),
std::vector<std::pair<float, float>>& chunkRanges,
int nLaunches,
int nBlocks,
int nThreads,
T&... args);

// Main interface
void globalInit(); // Allocate scratch buffers and compute runtime parameters
void run(); // Execute all specified callbacks
Expand Down
30 changes: 14 additions & 16 deletions GPU/GPUbenchmark/Shared/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ inline std::ostream& operator<<(std::ostream& os, Test test)

enum class Mode {
Sequential,
Concurrent
Concurrent,
Distributed
};

inline std::ostream& operator<<(std::ostream& os, Mode mode)
Expand All @@ -84,6 +85,9 @@ inline std::ostream& operator<<(std::ostream& os, Mode mode)
case Mode::Concurrent:
os << "concurrent";
break;
case Mode::Distributed:
os << "distributed";
break;
}
return os;
}
Expand Down Expand Up @@ -138,17 +142,11 @@ inline std::string getTestName(Mode mode, Test test, KernelConfig blocks)
return tname;
}

template <class chunk_t>
inline chunk_t* getPartPtr(chunk_t* scratchPtr, float chunkReservedGB, int partNumber)
{
return reinterpret_cast<chunk_t*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * chunkReservedGB) * partNumber);
}

// Return pointer to custom offset (GB)
template <class chunk_t>
inline chunk_t* getCustomPtr(chunk_t* scratchPtr, int partNumber)
inline chunk_t* getCustomPtr(chunk_t* scratchPtr, float startGB)
{
return reinterpret_cast<chunk_t*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * partNumber));
return reinterpret_cast<chunk_t*>(reinterpret_cast<char*>(scratchPtr) + static_cast<size_t>(GB * startGB));
}

inline float computeThroughput(Test test, float result, float chunkSizeGB, int ntests)
Expand All @@ -160,9 +158,9 @@ inline float computeThroughput(Test test, float result, float chunkSizeGB, int n
}

template <class chunk_t>
inline size_t getBufferCapacity(int chunkReservedGB)
inline size_t getBufferCapacity(float chunkReservedGB)
{
return static_cast<size_t>(GB * chunkReservedGB / sizeof(chunk_t));
return static_cast<size_t>((GB * chunkReservedGB) / sizeof(chunk_t));
}

// LCG: https://rosettacode.org/wiki/Linear_congruential_generator
Expand Down Expand Up @@ -202,7 +200,7 @@ struct benchmarkOpts {
std::vector<Mode> modes = {Mode::Sequential, Mode::Concurrent};
std::vector<KernelConfig> pools = {KernelConfig::Single, KernelConfig::Multi};
std::vector<std::string> dtypes = {"char", "int", "ulong"};
std::vector<std::pair<int, int>> testChunks;
std::vector<std::pair<float, float>> testChunks;
float chunkReservedGB = 1.f;
float threadPoolFraction = 1.f;
float freeMemoryFractionToAllocate = 0.95f;
Expand Down Expand Up @@ -235,10 +233,10 @@ struct gpuState {
float chunkReservedGB; // Size of each partition (GB)

// General containers and state
chunk_t* scratchPtr; // Pointer to scratch buffer
size_t scratchSize; // Size of scratch area (B)
std::vector<chunk_t*> partAddrOnHost; // Pointers to scratch partitions on host vector
std::vector<std::pair<int, int>> testChunks; // Vector of definitions for arbitrary chunks
chunk_t* scratchPtr; // Pointer to scratch buffer
size_t scratchSize; // Size of scratch area (B)
std::vector<chunk_t*> partAddrOnHost; // Pointers to scratch partitions on host vector
std::vector<std::pair<float, float>> testChunks; // Vector of definitions for arbitrary chunks

// Static info
size_t totalMemory;
Expand Down
Loading