From 8d6f98123b0746a08459166d2f76747ed091effc Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 19 Jun 2017 11:15:53 -0700 Subject: [PATCH 1/4] Convert CUDA JIT to use nvrtc instead of nvvm --- src/backend/cuda/Array.cpp | 4 +- src/backend/cuda/CMakeLists.txt | 128 ++---- src/backend/cuda/JIT/BinaryNode.hpp | 64 +-- src/backend/cuda/JIT/BufferNode.hpp | 136 ++----- src/backend/cuda/JIT/Node.hpp | 26 +- src/backend/cuda/JIT/ScalarNode.hpp | 28 +- src/backend/cuda/JIT/UnaryNode.hpp | 54 +-- src/backend/cuda/JIT/arith.cu | 44 --- src/backend/cuda/JIT/cast.cu | 104 ----- src/backend/cuda/JIT/exp.cu | 100 ----- src/backend/cuda/JIT/hyper.cu | 41 -- src/backend/cuda/JIT/logic.cu | 109 ----- src/backend/cuda/JIT/numeric.cu | 191 --------- src/backend/cuda/JIT/trig.cu | 62 --- src/backend/cuda/arith.hpp | 1 - src/backend/cuda/binary.hpp | 356 +++++++---------- src/backend/cuda/cast.hpp | 86 +++- src/backend/cuda/complex.hpp | 72 +--- src/backend/cuda/jit.cpp | 590 ++++++++-------------------- src/backend/cuda/kernel/jit.cuh | 205 ++++++++++ src/backend/cuda/logic.hpp | 3 +- src/backend/cuda/types.cpp | 99 ++--- src/backend/cuda/types.hpp | 6 +- src/backend/cuda/unary.hpp | 244 ++++-------- 24 files changed, 837 insertions(+), 1916 deletions(-) delete mode 100644 src/backend/cuda/JIT/arith.cu delete mode 100644 src/backend/cuda/JIT/cast.cu delete mode 100644 src/backend/cuda/JIT/exp.cu delete mode 100644 src/backend/cuda/JIT/hyper.cu delete mode 100644 src/backend/cuda/JIT/logic.cu delete mode 100644 src/backend/cuda/JIT/numeric.cu delete mode 100644 src/backend/cuda/JIT/trig.cu create mode 100644 src/backend/cuda/kernel/jit.cuh diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp index 495f35807e..da98060331 100644 --- a/src/backend/cuda/Array.cpp +++ b/src/backend/cuda/Array.cpp @@ -33,8 +33,8 @@ namespace cuda template Node_ptr bufferNodePtr() { - Node_ptr node(reinterpret_cast(new BufferNode(irname(), afShortName()))); - return node; + return Node_ptr(reinterpret_cast(new BufferNode(getFullName(), + shortname(true)))); } template diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt index eb4b27b44b..e48dda4446 100644 --- a/src/backend/cuda/CMakeLists.txt +++ b/src/backend/cuda/CMakeLists.txt @@ -3,10 +3,6 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8) FIND_PACKAGE(CUDA 7.0 REQUIRED) INCLUDE(CLKernelToH) -INCLUDE(FindNVVM) - -OPTION(USE_LIBDEVICE "Use libdevice for CUDA JIT" ON) -SET(CUDA_LIBDEVICE_DIR "${CUDA_NVVM_HOME}/libdevice" CACHE PATH "Path where libdevice compute files are located" FORCE) MARK_AS_ADVANCED( CUDA_BUILD_CUBIN @@ -168,14 +164,10 @@ FILE(GLOB jit_sources FILE(GLOB kernel_headers "kernel/*.hpp") -FILE(GLOB ptx_sources - "JIT/*.cu") - LIST(SORT cuda_headers) LIST(SORT cuda_sources) LIST(SORT jit_sources) LIST(SORT kernel_headers) -LIST(SORT ptx_sources) SOURCE_GROUP(backend\\cuda\\Headers FILES ${cuda_headers}) SOURCE_GROUP(backend\\cuda\\Sources FILES ${cuda_sources}) @@ -219,10 +211,23 @@ FILE(GLOB cpp_sources LIST(SORT cpp_sources) +SET(jit_kernel_headers + "kernel_headers") + +FILE(GLOB jit_src "kernel/jit.cuh") +CL_KERNEL_TO_H( + SOURCES ${jit_src} + VARNAME jit_files + EXTENSION "hpp" + OUTPUT_DIR ${jit_kernel_headers} + TARGETS jit_kernel_targets + NAMESPACE "cuda" + ) + + SOURCE_GROUP(api\\cpp\\Sources FILES ${cpp_sources}) INCLUDE("${CMAKE_CURRENT_SOURCE_DIR}/kernel/thrust_sort_by_key/CMakeLists.txt") - INCLUDE("${CMAKE_CURRENT_SOURCE_DIR}/kernel/scan_by_key/CMakeLists.txt") LIST(LENGTH COMPUTE_VERSIONS COMPUTE_COUNT) @@ -242,89 +247,8 @@ SET(OLD_CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}) IF(${CUDA_VERSION_MAJOR} GREATER 7) # CUDA 8 or newer SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --keep-device-functions") ENDIF() -CUDA_COMPILE_PTX(ptx_files ${ptx_sources}) SET(CUDA_NVCC_FLAGS ${OLD_CUDA_NVCC_FLAGS}) -set(cuda_ptx "") -foreach(ptx_src_file ${ptx_sources}) - - get_filename_component(_name "${ptx_src_file}" NAME_WE) - - # CUDA_COMPILE_PTX from CMake 3.7 has new features that require this change - # TODO Fix this with a more complete solution - IF(CMAKE_VERSION VERSION_LESS 3.7) # Before 3.7 - SET(NAME_APPEND "") - ELSE(CMAKE_VERSION VERSION_LESS 3.7) # 3.7 and newer - SET(NAME_APPEND "_1") - ENDIF(CMAKE_VERSION VERSION_LESS 3.7) - - set(_gen_file_name - "${PROJECT_BINARY_DIR}/src/backend/cuda/cuda_compile_ptx${NAME_APPEND}_generated_${_name}.cu.ptx") - set(_out_file_name - "${PROJECT_BINARY_DIR}/src/backend/cuda/${_name}.ptx") - - ADD_CUSTOM_COMMAND( - OUTPUT "${_out_file_name}" - DEPENDS "${_gen_file_name}" - COMMAND ${CMAKE_COMMAND} -E copy "${_gen_file_name}" "${_out_file_name}") - - list(APPEND cuda_ptx "${_out_file_name}") -endforeach() - -SET( ptx_headers - "ptx_headers") - -CL_KERNEL_TO_H( - SOURCES ${cuda_ptx} - VARNAME kernel_files - EXTENSION "hpp" - OUTPUT_DIR ${ptx_headers} - TARGETS ptx_targets - NAMESPACE "cuda" - NULLTERM TRUE - ) - -SET(libdevice_bc "") -IF (USE_LIBDEVICE) - SET(libdevice_computes "") - LIST(APPEND libdevice_computes "20" "30" "35" "50") - FOREACH(libdevice_compute ${libdevice_computes}) - SET(_libdevice_bc_file "${CUDA_LIBDEVICE_DIR}/libdevice.compute_${libdevice_compute}.10.bc") - SET(_libdevice_bc_copy "${PROJECT_BINARY_DIR}/src/backend/cuda/compute_${libdevice_compute}.bc") - IF (EXISTS ${_libdevice_bc_file}) - ADD_CUSTOM_COMMAND( - OUTPUT "${_libdevice_bc_copy}" - DEPENDS "${_libdevice_bc_file}" - COMMAND ${CMAKE_COMMAND} -E copy "${_libdevice_bc_file}" "${_libdevice_bc_copy}") - LIST(APPEND libdevice_bc ${_libdevice_bc_copy}) - ADD_DEFINITIONS(-D"__LIBDEVICE_COMPUTE_${libdevice_compute}") - ENDIF() - ENDFOREACH() -ENDIF() - -LIST(LENGTH libdevice_bc libdevice_bc_len) - -IF (${libdevice_bc_len} GREATER 0) - - SET(libdevice_headers - "libdevice_headers") - - CL_KERNEL_TO_H( - SOURCES ${libdevice_bc} - VARNAME libdevice_files - EXTENSION "hpp" - OUTPUT_DIR ${libdevice_headers} - TARGETS libdevice_targets - NAMESPACE "cuda" - BINARY TRUE - ) - - MESSAGE(STATUS "LIBDEVICE found.") - ADD_DEFINITIONS(-DUSE_LIBDEVICE) -ELSE() - MESSAGE(STATUS "LIBDEVICE not found on system. CUDA JIT may be slower") -ENDIF() - IF("${APPLE}") ADD_DEFINITIONS(-D__STRICT_ANSI__) ELSE() @@ -407,10 +331,11 @@ ENDIF(NOT CUDA_CUDA_LIBRARY) SET(CUDA_ADD_LIBRARY_OPTIONS "") IF(UNIX) - # These flags enable C++11 and disable invalid offsetof warning - SET(CUDA_ADD_LIBRARY_OPTIONS "-std=c++11 -Xcudafe \"--diag_suppress=1427\"") + # These flags enable C++11 and disable invalid offsetof warning + SET(CUDA_ADD_LIBRARY_OPTIONS "-std=c++11 -Xcudafe \"--diag_suppress=1427\"") ENDIF(UNIX) + MY_CUDA_ADD_LIBRARY(afcuda SHARED ${cuda_headers} ${cuda_sources} @@ -425,23 +350,26 @@ MY_CUDA_ADD_LIBRARY(afcuda SHARED ${scan_by_key_sources} OPTIONS ${CUDA_GENERATE_CODE} ${CUDA_ADD_LIBRARY_OPTIONS}) -ADD_DEPENDENCIES(afcuda ${ptx_targets}) - -IF (${libdevice_bc_len} GREATER 0) - ADD_DEPENDENCIES(afcuda ${libdevice_targets}) -ENDIF() +FIND_LIBRARY ( + CUDA_nvrtc_LIBRARY + NAMES "nvrtc" + PATHS ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES "lib64" "lib/x64" "lib" + DOC "CUDA NVRTC Library" + NO_DEFAULT_PATH + ) -TARGET_LINK_LIBRARIES(afcuda - PRIVATE ${CUDA_CUBLAS_LIBRARIES} +TARGET_LINK_LIBRARIES(afcuda PRIVATE ${CUDA_CUBLAS_LIBRARIES} PRIVATE ${CUDA_LIBRARIES} PRIVATE ${FreeImage_LIBS} PRIVATE ${CUDA_CUFFT_LIBRARIES} PRIVATE ${CUDA_cusparse_LIBRARY} PRIVATE ${CUDA_cusolver_LIBRARY} - PRIVATE ${CUDA_nvvm_LIBRARY} + PRIVATE ${CUDA_nvrtc_LIBRARY} PRIVATE ${CUDA_CUDA_LIBRARY} ) +ADD_DEPENDENCIES(afcuda ${jit_kernel_targets}) LIST(LENGTH GRAPHICS_DEPENDENCIES GRAPHICS_DEPENDENCIES_LEN) IF(${GRAPHICS_DEPENDENCIES_LEN} GREATER 0) ADD_DEPENDENCIES(afcuda ${GRAPHICS_DEPENDENCIES}) diff --git a/src/backend/cuda/JIT/BinaryNode.hpp b/src/backend/cuda/JIT/BinaryNode.hpp index 115c892119..8edb88098d 100644 --- a/src/backend/cuda/JIT/BinaryNode.hpp +++ b/src/backend/cuda/JIT/BinaryNode.hpp @@ -20,68 +20,34 @@ namespace JIT class BinaryNode : public Node { private: - const std::string m_op_str; - const int m_op; - const int m_call_type; + std::string m_op_str; + int m_op; public: BinaryNode(const char *out_type_str, const char *name_str, - const std::string &op_str, - Node_ptr lhs, Node_ptr rhs, int op, int call_type) + const char *op_str, + Node_ptr lhs, Node_ptr rhs, int op) : Node(out_type_str, name_str, std::max(lhs->getHeight(), rhs->getHeight()) + 1, {lhs, rhs}), m_op_str(op_str), - m_op(op), - m_call_type(call_type) + m_op(op) { } void genKerName(std::stringstream &kerStream, Node_ids ids) { - // Make the hex representation of enum part of the Kernel name - kerStream << "_" << std::setw(2) << std::setfill('0') << std::hex << m_op; - kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.child_ids[0]; - kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.child_ids[1]; - kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.id << std::dec; + // Make the dec representation of enum part of the Kernel name + kerStream << "_" << std::setw(3) << std::setfill('0') << std::dec << m_op; + kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.child_ids[0]; + kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.child_ids[1]; + kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.id << std::dec; } - void genFuncs(std::stringstream &kerStream, str_map_t &declStrs, Node_ids ids, bool is_linear) + void genFuncs(std::stringstream &kerStream, Node_ids ids) { - if (m_call_type == 0) { - std::stringstream declStream; - declStream << "declare " << m_type_str << " " << m_op_str - << "(" << m_children[0]->getTypeStr() << " , " - << m_children[1]->getTypeStr() << ")\n"; - declStrs[declStream.str()] = true; - - kerStream << "%val" << ids.id << " = call " - << m_type_str << " " - << m_op_str << "(" - << m_children[0]->getTypeStr() << " " - << "%val" << ids.child_ids[0] << ", " - << m_children[1]->getTypeStr() << " " - << "%val" << ids.child_ids[1] << ")\n"; - - } else { - if (m_call_type == 1) { - // arithmetic operations - kerStream << "%val" << ids.id << " = " - << m_op_str << " " - << m_type_str << " " - << "%val" << ids.child_ids[0] << ", " - << "%val" << ids.child_ids[1] << "\n"; - } else { - // logical operators - kerStream << "%tmp" << ids.id << " = " - << m_op_str << " " - << m_children[0]->getTypeStr() << " " - << "%val" << ids.child_ids[0] << ", " - << "%val" << ids.child_ids[1] << "\n"; - - kerStream << "%val" << ids.id << " = " - << "zext i1 %tmp" << ids.id << " to i8\n"; - - } - } + kerStream << m_type_str << " val" << ids.id << " = " + << m_op_str << "(val" << ids.child_ids[0] + << ", val" << ids.child_ids[1] << ");" + << "\n"; } }; diff --git a/src/backend/cuda/JIT/BufferNode.hpp b/src/backend/cuda/JIT/BufferNode.hpp index c6a03d2899..6c4c2fb0a2 100644 --- a/src/backend/cuda/JIT/BufferNode.hpp +++ b/src/backend/cuda/JIT/BufferNode.hpp @@ -10,7 +10,6 @@ #pragma once #include "Node.hpp" #include -#include #include namespace cuda @@ -19,25 +18,17 @@ namespace cuda namespace JIT { - template - static inline std::string toString(T val) - { - std::stringstream s; - s << val; - return s.str(); - } template class BufferNode : public Node { private: - // Keep the shared pointer for reference counting std::shared_ptr m_data; Param m_param; unsigned m_bytes; + bool m_linear_buffer; std::once_flag m_set_data_flag; - bool m_linear_buffer; public: BufferNode(const char *type_str, @@ -46,6 +37,12 @@ namespace JIT { } + bool isBuffer() { return true; } + + ~BufferNode() + { + } + void setData(Param param, std::shared_ptr data, const unsigned bytes, bool is_linear) { std::call_once(m_set_data_flag, [this, param, data, bytes, is_linear]() { @@ -65,90 +62,58 @@ namespace JIT return m_linear_buffer && same_dims; } - bool isBuffer() { return true; } - void genKerName(std::stringstream &kerStream, Node_ids ids) { kerStream << "_" << m_name_str; - kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.id << std::dec; + kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.id << std::dec; } - void genParams(std::stringstream &kerStream, - std::stringstream &annStream, - int id, - bool is_linear) + void genParams(std::stringstream &kerStream, int id, bool is_linear) { - kerStream << m_type_str << "* %in" << id << ",\n"; - annStream << m_type_str << "*,\n"; - - if (!is_linear) { - kerStream << "i32 %dim0" << id << "," - << "i32 %dim1" << id << "," - << "i32 %dim2" << id << "," - << "i32 %dim3" << id << "," - << "\n" - << "i32 %str1" << id << "," - << "i32 %str2" << id << "," - << "i32 %str3" << id << "," - << "\n"; + if (is_linear) { + kerStream << m_type_str << " *in" << id << "_ptr,\n"; + } else { + kerStream << "Param<" << m_type_str << "> in" << id + << ",\n"; + } + } - annStream << "i32, i32, i32, i32,\n" - << "i32, i32, i32,\n"; + void setArgs(std::vector &args, bool is_linear) + { + if (is_linear) { + args.push_back((void *)&m_param.ptr); + } else { + args.push_back((void *)&m_param); } } void genOffsets(std::stringstream &kerStream, int id, bool is_linear) { - if (!is_linear) { - kerStream << "%b3" << id << " = icmp slt i32 %id3, %dim3" << id << "\n"; - kerStream << "%b2" << id << " = icmp slt i32 %id2, %dim2" << id << "\n"; - kerStream << "%b1" << id << " = icmp slt i32 %id1, %dim1" << id << "\n"; - kerStream << "%b0" << id << " = icmp slt i32 %id0, %dim0" << id << "\n"; - - kerStream << "%c3" << id << " = zext i1 %b3" << id << " to i32\n"; - kerStream << "%c2" << id << " = zext i1 %b2" << id << " to i32\n"; - kerStream << "%c1" << id << " = zext i1 %b1" << id << " to i32\n"; - kerStream << "%c0" << id << " = zext i1 %b0" << id << " to i32\n"; - - kerStream << "%d3" << id << " = mul i32 %c3" << id << ", %id3\n"; - kerStream << "%d2" << id << " = mul i32 %c2" << id << ", %id2\n"; - kerStream << "%d1" << id << " = mul i32 %c1" << id << ", %id1\n"; - kerStream << "%d0" << id << " = mul i32 %c0" << id << ", %id0\n"; - - kerStream << "%off3i" << id << " = mul i32 %d3" << id - << ", %str3" << id << "\n"; - - kerStream << "%off2i" << id << " = mul i32 %d2" << id - << ", %str2" << id << "\n"; - - kerStream << "%off1i" << id << " = mul i32 %d1" << id - << ", %str1" << id << "\n"; - - kerStream << "%off23i" << id << " = add i32 %off2i" - << id << ", %off3i" << id << "\n"; - - kerStream << "%off123i" << id << " = add i32 %off23i" - << id << ", %off1i" << id << "\n"; - - kerStream << "%idxa" << id << " = add i32 %off123i" - << id << ", %d0" << id << "\n"; - - kerStream << "%idx" << id << " = sext i32 %idxa" << id <<" to i64\n\n"; + std::string idx_str = std::string("int idx") + std::to_string(id); + + if (is_linear) { + kerStream << idx_str << " = idx;\n"; + } else { + std::string info_str = std::string("in") + std::to_string(id);; + kerStream << idx_str << " = " + << "(id3 < " << info_str << ".dims[3]) * " + << info_str << ".strides[3] * id3 + " + << "(id2 < " << info_str << ".dims[2]) * " + << info_str << ".strides[2] * id2 + " + << "(id1 < " << info_str << ".dims[1]) * " + << info_str << ".strides[1] * id1 + " + << "(id0 < " << info_str << ".dims[0]) * " + << "id0;" + << "\n"; + kerStream << m_type_str << " *in" << id << "_ptr = in" << id << ".ptr;\n"; } } - void genFuncs(std::stringstream &kerStream, str_map_t &declStrs, Node_ids ids, bool is_linear) + void genFuncs(std::stringstream &kerStream, Node_ids ids) { - kerStream << "%inIdx" << ids.id << " = " - << "getelementptr inbounds " << m_type_str << "* %in" << ids.id - << ", i64 %idx"; - - if (!is_linear) kerStream << ids.id; - kerStream << "\n"; - - kerStream << "%val" << ids.id << " = " << "load " - << m_type_str << "* %inIdx" << ids.id << "\n\n"; - + kerStream << m_type_str << " val" << ids.id << " = " + << "in" << ids.id << "_ptr[idx" << ids.id << "];" + << "\n"; } void getInfo(unsigned &len, unsigned &buf_count, unsigned &bytes) @@ -158,21 +123,6 @@ namespace JIT bytes += m_bytes; return; } - - void setArgs(std::vector &args, bool is_linear) - { - args.push_back((void *)&(m_param.ptr)); - - if (!is_linear) { - args.push_back((void *)&m_param.dims[0]); - args.push_back((void *)&m_param.dims[1]); - args.push_back((void *)&m_param.dims[2]); - args.push_back((void *)&m_param.dims[3]); - args.push_back((void *)&m_param.strides[1]); - args.push_back((void *)&m_param.strides[2]); - args.push_back((void *)&m_param.strides[3]); - } - } }; } diff --git a/src/backend/cuda/JIT/Node.hpp b/src/backend/cuda/JIT/Node.hpp index 035692550d..feae91ca98 100644 --- a/src/backend/cuda/JIT/Node.hpp +++ b/src/backend/cuda/JIT/Node.hpp @@ -8,19 +8,22 @@ ********************************************************/ #pragma once +#include #include - -#include -#include #include #include +#include +#include namespace cuda { namespace JIT { + class Node; + using std::shared_ptr; + typedef shared_ptr Node_ptr; typedef struct { @@ -28,9 +31,6 @@ namespace JIT std::vector child_ids; } Node_ids; - typedef std::unordered_map str_map_t; - typedef str_map_t::iterator str_map_iter; - typedef std::shared_ptr Node_ptr; typedef std::unordered_map Node_map_t; typedef Node_map_t::iterator Node_map_iter; @@ -66,16 +66,11 @@ namespace JIT } virtual void genKerName(std::stringstream &kerStream, Node_ids ids) {} - virtual void genParams (std::stringstream &kerStream, - std::stringstream &annStream, - int id, bool is_linear) {} + virtual void genParams (std::stringstream &kerStream, int id, bool is_linear) {} virtual void genOffsets (std::stringstream &kerStream, int id, bool is_linear) {} - virtual void genFuncs (std::stringstream &kerStream, str_map_t &declStrs, - Node_ids id, bool is_linear) - {} + virtual void genFuncs (std::stringstream &kerStream, Node_ids) {} - virtual void setArgs(std::vector &args, bool is_linear) {} - virtual bool isLinear(dim_t dims[4]) { return true; } + virtual void setArgs (std::vector &args, bool is_linear) { } virtual void getInfo(unsigned &len, unsigned &buf_count, unsigned &bytes) { @@ -83,9 +78,8 @@ namespace JIT } virtual bool isBuffer() { return false; } - + virtual bool isLinear(dim_t dims[4]) { return true; } std::string getTypeStr() { return m_type_str; } - int getHeight() { return m_height; } std::string getNameStr() { return m_name_str; } diff --git a/src/backend/cuda/JIT/ScalarNode.hpp b/src/backend/cuda/JIT/ScalarNode.hpp index 264b7be61e..aae0496ec3 100644 --- a/src/backend/cuda/JIT/ScalarNode.hpp +++ b/src/backend/cuda/JIT/ScalarNode.hpp @@ -8,9 +8,9 @@ ********************************************************/ #pragma once -#include #include "Node.hpp" #include +#include #include namespace cuda @@ -18,15 +18,17 @@ namespace cuda namespace JIT { - template + + template class ScalarNode : public Node { private: - T m_val; + const T m_val; + public: ScalarNode(T val) - : Node(irname(), afShortName(false), 0, {}), + : Node(getFullName(), shortname(false), 0, {}), m_val(val) { } @@ -34,23 +36,27 @@ namespace JIT void genKerName(std::stringstream &kerStream, Node_ids ids) { kerStream << "_" << m_name_str; - kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.id << std::dec; + kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.id << std::dec; } - void genParams(std::stringstream &kerStream, - std::stringstream &annStream, - int id, - bool is_linear) + void genParams(std::stringstream &kerStream, int id, bool is_linear) { - kerStream << m_type_str << " %val" << id << ", " << std::endl; - annStream << m_type_str << ",\n"; + kerStream << m_type_str << " scalar" << id << ", " << "\n"; } void setArgs(std::vector &args, bool is_linear) { args.push_back((void *)&m_val); } + + void genFuncs(std::stringstream &kerStream, Node_ids ids) + { + kerStream << m_type_str << " val" << ids.id << " = " + << "scalar" << ids.id << ";" + << "\n"; + } }; + } } diff --git a/src/backend/cuda/JIT/UnaryNode.hpp b/src/backend/cuda/JIT/UnaryNode.hpp index ac334c14b8..72e148289f 100644 --- a/src/backend/cuda/JIT/UnaryNode.hpp +++ b/src/backend/cuda/JIT/UnaryNode.hpp @@ -22,62 +22,30 @@ namespace JIT private: const std::string m_op_str; const int m_op; - const bool m_is_check; public: UnaryNode(const char *out_type_str, const char *name_str, - const std::string &op_str, - Node_ptr child, int op, bool is_check=false) + const char *op_str, + Node_ptr child, int op) : Node(out_type_str, name_str, child->getHeight() + 1, {child}), m_op_str(op_str), - m_op(op), - m_is_check(is_check) + m_op(op) { } void genKerName(std::stringstream &kerStream, Node_ids ids) { - // Make the hex representation of enum part of the Kernel name - kerStream << "_" << std::setw(2) << std::setfill('0') << std::hex << m_op; - kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.child_ids[0]; - kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.id << std::dec; + // Make the dec representation of enum part of the Kernel name + kerStream << "_" << std::setw(3) << std::setfill('0') << std::dec << m_op; + kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.child_ids[0]; + kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.id << std::dec; } - void genFuncs(std::stringstream &kerStream, str_map_t &declStrs, Node_ids ids, bool is_linear) + void genFuncs(std::stringstream &kerStream, Node_ids ids) { - std::stringstream declStream; - - if (m_is_check) { - declStream << "declare " << "i32 " << m_op_str - << "(" << m_children[0]->getTypeStr() << ")\n"; - } else { - declStream << "declare " << m_type_str << " " << m_op_str - << "(" << m_children[0]->getTypeStr() << ")\n"; - } - - declStrs[declStream.str()] = true; - - if (m_is_check) { - kerStream << "%tmp" << ids.id << " = call i32 " - << m_op_str << "(" - << m_children[0]->getTypeStr() << " " - << "%val" << ids.child_ids[0] << ")\n"; - - if (m_type_str[0] == 'i') { - kerStream << "%val" << ids.id << " = " - << "trunc i32 %tmp" << ids.id << " to " << m_type_str << "\n"; - } else { - kerStream << "%val" << ids.id << " = " - << "sitofp i32 %tmp" << ids.id << " to " << m_type_str << "\n"; - } - - } else { - kerStream << "%val" << ids.id << " = call " - << m_type_str << " " - << m_op_str << "(" - << m_children[0]->getTypeStr() << " " - << "%val" << ids.child_ids[0] << ")\n"; - } + kerStream << m_type_str << " val" << ids.id << " = " + << m_op_str << "(val" << ids.child_ids[0] << ");" + << "\n"; } }; diff --git a/src/backend/cuda/JIT/arith.cu b/src/backend/cuda/JIT/arith.cu deleted file mode 100644 index adfa9e9068..0000000000 --- a/src/backend/cuda/JIT/arith.cu +++ /dev/null @@ -1,44 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#include "types.h" - -#define ARITH_BASIC(fn, op, T) \ - __device__ T ___##fn(T a, T b) \ - { \ - return a op b; \ - } \ - - -#define ARITH(fn, op) \ - ARITH_BASIC(fn, op, float) \ - ARITH_BASIC(fn, op, double) \ - ARITH_BASIC(fn, op, int) \ - ARITH_BASIC(fn, op, uint) \ - ARITH_BASIC(fn, op, char) \ - ARITH_BASIC(fn, op, uchar) \ - ARITH_BASIC(fn, op, intl) \ - ARITH_BASIC(fn, op, uintl) \ - ARITH_BASIC(fn, op, short) \ - ARITH_BASIC(fn, op, ushort) \ - \ - __device__ cfloat ___##fn(cfloat a, cfloat b) \ - { \ - return cuC##fn##f(a, b); \ - } \ - \ - __device__ cdouble ___##fn(cdouble a, cdouble b) \ - { \ - return cuC##fn(a, b); \ - } \ - -ARITH(add, +) -ARITH(sub, -) -ARITH(mul, *) -ARITH(div, /) diff --git a/src/backend/cuda/JIT/cast.cu b/src/backend/cuda/JIT/cast.cu deleted file mode 100644 index 8905955145..0000000000 --- a/src/backend/cuda/JIT/cast.cu +++ /dev/null @@ -1,104 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#include "types.h" - -#define CAST_BASIC(FN, To, Ti) __device__ To FN(Ti in) { return (To) in; } - -#define CAST_BASIC_BOOL(FN, To, Ti) __device__ To FN(Ti in) { return (To)(in != 0); } - -#define CAST(T, X) \ - CAST_BASIC(___mk##X, T, float) \ - CAST_BASIC(___mk##X, T, double) \ - CAST_BASIC(___mk##X, T, int) \ - CAST_BASIC(___mk##X, T, uint) \ - CAST_BASIC(___mk##X, T, char) \ - CAST_BASIC(___mk##X, T, uchar) \ - CAST_BASIC(___mk##X, T, intl) \ - CAST_BASIC(___mk##X, T, uintl) \ - CAST_BASIC(___mk##X, T, short) \ - CAST_BASIC(___mk##X, T, ushort) \ - -CAST(float , S) -CAST(double, D) -CAST(int , I) -CAST(intl , X) -CAST(short , P) -CAST(uint , U) -CAST(uchar , V) -CAST(uintl , Y) -CAST(ushort, Q) - -CAST_BASIC_BOOL(___mkJ, char, float) -CAST_BASIC_BOOL(___mkJ, char, double) -CAST_BASIC_BOOL(___mkJ, char, int) -CAST_BASIC_BOOL(___mkJ, char, uint) -CAST_BASIC_BOOL(___mkJ, char, char) -CAST_BASIC_BOOL(___mkJ, char, uchar) -CAST_BASIC_BOOL(___mkJ, char, intl) -CAST_BASIC_BOOL(___mkJ, char, uintl) -CAST_BASIC_BOOL(___mkJ, char, short) -CAST_BASIC_BOOL(___mkJ, char, ushort) - -#define CPLX_BASIC(FN, To, Tr, Ti) \ - __device__ To FN(Ti in) \ - { \ - To out = {(Tr)in, 0}; \ - return out; \ - } \ - -#define CPLX_CAST(T, Tr, X) \ - CPLX_BASIC(___mk##X, T, Tr, float) \ - CPLX_BASIC(___mk##X, T, Tr, double) \ - CPLX_BASIC(___mk##X, T, Tr, int) \ - CPLX_BASIC(___mk##X, T, Tr, uint) \ - CPLX_BASIC(___mk##X, T, Tr, char) \ - CPLX_BASIC(___mk##X, T, Tr, uchar) \ - CPLX_BASIC(___mk##X, T, Tr, uintl) \ - CPLX_BASIC(___mk##X, T, Tr, intl) \ - CPLX_BASIC(___mk##X, T, Tr, ushort) \ - CPLX_BASIC(___mk##X, T, Tr, short) \ - -CPLX_CAST(cfloat, float, C) -CPLX_CAST(cdouble, double, Z) - -__device__ cfloat ___mkC(cfloat C) -{ - return C; -} - -__device__ cfloat ___mkC(cdouble C) -{ - cfloat res = {C.x, C.y}; - return res; -} - -__device__ cdouble ___mkZ(cdouble C) -{ - return C; -} - -__device__ cdouble ___mkZ(cfloat C) -{ - cdouble res = {C.x, C.y}; - return res; -} - -__device__ float ___real(cfloat in) { return in.x; } -__device__ double ___real(cdouble in) { return in.x; } - - -__device__ float ___imag(cfloat in) { return in.y; } -__device__ double ___imag(cdouble in) { return in.y; } - -__device__ cfloat ___cplx(float l, float r) { cfloat out = {l, r}; return out; } -__device__ cdouble ___cplx(double l, double r) { cdouble out = {l, r}; return out; } - -__device__ cfloat ___conj(cfloat in) { return cuConjf(in); } -__device__ cdouble ___conj(cdouble in) { return cuConj (in); } diff --git a/src/backend/cuda/JIT/exp.cu b/src/backend/cuda/JIT/exp.cu deleted file mode 100644 index 3f110b4328..0000000000 --- a/src/backend/cuda/JIT/exp.cu +++ /dev/null @@ -1,100 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#include "types.h" - -__device__ double sigmoid(double in) -{ - return (1.0) / (1 + exp(-in)); -} - -__device__ float sigmoidf(float in) -{ - return (1.0) / (1 + expf(-in)); -} - -#define MATH_BASIC(fn, T) \ - __device__ T ___##fn(T a) \ - { \ - return fn##f((float)a); \ - } \ - - -#define MATH(fn) \ - MATH_BASIC(fn, float) \ - MATH_BASIC(fn, int) \ - MATH_BASIC(fn, uint) \ - MATH_BASIC(fn, char) \ - MATH_BASIC(fn, uchar) \ - MATH_BASIC(fn, uintl) \ - MATH_BASIC(fn, intl) \ - MATH_BASIC(fn, ushort) \ - MATH_BASIC(fn, short) \ - __device__ double ___##fn(double a) \ - { \ - return fn(a); \ - } \ - - -MATH(exp) -MATH(expm1) -MATH(erf) -MATH(erfc) -MATH(sigmoid) - -MATH(log) -MATH(log10) -MATH(log1p) -MATH(log2) - -MATH(sqrt) -MATH(cbrt) - -#define MATH2_BASIC(fn, T) \ - __device__ T ___##fn(T a, T b) \ - { \ - return fn##f((float)a, (float)b); \ - } \ - -#define MATH2(fn) \ - MATH2_BASIC(fn, float) \ - MATH2_BASIC(fn, int) \ - MATH2_BASIC(fn, uint) \ - MATH2_BASIC(fn, char) \ - MATH2_BASIC(fn, uchar) \ - MATH2_BASIC(fn, uintl) \ - MATH2_BASIC(fn, intl) \ - MATH2_BASIC(fn, ushort) \ - MATH2_BASIC(fn, short) \ - __device__ double ___##fn(double a, double b) \ - { \ - return fn(a, b); \ - } \ - -MATH2(pow) - -__device__ cfloat ___pow(cfloat a, float b) -{ - float R = cuCabsf(a); - float Theta = atan2(a.y, a.x); - float R_b = powf(R, b); - float Theta_b = Theta * b; - cfloat res = {R_b * cosf(Theta_b), R_b * sinf(Theta_b)}; - return res; -} - -__device__ cdouble ___pow(cdouble a, float b) -{ - float R = cuCabs(a); - float Theta = atan2(a.y, a.x); - float R_b = pow(R, b); - float Theta_b = Theta * b; - cdouble res = {R_b * cos(Theta_b), R_b * sin(Theta_b)}; - return res; -} diff --git a/src/backend/cuda/JIT/hyper.cu b/src/backend/cuda/JIT/hyper.cu deleted file mode 100644 index 6673fb1f14..0000000000 --- a/src/backend/cuda/JIT/hyper.cu +++ /dev/null @@ -1,41 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#include "types.h" - -#define MATH_BASIC(fn, T) \ - __device__ T ___##fn(T a) \ - { \ - return fn##f((float)a); \ - } \ - - -#define MATH(fn) \ - MATH_BASIC(fn, float) \ - MATH_BASIC(fn, int) \ - MATH_BASIC(fn, uint) \ - MATH_BASIC(fn, char) \ - MATH_BASIC(fn, uchar) \ - MATH_BASIC(fn, uintl) \ - MATH_BASIC(fn, intl) \ - MATH_BASIC(fn, ushort) \ - MATH_BASIC(fn, short) \ - __device__ double ___##fn(double a) \ - { \ - return fn(a); \ - } \ - - -MATH(sinh) -MATH(cosh) -MATH(tanh) - -MATH(asinh) -MATH(acosh) -MATH(atanh) diff --git a/src/backend/cuda/JIT/logic.cu b/src/backend/cuda/JIT/logic.cu deleted file mode 100644 index 6072c3c447..0000000000 --- a/src/backend/cuda/JIT/logic.cu +++ /dev/null @@ -1,109 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#include "types.h" - -#define LOGIC_BASIC(fn, op, T) \ - __device__ bool ___##fn(T a, T b) \ - { \ - return a op b; \ - } \ - - -#define LOGIC(fn, op) \ - LOGIC_BASIC(fn, op, float) \ - LOGIC_BASIC(fn, op, double) \ - LOGIC_BASIC(fn, op, int) \ - LOGIC_BASIC(fn, op, uint) \ - LOGIC_BASIC(fn, op, char) \ - LOGIC_BASIC(fn, op, uchar) \ - LOGIC_BASIC(fn, op, intl) \ - LOGIC_BASIC(fn, op, uintl) \ - LOGIC_BASIC(fn, op, short) \ - LOGIC_BASIC(fn, op, ushort) \ - \ - __device__ bool ___##fn(cfloat a, cfloat b) \ - { \ - return cabs2(a) op cabs2(b); \ - } \ - \ - __device__ bool ___##fn(cdouble a, cdouble b) \ - { \ - return cabs2(a) op cabs2(b); \ - } \ - -LOGIC(lt, <) -LOGIC(gt, >) -LOGIC(le, <=) -LOGIC(ge, >=) -LOGIC(and, &&) -LOGIC(or, ||) - -#define LOGIC_EQ(fn, op, op2) \ - LOGIC_BASIC(fn, op, float) \ - LOGIC_BASIC(fn, op, double) \ - LOGIC_BASIC(fn, op, int) \ - LOGIC_BASIC(fn, op, uint) \ - LOGIC_BASIC(fn, op, char) \ - LOGIC_BASIC(fn, op, uchar) \ - LOGIC_BASIC(fn, op, intl) \ - LOGIC_BASIC(fn, op, uintl) \ - LOGIC_BASIC(fn, op, short) \ - LOGIC_BASIC(fn, op, ushort) \ - \ - __device__ bool ___##fn(cfloat a, cfloat b) \ - { \ - return (a.x op b.x) op2 (a.y op b.y); \ - } \ - \ - __device__ bool ___##fn(cdouble a, cdouble b) \ - { \ - return (a.x op b.x) op2 (a.y op b.y); \ - } \ - -LOGIC_EQ(eq, ==, &&) -LOGIC_EQ(neq, !=, ||) - -#define NOT_FN(T) \ - __device__ bool ___not(T in) { return !in; } \ - -NOT_FN(float) -NOT_FN(double) -NOT_FN(int) -NOT_FN(uint) -NOT_FN(char) -NOT_FN(uchar) -NOT_FN(intl) -NOT_FN(uintl) -NOT_FN(short) -NOT_FN(ushort) - -#define BIT_FN(T) \ - __device__ T ___bitand (T lhs, T rhs) { return lhs & rhs; } \ - __device__ T ___bitor (T lhs, T rhs) { return lhs | rhs; } \ - __device__ T ___bitxor (T lhs, T rhs) { return lhs ^ rhs; } \ - __device__ T ___bitshiftl(T lhs, T rhs) { return lhs << rhs; } \ - __device__ T ___bitshiftr(T lhs, T rhs) { return lhs >> rhs; } \ - -BIT_FN(int) -BIT_FN(char) -BIT_FN(intl) -BIT_FN(uchar) -BIT_FN(uint) -BIT_FN(uintl) -BIT_FN(short) -BIT_FN(ushort) - -__device__ char ___isNaN(float in) { return isnan(in); } -__device__ char ___isINF(float in) { return isinf(in); } -__device__ char ___iszero(float in) { return (in == 0); } - -__device__ char ___isNaN(double in) { return isnan(in); } -__device__ char ___isINF(double in) { return isinf(in); } -__device__ char ___iszero(double in) { return (in == 0); } diff --git a/src/backend/cuda/JIT/numeric.cu b/src/backend/cuda/JIT/numeric.cu deleted file mode 100644 index 2bcb15a112..0000000000 --- a/src/backend/cuda/JIT/numeric.cu +++ /dev/null @@ -1,191 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#include "types.h" - -template __device__ T sign(T a) { return signbit(a); } - -#define MATH_BASIC(fn, T) \ - __device__ T ___##fn(T a) \ - { \ - return fn(a); \ - } \ - - -#define MATH_NOOP(fn, T) \ - __device__ T ___##fn(T a) \ - { \ - return a; \ - } \ - - -#define MATH_CAST(fn, T, Tc) \ - __device__ T ___##fn(T a) \ - { \ - return (T)fn((Tc)a); \ - } \ - -MATH_BASIC(floor, float) -MATH_BASIC(floor, double) -MATH_NOOP(floor, int) -MATH_NOOP(floor, uint) -MATH_NOOP(floor, char) -MATH_NOOP(floor, uchar) -MATH_NOOP(floor, uintl) -MATH_NOOP(floor, intl) -MATH_NOOP(floor, ushort) -MATH_NOOP(floor, short) - -MATH_BASIC(ceil, float) -MATH_BASIC(ceil, double) -MATH_NOOP(ceil, int) -MATH_NOOP(ceil, uint) -MATH_NOOP(ceil, char) -MATH_NOOP(ceil, uchar) -MATH_NOOP(ceil, uintl) -MATH_NOOP(ceil, intl) -MATH_NOOP(ceil, ushort) -MATH_NOOP(ceil, short) - -MATH_BASIC(round, float) -MATH_BASIC(round, double) -MATH_NOOP(round, int) -MATH_NOOP(round, uint) -MATH_NOOP(round, char) -MATH_NOOP(round, uchar) -MATH_NOOP(round, uintl) -MATH_NOOP(round, intl) -MATH_NOOP(round, ushort) -MATH_NOOP(round, short) - -MATH_BASIC(trunc, float) -MATH_BASIC(trunc, double) -MATH_NOOP(trunc, int) -MATH_NOOP(trunc, uint) -MATH_NOOP(trunc, char) -MATH_NOOP(trunc, uchar) -MATH_NOOP(trunc, uintl) -MATH_NOOP(trunc, intl) -MATH_NOOP(trunc, ushort) -MATH_NOOP(trunc, short) - -MATH_BASIC(sign, float) -MATH_BASIC(sign, double) -MATH_NOOP(sign, int) -MATH_NOOP(sign, uint) -MATH_NOOP(sign, char) -MATH_NOOP(sign, uchar) -MATH_NOOP(sign, uintl) -MATH_NOOP(sign, intl) -MATH_NOOP(sign, ushort) -MATH_NOOP(sign, short) - -MATH_BASIC(abs, float) -MATH_BASIC(abs, double) -MATH_BASIC(abs, int) -MATH_CAST(abs, char, int) -MATH_NOOP(abs, uint) -MATH_NOOP(abs, uchar) -MATH_NOOP(abs, uintl) -MATH_NOOP(abs, intl) -MATH_NOOP(abs, ushort) -MATH_NOOP(abs, short) - -MATH_BASIC(tgamma, float) -MATH_BASIC(tgamma, double) -MATH_CAST(tgamma, int , float) -MATH_CAST(tgamma, uint , float) -MATH_CAST(tgamma, char , float) -MATH_CAST(tgamma, uchar , float) -MATH_CAST(tgamma, uintl , float) -MATH_CAST(tgamma, intl , float) -MATH_CAST(tgamma, ushort, float) -MATH_CAST(tgamma, short , float) - -MATH_BASIC(lgamma, float) -MATH_BASIC(lgamma, double) -MATH_CAST(lgamma, int , float) -MATH_CAST(lgamma, uint , float) -MATH_CAST(lgamma, char , float) -MATH_CAST(lgamma, uchar , float) -MATH_CAST(lgamma, uintl , float) -MATH_CAST(lgamma, intl , float) -MATH_CAST(lgamma, ushort, float) -MATH_CAST(lgamma, short , float) - -MATH_NOOP(noop, float) -MATH_NOOP(noop, double) -MATH_NOOP(noop, cfloat) -MATH_NOOP(noop, cdouble) -MATH_NOOP(noop, int) -MATH_NOOP(noop, uint) -MATH_NOOP(noop, char) -MATH_NOOP(noop, uchar) -MATH_NOOP(noop, uintl) -MATH_NOOP(noop, intl) -MATH_NOOP(noop, ushort) -MATH_NOOP(noop, short) - -__device__ float ___abs(cfloat a) { return cuCabsf(a); } -__device__ double ___abs(cdouble a) { return cuCabs(a); } - -template __device__ T rem(T a, T b) { return a % b; } -__device__ float rem(float a, float b) { return remainderf(a, b); } -__device__ double rem(double a, double b) { return remainder(a, b); } - -template __device__ T mod(T a, T b) { return a % b; } -__device__ float mod(float a, float b) { return fmodf(a, b); } -__device__ double mod(double a, double b) { return fmod(a, b); } - -#define MATH2_BASIC(fn, T) \ - __device__ T ___##fn(T a, T b) \ - { \ - return fn(a, b); \ - } \ - -#define MATH2(fn) \ - MATH2_BASIC(fn, float) \ - MATH2_BASIC(fn, int) \ - MATH2_BASIC(fn, uint) \ - MATH2_BASIC(fn, intl) \ - MATH2_BASIC(fn, uintl) \ - MATH2_BASIC(fn, char) \ - MATH2_BASIC(fn, uchar) \ - MATH2_BASIC(fn, short) \ - MATH2_BASIC(fn, ushort) \ - __device__ double ___##fn(double a, double b) \ - { \ - return fn(a, b); \ - } \ - -MATH2(min) -MATH2(max) -MATH2(mod) -MATH2(rem) - -__device__ float ___hypot(float a, float b) -{ - return hypot(a, b); -} - -__device__ double ___hypot(double a, double b) -{ - return hypot(a, b); -} - -#define COMPARE_CPLX(fn, op, T) \ - __device__ T ___##fn(T a, T b) \ - { \ - return cabs2(a) op cabs2(b) ? a : b; \ - } \ - -COMPARE_CPLX(min, <, cfloat) -COMPARE_CPLX(min, <, cdouble) -COMPARE_CPLX(max, >, cfloat) -COMPARE_CPLX(max, >, cdouble) diff --git a/src/backend/cuda/JIT/trig.cu b/src/backend/cuda/JIT/trig.cu deleted file mode 100644 index 372bd4d026..0000000000 --- a/src/backend/cuda/JIT/trig.cu +++ /dev/null @@ -1,62 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#include "types.h" - -#define MATH_BASIC(fn, T) \ - __device__ T ___##fn(T a) \ - { \ - return fn##f((float)a); \ - } \ - - -#define MATH(fn) \ - MATH_BASIC(fn, float) \ - MATH_BASIC(fn, int) \ - MATH_BASIC(fn, uint) \ - MATH_BASIC(fn, char) \ - MATH_BASIC(fn, uchar) \ - MATH_BASIC(fn, uintl) \ - MATH_BASIC(fn, intl) \ - MATH_BASIC(fn, ushort) \ - MATH_BASIC(fn, short) \ - __device__ double ___##fn(double a) \ - { \ - return fn(a); \ - } \ - - -MATH(sin) -MATH(cos) -MATH(tan) - -MATH(asin) -MATH(acos) -MATH(atan) - -#define ATAN2(T) \ - __device__ T ___atan2(T x, T y) \ - { \ - return atan2((float)x, (float)y); \ - } \ - -ATAN2(float) -ATAN2(int) -ATAN2(uint) -ATAN2(char) -ATAN2(uchar) -ATAN2(uintl) -ATAN2(intl) -ATAN2(ushort) -ATAN2(short) - -__device__ double ___atan2(double x, double y) -{ - return atan2(x, y); -} diff --git a/src/backend/cuda/arith.hpp b/src/backend/cuda/arith.hpp index 5a39fcdf1c..87117e90bb 100644 --- a/src/backend/cuda/arith.hpp +++ b/src/backend/cuda/arith.hpp @@ -10,7 +10,6 @@ #include #include #include -#include #include namespace cuda diff --git a/src/backend/cuda/binary.hpp b/src/backend/cuda/binary.hpp index bb81c19ffa..d5659eb7e6 100644 --- a/src/backend/cuda/binary.hpp +++ b/src/backend/cuda/binary.hpp @@ -17,220 +17,158 @@ namespace cuda { - -template -struct BinOp -{ - std::string name; - int call_type; - BinOp() : - name("noop"), - call_type(0) - {} -}; - -#define BINARY(fn) \ - template \ - struct BinOp \ - { \ - std::string name; \ - int call_type; \ - BinOp() : \ - name(cuMangledName("___"#fn)), \ - call_type(0) \ - {} \ + template + struct BinOp + { + const char *name() + { + return "__invalid"; + } }; -#if defined(USE_LIBDEVICE) -#define NVVM_ARITH_OP(T, fn, fname) \ - template<> \ - struct BinOp \ +#define BINARY_TYPE_1(fn) \ + template \ + struct BinOp \ { \ - std::string name; \ - int call_type; \ - BinOp() : \ - name(fname), \ - call_type(1) \ - {} \ + const char *name() \ + { \ + return "__"#fn; \ + } \ }; \ - -#define NVVM_COMPARE_OP(T, fn, fname) \ - template<> \ - struct BinOp \ + \ + template \ + struct BinOp \ + { \ + const char *name() \ + { \ + return "__c"#fn"f"; \ + } \ + }; \ + \ + template \ + struct BinOp \ { \ - std::string name; \ - int call_type; \ - BinOp() : \ - name(fname), \ - call_type(2) \ - {} \ + const char *name() \ + { \ + return "__c"#fn; \ + } \ }; \ -#define NVVM_BINARY_FUNC(T, fn, fname) \ - template<> \ - struct BinOp \ + +BINARY_TYPE_1(eq) +BINARY_TYPE_1(neq) +BINARY_TYPE_1(lt) +BINARY_TYPE_1(le) +BINARY_TYPE_1(gt) +BINARY_TYPE_1(ge) +BINARY_TYPE_1(add) +BINARY_TYPE_1(sub) +BINARY_TYPE_1(mul) +BINARY_TYPE_1(div) +BINARY_TYPE_1(and) +BINARY_TYPE_1(or) +BINARY_TYPE_1(bitand) +BINARY_TYPE_1(bitor) +BINARY_TYPE_1(bitxor) +BINARY_TYPE_1(bitshiftl) +BINARY_TYPE_1(bitshiftr) + +#undef BINARY_TYPE_1 + +#define BINARY_TYPE_2(fn) \ + template \ + struct BinOp \ + { \ + const char *name() \ + { \ + return "__"#fn; \ + } \ + }; \ + template \ + struct BinOp \ + { \ + const char *name() \ + { \ + return "f"#fn; \ + } \ + }; \ + template \ + struct BinOp \ + { \ + const char *name() \ + { \ + return "f"#fn; \ + } \ + }; \ + template \ + struct BinOp \ + { \ + const char *name() \ + { \ + return "__c"#fn"f"; \ + } \ + }; \ + \ + template \ + struct BinOp \ { \ - std::string name; \ - int call_type; \ - BinOp() : \ - name("@__nv_"#fname), \ - call_type(0) \ - {} \ + const char *name() \ + { \ + return "__c"#fn; \ + } \ }; \ -#else -#define NVVM_ARITH_OP(T, fn, fname) // No specialization -#define NVVM_COMPARE_OP(T, fn, fname) // No specialization -#define NVVM_BINARY_FUNC(T, fn, fname) // No specialization +BINARY_TYPE_2(min) +BINARY_TYPE_2(max) +BINARY_TYPE_2(pow) +BINARY_TYPE_2(rem) +BINARY_TYPE_2(mod) + +template +struct BinOp +{ + const char *name() + { + return "__cplx2f"; + } +}; + +template +struct BinOp +{ + const char *name() + { + return "__cplx2"; + } +}; + +template +struct BinOp +{ + const char *name() + { + return "noop"; + } +}; + +template +struct BinOp +{ + const char *name() + { + return "atan2"; + } +}; -#endif - -#define NVVM_ARITH_OP_INT(fn, fname) \ - NVVM_ARITH_OP(int, fn, fname) \ - NVVM_ARITH_OP(short, fn, fname) \ - NVVM_ARITH_OP(intl, fn, fname) \ - -#define NVVM_ARITH_OP_UINT(fn, fname) \ - NVVM_ARITH_OP(uint, fn, fname) \ - NVVM_ARITH_OP(ushort, fn, fname) \ - NVVM_ARITH_OP(uintl, fn, fname) \ - -#define NVVM_ARITH_OP_FLOAT(fn, fname) \ - NVVM_ARITH_OP(float, fn, fname) \ - NVVM_ARITH_OP(double, fn, fname) \ - -#define NVVM_ARITH_OP_CPLX(fn, fname) \ - NVVM_ARITH_OP(cfloat, fn, fname) \ - NVVM_ARITH_OP(cdouble, fn, fname) \ - -#define NVVM_COMPARE_OP_INT(fn, fname) \ - NVVM_COMPARE_OP(int, fn, fname) \ - NVVM_COMPARE_OP(short, fn, fname) \ - NVVM_COMPARE_OP(intl, fn, fname) \ - -#define NVVM_COMPARE_OP_UINT(fn, fname) \ - NVVM_COMPARE_OP(uint, fn, fname) \ - NVVM_COMPARE_OP(ushort, fn, fname) \ - NVVM_COMPARE_OP(uintl, fn, fname) \ - -#define NVVM_COMPARE_OP_FLOAT(fn, fname) \ - NVVM_COMPARE_OP(float, fn, fname) \ - NVVM_COMPARE_OP(double, fn, fname) \ - -BINARY(add) -NVVM_ARITH_OP_INT(add, "add") -NVVM_ARITH_OP_UINT(add, "add") -NVVM_ARITH_OP_FLOAT(add, "fadd") -NVVM_ARITH_OP_CPLX(add, "fadd") - -BINARY(sub) -NVVM_ARITH_OP_INT(sub, "sub") -NVVM_ARITH_OP_UINT(sub, "sub") -NVVM_ARITH_OP_FLOAT(sub, "fsub") -NVVM_ARITH_OP_CPLX(sub, "fsub") - -BINARY(mul) -NVVM_ARITH_OP_INT(mul, "mul") -NVVM_ARITH_OP_UINT(mul, "mul") -NVVM_ARITH_OP_FLOAT(mul, "fmul") - -BINARY(div) -NVVM_ARITH_OP_INT(div, "sdiv") -NVVM_ARITH_OP_UINT(div, "udiv") -NVVM_ARITH_OP_FLOAT(div, "fdiv") - -BINARY(bitand) -NVVM_ARITH_OP_INT(bitand, "and") -NVVM_ARITH_OP_UINT(bitand, "and") - -BINARY(bitor) -NVVM_ARITH_OP_INT(bitor, "or") -NVVM_ARITH_OP_UINT(bitor, "or") - -BINARY(bitxor) -NVVM_ARITH_OP_INT(bitxor, "xor") -NVVM_ARITH_OP_UINT(bitxor, "xor") - -BINARY(bitshiftl) -NVVM_ARITH_OP_INT(bitshiftl, "shl") -NVVM_ARITH_OP_UINT(bitshiftl, "shl") - -BINARY(bitshiftr) -NVVM_ARITH_OP_INT(bitshiftr, "lshr") -NVVM_ARITH_OP_UINT(bitshiftr, "lshr") - - -BINARY(and) -BINARY(or) - -BINARY(lt) -NVVM_COMPARE_OP_INT(lt, "icmp slt") -NVVM_COMPARE_OP_UINT(lt, "icmp ult") -NVVM_COMPARE_OP_FLOAT(lt, "fcmp olt") - -BINARY(gt) -NVVM_COMPARE_OP_INT(gt, "icmp sgt") -NVVM_COMPARE_OP_UINT(gt, "icmp ugt") -NVVM_COMPARE_OP_FLOAT(gt, "fcmp ogt") - -BINARY(le) -NVVM_COMPARE_OP_INT(le, "icmp sle") -NVVM_COMPARE_OP_UINT(le, "icmp ule") -NVVM_COMPARE_OP_FLOAT(le, "fcmp ole") - -BINARY(ge) -NVVM_COMPARE_OP_INT(ge, "icmp sge") -NVVM_COMPARE_OP_UINT(ge, "icmp uge") -NVVM_COMPARE_OP_FLOAT(ge, "fcmp oge") - -BINARY(eq) -NVVM_COMPARE_OP_INT(eq, "icmp eq") -NVVM_COMPARE_OP_UINT(eq, "icmp eq") -NVVM_COMPARE_OP_FLOAT(eq, "fcmp oeq") - -BINARY(neq) -NVVM_COMPARE_OP_INT(neq, "icmp ne") -NVVM_COMPARE_OP_UINT(neq, "icmp ne") -NVVM_COMPARE_OP_FLOAT(neq, "fcmp one") - -BINARY(max) -NVVM_BINARY_FUNC(float, max, fmaxf) -NVVM_BINARY_FUNC(double, max, fmax) -NVVM_BINARY_FUNC(int, max, max) -NVVM_BINARY_FUNC(uint, max, umax) -NVVM_BINARY_FUNC(intl, max, llmax) -NVVM_BINARY_FUNC(uintl, max, ullmax) - -BINARY(min) -NVVM_BINARY_FUNC(float, min, fminf) -NVVM_BINARY_FUNC(double, min, fmin) -NVVM_BINARY_FUNC(int, min, min) -NVVM_BINARY_FUNC(uint, min, umin) -NVVM_BINARY_FUNC(intl, min, llmin) -NVVM_BINARY_FUNC(uintl, min, ullmin) - -BINARY(pow) -NVVM_BINARY_FUNC(float, pow, powf) -NVVM_BINARY_FUNC(double, pow, pow) - -BINARY(mod) -NVVM_BINARY_FUNC(float, mod, fmodf) -NVVM_BINARY_FUNC(double, mod, fmod) - -BINARY(rem) -NVVM_BINARY_FUNC(float, rem, remainderf) -NVVM_BINARY_FUNC(double, rem, remainder) - -BINARY(atan2) -NVVM_BINARY_FUNC(float, atan2, atan2f) -NVVM_BINARY_FUNC(double, atan2, atan2) - -BINARY(hypot) -NVVM_BINARY_FUNC(float, hypot, hypotf) -NVVM_BINARY_FUNC(double, hypot, hypot) - -#undef BINARY +template +struct BinOp +{ + const char *name() + { + return "hypot"; + } +}; template Array createBinaryNode(const Array &lhs, const Array &rhs, const af::dim4 &odims) @@ -239,16 +177,14 @@ Array createBinaryNode(const Array &lhs, const Array &rhs, const af: JIT::Node_ptr lhs_node = lhs.getNode(); JIT::Node_ptr rhs_node = rhs.getNode(); - - JIT::BinaryNode *node = new JIT::BinaryNode(irname(), - afShortName(), - bop.name, + JIT::BinaryNode *node = new JIT::BinaryNode(getFullName(), + shortname(true), + bop.name(), lhs_node, - rhs_node, - (int)(op), - bop.call_type); + rhs_node, (int)(op)); - return createNodeArray(odims, JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(odims, JIT::Node_ptr( + reinterpret_cast(node))); } } diff --git a/src/backend/cuda/cast.hpp b/src/backend/cuda/cast.hpp index 906620036e..9f854c6e06 100644 --- a/src/backend/cuda/cast.hpp +++ b/src/backend/cuda/cast.hpp @@ -13,8 +13,8 @@ #include #include #include -#include #include +#include #include namespace cuda @@ -23,18 +23,86 @@ namespace cuda template struct CastOp { - std::string func; - CastOp() { - std::string tmp = std::string("___mk") + afShortName(); - func = cuMangledName(tmp.c_str()); + const char *name() + { + return ""; + } +}; + +#define CAST_FN(TYPE) \ + template \ + struct CastOp \ + { \ + const char *name() \ + { \ + return "("#TYPE")"; \ + } \ + }; + +CAST_FN(int) +CAST_FN(unsigned int) +CAST_FN(unsigned char) +CAST_FN(unsigned short) +CAST_FN(short) +CAST_FN(float) +CAST_FN(double) + +#define CAST_CFN(TYPE) \ + template \ + struct CastOp \ + { \ + const char *name() \ + { \ + return "__convert_"#TYPE; \ + } \ + }; + + +CAST_CFN(cfloat) +CAST_CFN(cdouble) +CAST_CFN(char) + + +template<> +struct CastOp +{ + const char *name() + { + return "__convert_z2c"; + } +}; + + +template<> +struct CastOp +{ + const char *name() + { + return "__convert_c2z"; + } +}; + +template<> +struct CastOp +{ + const char *name() + { + return "__convert_c2c"; } +}; + - const std::string name() +template<> +struct CastOp +{ + const char *name() { - return func; + return "__convert_z2z"; } }; +#undef CAST_FN +#undef CAST_CFN template struct CastWrapper @@ -43,8 +111,8 @@ struct CastWrapper { CastOp cop; JIT::Node_ptr in_node = in.getNode(); - JIT::UnaryNode *node = new JIT::UnaryNode(irname(), - afShortName(), + JIT::UnaryNode *node = new JIT::UnaryNode(getFullName(), + shortname(true), cop.name(), in_node, af_cast_t); return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); diff --git a/src/backend/cuda/complex.hpp b/src/backend/cuda/complex.hpp index 6082a5e194..67ba0ebe4e 100644 --- a/src/backend/cuda/complex.hpp +++ b/src/backend/cuda/complex.hpp @@ -6,66 +6,28 @@ * The complete license agreement can be obtained at: * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ + #include #include #include -#include -#include +#include #include namespace cuda { - template static const std::string cplx_name() { return cuMangledName("___noop"); } - template<> STATIC_ const std::string cplx_name() { return cuMangledName("___cplx"); } - template<> STATIC_ const std::string cplx_name() { return cuMangledName("___cplx"); } - - template static const std::string real_name() { return cuMangledName("___noop"); } - template<> STATIC_ const std::string real_name() { return cuMangledName("___real"); } - template<> STATIC_ const std::string real_name() { return cuMangledName("___real"); } - - template static const std::string imag_name() { return cuMangledName("___noop"); } - template<> STATIC_ const std::string imag_name() { return cuMangledName("___imag"); } - template<> STATIC_ const std::string imag_name() { return cuMangledName("___imag"); } - - template static const std::string abs_name() { return cuMangledName("___noop"); } -#if defined(USE_LIBDEVICE) - template<> STATIC_ const std::string abs_name() { return "@__nv_fabsf"; } - template<> STATIC_ const std::string abs_name() { return "@__nv_fabs" ; } -#else - template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } - template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } -#endif - template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } - template<> STATIC_ const std::string abs_name() { return cuMangledName("___abs"); } - - template static const std::string conj_name() { return cuMangledName("___noop"); } - template<> STATIC_ const std::string conj_name() { return cuMangledName("___conj"); } - template<> STATIC_ const std::string conj_name() { return cuMangledName("___conj"); } - template Array cplx(const Array &lhs, const Array &rhs, const af::dim4 &odims) { - JIT::Node_ptr lhs_node = lhs.getNode(); - JIT::Node_ptr rhs_node = rhs.getNode(); - - JIT::BinaryNode *node = new JIT::BinaryNode(irname(), - afShortName(), - cplx_name(), - lhs_node, - rhs_node, - (int)(af_cplx2_t), - 0); - - return createNodeArray(odims, JIT::Node_ptr(reinterpret_cast(node))); + return createBinaryNode(lhs, rhs, odims); } template Array real(const Array &in) { JIT::Node_ptr in_node = in.getNode(); - JIT::UnaryNode *node = new JIT::UnaryNode(irname(), - afShortName(), - real_name(), + JIT::UnaryNode *node = new JIT::UnaryNode(getFullName(), + shortname(true), + "__creal", in_node, af_real_t); return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); @@ -75,32 +37,40 @@ namespace cuda Array imag(const Array &in) { JIT::Node_ptr in_node = in.getNode(); - JIT::UnaryNode *node = new JIT::UnaryNode(irname(), - afShortName(), - imag_name(), + JIT::UnaryNode *node = new JIT::UnaryNode(getFullName(), + shortname(true), + "__cimag", in_node, af_imag_t); return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); } + template static const char *abs_name() { return "fabs"; } + template<> STATIC_ const char *abs_name() { return "__cabsf"; } + template<> STATIC_ const char *abs_name() { return "__cabs"; } + template Array abs(const Array &in) { JIT::Node_ptr in_node = in.getNode(); - JIT::UnaryNode *node = new JIT::UnaryNode(irname(), - afShortName(), + JIT::UnaryNode *node = new JIT::UnaryNode(getFullName(), + shortname(true), abs_name(), in_node, af_abs_t); return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); } + template static const char *conj_name() { return "__noop"; } + template<> STATIC_ const char *conj_name() { return "__cconjf"; } + template<> STATIC_ const char *conj_name() { return "__cconj"; } + template Array conj(const Array &in) { JIT::Node_ptr in_node = in.getNode(); - JIT::UnaryNode *node = new JIT::UnaryNode(irname(), - afShortName(), + JIT::UnaryNode *node = new JIT::UnaryNode(getFullName(), + shortname(true), conj_name(), in_node, af_conj_t); diff --git a/src/backend/cuda/jit.cpp b/src/backend/cuda/jit.cpp index a7870aa7bd..5c655b2247 100644 --- a/src/backend/cuda/jit.cpp +++ b/src/backend/cuda/jit.cpp @@ -12,35 +12,10 @@ #include #include -#include -#include -#include -#include -#include -#include -#include - -#if defined(__LIBDEVICE_COMPUTE_20) -#include -#endif - -#if defined(__LIBDEVICE_COMPUTE_30) -#include -#endif - -#if defined(__LIBDEVICE_COMPUTE_35) -#include -#endif - -#if defined(__LIBDEVICE_COMPUTE_50) -#include -#endif - #include #include #include #include -#include #include #include @@ -48,6 +23,8 @@ #include #include #include +#include +#include namespace cuda { @@ -55,8 +32,6 @@ namespace cuda using JIT::Node; using JIT::Node_ids; using JIT::Node_map_t; -using JIT::str_map_iter; -using JIT::str_map_t; using std::hash; using std::lock_guard; @@ -67,12 +42,6 @@ using std::stringstream; using std::unique_ptr; using std::vector; -const char *layout64 = "target datalayout = \"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\"\n\n\n"; -const char *layout32 = "target datalayout = \"e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\"\n\n\n"; - -const char *triple64 = "target triple = \"nvptx64-unknown-cuda\"\n\n"; -const char *triple32 = "target triple = \"nvptx-unknown-cuda\"\n\n"; - static string getFuncName(const vector &output_nodes, const vector &full_nodes, const vector &full_ids, @@ -94,7 +63,7 @@ static string getFuncName(const vector &output_nodes, hash hash_fn; - hashName << "@KER"; + hashName << "KER"; hashName << hash_fn(funcName.str()); return hashName.str(); } @@ -105,404 +74,207 @@ static string getKernelString(const string funcName, const vector &output_ids, bool is_linear) { - static const char *defineVoid = "define void "; - static const char *generalDimParams = "\n" - "i32 %ostr0, i32 %ostr1, i32 %ostr2, i32 %ostr3,\n" - "i32 %odim0, i32 %odim1, i32 %odim2, i32 %odim3,\n" - "i32 %blkx, i32 %blky, i32 %ndims"; - - static const char *linearDimParams = "\n" - "i32 %nelem, i32 %blkx, i32 %blky"; - - const char *dimParams = is_linear ? linearDimParams : generalDimParams; - - static const char *blockStart = "\n{\n\n" - "entry:\n\n"; - static const char *blockEnd = "\n\n" - "ret void\n" - "\n\n}\n"; - - static const char *idAlias = "\n" - "%tidx = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()\n" - "%bdmx = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()\n" - "%bidx = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()\n" - "%bidy = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()\n" - "%gdmx = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()\n" - "\n\n"; - static const char *earlyExit = "\n" - "end:\n\n" - "ret void\n"; - static const char *core = "\n" - "core:\n\n"; - static const char *generalIndex = "\n" - "%tidy = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()\n" - "%bdmy = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()\n" - "%blk_x = alloca i32, align 4\n" - "%blk_y = alloca i32, align 4\n" - "%id_3 = alloca i32, align 4\n" - "%id_2 = alloca i32, align 4\n" - "store i32 %bidx, i32* %blk_x, align 4\n" - "store i32 %bidy, i32* %blk_y, align 4\n" - "store i32 0, i32* %id_2, align 4\n" - "store i32 0, i32* %id_3, align 4\n" - "%two = alloca i32, align 4\n" - "store i32 2, i32* %two, align 4\n" - "%twoval = load i32* %two, align 4\n" - "%is34 = icmp sgt i32 %ndims, %twoval\n" - "br i1 %is34, label %do34, label %do2\n" - "\ndo34:\n" - "%id2t = sdiv i32 %bidx, %blkx\n" - "store i32 %id2t, i32* %id_2, align 4\n" - "%id2m = mul i32 %id2t, %blkx\n" - "%blk_xx = sub i32 %bidx, %id2m\n" - "store i32 %blk_xx, i32* %blk_x, align 4\n" - "%three = alloca i32, align 4\n" - "store i32 3, i32* %three, align 4\n" - "%threeval = load i32* %three, align 4\n" - "%is4 = icmp sgt i32 %ndims, %threeval\n" - "br i1 %is4, label %do4, label %do2\n" - "\ndo4:\n" - "%id3t = sdiv i32 %bidy, %blky\n" - "store i32 %id3t, i32* %id_3, align 4\n" - "%id3m = mul i32 %id3t, %blky\n" - "%blk_yy = sub i32 %bidy, %id3m\n" - "store i32 %blk_yy, i32* %blk_y, align 4\n" - "br label %do2\n" - "\ndo2:\n" - "%id2 = load i32* %id_2, align 4\n" - "%id3 = load i32* %id_3, align 4\n" - "%tmp_x = load i32* %blk_x, align 4\n" - "%id0m = mul i32 %tmp_x, %bdmx\n" - "%id0 = add i32 %tidx, %id0m\n" - "%tmp_y = load i32* %blk_y, align 4\n" - "%id1m = mul i32 %tmp_y, %bdmy\n" - "%id1 = add i32 %tidy, %id1m\n" - "\n\n" - "%off3o = mul i32 %id3, %ostr3\n" - "%off2o = mul i32 %id2, %ostr2\n" - "%off1o = mul i32 %id1, %ostr1\n" - "%off23o = add i32 %off3o, %off2o\n" - "%off123o = add i32 %off23o, %off1o\n" - "%idxa = add i32 %off123o, %id0\n" - "%idx = sext i32 %idxa to i64\n" - "\n\n" - "%cmp3 = icmp slt i32 %id3, %odim3\n" - "%cmp2 = icmp slt i32 %id2, %odim2\n" - "%cmp1 = icmp slt i32 %id1, %odim1\n" - "%cmp0 = icmp slt i32 %id0, %odim0\n" - "br i1 %cmp3, label %check2, label %end\n" - "\ncheck2:\n" - "br i1 %cmp2, label %check1, label %end\n" - "\ncheck1:\n" - "br i1 %cmp1, label %check0, label %end\n" - "\ncheck0:\n" - "br i1 %cmp0, label %core, label %end\n"; + const char *includeFileStr = jit_cuh; + + const char paramTStr[] ="" + "template\n" + "struct Param\n" + "{\n" + " T *ptr;\n" + " dim_t dims[4];\n" + " dim_t strides[4];\n" + "};\n"; + + std::string typedefStr = "typedef unsigned int uint;\n"; + typedefStr += "typedef "; + typedefStr += getFullName(); + typedefStr += " dim_t;\n"; + + // Common CUDA code + // This part of the code does not change with the kernel. + + static const char *kernelVoid = "extern \"C\" __global__ void\n"; + static const char *dimParams = "uint blocks_x, uint blocks_y, uint num_odims"; + static const char *blockStart = "{\n\n"; + static const char *blockEnd = "\n\n}"; static const char *linearIndex = "\n" - "%boff = mul i32 %bidy, %gdmx\n" - "%bid = add i32 %boff, %bidx\n" - "%goff = mul i32 %bid , %bdmx\n" - "%gid = add i32 %goff ,%tidx\n" - "%idx = sext i32 %gid to i64\n" - "%cmp0 = icmp slt i32 %gid, %nelem\n" - "br i1 %cmp0, label %core, label %end\n"; - - static const char *functionLoad = "\n" - "declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone\n" - "declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() nounwind readnone\n" - "declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() nounwind readnone\n" - "declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() nounwind readnone\n" - "declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone\n" - "declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() nounwind readnone\n" - "declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() nounwind readnone\n" - "\n"; + "uint blockId = blockIdx.y * gridDim.x + blockIdx.x;\n" + "uint threadId = threadIdx.x;\n" + "int idx = blockId * blockDim.x * blockDim.y + threadId;\n" + "if (idx >= outref.dims[3] * outref.strides[3]) return;\n"; + + static const char *generalIndex = "\n" + "uint id0 = 0, id1 = 0, id2 = 0, id3 = 0;\n" + "if (num_odims > 2) {\n" + "id2 = blockIdx.x / blocks_x;\n" + "id0 = blockIdx.x - id2 * blocks_x;\n" + "id0 = threadIdx.x + id0 * blockDim.x;\n" + "if (num_odims > 3) {\n" + "id3 = blockIdx.y / blocks_y;\n" + "id1 = blockIdx.y - id3 * blocks_y;\n" + "id1 = threadIdx.y + id1 * blockDim.y;\n" + "} else {\n" + "id1 = threadIdx.y + blockDim.y * blockIdx.y;\n" + "}\n" + " } else {\n" + "id3 = 0;\n" + "id2 = 0;\n" + "id1 = threadIdx.y + blockDim.y * blockIdx.y;\n" + "id0 = threadIdx.x + blockDim.x * blockIdx.x;\n" + "}\n" + "bool cond = \n" + "id0 < outref.dims[0] && \n" + "id1 < outref.dims[1] && \n" + "id2 < outref.dims[2] && \n" + "id3 < outref.dims[3];\n\n" + "if (!cond) return;\n\n" + "int idx = " + "outref.strides[3] * id3 + outref.strides[2] * id2 + " + "outref.strides[1] * id1 + id0;\n\n"; - stringstream kerStream; - stringstream inAnnStream; - stringstream outAnnStream; stringstream inParamStream; stringstream outParamStream; - stringstream funcBodyStream; - stringstream offsetsStream; stringstream outWriteStream; - str_map_t declStrs; - - vector types_output(output_ids.size()); - for (int i = 0; i < (int)output_ids.size(); i++) { - types_output[i] = full_nodes[output_ids[i]]->getTypeStr(); - } + stringstream offsetsStream; + stringstream opsStream; + stringstream outrefstream; for (int i = 0; i < (int)full_nodes.size(); i++) { const auto &node = full_nodes[i]; const auto &ids_curr = full_ids[i]; - // Generate input parameters, needs only current id - node->genParams(inParamStream, inAnnStream, ids_curr.id, is_linear); - // Generate input offsets, needs only current id + // Generate input parameters, only needs current id + node->genParams(inParamStream, ids_curr.id, is_linear); + // Generate input offsets, only needs current id node->genOffsets(offsetsStream, ids_curr.id, is_linear); - // Generate the core function body, needs children id as well - node->genFuncs(funcBodyStream, declStrs, ids_curr, is_linear); + // Generate the core function body, needs children ids as well + node->genFuncs(opsStream, ids_curr); } + outrefstream << "Param<" << full_nodes[output_ids[0]]->getTypeStr() + << "> outref = out" << output_ids[0] << ";\n"; + for (int i = 0; i < (int)output_ids.size(); i++) { int id = output_ids[i]; - string outTypeStr = types_output[i]; - // Generate output parameters - outParamStream << outTypeStr << "* %out" << id << ",\n"; - - // Generate instruction to write output - outWriteStream << "%outIdx" << id - << "= getelementptr inbounds " - << outTypeStr - << "* %out" << id - << ", i64 %idx\n"; - outWriteStream << "store " - << outTypeStr - << " %val" << id << ", " - << outTypeStr - << "* %outIdx" << id << "\n"; - - // Generate output annotation string - outAnnStream << outTypeStr << "*,\n"; - } - - if (sizeof(void *) == 8) { - kerStream << layout64; - kerStream << triple64; - } else { - kerStream << layout32; - kerStream << triple32; + outParamStream << "Param<" << full_nodes[id]->getTypeStr() << "> out" << id << ", \n"; + // Generate code to write the output + outWriteStream << "out" << id << ".ptr[idx] = " << "val" << id << ";\n"; } - const char *index = is_linear ? linearIndex : generalIndex; - - kerStream << defineVoid - << funcName - << " (\n" - << inParamStream.str() - << outParamStream.str() - << dimParams - << " )\n" - << blockStart - << idAlias - << index - << earlyExit - << core - << offsetsStream.str() - << funcBodyStream.str() - << outWriteStream.str() - << blockEnd; - - for(str_map_iter iterator = declStrs.begin(); - iterator != declStrs.end(); iterator++) { - kerStream << iterator->first << "\n"; - } - kerStream << functionLoad; - - kerStream << "!nvvm.annotations = !{!1}\n" - "!1 = metadata !{void (\n" - << inAnnStream.str() - << outAnnStream.str(); - + // Put various blocks into a single stream + stringstream kerStream; + kerStream << typedefStr; + kerStream << paramTStr; + kerStream << includeFileStr << "\n\n"; + kerStream << kernelVoid; + kerStream << funcName; + kerStream << "(\n"; + kerStream << inParamStream.str(); + kerStream << outParamStream.str(); + kerStream << dimParams; + kerStream << ")\n"; + kerStream << blockStart; + kerStream << outrefstream.str(); if (is_linear) { - kerStream << "i32, i32, i32\n"; + kerStream << linearIndex; } else { - kerStream << "i32, i32, i32, i32,\n" - "i32, i32, i32, i32,\n" - "i32, i32, i32\n"; + kerStream << generalIndex; } - - kerStream << ")* " << funcName << ",\n " - << "metadata !\"kernel\", i32 1}\n"; + kerStream << offsetsStream.str(); + kerStream << opsStream.str(); + kerStream << outWriteStream.str(); + kerStream << blockEnd; return kerStream.str(); } -#define NVVM_CHECK(fn, msg) do { \ - nvvmResult res = fn; \ - if (res == NVVM_SUCCESS) break; \ - char nvvm_err_msg[1024]; \ - snprintf(nvvm_err_msg, \ - sizeof(nvvm_err_msg), \ - "NVVM Error (%d): %s\n", \ - (int)(res), msg); \ - AF_ERROR(nvvm_err_msg, \ - AF_ERR_INTERNAL); \ - \ - } while(0) - -#if defined(USE_LIBDEVICE) -void compute_to_libdevice_table(const char **buffer, size_t *bc_buffer_len, int compute) -{ -// These macros create a fallback compute if in case the specific libdevice -// compute is not found -// 50 -> 30 -> 20 -> Not Found -// 35 -> 30 -> 20 -> Not Found -// 30 -> 20 -> Not Found -// 20 -> Not Found -#if defined(__LIBDEVICE_COMPUTE_20) - #define COMPUTE_20_STR compute_20_bc - #define COMPUTE_20_LEN compute_20_bc_len -#else - #define COMPUTE_20_STR NULL - #define COMPUTE_20_LEN 0 -#endif - -#if defined(__LIBDEVICE_COMPUTE_30) - #define COMPUTE_30_STR compute_30_bc - #define COMPUTE_30_LEN compute_30_bc_len -#else // Fallback - #define COMPUTE_30_STR COMPUTE_20_STR - #define COMPUTE_30_LEN COMPUTE_20_LEN -#endif - -#if defined(__LIBDEVICE_COMPUTE_35) - #define COMPUTE_35_STR compute_35_bc - #define COMPUTE_35_LEN compute_35_bc_len -#else // Fallback - #define COMPUTE_35_STR COMPUTE_30_STR - #define COMPUTE_35_LEN COMPUTE_30_LEN -#endif - -#if defined(__LIBDEVICE_COMPUTE_50) - #define COMPUTE_50_STR compute_50_bc - #define COMPUTE_50_LEN compute_50_bc_len -#else // Fallback - #define COMPUTE_50_STR COMPUTE_30_STR - #define COMPUTE_50_LEN COMPUTE_30_LEN -#endif - - // Source: http://docs.nvidia.com/cuda/libdevice-users-guide/basic-usage.html#version-selection - if(compute >= 20 && compute < 30) { - *buffer = COMPUTE_20_STR; - *bc_buffer_len = COMPUTE_20_LEN; - } else if (compute == 30) { - *buffer = COMPUTE_30_STR; - *bc_buffer_len = COMPUTE_30_LEN; - } else if (compute >= 31 && compute < 35) { - *buffer = COMPUTE_20_STR; - *bc_buffer_len = COMPUTE_20_LEN; - } else if (compute >= 35 && compute <= 37) { - *buffer = COMPUTE_35_STR; - *bc_buffer_len = COMPUTE_35_LEN; - } else if (compute > 37 && compute < 50) { - *buffer = COMPUTE_30_STR; - *bc_buffer_len = COMPUTE_30_LEN; - } else if (compute >= 50 && compute <= 53) { - *buffer = COMPUTE_50_STR; - *bc_buffer_len = COMPUTE_50_LEN; - } else if (compute > 53) { - *buffer = COMPUTE_30_STR; - *bc_buffer_len = COMPUTE_30_LEN; - } else { - *buffer = COMPUTE_30_STR; - *bc_buffer_len = COMPUTE_30_LEN; - } -} -#endif - -static unique_ptr irToPtx(string IR, size_t *ptx_size) -{ - nvvmProgram prog; - - NVVM_CHECK(nvvmCreateProgram(&prog), "Failed to create program"); - -#if defined(USE_LIBDEVICE) - // Get compute version of device - cudaDeviceProp devProp = getDeviceProp(getActiveDeviceId()); - int compute = devProp.major * 10 + devProp.minor; - const char *bc_buffer = NULL; - size_t bc_buffer_len = 0; - compute_to_libdevice_table(&bc_buffer, &bc_buffer_len, compute); - if(bc_buffer) - NVVM_CHECK(nvvmAddModuleToProgram(prog, bc_buffer, bc_buffer_len, "libdevice kernels"), - "Failed to add libdevice"); - else - NVVM_CHECK(nvvmAddModuleToProgram(prog, IR.c_str(), IR.size(), "generated kernel"), - "Failed to add module"); -#endif - - NVVM_CHECK(nvvmAddModuleToProgram(prog, IR.c_str(), IR.size(), "generated kernel"), - "Failed to add module"); - - //FIXME: Use proper compute - const char *options = NULL; - const int noptions = 0; - -//#ifdef NDEBUG -#if 0 - NVVM_CHECK(nvvmCompileProgram(prog, noptions, &options), "Failed to compile program"); -#else - nvvmResult comp_res = nvvmCompileProgram(prog, noptions, &options); - if (comp_res != NVVM_SUCCESS) { - size_t log_size = 0; - nvvmGetProgramLogSize(prog, &log_size); - printf("%ld, %zu\n", IR.size(), log_size); - unique_ptr log(new char[log_size]); - nvvmGetProgramLog(prog, log.get()); - printf("LOG:\n%s\n%s", log.get(), IR.c_str()); - NVVM_CHECK(comp_res, "Failed to compile program"); - } -#endif - - NVVM_CHECK(nvvmGetCompiledResultSize(prog, ptx_size), "Can not get ptx size"); - - unique_ptr ptx{new char[*ptx_size]}; - NVVM_CHECK(nvvmGetCompiledResult(prog, ptx.get()), "Can not get ptx from NVVM IR"); - NVVM_CHECK(nvvmDestroyProgram(&prog), "Failed to destroy program"); - return ptx; -} - typedef struct { CUmodule prog; CUfunction ker; } kc_entry_t; - -const size_t size = 1024; -char linkInfo[size]; -char linkError[size]; - -#ifndef NDEBUG #define CU_CHECK(fn) do { \ CUresult res = fn; \ if (res == CUDA_SUCCESS) break; \ char cu_err_msg[1024]; \ snprintf(cu_err_msg, \ sizeof(cu_err_msg), \ - "CU Error (%d)\n%s\n", \ - (int)(res), linkError); \ + "CU Error (%d)\n", \ + (int)(res)); \ AF_ERROR(cu_err_msg, \ AF_ERR_INTERNAL); \ } while(0) -#else -#define CU_CHECK(fn) do { \ + +#ifndef NDEBUG +#define CU_LINK_CHECK(fn) do { \ CUresult res = fn; \ if (res == CUDA_SUCCESS) break; \ char cu_err_msg[1024]; \ snprintf(cu_err_msg, \ sizeof(cu_err_msg), \ - "CU Error (%d)\n", \ - (int)(res)); \ + "CU Error (%d)\n%s\n", \ + (int)(res), linkError); \ AF_ERROR(cu_err_msg, \ AF_ERR_INTERNAL); \ } while(0) +#else +#define CU_LINK_CHECK(fn) CU_CHECK(fn) +#endif + +#ifndef NDEBUG +#define NVRTC_CHECK(fn) do { \ + nvrtcResult res = fn; \ + if (res == NVRTC_SUCCESS) break; \ + size_t logSize; \ + nvrtcGetProgramLogSize(prog, &logSize); \ + unique_ptr log(new char[logSize]); \ + char *logptr = log.get(); \ + nvrtcGetProgramLog(prog, logptr); \ + logptr[logSize] = '\x0'; \ + printf("%s\n", logptr); \ + AF_ERROR("NVRTC ERROR", \ + AF_ERR_INTERNAL); \ + } while(0) +#else +#define NVRTC_CHECK(fn) do { \ + nvrtcResult res = fn; \ + if (res == NVRTC_SUCCESS) break; \ + char nvrtc_err_msg[1024]; \ + snprintf(nvrtc_err_msg, \ + sizeof(nvrtc_err_msg), \ + "NVRTC Error(%d): %s\n", \ + res, nvrtcGetErrorString(res)); \ + AF_ERROR(nvrtc_err_msg, \ + AF_ERR_INTERNAL); \ + } while(0) #endif +unique_ptr compileToPTX(const char *ker_name, string jit_ker, size_t *ptx_size) +{ + nvrtcProgram prog; + NVRTC_CHECK(nvrtcCreateProgram(&prog, jit_ker.c_str(), ker_name, 0, NULL, NULL)); + NVRTC_CHECK(nvrtcCompileProgram(prog, 0, NULL)); + NVRTC_CHECK(nvrtcGetPTXSize(prog, ptx_size)); + unique_ptr ptx(new char[*ptx_size]); + NVRTC_CHECK(nvrtcGetPTX(prog, ptx.get())); + return ptx; +} + static kc_entry_t compileKernel(const char *ker_name, string jit_ker) { lock_guard lock(getDriverApiMutex(getActiveDeviceId())); + const size_t linkLogSize = 1024; + char linkInfo[linkLogSize] = {0}; + char linkError[linkLogSize] = {0}; + size_t ptx_size; - unique_ptr ptx(irToPtx(jit_ker, &ptx_size)); + auto ptx = compileToPTX(ker_name, jit_ker, &ptx_size); CUlinkState linkState; - - linkInfo[0] = 0; - linkError[0] = 0; - CUjit_option linkOptions[] = { CU_JIT_INFO_LOG_BUFFER, CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, @@ -513,49 +285,25 @@ static kc_entry_t compileKernel(const char *ker_name, string jit_ker) void *linkOptionValues[] = { linkInfo, - reinterpret_cast(1024), + reinterpret_cast(linkLogSize), linkError, - reinterpret_cast(1024), + reinterpret_cast(linkLogSize), reinterpret_cast(1) }; - CU_CHECK(cuLinkCreate(5, linkOptions, linkOptionValues, &linkState)); - CU_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)ptx.get(), - ptx_size, ker_name, 0, NULL, NULL)); - - CU_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)arith_ptx, - arith_ptx_len, "arith", 0, NULL, NULL)); - - CU_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)cast_ptx, - cast_ptx_len, "cast", 0, NULL, NULL)); - - CU_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)exp_ptx, - exp_ptx_len, "exp", 0, NULL, NULL)); - - CU_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)hyper_ptx, - hyper_ptx_len, "hyper", 0, NULL, NULL)); - - CU_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)logic_ptx, - logic_ptx_len, "logic", 0, NULL, NULL)); - - CU_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)numeric_ptx, - numeric_ptx_len, "numeric", 0, NULL, NULL)); - - CU_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)trig_ptx, - trig_ptx_len, "trig", 0, NULL, NULL)); + CU_LINK_CHECK(cuLinkCreate(5, linkOptions, linkOptionValues, &linkState)); + CU_LINK_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)ptx.get(), + ptx_size, ker_name, 0, NULL, NULL)); void *cubin; size_t cubinSize; CUmodule module; CUfunction kernel; - - CU_CHECK(cuLinkComplete(linkState, &cubin, &cubinSize)); + CU_LINK_CHECK(cuLinkComplete(linkState, &cubin, &cubinSize)); CU_CHECK(cuModuleLoadDataEx(&module, cubin, 0, 0, 0)); - CU_CHECK(cuModuleGetFunction(&kernel, module, ker_name + 1)); - + CU_CHECK(cuModuleGetFunction(&kernel, module, ker_name)); kc_entry_t entry = {module, kernel}; - return entry; } @@ -659,40 +407,12 @@ void evalNodes(vector >&outputs, vector output_nodes) } for (int i = 0; i < num_outputs; i++) { - args.push_back(&outputs[i].ptr); - } - - // DO NOT PUT THESE IN A SCOPE. - // The pointers are used later. - // Scoping them results in undefined behavior. - - int strides[] = {(int)outputs[0].strides[0], - (int)outputs[0].strides[1], - (int)outputs[0].strides[2], - (int)outputs[0].strides[3]}; - - int dims[] = {(int)outputs[0].dims[0], - (int)outputs[0].dims[1], - (int)outputs[0].dims[2], - (int)outputs[0].dims[3]}; - - if (is_linear) { - int nelem = 1; - for (int i = 0; i < 4; i++) { - nelem *= outputs[0].dims[i]; - } - args.push_back((void *)&nelem); - } else { - for (int i = 0; i < 4; i++) args.push_back((void *)(strides + i)); - for (int i = 0; i < 4; i++) args.push_back((void *)(dims + i)); + args.push_back((void *)&outputs[i]); } args.push_back((void *)&blocks_x_); args.push_back((void *)&blocks_y_); - - if (!is_linear) { - args.push_back((void *)&num_odims); - } + args.push_back((void *)&num_odims); lock_guard lock(getDriverApiMutex(getActiveDeviceId())); CU_CHECK(cuLaunchKernel(ker, diff --git a/src/backend/cuda/kernel/jit.cuh b/src/backend/cuda/kernel/jit.cuh new file mode 100644 index 0000000000..0868d1f3df --- /dev/null +++ b/src/backend/cuda/kernel/jit.cuh @@ -0,0 +1,205 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +typedef float2 cuFloatComplex; +typedef cuFloatComplex cfloat; + +typedef double2 cuDoubleComplex; +typedef cuDoubleComplex cdouble; + +// ---------------------------------------------- +// REAL NUMBER OPERATIONS +// ---------------------------------------------- +#define sign(in) signbit((in)) +#define __noop(a) (a) +#define __add(lhs, rhs) (lhs) + (rhs) +#define __sub(lhs, rhs) (lhs) - (rhs) +#define __mul(lhs, rhs) (lhs) * (rhs) +#define __div(lhs, rhs) (lhs) / (rhs) +#define __and(lhs, rhs) (lhs) && (rhs) +#define __or(lhs, rhs) (lhs) || (rhs) + +#define __lt(lhs, rhs) (lhs) < (rhs) +#define __gt(lhs, rhs) (lhs) > (rhs) +#define __le(lhs, rhs) (lhs) <= (rhs) +#define __ge(lhs, rhs) (lhs) >= (rhs) +#define __eq(lhs, rhs) (lhs) == (rhs) +#define __neq(lhs, rhs) (lhs) != (rhs) + +#define __conj(in) (in) +#define __real(in) (in) +#define __imag(in) (0) +#define __abs(in) abs(in) +#define __sigmoid(in) (1.0/(1 + exp(-(in)))) + +#define __bitor(lhs, rhs) ((lhs) | (rhs)) +#define __bitand(lhs, rhs) ((lhs) & (rhs)) +#define __bitxor(lhs, rhs) ((lhs) ^ (rhs)) +#define __bitshiftl(lhs, rhs) ((lhs) << (rhs)) +#define __bitshiftr(lhs, rhs) ((lhs) >> (rhs)) + +#define __min(lhs, rhs) ((lhs) < (rhs)) ? (lhs) : (rhs) +#define __max(lhs, rhs) ((lhs) > (rhs)) ? (lhs) : (rhs) +#define __rem(lhs, rhs) ((lhs) % (rhs)) +#define __mod(lhs, rhs) ((lhs) % (rhs)) +#define __pow(lhs, rhs) fpow((float)lhs, (float)rhs) + +#define __convert_char(val) (char)((val) != 0) +#define fpow(lhs, rhs) pow((lhs), (rhs)) +#define frem(lhs, rhs) remainder((lhs), (rhs)) +#define iszero(a) ((a) == 0) + +// ---------------------------------------------- +// COMPLEX FLOAT OPERATIONS +// ---------------------------------------------- + +#define __crealf(in) ((in).x) +#define __cimagf(in) ((in).y) +#define __cabsf(in) hypotf(in.x, in.y) + +__device__ cfloat __cplx2f(float x, float y) +{ + cfloat res = {x, y}; + return res; +} + +__device__ cfloat __cconjf(cfloat in) +{ + cfloat res = {in.x, -in.y}; + return res; +} + +__device__ cfloat __caddf(cfloat lhs, cfloat rhs) +{ + cfloat res = {lhs.x + rhs.x, lhs.y + rhs.y}; + return res; +} + +__device__ cfloat __csubf(cfloat lhs, cfloat rhs) +{ + cfloat res = {lhs.x - rhs.x, lhs.y - rhs.y}; + return res; +} + +__device__ cfloat __cmulf(cfloat lhs, cfloat rhs) +{ + cfloat out; + out.x = lhs.x * rhs.x - lhs.y * rhs.y; + out.y = lhs.x * rhs.y + lhs.y * rhs.x; + return out; +} + +__device__ cfloat __cdivf(cfloat lhs, cfloat rhs) +{ + // Normalize by absolute value and multiply + float rhs_abs = __cabsf(rhs); + float inv_rhs_abs = 1.0f / rhs_abs; + float rhs_x = inv_rhs_abs * rhs.x; + float rhs_y = inv_rhs_abs * rhs.y; + cfloat out = {lhs.x * rhs_x + lhs.y * rhs_y, + lhs.y * rhs_x - lhs.x * rhs_y}; + out.x *= inv_rhs_abs; + out.y *= inv_rhs_abs; + return out; +} + +__device__ cfloat __cminf(cfloat lhs, cfloat rhs) +{ + return __cabsf(lhs) < __cabsf(rhs) ? lhs : rhs; +} + +__device__ cfloat __cmaxf(cfloat lhs, cfloat rhs) +{ + return __cabsf(lhs) > __cabsf(rhs) ? lhs : rhs; +} +#define __candf(lhs, rhs) __cabsf(lhs) && __cabsf(rhs) +#define __corf(lhs, rhs) __cabsf(lhs) || __cabsf(rhs) +#define __ceqf(lhs, rhs) (((lhs).x == (rhs).x) && ((lhs).y == (rhs).y)) +#define __cneqf(lhs, rhs) !__ceqf((lhs), (rhs)) +#define __cltf(lhs, rhs) (__cabsf(lhs) < __cabsf(rhs)) +#define __clef(lhs, rhs) (__cabsf(lhs) <= __cabsf(rhs)) +#define __cgtf(lhs, rhs) (__cabsf(lhs) > __cabsf(rhs)) +#define __cgef(lhs, rhs) (__cabsf(lhs) >= __cabsf(rhs)) +#define __convert_cfloat(real) __cplx2f(real, 0) +#define __convert_c2c(in) (in) +#define __convert_z2c(in) __cplx2f((float)in.x, (float)in.y) + +// ---------------------------------------------- +// COMPLEX DOUBLE OPERATIONS +// ---------------------------------------------- +#define __creal(in) ((in).x) +#define __cimag(in) ((in).y) +#define __cabs(in) hypot(in.x, in.y) + +__device__ cdouble __cplx2(double x, double y) +{ + cdouble res = {x, y}; + return res; +} + +__device__ cdouble __cconj(cdouble in) +{ + cdouble res = {in.x, -in.y}; + return res; +} + +__device__ cdouble __cadd(cdouble lhs, cdouble rhs) +{ + cdouble res = {lhs.x + rhs.x, lhs.y + rhs.y}; + return res; +} + +__device__ cdouble __csub(cdouble lhs, cdouble rhs) +{ + cdouble res = {lhs.x - rhs.x, lhs.y - rhs.y}; + return res; +} + +__device__ cdouble __cmul(cdouble lhs, cdouble rhs) +{ + cdouble out; + out.x = lhs.x * rhs.x - lhs.y * rhs.y; + out.y = lhs.x * rhs.y + lhs.y * rhs.x; + return out; +} + +__device__ cdouble __cdiv(cdouble lhs, cdouble rhs) +{ + // Normalize by absolute value and multiply + double rhs_abs = __cabs(rhs); + double inv_rhs_abs = 1.0 / rhs_abs; + double rhs_x = inv_rhs_abs * rhs.x; + double rhs_y = inv_rhs_abs * rhs.y; + cdouble out = {lhs.x * rhs_x + lhs.y * rhs_y, + lhs.y * rhs_x - lhs.x * rhs_y}; + out.x *= inv_rhs_abs; + out.y *= inv_rhs_abs; + return out; +} + +__device__ cdouble __cmin(cdouble lhs, cdouble rhs) +{ + return __cabs(lhs) < __cabs(rhs) ? lhs : rhs; +} + +__device__ cdouble __cmax(cdouble lhs, cdouble rhs) +{ + return __cabs(lhs) > __cabs(rhs) ? lhs : rhs; +} +#define __cand(lhs, rhs) __cabs(lhs) && __cabs(rhs) +#define __cor(lhs, rhs) __cabs(lhs) || __cabs(rhs) +#define __ceq(lhs, rhs) (((lhs).x == (rhs).x) && ((lhs).y == (rhs).y)) +#define __cneq(lhs, rhs) !__ceq((lhs), (rhs)) +#define __clt(lhs, rhs) (__cabs(lhs) < __cabs(rhs)) +#define __cle(lhs, rhs) (__cabs(lhs) <= __cabs(rhs)) +#define __cgt(lhs, rhs) (__cabs(lhs) > __cabs(rhs)) +#define __cge(lhs, rhs) (__cabs(lhs) >= __cabs(rhs)) +#define __convert_cdouble(real) __cplx2(real, 0) +#define __convert_z2z(in) (in) +#define __convert_c2z(in) __cplx2((double)in.x, (double)in.y) diff --git a/src/backend/cuda/logic.hpp b/src/backend/cuda/logic.hpp index 8261a02d73..2c047ba8f8 100644 --- a/src/backend/cuda/logic.hpp +++ b/src/backend/cuda/logic.hpp @@ -7,11 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#include #include #include #include -#include #include +#include namespace cuda { diff --git a/src/backend/cuda/types.cpp b/src/backend/cuda/types.cpp index 8c29c00b45..b69eb661e7 100644 --- a/src/backend/cuda/types.cpp +++ b/src/backend/cuda/types.cpp @@ -10,91 +10,38 @@ #include #include "types.hpp" #include +#include namespace cuda { - template const char *cuShortName() { return "q"; } - template<> const char *cuShortName() { return "f"; } - template<> const char *cuShortName() { return "d"; } - template<> const char *cuShortName() { return "6float2"; } - template<> const char *cuShortName() { return "7double2"; } - template<> const char *cuShortName() { return "i"; } - template<> const char *cuShortName() { return "j"; } - template<> const char *cuShortName() { return "c"; } - template<> const char *cuShortName() { return "h"; } - template<> const char *cuShortName() { return "x"; } - template<> const char *cuShortName() { return "y"; } - template<> const char *cuShortName() { return "s"; } - template<> const char *cuShortName() { return "t"; } - template const char *afShortName(bool caps) { return caps ? "Q" : "q"; } - template<> const char *afShortName(bool caps) { return caps ? "S" : "s"; } - template<> const char *afShortName(bool caps) { return caps ? "D" : "d"; } - template<> const char *afShortName(bool caps) { return caps ? "C" : "c"; } - template<> const char *afShortName(bool caps) { return caps ? "Z" : "z"; } - template<> const char *afShortName(bool caps) { return caps ? "I" : "i"; } - template<> const char *afShortName(bool caps) { return caps ? "U" : "u"; } - template<> const char *afShortName(bool caps) { return caps ? "J" : "j"; } - template<> const char *afShortName(bool caps) { return caps ? "V" : "v"; } - template<> const char *afShortName(bool caps) { return caps ? "X" : "x"; } - template<> const char *afShortName(bool caps) { return caps ? "Y" : "y"; } - template<> const char *afShortName(bool caps) { return caps ? "P" : "P"; } - template<> const char *afShortName(bool caps) { return caps ? "Q" : "Q"; } - - template const char *irname() { return "i32"; } - template<> const char *irname() { return "float"; } - template<> const char *irname() { return "double"; } - template<> const char *irname() { return "<2 x float>"; } - template<> const char *irname() { return "<2 x double>"; } - template<> const char *irname() { return "i32"; } - template<> const char *irname() { return "i32"; } - template<> const char *irname() { return "i64"; } - template<> const char *irname() { return "i64"; } - template<> const char *irname() { return "i8"; } - template<> const char *irname() { return "i8"; } - template<> const char *irname() { return "i16"; } - template<> const char *irname() { return "i16"; } - - template - static inline std::string toString(T val) - { - std::stringstream s; - s << val; - return s.str(); - } - - template - const std::string cuMangledName(const char *fn) - { - std::string cname(cuShortName()); - std::string fname(fn); - size_t flen = fname.size(); - - std::string res = std::string("@_Z") + toString(flen) + fname + cname; - if (binary) { - if (cname.size() > 1) { - res = res + "S_"; - } else { - res = res + cname; - } - } - return res; - } - -#define INSTANTIATE(T) \ - template const std::string cuMangledName(const char *fn); \ - template const std::string cuMangledName(const char *fn); \ + template const char *shortname(bool caps) { return caps ? "Q" : "q"; } + template<> const char *shortname(bool caps) { return caps ? "S" : "s"; } + template<> const char *shortname(bool caps) { return caps ? "D" : "d"; } + template<> const char *shortname(bool caps) { return caps ? "C" : "c"; } + template<> const char *shortname(bool caps) { return caps ? "Z" : "z"; } + template<> const char *shortname(bool caps) { return caps ? "I" : "i"; } + template<> const char *shortname(bool caps) { return caps ? "U" : "u"; } + template<> const char *shortname(bool caps) { return caps ? "J" : "j"; } + template<> const char *shortname(bool caps) { return caps ? "V" : "v"; } + template<> const char *shortname(bool caps) { return caps ? "X" : "x"; } + template<> const char *shortname(bool caps) { return caps ? "Y" : "y"; } + template<> const char *shortname(bool caps) { return caps ? "P" : "P"; } + template<> const char *shortname(bool caps) { return caps ? "Q" : "Q"; } + +#define INSTANTIATE(T) \ + template<> const char *getFullName() { return #T; } \ INSTANTIATE(float) INSTANTIATE(double) INSTANTIATE(cfloat) INSTANTIATE(cdouble) INSTANTIATE(char) - INSTANTIATE(uchar) - INSTANTIATE(int) - INSTANTIATE(uint) - INSTANTIATE(intl) - INSTANTIATE(uintl) + INSTANTIATE(unsigned char) INSTANTIATE(short) - INSTANTIATE(ushort) + INSTANTIATE(unsigned short) + INSTANTIATE(int) + INSTANTIATE(unsigned int) + INSTANTIATE(unsigned long long) + INSTANTIATE(long long) } diff --git a/src/backend/cuda/types.hpp b/src/backend/cuda/types.hpp index 08aab5f374..b12200bdd8 100644 --- a/src/backend/cuda/types.hpp +++ b/src/backend/cuda/types.hpp @@ -10,6 +10,7 @@ #pragma once #include #include +#include namespace cuda { @@ -23,7 +24,6 @@ template struct is_complex { static const bool value = fals template<> struct is_complex { static const bool value = true; }; template<> struct is_complex { static const bool value = true; }; -template const std::string cuMangledName(const char *fn); -template const char *afShortName(bool caps = true); -template const char *irname(); +template const char *shortname(bool caps = true); +template const char *getFullName(); } diff --git a/src/backend/cuda/unary.hpp b/src/backend/cuda/unary.hpp index 291f805193..13b9690cef 100644 --- a/src/backend/cuda/unary.hpp +++ b/src/backend/cuda/unary.hpp @@ -10,176 +10,90 @@ #include #include #include -#include #include namespace cuda { +template +static const char *unaryName() { return "__noop"; } + +#define UNARY_DECL(OP, FNAME) \ + template<> STATIC_ \ + const char *unaryName() \ + { \ + return FNAME; \ + } \ + +#define UNARY_FN(OP) UNARY_DECL(OP, #OP) + +UNARY_FN(sin) +UNARY_FN(cos) +UNARY_FN(tan) + +UNARY_FN(asin) +UNARY_FN(acos) +UNARY_FN(atan) + +UNARY_FN(sinh) +UNARY_FN(cosh) +UNARY_FN(tanh) + +UNARY_FN(asinh) +UNARY_FN(acosh) +UNARY_FN(atanh) + +UNARY_FN(exp) +UNARY_DECL(sigmoid, "__sigmoid") +UNARY_FN(expm1) +UNARY_FN(erf) +UNARY_FN(erfc) + +UNARY_FN(tgamma) +UNARY_FN(lgamma) + +UNARY_FN(log) +UNARY_FN(log1p) +UNARY_FN(log10) +UNARY_FN(log2) + +UNARY_FN(sqrt) +UNARY_FN(cbrt) + +UNARY_FN(trunc) +UNARY_FN(round) +UNARY_FN(sign) +UNARY_FN(ceil) +UNARY_FN(floor) + +UNARY_FN(isinf) +UNARY_FN(isnan) +UNARY_FN(iszero) + template -struct UnOp +Array unaryOp(const Array &in) { - const char *name() - { - return "noop"; - } -}; - -#define UNARY_FN(fn) \ - template \ - struct UnOp \ - { \ - std::string res; \ - bool is_check; \ - UnOp() : \ - res(cuMangledName("___"#fn)), \ - is_check(false) \ - { \ - } \ - const std::string name() \ - { \ - return res; \ - } \ - }; \ - -#define UNARY_FN_NAME(op, fn) \ - template \ - struct UnOp \ - { \ - std::string res; \ - bool is_check; \ - UnOp() : \ - res(cuMangledName("___"#fn)), \ - is_check(false) \ - { \ - } \ - const std::string name() \ - { \ - return res; \ - } \ - }; \ - -#if defined(USE_LIBDEVICE) -#define NVVM_SPECIALIZE_TYPE(T, fn, fname) \ - template<> \ - struct UnOp \ - { \ - std::string res; \ - bool is_check; \ - UnOp() : \ - res("@__nv_"#fname), \ - is_check(false) \ - { \ - } \ - const std::string name() \ - { \ - return res; \ - } \ - }; \ - -#define NVVM_SPECIALIZE_CHECK(T, fn, fname) \ - template<> \ - struct UnOp \ - { \ - std::string res; \ - bool is_check; \ - UnOp() : \ - res("@__nv_"#fname), \ - is_check(true) \ - { \ - } \ - const std::string name() \ - { \ - return res; \ - } \ - }; \ - -#else -#define NVVM_SPECIALIZE_TYPE(T, fn, fname) // no specialization -#define NVVM_SPECIALIZE_CHECK(T, fn, fname) // no specialization -#endif - -#define NVVM_SPECIALIZE_FLOATING_NAME(fn, fname) \ - UNARY_FN(fn) \ - NVVM_SPECIALIZE_TYPE(float, fn, fname##f) \ - NVVM_SPECIALIZE_TYPE(double, fn, fname) \ - - -#define NVVM_SPECIALIZE_FLOATING(fn) \ - NVVM_SPECIALIZE_FLOATING_NAME(fn, fn) - -NVVM_SPECIALIZE_FLOATING(sin) -NVVM_SPECIALIZE_FLOATING(cos) -NVVM_SPECIALIZE_FLOATING(tan) -NVVM_SPECIALIZE_FLOATING(asin) -NVVM_SPECIALIZE_FLOATING(acos) -NVVM_SPECIALIZE_FLOATING(atan) -NVVM_SPECIALIZE_FLOATING(sinh) -NVVM_SPECIALIZE_FLOATING(cosh) -NVVM_SPECIALIZE_FLOATING(tanh) -NVVM_SPECIALIZE_FLOATING(asinh) -NVVM_SPECIALIZE_FLOATING(acosh) -NVVM_SPECIALIZE_FLOATING(atanh) -NVVM_SPECIALIZE_FLOATING(exp) -NVVM_SPECIALIZE_FLOATING(expm1) -NVVM_SPECIALIZE_FLOATING(erf) -NVVM_SPECIALIZE_FLOATING(erfc) -NVVM_SPECIALIZE_FLOATING(tgamma) -NVVM_SPECIALIZE_FLOATING(lgamma) -NVVM_SPECIALIZE_FLOATING(log) -NVVM_SPECIALIZE_FLOATING(log1p) -NVVM_SPECIALIZE_FLOATING(log10) -NVVM_SPECIALIZE_FLOATING(log2) -NVVM_SPECIALIZE_FLOATING(sqrt) -NVVM_SPECIALIZE_FLOATING(cbrt) -NVVM_SPECIALIZE_FLOATING(round) -NVVM_SPECIALIZE_FLOATING(trunc) -NVVM_SPECIALIZE_FLOATING(ceil) -NVVM_SPECIALIZE_FLOATING(floor) - -UNARY_FN(sign ) -NVVM_SPECIALIZE_CHECK(float , sign, signbitf) -NVVM_SPECIALIZE_CHECK(double, sign, signbitd) - -UNARY_FN_NAME(isnan, isNaN) -NVVM_SPECIALIZE_CHECK(float , isnan, isnanf) -NVVM_SPECIALIZE_CHECK(double, isnan, isnand) - -UNARY_FN_NAME(isinf, isINF) -NVVM_SPECIALIZE_CHECK(float , isinf, isinff) -NVVM_SPECIALIZE_CHECK(double, isinf, isinfd) - -UNARY_FN_NAME(iszero, iszero) -UNARY_FN(sigmoid) - -#undef UNARY_FN - - template - Array unaryOp(const Array &in) - { - - UnOp uop; - - JIT::Node_ptr in_node = in.getNode(); - - JIT::UnaryNode *node = new JIT::UnaryNode(irname(), - afShortName(), - uop.name(), - in_node, op, uop.is_check); - - return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); - } - - template - Array checkOp(const Array &in) - { - UnOp uop; - - JIT::Node_ptr in_node = in.getNode(); - JIT::UnaryNode *node = new JIT::UnaryNode(irname(), - afShortName(), - uop.name(), - in_node, op, uop.is_check); - return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); - } + JIT::Node_ptr in_node = in.getNode(); + + JIT::UnaryNode *node = new JIT::UnaryNode(getFullName(), + shortname(true), + unaryName(), + in_node, op); + + return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); +} + +template +Array checkOp(const Array &in) +{ + JIT::Node_ptr in_node = in.getNode(); + + JIT::UnaryNode *node = new JIT::UnaryNode(getFullName(), + shortname(true), + unaryName(), + in_node, op); + + return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); +} + } From 225a3e045c519e0325a6af44db5909bea34ae8d0 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Mon, 19 Jun 2017 11:16:29 -0700 Subject: [PATCH 2/4] Fixing underflow/overflow issues with complex math in opencl jit --- src/backend/opencl/kernel/jit.cl | 70 ++++++++++++++++---------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/src/backend/opencl/kernel/jit.cl b/src/backend/opencl/kernel/jit.cl index 3092449418..183e52be81 100644 --- a/src/backend/opencl/kernel/jit.cl +++ b/src/backend/opencl/kernel/jit.cl @@ -27,17 +27,14 @@ #define __real(in) (in) #define __imag(in) (0) #define __abs(in) abs(in) -#define __abs2(in) (in) * (in) #define __crealf(in) ((in).x) #define __cimagf(in) ((in).y) -#define __cabsf2(in) ((in).x * (in).x + (in).y * (in).y) -#define __cabsf(in) sqrt(__cabsf2(in)) +#define __cabsf(in) hypot((in).x, (in).y) #define __creal(in) ((in).x) #define __cimag(in) ((in).y) -#define __cabs2(in) ((in).x * (in).x + (in).y * (in).y) -#define __cabs(in) sqrt(__cabs2(in)) +#define __cabs(in) hypot((in).x, (in).y) #define __sigmoid(in) (1.0/(1 + exp(-(in)))) float2 __cconjf(float2 in) @@ -69,35 +66,37 @@ float2 __cmulf(float2 lhs, float2 rhs) // FIXME: overflow / underflow issues float2 __cdivf(float2 lhs, float2 rhs) { - float2 out; - float den = (rhs.x * rhs.x + rhs.y * rhs.y); - float2 num = __cmulf(lhs, __cconjf(rhs)); - - out.x = num.x / den; - out.y = num.y / den; - + // Normalize by absolute value and multiply + float rhs_abs = __cabsf(rhs); + float inv_rhs_abs = 1.0f / rhs_abs; + float rhs_x = inv_rhs_abs * rhs.x; + float rhs_y = inv_rhs_abs * rhs.y; + float2 out = {lhs.x * rhs_x + lhs.y * rhs_y, + lhs.y * rhs_x - lhs.x * rhs_y}; + out.x *= inv_rhs_abs; + out.y *= inv_rhs_abs; return out; } -#define __candf(lhs, rhs) __cabsf2(lhs) && __cabsf2(rhs) -#define __cand(lhs, rhs) __cabs2(lhs) && __cabs2(rhs) +#define __candf(lhs, rhs) __cabsf(lhs) && __cabsf(rhs) +#define __cand(lhs, rhs) __cabs(lhs) && __cabs(rhs) -#define __corf(lhs, rhs) __cabsf2(lhs) || __cabsf2(rhs) -#define __cor(lhs, rhs) __cabs2(lhs) || __cabs2(rhs) +#define __corf(lhs, rhs) __cabsf(lhs) || __cabsf(rhs) +#define __cor(lhs, rhs) __cabs(lhs) || __cabs(rhs) #define __ceqf(lhs, rhs) (((lhs).x == (rhs).x) && ((lhs).y == (rhs).y)) #define __cneqf(lhs, rhs) !__ceqf((lhs), (rhs)) -#define __cltf(lhs, rhs) (__cabsf2(lhs) < __cabsf2(rhs)) -#define __clef(lhs, rhs) (__cabsf2(lhs) <= __cabsf2(rhs)) -#define __cgtf(lhs, rhs) (__cabsf2(lhs) > __cabsf2(rhs)) -#define __cgef(lhs, rhs) (__cabsf2(lhs) >= __cabsf2(rhs)) +#define __cltf(lhs, rhs) (__cabsf(lhs) < __cabsf(rhs)) +#define __clef(lhs, rhs) (__cabsf(lhs) <= __cabsf(rhs)) +#define __cgtf(lhs, rhs) (__cabsf(lhs) > __cabsf(rhs)) +#define __cgef(lhs, rhs) (__cabsf(lhs) >= __cabsf(rhs)) #define __ceq(lhs, rhs) (((lhs).x == (rhs).x) && ((lhs).y == (rhs).y)) #define __cneq(lhs, rhs) !__ceq((lhs), (rhs)) -#define __clt(lhs, rhs) (__cabs2(lhs) < __cabs2(rhs)) -#define __cle(lhs, rhs) (__cabs2(lhs) <= __cabs2(rhs)) -#define __cgt(lhs, rhs) (__cabs2(lhs) > __cabs2(rhs)) -#define __cge(lhs, rhs) (__cabs2(lhs) >= __cabs2(rhs)) +#define __clt(lhs, rhs) (__cabs(lhs) < __cabs(rhs)) +#define __cle(lhs, rhs) (__cabs(lhs) <= __cabs(rhs)) +#define __cgt(lhs, rhs) (__cabs(lhs) > __cabs(rhs)) +#define __cge(lhs, rhs) (__cabs(lhs) >= __cabs(rhs)) #define __bitor(lhs, rhs) ((lhs) | (rhs)) #define __bitand(lhs, rhs) ((lhs) & (rhs)) @@ -113,12 +112,12 @@ float2 __cdivf(float2 lhs, float2 rhs) float2 __cminf(float2 lhs, float2 rhs) { - return __abs2(lhs) < __abs2(rhs) ? lhs : rhs; + return __cabsf(lhs) < __cabsf(rhs) ? lhs : rhs; } float2 __cmaxf(float2 lhs, float2 rhs) { - return __abs2(lhs) > __abs2(rhs) ? lhs : rhs; + return __cabsf(lhs) > __cabsf(rhs) ? lhs : rhs; } float2 __cplx2f(float lhs, float rhs) @@ -175,23 +174,26 @@ double2 __cmul(double2 lhs, double2 rhs) double2 __cdiv(double2 lhs, double2 rhs) { - double2 out; - double den = (rhs.x * rhs.x + rhs.y * rhs.y); - double2 num = __cmul(lhs, __cconj(rhs)); - - out.x = num.x / den; - out.y = num.y / den; + // Normalize by absolute value and multiply + double rhs_abs = __cabs(rhs); + double inv_rhs_abs = 1.0 / rhs_abs; + double rhs_x = inv_rhs_abs * rhs.x; + double rhs_y = inv_rhs_abs * rhs.y; + double2 out = {lhs.x * rhs_x + lhs.y * rhs_y, + lhs.y * rhs_x - lhs.x * rhs_y}; + out.x *= inv_rhs_abs; + out.y *= inv_rhs_abs; return out; } double2 __cmin(double2 lhs, double2 rhs) { - return __abs2(lhs) < __abs2(rhs) ? lhs : rhs; + return __cabs(lhs) < __cabs(rhs) ? lhs : rhs; } double2 __cmax(double2 lhs, double2 rhs) { - return __abs2(lhs) > __abs2(rhs) ? lhs : rhs; + return __cabs(lhs) > __cabs(rhs) ? lhs : rhs; } double2 __cplx2(double lhs, double rhs) From 81d74af7c30758a2815cec248652c55c9533abe4 Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 20 Jun 2017 05:17:28 -0700 Subject: [PATCH 3/4] Style changes for JIT in CUDA and OpenCL backends --- src/backend/cuda/Array.cpp | 4 +- src/backend/cuda/JIT/BufferNode.hpp | 8 +- src/backend/cuda/JIT/Node.hpp | 2 +- src/backend/cuda/binary.hpp | 5 +- src/backend/cuda/cast.hpp | 6 +- src/backend/cuda/complex.hpp | 8 +- src/backend/cuda/jit.cpp | 146 +++++++++++++------------- src/backend/cuda/kernel/jit.cuh | 2 +- src/backend/cuda/scalar.hpp | 3 +- src/backend/cuda/types.cpp | 2 +- src/backend/cuda/types.hpp | 2 +- src/backend/cuda/unary.hpp | 4 +- src/backend/opencl/Array.cpp | 4 +- src/backend/opencl/JIT/BufferNode.hpp | 9 +- src/backend/opencl/binary.hpp | 3 +- src/backend/opencl/cast.hpp | 6 +- src/backend/opencl/complex.hpp | 8 +- src/backend/opencl/jit.cpp | 75 ++++++------- src/backend/opencl/kernel/jit.cl | 2 +- src/backend/opencl/scalar.hpp | 3 +- src/backend/opencl/types.cpp | 2 +- src/backend/opencl/types.hpp | 2 +- src/backend/opencl/unary.hpp | 4 +- 23 files changed, 146 insertions(+), 164 deletions(-) diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp index da98060331..cec91d5c04 100644 --- a/src/backend/cuda/Array.cpp +++ b/src/backend/cuda/Array.cpp @@ -33,8 +33,8 @@ namespace cuda template Node_ptr bufferNodePtr() { - return Node_ptr(reinterpret_cast(new BufferNode(getFullName(), - shortname(true)))); + return Node_ptr(new BufferNode(getFullName(), + shortname(true))); } template diff --git a/src/backend/cuda/JIT/BufferNode.hpp b/src/backend/cuda/JIT/BufferNode.hpp index 6c4c2fb0a2..4c14f6c2d5 100644 --- a/src/backend/cuda/JIT/BufferNode.hpp +++ b/src/backend/cuda/JIT/BufferNode.hpp @@ -26,8 +26,8 @@ namespace JIT std::shared_ptr m_data; Param m_param; unsigned m_bytes; - bool m_linear_buffer; std::once_flag m_set_data_flag; + bool m_linear_buffer; public: @@ -39,10 +39,6 @@ namespace JIT bool isBuffer() { return true; } - ~BufferNode() - { - } - void setData(Param param, std::shared_ptr data, const unsigned bytes, bool is_linear) { std::call_once(m_set_data_flag, [this, param, data, bytes, is_linear]() { @@ -94,7 +90,7 @@ namespace JIT if (is_linear) { kerStream << idx_str << " = idx;\n"; } else { - std::string info_str = std::string("in") + std::to_string(id);; + std::string info_str = std::string("in") + std::to_string(id); kerStream << idx_str << " = " << "(id3 < " << info_str << ".dims[3]) * " << info_str << ".strides[3] * id3 + " diff --git a/src/backend/cuda/JIT/Node.hpp b/src/backend/cuda/JIT/Node.hpp index feae91ca98..995d4c7b1d 100644 --- a/src/backend/cuda/JIT/Node.hpp +++ b/src/backend/cuda/JIT/Node.hpp @@ -65,7 +65,7 @@ namespace JIT } } - virtual void genKerName(std::stringstream &kerStream, Node_ids ids) {} + virtual void genKerName (std::stringstream &kerStream, Node_ids ids) {} virtual void genParams (std::stringstream &kerStream, int id, bool is_linear) {} virtual void genOffsets (std::stringstream &kerStream, int id, bool is_linear) {} virtual void genFuncs (std::stringstream &kerStream, Node_ids) {} diff --git a/src/backend/cuda/binary.hpp b/src/backend/cuda/binary.hpp index d5659eb7e6..58d6254453 100644 --- a/src/backend/cuda/binary.hpp +++ b/src/backend/cuda/binary.hpp @@ -108,7 +108,6 @@ BINARY_TYPE_1(bitshiftr) return "__c"#fn"f"; \ } \ }; \ - \ template \ struct BinOp \ { \ @@ -118,7 +117,6 @@ BINARY_TYPE_1(bitshiftr) } \ }; \ - BINARY_TYPE_2(min) BINARY_TYPE_2(max) BINARY_TYPE_2(pow) @@ -183,8 +181,7 @@ Array createBinaryNode(const Array &lhs, const Array &rhs, const af: lhs_node, rhs_node, (int)(op)); - return createNodeArray(odims, JIT::Node_ptr( - reinterpret_cast(node))); + return createNodeArray(odims, JIT::Node_ptr(node)); } } diff --git a/src/backend/cuda/cast.hpp b/src/backend/cuda/cast.hpp index 9f854c6e06..5b1ffe3a96 100644 --- a/src/backend/cuda/cast.hpp +++ b/src/backend/cuda/cast.hpp @@ -57,12 +57,10 @@ CAST_FN(double) } \ }; - CAST_CFN(cfloat) CAST_CFN(cdouble) CAST_CFN(char) - template<> struct CastOp { @@ -72,7 +70,6 @@ struct CastOp } }; - template<> struct CastOp { @@ -91,7 +88,6 @@ struct CastOp } }; - template<> struct CastOp { @@ -115,7 +111,7 @@ struct CastWrapper shortname(true), cop.name(), in_node, af_cast_t); - return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(in.dims(), JIT::Node_ptr(node)); } }; diff --git a/src/backend/cuda/complex.hpp b/src/backend/cuda/complex.hpp index 67ba0ebe4e..578d982087 100644 --- a/src/backend/cuda/complex.hpp +++ b/src/backend/cuda/complex.hpp @@ -30,7 +30,7 @@ namespace cuda "__creal", in_node, af_real_t); - return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(in.dims(), JIT::Node_ptr(node)); } template @@ -42,7 +42,7 @@ namespace cuda "__cimag", in_node, af_imag_t); - return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(in.dims(), JIT::Node_ptr(node)); } template static const char *abs_name() { return "fabs"; } @@ -58,7 +58,7 @@ namespace cuda abs_name(), in_node, af_abs_t); - return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(in.dims(), JIT::Node_ptr(node)); } template static const char *conj_name() { return "__noop"; } @@ -74,6 +74,6 @@ namespace cuda conj_name(), in_node, af_conj_t); - return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(in.dims(), JIT::Node_ptr(node)); } } diff --git a/src/backend/cuda/jit.cpp b/src/backend/cuda/jit.cpp index 5c655b2247..6f703765ab 100644 --- a/src/backend/cuda/jit.cpp +++ b/src/backend/cuda/jit.cpp @@ -7,24 +7,25 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include +#include +#include + #include #include #include - #include -#include -#include #include +#include +#include + #include #include #include #include #include #include -#include -#include namespace cuda { @@ -77,14 +78,14 @@ static string getKernelString(const string funcName, const char *includeFileStr = jit_cuh; - const char paramTStr[] ="" - "template\n" - "struct Param\n" - "{\n" - " T *ptr;\n" - " dim_t dims[4];\n" - " dim_t strides[4];\n" - "};\n"; + const char paramTStr[] =R"JIT( + template + struct Param + { + T *ptr; + dim_t dims[4]; + dim_t strides[4]; + };)JIT"; std::string typedefStr = "typedef unsigned int uint;\n"; typedefStr += "typedef "; @@ -94,46 +95,48 @@ static string getKernelString(const string funcName, // Common CUDA code // This part of the code does not change with the kernel. - static const char *kernelVoid = "extern \"C\" __global__ void\n"; + static const char *kernelVoid = "extern \"C\" __global__ void\n"; static const char *dimParams = "uint blocks_x, uint blocks_y, uint num_odims"; static const char *blockStart = "{\n\n"; static const char *blockEnd = "\n\n}"; - static const char *linearIndex = "\n" - "uint blockId = blockIdx.y * gridDim.x + blockIdx.x;\n" - "uint threadId = threadIdx.x;\n" - "int idx = blockId * blockDim.x * blockDim.y + threadId;\n" - "if (idx >= outref.dims[3] * outref.strides[3]) return;\n"; - - static const char *generalIndex = "\n" - "uint id0 = 0, id1 = 0, id2 = 0, id3 = 0;\n" - "if (num_odims > 2) {\n" - "id2 = blockIdx.x / blocks_x;\n" - "id0 = blockIdx.x - id2 * blocks_x;\n" - "id0 = threadIdx.x + id0 * blockDim.x;\n" - "if (num_odims > 3) {\n" - "id3 = blockIdx.y / blocks_y;\n" - "id1 = blockIdx.y - id3 * blocks_y;\n" - "id1 = threadIdx.y + id1 * blockDim.y;\n" - "} else {\n" - "id1 = threadIdx.y + blockDim.y * blockIdx.y;\n" - "}\n" - " } else {\n" - "id3 = 0;\n" - "id2 = 0;\n" - "id1 = threadIdx.y + blockDim.y * blockIdx.y;\n" - "id0 = threadIdx.x + blockDim.x * blockIdx.x;\n" - "}\n" - "bool cond = \n" - "id0 < outref.dims[0] && \n" - "id1 < outref.dims[1] && \n" - "id2 < outref.dims[2] && \n" - "id3 < outref.dims[3];\n\n" - "if (!cond) return;\n\n" - "int idx = " - "outref.strides[3] * id3 + outref.strides[2] * id2 + " - "outref.strides[1] * id1 + id0;\n\n"; - + static const char *linearIndex = R"JIT( + uint blockId = blockIdx.y * gridDim.x + blockIdx.x; + uint threadId = threadIdx.x; + int idx = blockId * blockDim.x * blockDim.y + threadId; + if (idx >= outref.dims[3] * outref.strides[3]) return; + )JIT"; + + static const char *generalIndex = R"JIT( + uint id0 = 0, id1 = 0, id2 = 0, id3 = 0; + if (num_odims > 2) { + id2 = blockIdx.x / blocks_x; + id0 = blockIdx.x - id2 * blocks_x; + id0 = threadIdx.x + id0 * blockDim.x; + if (num_odims > 3) { + id3 = blockIdx.y / blocks_y; + id1 = blockIdx.y - id3 * blocks_y; + id1 = threadIdx.y + id1 * blockDim.y; + } else { + id1 = threadIdx.y + blockDim.y * blockIdx.y; + } + } else { + id3 = 0; + id2 = 0; + id1 = threadIdx.y + blockDim.y * blockIdx.y; + id0 = threadIdx.x + blockDim.x * blockIdx.x; + } + + bool cond = id0 < outref.dims[0] && + id1 < outref.dims[1] && + id2 < outref.dims[2] && + id3 < outref.dims[3]; + if (!cond) return; + + int idx = outref.strides[3] * id3 + + outref.strides[2] * id2 + + outref.strides[1] * id1 + id0; + )JIT"; stringstream inParamStream; stringstream outParamStream; @@ -161,7 +164,7 @@ static string getKernelString(const string funcName, // Generate output parameters outParamStream << "Param<" << full_nodes[id]->getTypeStr() << "> out" << id << ", \n"; // Generate code to write the output - outWriteStream << "out" << id << ".ptr[idx] = " << "val" << id << ";\n"; + outWriteStream << "out" << id << ".ptr[idx] = val" << id << ";\n"; } // Put various blocks into a single stream @@ -225,18 +228,18 @@ typedef struct { #endif #ifndef NDEBUG -#define NVRTC_CHECK(fn) do { \ - nvrtcResult res = fn; \ - if (res == NVRTC_SUCCESS) break; \ - size_t logSize; \ - nvrtcGetProgramLogSize(prog, &logSize); \ - unique_ptr log(new char[logSize]); \ - char *logptr = log.get(); \ - nvrtcGetProgramLog(prog, logptr); \ - logptr[logSize] = '\x0'; \ - printf("%s\n", logptr); \ - AF_ERROR("NVRTC ERROR", \ - AF_ERR_INTERNAL); \ +#define NVRTC_CHECK(fn) do { \ + nvrtcResult res = fn; \ + if (res == NVRTC_SUCCESS) break; \ + size_t logSize; \ + nvrtcGetProgramLogSize(prog, &logSize); \ + unique_ptr log(new char[logSize +1]); \ + char *logptr = log.get(); \ + nvrtcGetProgramLog(prog, logptr); \ + logptr[logSize] = '\x0'; \ + printf("%s\n", logptr); \ + AF_ERROR("NVRTC ERROR", \ + AF_ERR_INTERNAL); \ } while(0) #else #define NVRTC_CHECK(fn) do { \ @@ -252,14 +255,16 @@ typedef struct { } while(0) #endif -unique_ptr compileToPTX(const char *ker_name, string jit_ker, size_t *ptx_size) +std::vector compileToPTX(const char *ker_name, string jit_ker) { nvrtcProgram prog; + size_t ptx_size; + std::vector ptx; NVRTC_CHECK(nvrtcCreateProgram(&prog, jit_ker.c_str(), ker_name, 0, NULL, NULL)); NVRTC_CHECK(nvrtcCompileProgram(prog, 0, NULL)); - NVRTC_CHECK(nvrtcGetPTXSize(prog, ptx_size)); - unique_ptr ptx(new char[*ptx_size]); - NVRTC_CHECK(nvrtcGetPTX(prog, ptx.get())); + NVRTC_CHECK(nvrtcGetPTXSize(prog, &ptx_size)); + ptx.resize(ptx_size); + NVRTC_CHECK(nvrtcGetPTX(prog, ptx.data())); return ptx; } @@ -271,8 +276,7 @@ static kc_entry_t compileKernel(const char *ker_name, string jit_ker) char linkInfo[linkLogSize] = {0}; char linkError[linkLogSize] = {0}; - size_t ptx_size; - auto ptx = compileToPTX(ker_name, jit_ker, &ptx_size); + auto ptx = compileToPTX(ker_name, jit_ker); CUlinkState linkState; CUjit_option linkOptions[] = { @@ -292,8 +296,8 @@ static kc_entry_t compileKernel(const char *ker_name, string jit_ker) }; CU_LINK_CHECK(cuLinkCreate(5, linkOptions, linkOptionValues, &linkState)); - CU_LINK_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)ptx.get(), - ptx_size, ker_name, 0, NULL, NULL)); + CU_LINK_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)ptx.data(), + ptx.size(), ker_name, 0, NULL, NULL)); void *cubin; size_t cubinSize; diff --git a/src/backend/cuda/kernel/jit.cuh b/src/backend/cuda/kernel/jit.cuh index 0868d1f3df..830a9e58c4 100644 --- a/src/backend/cuda/kernel/jit.cuh +++ b/src/backend/cuda/kernel/jit.cuh @@ -177,7 +177,7 @@ __device__ cdouble __cdiv(cdouble lhs, cdouble rhs) double rhs_x = inv_rhs_abs * rhs.x; double rhs_y = inv_rhs_abs * rhs.y; cdouble out = {lhs.x * rhs_x + lhs.y * rhs_y, - lhs.y * rhs_x - lhs.x * rhs_y}; + lhs.y * rhs_x - lhs.x * rhs_y}; out.x *= inv_rhs_abs; out.y *= inv_rhs_abs; return out; diff --git a/src/backend/cuda/scalar.hpp b/src/backend/cuda/scalar.hpp index b2bd1606cc..46fca748a8 100644 --- a/src/backend/cuda/scalar.hpp +++ b/src/backend/cuda/scalar.hpp @@ -18,8 +18,7 @@ namespace cuda template Array createScalarNode(const dim4 &size, const T val) { - JIT::ScalarNode *node = new JIT::ScalarNode(val); - return createNodeArray(size, JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(size, JIT::Node_ptr(new JIT::ScalarNode(val))); } } diff --git a/src/backend/cuda/types.cpp b/src/backend/cuda/types.cpp index b69eb661e7..9d85037ba6 100644 --- a/src/backend/cuda/types.cpp +++ b/src/backend/cuda/types.cpp @@ -15,7 +15,7 @@ namespace cuda { - template const char *shortname(bool caps) { return caps ? "Q" : "q"; } + template const char *shortname(bool caps) { return caps ? "Q" : "q"; } template<> const char *shortname(bool caps) { return caps ? "S" : "s"; } template<> const char *shortname(bool caps) { return caps ? "D" : "d"; } template<> const char *shortname(bool caps) { return caps ? "C" : "c"; } diff --git a/src/backend/cuda/types.hpp b/src/backend/cuda/types.hpp index b12200bdd8..3376e84bb5 100644 --- a/src/backend/cuda/types.hpp +++ b/src/backend/cuda/types.hpp @@ -24,6 +24,6 @@ template struct is_complex { static const bool value = fals template<> struct is_complex { static const bool value = true; }; template<> struct is_complex { static const bool value = true; }; -template const char *shortname(bool caps = true); +template const char *shortname(bool caps = true); template const char *getFullName(); } diff --git a/src/backend/cuda/unary.hpp b/src/backend/cuda/unary.hpp index 13b9690cef..ed3d81944a 100644 --- a/src/backend/cuda/unary.hpp +++ b/src/backend/cuda/unary.hpp @@ -80,7 +80,7 @@ Array unaryOp(const Array &in) unaryName(), in_node, op); - return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(in.dims(), JIT::Node_ptr(node)); } template @@ -93,7 +93,7 @@ Array checkOp(const Array &in) unaryName(), in_node, op); - return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(in.dims(), JIT::Node_ptr(node)); } } diff --git a/src/backend/opencl/Array.cpp b/src/backend/opencl/Array.cpp index 1ab28a8e31..ea061e21e3 100644 --- a/src/backend/opencl/Array.cpp +++ b/src/backend/opencl/Array.cpp @@ -30,8 +30,8 @@ namespace opencl template Node_ptr bufferNodePtr() { - return Node_ptr(reinterpret_cast(new BufferNode(dtype_traits::getName(), - shortname(true)))); + return Node_ptr(new BufferNode(dtype_traits::getName(), + shortname(true))); } template diff --git a/src/backend/opencl/JIT/BufferNode.hpp b/src/backend/opencl/JIT/BufferNode.hpp index 92a2c32697..c55206b472 100644 --- a/src/backend/opencl/JIT/BufferNode.hpp +++ b/src/backend/opencl/JIT/BufferNode.hpp @@ -25,8 +25,8 @@ namespace JIT std::shared_ptr m_data; KParam m_info; unsigned m_bytes; - bool m_linear_buffer; std::once_flag m_set_data_flag; + bool m_linear_buffer; public: @@ -38,10 +38,6 @@ namespace JIT bool isBuffer() { return true; } - ~BufferNode() - { - } - void setData(KParam info, std::shared_ptr data, const unsigned bytes, bool is_linear) { std::call_once(m_set_data_flag, [this, info, data, bytes, is_linear]() { @@ -92,7 +88,7 @@ namespace JIT void genOffsets(std::stringstream &kerStream, int id, bool is_linear) { std::string idx_str = std::string("int idx") + std::to_string(id); - std::string info_str = std::string("iInfo") + std::to_string(id);; + std::string info_str = std::string("iInfo") + std::to_string(id); if (!is_linear) { kerStream << idx_str << " = " @@ -122,7 +118,6 @@ namespace JIT len++; buf_count++; bytes += m_bytes; - return; } }; diff --git a/src/backend/opencl/binary.hpp b/src/backend/opencl/binary.hpp index 11493a5966..9a653a8302 100644 --- a/src/backend/opencl/binary.hpp +++ b/src/backend/opencl/binary.hpp @@ -183,8 +183,7 @@ Array createBinaryNode(const Array &lhs, const Array &rhs, const af: lhs_node, rhs_node, (int)(op)); - return createNodeArray(odims, JIT::Node_ptr( - reinterpret_cast(node))); + return createNodeArray(odims, JIT::Node_ptr(node)); } } diff --git a/src/backend/opencl/cast.hpp b/src/backend/opencl/cast.hpp index bddbd5ad34..3df062053f 100644 --- a/src/backend/opencl/cast.hpp +++ b/src/backend/opencl/cast.hpp @@ -55,12 +55,10 @@ CAST_FN(double) } \ }; - CAST_CFN(cfloat) CAST_CFN(cdouble) CAST_CFN(char) - template<> struct CastOp { @@ -70,7 +68,6 @@ struct CastOp } }; - template<> struct CastOp { @@ -89,7 +86,6 @@ struct CastOp } }; - template<> struct CastOp { @@ -113,7 +109,7 @@ struct CastWrapper shortname(true), cop.name(), in_node, af_cast_t); - return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(in.dims(), JIT::Node_ptr(node)); } }; diff --git a/src/backend/opencl/complex.hpp b/src/backend/opencl/complex.hpp index 0838370c3c..e72850675d 100644 --- a/src/backend/opencl/complex.hpp +++ b/src/backend/opencl/complex.hpp @@ -30,7 +30,7 @@ namespace opencl "__creal", in_node, af_real_t); - return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(in.dims(), JIT::Node_ptr(node)); } template @@ -42,7 +42,7 @@ namespace opencl "__cimag", in_node, af_imag_t); - return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(in.dims(), JIT::Node_ptr(node)); } template static const char *abs_name() { return "fabs"; } @@ -58,7 +58,7 @@ namespace opencl abs_name(), in_node, af_abs_t); - return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(in.dims(), JIT::Node_ptr(node)); } template static const char *conj_name() { return "__noop"; } @@ -74,6 +74,6 @@ namespace opencl conj_name(), in_node, af_conj_t); - return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(in.dims(), JIT::Node_ptr(node)); } } diff --git a/src/backend/opencl/jit.cpp b/src/backend/opencl/jit.cpp index 5442eba7f4..7f2c1f4644 100644 --- a/src/backend/opencl/jit.cpp +++ b/src/backend/opencl/jit.cpp @@ -80,46 +80,47 @@ static string getKernelString(const string funcName, // Common OpenCL code // This part of the code does not change with the kernel. - static const char *kernelVoid = "__kernel void\n"; + static const char *kernelVoid = "__kernel void\n"; static const char *dimParams = "KParam oInfo, uint groups_0, uint groups_1, uint num_odims"; static const char *blockStart = "{\n\n"; static const char *blockEnd = "\n\n}"; - static const char *linearIndex = "\n" - "uint groupId = get_group_id(1) * get_num_groups(0) + get_group_id(0);\n" - "uint threadId = get_local_id(0);\n" - "int idx = groupId * get_local_size(0) * get_local_size(1) + threadId;\n" - "if (idx >= oInfo.dims[3] * oInfo.strides[3]) return;\n"; - - static const char *generalIndex = "\n" - "uint id0 = 0, id1 = 0, id2 = 0, id3 = 0;\n" - "if (num_odims > 2) {\n" - "id2 = get_group_id(0) / groups_0;\n" - "id0 = get_group_id(0) - id2 * groups_0;\n" - "id0 = get_local_id(0) + id0 * get_local_size(0);\n" - "if (num_odims > 3) {\n" - "id3 = get_group_id(1) / groups_1;\n" - "id1 = get_group_id(1) - id3 * groups_1;\n" - "id1 = get_local_id(1) + id1 * get_local_size(1);\n" - "} else {\n" - "id1 = get_global_id(1);\n" - "}\n" - " } else {\n" - "id3 = 0;\n" - "id2 = 0;\n" - "id1 = get_global_id(1);\n" - "id0 = get_global_id(0);\n" - "}\n" - "bool cond = \n" - "id0 < oInfo.dims[0] && \n" - "id1 < oInfo.dims[1] && \n" - "id2 < oInfo.dims[2] && \n" - "id3 < oInfo.dims[3];\n\n" - "if (!cond) return;\n\n" - "int idx = " - "oInfo.strides[3] * id3 + oInfo.strides[2] * id2 + " - "oInfo.strides[1] * id1 + id0 + oInfo.offset;\n\n"; - + static const char *linearIndex = R"JIT( + uint groupId = get_group_id(1) * get_num_groups(0) + get_group_id(0); + uint threadId = get_local_id(0); + int idx = groupId * get_local_size(0) * get_local_size(1) + threadId; + if (idx >= oInfo.dims[3] * oInfo.strides[3]) return; + )JIT"; + + static const char *generalIndex = R"JIT( + uint id0 = 0, id1 = 0, id2 = 0, id3 = 0; + if (num_odims > 2) { + id2 = get_group_id(0) / groups_0; + id0 = get_group_id(0) - id2 * groups_0; + id0 = get_local_id(0) + id0 * get_local_size(0); + if (num_odims > 3) { + id3 = get_group_id(1) / groups_1; + id1 = get_group_id(1) - id3 * groups_1; + id1 = get_local_id(1) + id1 * get_local_size(1); + } else { + id1 = get_global_id(1); + } + } else { + id3 = 0; + id2 = 0; + id1 = get_global_id(1); + id0 = get_global_id(0); + } + bool cond = id0 < oInfo.dims[0] && + id1 < oInfo.dims[1] && + id2 < oInfo.dims[2] && + id3 < oInfo.dims[3]; + if (!cond) return; + int idx = oInfo.strides[3] * id3 + + oInfo.strides[2] * id2 + + oInfo.strides[1] * id1 + + id0 + oInfo.offset; + )JIT"; stringstream inParamStream; stringstream outParamStream; @@ -143,7 +144,7 @@ static string getKernelString(const string funcName, // Generate output parameters outParamStream << "__global " << full_nodes[id]->getTypeStr() << " *out" << id << ", \n"; // Generate code to write the output - outWriteStream << "out" << id << "[idx] = " << "val" << id << ";\n"; + outWriteStream << "out" << id << "[idx] = val" << id << ";\n"; } // Put various blocks into a single stream diff --git a/src/backend/opencl/kernel/jit.cl b/src/backend/opencl/kernel/jit.cl index 183e52be81..3e797ac050 100644 --- a/src/backend/opencl/kernel/jit.cl +++ b/src/backend/opencl/kernel/jit.cl @@ -180,7 +180,7 @@ double2 __cdiv(double2 lhs, double2 rhs) double rhs_x = inv_rhs_abs * rhs.x; double rhs_y = inv_rhs_abs * rhs.y; double2 out = {lhs.x * rhs_x + lhs.y * rhs_y, - lhs.y * rhs_x - lhs.x * rhs_y}; + lhs.y * rhs_x - lhs.x * rhs_y}; out.x *= inv_rhs_abs; out.y *= inv_rhs_abs; return out; diff --git a/src/backend/opencl/scalar.hpp b/src/backend/opencl/scalar.hpp index b6abf47ff9..fbd96b3ecc 100644 --- a/src/backend/opencl/scalar.hpp +++ b/src/backend/opencl/scalar.hpp @@ -18,8 +18,7 @@ namespace opencl template Array createScalarNode(const dim4 &size, const T val) { - JIT::ScalarNode *node = new JIT::ScalarNode(val); - return createNodeArray(size, JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(size, JIT::Node_ptr(new JIT::ScalarNode(val))); } } diff --git a/src/backend/opencl/types.cpp b/src/backend/opencl/types.cpp index 6581b047db..13744e444b 100644 --- a/src/backend/opencl/types.cpp +++ b/src/backend/opencl/types.cpp @@ -13,7 +13,7 @@ namespace opencl { - template const char *shortname(bool caps) { return caps ? "X" : "x"; } + template const char *shortname(bool caps) { return caps ? "X" : "x"; } template<> const char *shortname(bool caps) { return caps ? "S" : "s"; } template<> const char *shortname(bool caps) { return caps ? "D" : "d"; } diff --git a/src/backend/opencl/types.hpp b/src/backend/opencl/types.hpp index 4490f2bb36..277ba2c07e 100644 --- a/src/backend/opencl/types.hpp +++ b/src/backend/opencl/types.hpp @@ -33,7 +33,7 @@ template struct is_complex { static const bool value = fals template<> struct is_complex { static const bool value = true; }; template<> struct is_complex { static const bool value = true; }; -template const char *shortname(bool caps=false); +template const char *shortname(bool caps=false); template struct ToNumStr diff --git a/src/backend/opencl/unary.hpp b/src/backend/opencl/unary.hpp index 1e363d7dcb..66f775da5d 100644 --- a/src/backend/opencl/unary.hpp +++ b/src/backend/opencl/unary.hpp @@ -80,7 +80,7 @@ Array unaryOp(const Array &in) unaryName(), in_node, op); - return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(in.dims(), JIT::Node_ptr(node)); } template @@ -93,7 +93,7 @@ Array checkOp(const Array &in) unaryName(), in_node, op); - return createNodeArray(in.dims(), JIT::Node_ptr(reinterpret_cast(node))); + return createNodeArray(in.dims(), JIT::Node_ptr(node)); } } From bd5667a664cc6eb917715aa2c9638b8151f32e1a Mon Sep 17 00:00:00 2001 From: Pavan Yalamanchili Date: Tue, 20 Jun 2017 11:39:20 -0700 Subject: [PATCH 4/4] Use std::string instead of char * for storing text of the jit.cuh file --- src/backend/cuda/jit.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/cuda/jit.cpp b/src/backend/cuda/jit.cpp index 6f703765ab..e8d960e02c 100644 --- a/src/backend/cuda/jit.cpp +++ b/src/backend/cuda/jit.cpp @@ -76,9 +76,9 @@ static string getKernelString(const string funcName, bool is_linear) { - const char *includeFileStr = jit_cuh; + const std::string includeFileStr(jit_cuh, jit_cuh_len); - const char paramTStr[] =R"JIT( + const std::string paramTStr = R"JIT( template struct Param {