Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/backend/cuda/Array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ namespace cuda
template<typename T>
Node_ptr bufferNodePtr()
{
Node_ptr node(reinterpret_cast<Node *>(new BufferNode<T>(irname<T>(), afShortName<T>())));
return node;
return Node_ptr(new BufferNode<T>(getFullName<T>(),
shortname<T>(true)));
}

template<typename T>
Expand Down
128 changes: 28 additions & 100 deletions src/backend/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,6 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
FIND_PACKAGE(CUDA 7.0 REQUIRED)

INCLUDE(CLKernelToH)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should change the name of this guy at some point.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not doing it now :(

INCLUDE(FindNVVM)

OPTION(USE_LIBDEVICE "Use libdevice for CUDA JIT" ON)
SET(CUDA_LIBDEVICE_DIR "${CUDA_NVVM_HOME}/libdevice" CACHE PATH "Path where libdevice compute files are located" FORCE)

MARK_AS_ADVANCED(
CUDA_BUILD_CUBIN
Expand Down Expand Up @@ -168,14 +164,10 @@ FILE(GLOB jit_sources
FILE(GLOB kernel_headers
"kernel/*.hpp")

FILE(GLOB ptx_sources
"JIT/*.cu")

LIST(SORT cuda_headers)
LIST(SORT cuda_sources)
LIST(SORT jit_sources)
LIST(SORT kernel_headers)
LIST(SORT ptx_sources)

SOURCE_GROUP(backend\\cuda\\Headers FILES ${cuda_headers})
SOURCE_GROUP(backend\\cuda\\Sources FILES ${cuda_sources})
Expand Down Expand Up @@ -219,10 +211,23 @@ FILE(GLOB cpp_sources

LIST(SORT cpp_sources)

SET(jit_kernel_headers
"kernel_headers")

FILE(GLOB jit_src "kernel/jit.cuh")
CL_KERNEL_TO_H(
SOURCES ${jit_src}
VARNAME jit_files
EXTENSION "hpp"
OUTPUT_DIR ${jit_kernel_headers}
TARGETS jit_kernel_targets
NAMESPACE "cuda"
)


SOURCE_GROUP(api\\cpp\\Sources FILES ${cpp_sources})

INCLUDE("${CMAKE_CURRENT_SOURCE_DIR}/kernel/thrust_sort_by_key/CMakeLists.txt")

INCLUDE("${CMAKE_CURRENT_SOURCE_DIR}/kernel/scan_by_key/CMakeLists.txt")

LIST(LENGTH COMPUTE_VERSIONS COMPUTE_COUNT)
Expand All @@ -242,89 +247,8 @@ SET(OLD_CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS})
IF(${CUDA_VERSION_MAJOR} GREATER 7) # CUDA 8 or newer
SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --keep-device-functions")
ENDIF()
CUDA_COMPILE_PTX(ptx_files ${ptx_sources})
SET(CUDA_NVCC_FLAGS ${OLD_CUDA_NVCC_FLAGS})

set(cuda_ptx "")
foreach(ptx_src_file ${ptx_sources})

get_filename_component(_name "${ptx_src_file}" NAME_WE)

# CUDA_COMPILE_PTX from CMake 3.7 has new features that require this change
# TODO Fix this with a more complete solution
IF(CMAKE_VERSION VERSION_LESS 3.7) # Before 3.7
SET(NAME_APPEND "")
ELSE(CMAKE_VERSION VERSION_LESS 3.7) # 3.7 and newer
SET(NAME_APPEND "_1")
ENDIF(CMAKE_VERSION VERSION_LESS 3.7)

set(_gen_file_name
"${PROJECT_BINARY_DIR}/src/backend/cuda/cuda_compile_ptx${NAME_APPEND}_generated_${_name}.cu.ptx")
set(_out_file_name
"${PROJECT_BINARY_DIR}/src/backend/cuda/${_name}.ptx")

ADD_CUSTOM_COMMAND(
OUTPUT "${_out_file_name}"
DEPENDS "${_gen_file_name}"
COMMAND ${CMAKE_COMMAND} -E copy "${_gen_file_name}" "${_out_file_name}")

list(APPEND cuda_ptx "${_out_file_name}")
endforeach()

SET( ptx_headers
"ptx_headers")

CL_KERNEL_TO_H(
SOURCES ${cuda_ptx}
VARNAME kernel_files
EXTENSION "hpp"
OUTPUT_DIR ${ptx_headers}
TARGETS ptx_targets
NAMESPACE "cuda"
NULLTERM TRUE
)

SET(libdevice_bc "")
IF (USE_LIBDEVICE)
SET(libdevice_computes "")
LIST(APPEND libdevice_computes "20" "30" "35" "50")
FOREACH(libdevice_compute ${libdevice_computes})
SET(_libdevice_bc_file "${CUDA_LIBDEVICE_DIR}/libdevice.compute_${libdevice_compute}.10.bc")
SET(_libdevice_bc_copy "${PROJECT_BINARY_DIR}/src/backend/cuda/compute_${libdevice_compute}.bc")
IF (EXISTS ${_libdevice_bc_file})
ADD_CUSTOM_COMMAND(
OUTPUT "${_libdevice_bc_copy}"
DEPENDS "${_libdevice_bc_file}"
COMMAND ${CMAKE_COMMAND} -E copy "${_libdevice_bc_file}" "${_libdevice_bc_copy}")
LIST(APPEND libdevice_bc ${_libdevice_bc_copy})
ADD_DEFINITIONS(-D"__LIBDEVICE_COMPUTE_${libdevice_compute}")
ENDIF()
ENDFOREACH()
ENDIF()

LIST(LENGTH libdevice_bc libdevice_bc_len)

IF (${libdevice_bc_len} GREATER 0)

SET(libdevice_headers
"libdevice_headers")

CL_KERNEL_TO_H(
SOURCES ${libdevice_bc}
VARNAME libdevice_files
EXTENSION "hpp"
OUTPUT_DIR ${libdevice_headers}
TARGETS libdevice_targets
NAMESPACE "cuda"
BINARY TRUE
)

MESSAGE(STATUS "LIBDEVICE found.")
ADD_DEFINITIONS(-DUSE_LIBDEVICE)
ELSE()
MESSAGE(STATUS "LIBDEVICE not found on system. CUDA JIT may be slower")
ENDIF()

IF("${APPLE}")
ADD_DEFINITIONS(-D__STRICT_ANSI__)
ELSE()
Expand Down Expand Up @@ -407,10 +331,11 @@ ENDIF(NOT CUDA_CUDA_LIBRARY)

SET(CUDA_ADD_LIBRARY_OPTIONS "")
IF(UNIX)
# These flags enable C++11 and disable invalid offsetof warning
SET(CUDA_ADD_LIBRARY_OPTIONS "-std=c++11 -Xcudafe \"--diag_suppress=1427\"")
# These flags enable C++11 and disable invalid offsetof warning
SET(CUDA_ADD_LIBRARY_OPTIONS "-std=c++11 -Xcudafe \"--diag_suppress=1427\"")
ENDIF(UNIX)


MY_CUDA_ADD_LIBRARY(afcuda SHARED
${cuda_headers}
${cuda_sources}
Expand All @@ -425,23 +350,26 @@ MY_CUDA_ADD_LIBRARY(afcuda SHARED
${scan_by_key_sources}
OPTIONS ${CUDA_GENERATE_CODE} ${CUDA_ADD_LIBRARY_OPTIONS})

ADD_DEPENDENCIES(afcuda ${ptx_targets})

IF (${libdevice_bc_len} GREATER 0)
ADD_DEPENDENCIES(afcuda ${libdevice_targets})
ENDIF()
FIND_LIBRARY (
CUDA_nvrtc_LIBRARY
NAMES "nvrtc"
PATHS ${CUDA_TOOLKIT_ROOT_DIR}
PATH_SUFFIXES "lib64" "lib/x64" "lib"
DOC "CUDA NVRTC Library"
NO_DEFAULT_PATH
)

TARGET_LINK_LIBRARIES(afcuda
PRIVATE ${CUDA_CUBLAS_LIBRARIES}
TARGET_LINK_LIBRARIES(afcuda PRIVATE ${CUDA_CUBLAS_LIBRARIES}
PRIVATE ${CUDA_LIBRARIES}
PRIVATE ${FreeImage_LIBS}
PRIVATE ${CUDA_CUFFT_LIBRARIES}
PRIVATE ${CUDA_cusparse_LIBRARY}
PRIVATE ${CUDA_cusolver_LIBRARY}
PRIVATE ${CUDA_nvvm_LIBRARY}
PRIVATE ${CUDA_nvrtc_LIBRARY}
PRIVATE ${CUDA_CUDA_LIBRARY}
)

ADD_DEPENDENCIES(afcuda ${jit_kernel_targets})
LIST(LENGTH GRAPHICS_DEPENDENCIES GRAPHICS_DEPENDENCIES_LEN)
IF(${GRAPHICS_DEPENDENCIES_LEN} GREATER 0)
ADD_DEPENDENCIES(afcuda ${GRAPHICS_DEPENDENCIES})
Expand Down
64 changes: 15 additions & 49 deletions src/backend/cuda/JIT/BinaryNode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,68 +20,34 @@ namespace JIT
class BinaryNode : public Node
{
private:
const std::string m_op_str;
const int m_op;
const int m_call_type;
std::string m_op_str;
int m_op;

public:
BinaryNode(const char *out_type_str, const char *name_str,
const std::string &op_str,
Node_ptr lhs, Node_ptr rhs, int op, int call_type)
const char *op_str,
Node_ptr lhs, Node_ptr rhs, int op)
: Node(out_type_str, name_str, std::max(lhs->getHeight(), rhs->getHeight()) + 1, {lhs, rhs}),
m_op_str(op_str),
m_op(op),
m_call_type(call_type)
m_op(op)
{
}

void genKerName(std::stringstream &kerStream, Node_ids ids)
{
// Make the hex representation of enum part of the Kernel name
kerStream << "_" << std::setw(2) << std::setfill('0') << std::hex << m_op;
kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.child_ids[0];
kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.child_ids[1];
kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.id << std::dec;
// Make the dec representation of enum part of the Kernel name
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These aren't templated so it would be better if they were implemented in the cpp file.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll do another pass about reorganizing the JIT nodes at a later point (including trying to use the same code for CUDA and OpenCL JIT). Can we leave this be for now?

kerStream << "_" << std::setw(3) << std::setfill('0') << std::dec << m_op;
kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.child_ids[0];
kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.child_ids[1];
kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.id << std::dec;
}

void genFuncs(std::stringstream &kerStream, str_map_t &declStrs, Node_ids ids, bool is_linear)
void genFuncs(std::stringstream &kerStream, Node_ids ids)
{
if (m_call_type == 0) {
std::stringstream declStream;
declStream << "declare " << m_type_str << " " << m_op_str
<< "(" << m_children[0]->getTypeStr() << " , "
<< m_children[1]->getTypeStr() << ")\n";
declStrs[declStream.str()] = true;

kerStream << "%val" << ids.id << " = call "
<< m_type_str << " "
<< m_op_str << "("
<< m_children[0]->getTypeStr() << " "
<< "%val" << ids.child_ids[0] << ", "
<< m_children[1]->getTypeStr() << " "
<< "%val" << ids.child_ids[1] << ")\n";

} else {
if (m_call_type == 1) {
// arithmetic operations
kerStream << "%val" << ids.id << " = "
<< m_op_str << " "
<< m_type_str << " "
<< "%val" << ids.child_ids[0] << ", "
<< "%val" << ids.child_ids[1] << "\n";
} else {
// logical operators
kerStream << "%tmp" << ids.id << " = "
<< m_op_str << " "
<< m_children[0]->getTypeStr() << " "
<< "%val" << ids.child_ids[0] << ", "
<< "%val" << ids.child_ids[1] << "\n";

kerStream << "%val" << ids.id << " = "
<< "zext i1 %tmp" << ids.id << " to i8\n";

}
}
kerStream << m_type_str << " val" << ids.id << " = "
<< m_op_str << "(val" << ids.child_ids[0]
<< ", val" << ids.child_ids[1] << ");"
<< "\n";
}
};

Expand Down
Loading