From 8d6f98123b0746a08459166d2f76747ed091effc Mon Sep 17 00:00:00 2001
From: Pavan Yalamanchili <contact@pavanky.com>
Date: Mon, 19 Jun 2017 11:15:53 -0700
Subject: [PATCH 1/4] Convert CUDA JIT to use nvrtc instead of nvvm

---
 src/backend/cuda/Array.cpp          |   4 +-
 src/backend/cuda/CMakeLists.txt     | 128 ++----
 src/backend/cuda/JIT/BinaryNode.hpp |  64 +--
 src/backend/cuda/JIT/BufferNode.hpp | 136 ++-----
 src/backend/cuda/JIT/Node.hpp       |  26 +-
 src/backend/cuda/JIT/ScalarNode.hpp |  28 +-
 src/backend/cuda/JIT/UnaryNode.hpp  |  54 +--
 src/backend/cuda/JIT/arith.cu       |  44 ---
 src/backend/cuda/JIT/cast.cu        | 104 -----
 src/backend/cuda/JIT/exp.cu         | 100 -----
 src/backend/cuda/JIT/hyper.cu       |  41 --
 src/backend/cuda/JIT/logic.cu       | 109 -----
 src/backend/cuda/JIT/numeric.cu     | 191 ---------
 src/backend/cuda/JIT/trig.cu        |  62 ---
 src/backend/cuda/arith.hpp          |   1 -
 src/backend/cuda/binary.hpp         | 356 +++++++----------
 src/backend/cuda/cast.hpp           |  86 +++-
 src/backend/cuda/complex.hpp        |  72 +---
 src/backend/cuda/jit.cpp            | 590 ++++++++--------------------
 src/backend/cuda/kernel/jit.cuh     | 205 ++++++++++
 src/backend/cuda/logic.hpp          |   3 +-
 src/backend/cuda/types.cpp          |  99 ++---
 src/backend/cuda/types.hpp          |   6 +-
 src/backend/cuda/unary.hpp          | 244 ++++--------
 24 files changed, 837 insertions(+), 1916 deletions(-)
 delete mode 100644 src/backend/cuda/JIT/arith.cu
 delete mode 100644 src/backend/cuda/JIT/cast.cu
 delete mode 100644 src/backend/cuda/JIT/exp.cu
 delete mode 100644 src/backend/cuda/JIT/hyper.cu
 delete mode 100644 src/backend/cuda/JIT/logic.cu
 delete mode 100644 src/backend/cuda/JIT/numeric.cu
 delete mode 100644 src/backend/cuda/JIT/trig.cu
 create mode 100644 src/backend/cuda/kernel/jit.cuh
diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp
index 495f35807e..da98060331 100644
--- a/src/backend/cuda/Array.cpp
+++ b/src/backend/cuda/Array.cpp
@@ -33,8 +33,8 @@ namespace cuda
     template<typename T>
     Node_ptr bufferNodePtr()
     {
-        Node_ptr node(reinterpret_cast<Node *>(new BufferNode<T>(irname<T>(), afShortName<T>())));
-        return node;
+        return Node_ptr(reinterpret_cast<Node *>(new BufferNode<T>(getFullName<T>(),
+                                                                   shortname<T>(true))));
     }
 
     template<typename T>
diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt
index eb4b27b44b..e48dda4446 100644
--- a/src/backend/cuda/CMakeLists.txt
+++ b/src/backend/cuda/CMakeLists.txt
@@ -3,10 +3,6 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
 FIND_PACKAGE(CUDA 7.0 REQUIRED)
 
 INCLUDE(CLKernelToH)
-INCLUDE(FindNVVM)
-
-OPTION(USE_LIBDEVICE "Use libdevice for CUDA JIT" ON)
-SET(CUDA_LIBDEVICE_DIR "${CUDA_NVVM_HOME}/libdevice" CACHE PATH "Path where libdevice compute files are located" FORCE)
 
 MARK_AS_ADVANCED(
     CUDA_BUILD_CUBIN
@@ -168,14 +164,10 @@ FILE(GLOB jit_sources
 FILE(GLOB kernel_headers
     "kernel/*.hpp")
 
-FILE(GLOB ptx_sources
-    "JIT/*.cu")
-
 LIST(SORT cuda_headers)
 LIST(SORT cuda_sources)
 LIST(SORT jit_sources)
 LIST(SORT kernel_headers)
-LIST(SORT ptx_sources)
 
 SOURCE_GROUP(backend\\cuda\\Headers FILES ${cuda_headers})
 SOURCE_GROUP(backend\\cuda\\Sources FILES ${cuda_sources})
@@ -219,10 +211,23 @@ FILE(GLOB cpp_sources
 
 LIST(SORT cpp_sources)
 
+SET(jit_kernel_headers
+    "kernel_headers")
+
+FILE(GLOB jit_src "kernel/jit.cuh")
+CL_KERNEL_TO_H(
+    SOURCES ${jit_src}
+    VARNAME jit_files
+    EXTENSION "hpp"
+    OUTPUT_DIR ${jit_kernel_headers}
+    TARGETS jit_kernel_targets
+    NAMESPACE "cuda"
+    )
+
+
 SOURCE_GROUP(api\\cpp\\Sources FILES ${cpp_sources})
 
 INCLUDE("${CMAKE_CURRENT_SOURCE_DIR}/kernel/thrust_sort_by_key/CMakeLists.txt")
-
 INCLUDE("${CMAKE_CURRENT_SOURCE_DIR}/kernel/scan_by_key/CMakeLists.txt")
 
 LIST(LENGTH COMPUTE_VERSIONS COMPUTE_COUNT)
@@ -242,89 +247,8 @@ SET(OLD_CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS})
 IF(${CUDA_VERSION_MAJOR} GREATER 7) # CUDA 8 or newer
     SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --keep-device-functions")
 ENDIF()
-CUDA_COMPILE_PTX(ptx_files ${ptx_sources})
 SET(CUDA_NVCC_FLAGS ${OLD_CUDA_NVCC_FLAGS})
 
-set(cuda_ptx "")
-foreach(ptx_src_file ${ptx_sources})
-
-      get_filename_component(_name "${ptx_src_file}" NAME_WE)
-
-      # CUDA_COMPILE_PTX from CMake 3.7 has new features that require this change
-      # TODO Fix this with a more complete solution
-      IF(CMAKE_VERSION VERSION_LESS 3.7) # Before 3.7
-        SET(NAME_APPEND "")
-      ELSE(CMAKE_VERSION VERSION_LESS 3.7)  # 3.7 and newer
-        SET(NAME_APPEND "_1")
-      ENDIF(CMAKE_VERSION VERSION_LESS 3.7)
-
-      set(_gen_file_name
-        "${PROJECT_BINARY_DIR}/src/backend/cuda/cuda_compile_ptx${NAME_APPEND}_generated_${_name}.cu.ptx")
-      set(_out_file_name
-        "${PROJECT_BINARY_DIR}/src/backend/cuda/${_name}.ptx")
-
-      ADD_CUSTOM_COMMAND(
-        OUTPUT "${_out_file_name}"
-        DEPENDS "${_gen_file_name}"
-        COMMAND ${CMAKE_COMMAND} -E copy "${_gen_file_name}" "${_out_file_name}")
-
-      list(APPEND cuda_ptx "${_out_file_name}")
-endforeach()
-
-SET( ptx_headers
-    "ptx_headers")
-
-CL_KERNEL_TO_H(
-    SOURCES ${cuda_ptx}
-    VARNAME kernel_files
-    EXTENSION "hpp"
-    OUTPUT_DIR ${ptx_headers}
-    TARGETS ptx_targets
-    NAMESPACE "cuda"
-    NULLTERM TRUE
-    )
-
-SET(libdevice_bc "")
-IF (USE_LIBDEVICE)
-    SET(libdevice_computes "")
-    LIST(APPEND libdevice_computes "20" "30" "35" "50")
-    FOREACH(libdevice_compute ${libdevice_computes})
-        SET(_libdevice_bc_file "${CUDA_LIBDEVICE_DIR}/libdevice.compute_${libdevice_compute}.10.bc")
-        SET(_libdevice_bc_copy "${PROJECT_BINARY_DIR}/src/backend/cuda/compute_${libdevice_compute}.bc")
-        IF (EXISTS ${_libdevice_bc_file})
-            ADD_CUSTOM_COMMAND(
-                OUTPUT "${_libdevice_bc_copy}"
-                DEPENDS "${_libdevice_bc_file}"
-                COMMAND ${CMAKE_COMMAND} -E copy "${_libdevice_bc_file}" "${_libdevice_bc_copy}")
-            LIST(APPEND libdevice_bc ${_libdevice_bc_copy})
-            ADD_DEFINITIONS(-D"__LIBDEVICE_COMPUTE_${libdevice_compute}")
-        ENDIF()
-    ENDFOREACH()
-ENDIF()
-
-LIST(LENGTH libdevice_bc libdevice_bc_len)
-
-IF (${libdevice_bc_len} GREATER 0)
-
-  SET(libdevice_headers
-    "libdevice_headers")
-
-  CL_KERNEL_TO_H(
-    SOURCES ${libdevice_bc}
-    VARNAME libdevice_files
-    EXTENSION "hpp"
-    OUTPUT_DIR ${libdevice_headers}
-    TARGETS libdevice_targets
-    NAMESPACE "cuda"
-    BINARY TRUE
-    )
-
-  MESSAGE(STATUS "LIBDEVICE found.")
-  ADD_DEFINITIONS(-DUSE_LIBDEVICE)
-ELSE()
-  MESSAGE(STATUS "LIBDEVICE not found on system. CUDA JIT may be slower")
-ENDIF()
-
 IF("${APPLE}")
     ADD_DEFINITIONS(-D__STRICT_ANSI__)
 ELSE()
@@ -407,10 +331,11 @@ ENDIF(NOT CUDA_CUDA_LIBRARY)
 
 SET(CUDA_ADD_LIBRARY_OPTIONS "")
 IF(UNIX)
-    # These flags enable C++11 and disable invalid offsetof warning
-    SET(CUDA_ADD_LIBRARY_OPTIONS "-std=c++11 -Xcudafe \"--diag_suppress=1427\"")
+  # These flags enable C++11 and disable invalid offsetof warning
+  SET(CUDA_ADD_LIBRARY_OPTIONS "-std=c++11 -Xcudafe \"--diag_suppress=1427\"")
 ENDIF(UNIX)
 
+
 MY_CUDA_ADD_LIBRARY(afcuda SHARED
                 ${cuda_headers}
                 ${cuda_sources}
@@ -425,23 +350,26 @@ MY_CUDA_ADD_LIBRARY(afcuda SHARED
                 ${scan_by_key_sources}
                 OPTIONS ${CUDA_GENERATE_CODE} ${CUDA_ADD_LIBRARY_OPTIONS})
 
-ADD_DEPENDENCIES(afcuda ${ptx_targets})
-
-IF (${libdevice_bc_len} GREATER 0)
-  ADD_DEPENDENCIES(afcuda ${libdevice_targets})
-ENDIF()
+FIND_LIBRARY (
+  CUDA_nvrtc_LIBRARY
+  NAMES "nvrtc"
+  PATHS ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES "lib64" "lib/x64" "lib"
+  DOC "CUDA NVRTC Library"
+  NO_DEFAULT_PATH
+  )
 
-TARGET_LINK_LIBRARIES(afcuda
-                                PRIVATE ${CUDA_CUBLAS_LIBRARIES}
+TARGET_LINK_LIBRARIES(afcuda    PRIVATE ${CUDA_CUBLAS_LIBRARIES}
                                 PRIVATE ${CUDA_LIBRARIES}
                                 PRIVATE ${FreeImage_LIBS}
                                 PRIVATE ${CUDA_CUFFT_LIBRARIES}
                                 PRIVATE ${CUDA_cusparse_LIBRARY}
                                 PRIVATE ${CUDA_cusolver_LIBRARY}
-                                PRIVATE ${CUDA_nvvm_LIBRARY}
+                                PRIVATE ${CUDA_nvrtc_LIBRARY}
                                 PRIVATE ${CUDA_CUDA_LIBRARY}
                      )
 
+ADD_DEPENDENCIES(afcuda ${jit_kernel_targets})
 LIST(LENGTH GRAPHICS_DEPENDENCIES GRAPHICS_DEPENDENCIES_LEN)
 IF(${GRAPHICS_DEPENDENCIES_LEN} GREATER 0)
     ADD_DEPENDENCIES(afcuda ${GRAPHICS_DEPENDENCIES})
diff --git a/src/backend/cuda/JIT/BinaryNode.hpp b/src/backend/cuda/JIT/BinaryNode.hpp
index 115c892119..8edb88098d 100644
--- a/src/backend/cuda/JIT/BinaryNode.hpp
+++ b/src/backend/cuda/JIT/BinaryNode.hpp
@@ -20,68 +20,34 @@ namespace JIT
     class BinaryNode : public Node
     {
     private:
-        const std::string m_op_str;
-        const int m_op;
-        const int m_call_type;
+        std::string m_op_str;
+        int m_op;
 
     public:
         BinaryNode(const char *out_type_str, const char *name_str,
-                   const std::string &op_str,
-                   Node_ptr lhs, Node_ptr rhs, int op, int call_type)
+                   const char *op_str,
+                   Node_ptr lhs, Node_ptr rhs, int op)
             : Node(out_type_str, name_str, std::max(lhs->getHeight(), rhs->getHeight()) + 1, {lhs, rhs}),
               m_op_str(op_str),
-              m_op(op),
-              m_call_type(call_type)
+              m_op(op)
         {
         }
 
         void genKerName(std::stringstream &kerStream, Node_ids ids)
         {
-            // Make the hex representation of enum part of the Kernel name
-            kerStream << "_" << std::setw(2) << std::setfill('0') << std::hex << m_op;
-            kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.child_ids[0];
-            kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.child_ids[1];
-            kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.id << std::dec;
+            // Make the dec representation of enum part of the Kernel name
+            kerStream << "_" << std::setw(3) << std::setfill('0') << std::dec << m_op;
+            kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.child_ids[0];
+            kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.child_ids[1];
+            kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.id << std::dec;
         }
 
-        void genFuncs(std::stringstream &kerStream, str_map_t &declStrs, Node_ids ids, bool is_linear)
+        void genFuncs(std::stringstream &kerStream, Node_ids ids)
         {
-            if (m_call_type == 0) {
-                std::stringstream declStream;
-                declStream << "declare " << m_type_str << " " << m_op_str
-                           << "(" << m_children[0]->getTypeStr() << " , "
-                           << m_children[1]->getTypeStr() << ")\n";
-                declStrs[declStream.str()] = true;
-
-                kerStream << "%val" << ids.id << " = call "
-                          << m_type_str << " "
-                          << m_op_str << "("
-                          << m_children[0]->getTypeStr() << " "
-                          << "%val" << ids.child_ids[0] << ", "
-                          << m_children[1]->getTypeStr() << " "
-                          << "%val" << ids.child_ids[1] << ")\n";
-
-            } else {
-                if (m_call_type == 1) {
-                    // arithmetic operations
-                    kerStream << "%val" << ids.id << " = "
-                              << m_op_str << " "
-                              << m_type_str << " "
-                              << "%val" << ids.child_ids[0] << ", "
-                              << "%val" << ids.child_ids[1] << "\n";
-                } else {
-                    // logical operators
-                    kerStream << "%tmp" << ids.id << " = "
-                              << m_op_str << " "
-                              << m_children[0]->getTypeStr() << " "
-                              << "%val" << ids.child_ids[0] << ", "
-                              << "%val" << ids.child_ids[1] << "\n";
-
-                    kerStream << "%val" << ids.id << " = "
-                              << "zext i1 %tmp" << ids.id << " to i8\n";
-
-                }
-            }
+            kerStream << m_type_str << " val" << ids.id << " = "
+                      << m_op_str << "(val" << ids.child_ids[0]
+                      << ", val" << ids.child_ids[1] << ");"
+                      << "\n";
         }
     };
 
diff --git a/src/backend/cuda/JIT/BufferNode.hpp b/src/backend/cuda/JIT/BufferNode.hpp
index c6a03d2899..6c4c2fb0a2 100644
--- a/src/backend/cuda/JIT/BufferNode.hpp
+++ b/src/backend/cuda/JIT/BufferNode.hpp
@@ -10,7 +10,6 @@
 #pragma once
 #include "Node.hpp"
 #include <iomanip>
-#include <memory>
 #include <mutex>
 
 namespace cuda
@@ -19,25 +18,17 @@ namespace cuda
 namespace JIT
 {
 
-    template <typename T>
-    static inline std::string toString(T val)
-    {
-        std::stringstream s;
-        s << val;
-        return s.str();
-    }
 
     template<typename T>
     class BufferNode : public Node
     {
     private:
-        // Keep the shared pointer for reference counting
         std::shared_ptr<T> m_data;
         Param<T> m_param;
         unsigned m_bytes;
+        bool m_linear_buffer;
         std::once_flag m_set_data_flag;
 
-        bool m_linear_buffer;
     public:
 
         BufferNode(const char *type_str,
@@ -46,6 +37,12 @@ namespace JIT
         {
         }
 
+        bool isBuffer() { return true; }
+
+        ~BufferNode()
+        {
+        }
+
         void setData(Param<T> param, std::shared_ptr<T> data, const unsigned bytes, bool is_linear)
         {
             std::call_once(m_set_data_flag, [this, param, data, bytes, is_linear]() {
@@ -65,90 +62,58 @@ namespace JIT
             return m_linear_buffer && same_dims;
         }
 
-        bool isBuffer() { return true; }
-
         void genKerName(std::stringstream &kerStream, Node_ids ids)
         {
             kerStream << "_" << m_name_str;
-            kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.id << std::dec;
+            kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.id << std::dec;
         }
 
-        void genParams(std::stringstream &kerStream,
-                       std::stringstream &annStream,
-                       int id,
-                       bool is_linear)
+        void genParams(std::stringstream &kerStream, int id, bool is_linear)
         {
-            kerStream << m_type_str << "* %in" << id << ",\n";
-            annStream << m_type_str << "*,\n";
-
-            if (!is_linear) {
-                kerStream << "i32 %dim0"     << id << ","
-                          << "i32 %dim1"     << id << ","
-                          << "i32 %dim2"     << id << ","
-                          << "i32 %dim3"     << id << ","
-                          << "\n"
-                          << "i32 %str1"     << id << ","
-                          << "i32 %str2"     << id << ","
-                          << "i32 %str3"     << id << ","
-                          << "\n";
+            if (is_linear) {
+                kerStream << m_type_str << " *in" << id << "_ptr,\n";
+            } else {
+                kerStream << "Param<" << m_type_str << "> in" << id
+                          << ",\n";
+            }
+        }
 
-                annStream << "i32, i32, i32, i32,\n"
-                          << "i32, i32, i32,\n";
+        void setArgs(std::vector<void *> &args, bool is_linear)
+        {
+            if (is_linear) {
+                args.push_back((void *)&m_param.ptr);
+            } else {
+                args.push_back((void *)&m_param);
             }
         }
 
         void genOffsets(std::stringstream &kerStream, int id, bool is_linear)
         {
-            if (!is_linear) {
-                kerStream << "%b3" << id << " = icmp slt i32 %id3, %dim3" << id << "\n";
-                kerStream << "%b2" << id << " = icmp slt i32 %id2, %dim2" << id << "\n";
-                kerStream << "%b1" << id << " = icmp slt i32 %id1, %dim1" << id << "\n";
-                kerStream << "%b0" << id << " = icmp slt i32 %id0, %dim0" << id << "\n";
-
-                kerStream << "%c3" << id << " = zext i1 %b3" << id << " to i32\n";
-                kerStream << "%c2" << id << " = zext i1 %b2" << id << " to i32\n";
-                kerStream << "%c1" << id << " = zext i1 %b1" << id << " to i32\n";
-                kerStream << "%c0" << id << " = zext i1 %b0" << id << " to i32\n";
-
-                kerStream << "%d3" << id << " = mul i32 %c3" << id << ", %id3\n";
-                kerStream << "%d2" << id << " = mul i32 %c2" << id << ", %id2\n";
-                kerStream << "%d1" << id << " = mul i32 %c1" << id << ", %id1\n";
-                kerStream << "%d0" << id << " = mul i32 %c0" << id << ", %id0\n";
-
-                kerStream << "%off3i" << id << " = mul i32 %d3" << id
-                          << ", %str3" << id << "\n";
-
-                kerStream << "%off2i" << id << " = mul i32 %d2" << id
-                          << ", %str2" << id << "\n";
-
-                kerStream << "%off1i" << id << " = mul i32 %d1" << id
-                          << ", %str1" << id << "\n";
-
-                kerStream << "%off23i" << id << " = add i32 %off2i"
-                          << id << ", %off3i" << id << "\n";
-
-                kerStream << "%off123i" << id << " = add i32 %off23i"
-                          << id << ", %off1i" << id << "\n";
-
-                kerStream << "%idxa" << id << " = add i32 %off123i"
-                          << id << ", %d0" << id << "\n";
-
-                kerStream << "%idx" << id << " = sext i32 %idxa" << id <<" to i64\n\n";
+            std::string idx_str = std::string("int idx") + std::to_string(id);
+
+            if (is_linear) {
+                kerStream << idx_str << " = idx;\n";
+            } else {
+                std::string info_str = std::string("in") + std::to_string(id);;
+                kerStream << idx_str << " = "
+                          << "(id3 < " << info_str << ".dims[3]) * "
+                          << info_str << ".strides[3] * id3 + "
+                          << "(id2 < " << info_str << ".dims[2]) * "
+                          << info_str << ".strides[2] * id2 + "
+                          << "(id1 < " << info_str << ".dims[1]) * "
+                          << info_str << ".strides[1] * id1 + "
+                          << "(id0 < " << info_str << ".dims[0]) * "
+                          << "id0;"
+                          << "\n";
+                kerStream << m_type_str << " *in" << id << "_ptr = in" << id << ".ptr;\n";
             }
         }
 
-        void genFuncs(std::stringstream &kerStream, str_map_t &declStrs, Node_ids ids, bool is_linear)
+        void genFuncs(std::stringstream &kerStream, Node_ids ids)
         {
-            kerStream << "%inIdx" << ids.id << " = "
-                      << "getelementptr inbounds " << m_type_str << "* %in" << ids.id
-                      << ", i64 %idx";
-
-            if (!is_linear) kerStream << ids.id;
-            kerStream << "\n";
-
-            kerStream << "%val" << ids.id << " = " << "load "
-                      << m_type_str << "* %inIdx" << ids.id << "\n\n";
-
+            kerStream << m_type_str << " val" << ids.id << " = "
+                      << "in" << ids.id << "_ptr[idx" << ids.id << "];"
+                      << "\n";
         }
 
         void getInfo(unsigned &len, unsigned &buf_count, unsigned &bytes)
@@ -158,21 +123,6 @@ namespace JIT
             bytes += m_bytes;
             return;
         }
-
-        void setArgs(std::vector<void *> &args, bool is_linear)
-        {
-            args.push_back((void *)&(m_param.ptr));
-
-            if (!is_linear) {
-                args.push_back((void *)&m_param.dims[0]);
-                args.push_back((void *)&m_param.dims[1]);
-                args.push_back((void *)&m_param.dims[2]);
-                args.push_back((void *)&m_param.dims[3]);
-                args.push_back((void *)&m_param.strides[1]);
-                args.push_back((void *)&m_param.strides[2]);
-                args.push_back((void *)&m_param.strides[3]);
-            }
-        }
     };
 
 }
diff --git a/src/backend/cuda/JIT/Node.hpp b/src/backend/cuda/JIT/Node.hpp
index 035692550d..feae91ca98 100644
--- a/src/backend/cuda/JIT/Node.hpp
+++ b/src/backend/cuda/JIT/Node.hpp
@@ -8,19 +8,22 @@
  ********************************************************/
 
 #pragma once
+#include <platform.hpp>
 #include <optypes.hpp>
-
-#include <unordered_map>
-#include <memory>
 #include <string>
 #include <vector>
+#include <memory>
+#include <unordered_map>
 
 namespace cuda
 {
 
 namespace JIT
 {
+
     class Node;
+    using std::shared_ptr;
+    typedef shared_ptr<Node> Node_ptr;
 
     typedef struct
     {
@@ -28,9 +31,6 @@ namespace JIT
         std::vector<int> child_ids;
     } Node_ids;
 
-    typedef std::unordered_map<std::string, bool> str_map_t;
-    typedef str_map_t::iterator str_map_iter;
-    typedef std::shared_ptr<Node> Node_ptr;
     typedef std::unordered_map<Node *, Node_ids> Node_map_t;
     typedef Node_map_t::iterator Node_map_iter;
 
@@ -66,16 +66,11 @@ namespace JIT
         }
 
         virtual void genKerName(std::stringstream &kerStream, Node_ids ids) {}
-        virtual void genParams  (std::stringstream &kerStream,
-                                 std::stringstream &annStream,
-                                 int id, bool is_linear) {}
+        virtual void genParams  (std::stringstream &kerStream, int id, bool is_linear) {}
         virtual void genOffsets (std::stringstream &kerStream, int id, bool is_linear) {}
-        virtual void genFuncs   (std::stringstream &kerStream, str_map_t &declStrs,
-                                 Node_ids id, bool is_linear)
-        {}
+        virtual void genFuncs   (std::stringstream &kerStream, Node_ids) {}
 
-        virtual void setArgs(std::vector<void *> &args, bool is_linear) {}
-        virtual bool isLinear(dim_t dims[4]) { return true; }
+        virtual void setArgs (std::vector<void *> &args, bool is_linear) { }
 
         virtual void getInfo(unsigned &len, unsigned &buf_count, unsigned &bytes)
         {
@@ -83,9 +78,8 @@ namespace JIT
         }
 
         virtual bool isBuffer() { return false; }
-
+        virtual bool isLinear(dim_t dims[4]) { return true; }
         std::string getTypeStr() { return m_type_str; }
-
         int getHeight()  { return m_height; }
         std::string getNameStr() { return m_name_str; }
 
diff --git a/src/backend/cuda/JIT/ScalarNode.hpp b/src/backend/cuda/JIT/ScalarNode.hpp
index 264b7be61e..aae0496ec3 100644
--- a/src/backend/cuda/JIT/ScalarNode.hpp
+++ b/src/backend/cuda/JIT/ScalarNode.hpp
@@ -8,9 +8,9 @@
  ********************************************************/
 
 #pragma once
-#include <types.hpp>
 #include "Node.hpp"
 #include <math.hpp>
+#include <types.hpp>
 #include <iomanip>
 
 namespace cuda
@@ -18,15 +18,17 @@ namespace cuda
 
 namespace JIT
 {
-    template<typename T>
+
+    template <typename T>
     class ScalarNode : public Node
     {
     private:
-        T m_val;
+        const T m_val;
+
     public:
 
         ScalarNode(T val)
-            : Node(irname<T>(), afShortName<T>(false), 0, {}),
+            : Node(getFullName<T>(), shortname<T>(false), 0, {}),
               m_val(val)
         {
         }
@@ -34,23 +36,27 @@ namespace JIT
         void genKerName(std::stringstream &kerStream, Node_ids ids)
         {
             kerStream << "_" << m_name_str;
-            kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.id << std::dec;
+            kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.id << std::dec;
         }
 
-        void genParams(std::stringstream &kerStream,
-                       std::stringstream &annStream,
-                       int id,
-                       bool is_linear)
+        void genParams(std::stringstream &kerStream, int id, bool is_linear)
         {
-            kerStream << m_type_str << " %val" << id << ", " << std::endl;
-            annStream << m_type_str << ",\n";
+            kerStream << m_type_str << " scalar" << id << ", " << "\n";
         }
 
         void setArgs(std::vector<void *> &args, bool is_linear)
         {
             args.push_back((void *)&m_val);
         }
+
+        void genFuncs(std::stringstream &kerStream, Node_ids ids)
+        {
+            kerStream << m_type_str << " val" << ids.id << " = "
+                      << "scalar" << ids.id << ";"
+                      << "\n";
+        }
     };
+
 }
 
 }
diff --git a/src/backend/cuda/JIT/UnaryNode.hpp b/src/backend/cuda/JIT/UnaryNode.hpp
index ac334c14b8..72e148289f 100644
--- a/src/backend/cuda/JIT/UnaryNode.hpp
+++ b/src/backend/cuda/JIT/UnaryNode.hpp
@@ -22,62 +22,30 @@ namespace JIT
     private:
         const std::string m_op_str;
         const int m_op;
-        const bool m_is_check;
 
     public:
         UnaryNode(const char *out_type_str, const char *name_str,
-                  const std::string &op_str,
-                  Node_ptr child, int op, bool is_check=false)
+                  const char *op_str,
+                  Node_ptr child, int op)
             : Node(out_type_str, name_str, child->getHeight() + 1, {child}),
               m_op_str(op_str),
-              m_op(op),
-              m_is_check(is_check)
+              m_op(op)
         {
         }
 
         void genKerName(std::stringstream &kerStream, Node_ids ids)
         {
-            // Make the hex representation of enum part of the Kernel name
-            kerStream << "_" << std::setw(2) << std::setfill('0') << std::hex << m_op;
-            kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.child_ids[0];
-            kerStream << std::setw(2) << std::setfill('0') << std::hex << ids.id << std::dec;
+            // Make the dec representation of enum part of the Kernel name
+            kerStream << "_" << std::setw(3) << std::setfill('0') << std::dec << m_op;
+            kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.child_ids[0];
+            kerStream << std::setw(3) << std::setfill('0') << std::dec << ids.id << std::dec;
         }
 
-        void genFuncs(std::stringstream &kerStream, str_map_t &declStrs, Node_ids ids, bool is_linear)
+        void genFuncs(std::stringstream &kerStream, Node_ids ids)
         {
-            std::stringstream declStream;
-
-            if (m_is_check) {
-                declStream << "declare " << "i32 " << m_op_str
-                           << "(" << m_children[0]->getTypeStr() << ")\n";
-            } else {
-                declStream << "declare " << m_type_str << " " << m_op_str
-                           << "(" << m_children[0]->getTypeStr() << ")\n";
-            }
-
-            declStrs[declStream.str()] = true;
-
-            if (m_is_check) {
-                kerStream << "%tmp" << ids.id << " = call i32 "
-                          << m_op_str << "("
-                          << m_children[0]->getTypeStr() << " "
-                          << "%val" << ids.child_ids[0] << ")\n";
-
-                if (m_type_str[0] == 'i') {
-                    kerStream << "%val" << ids.id << " = "
-                              << "trunc i32 %tmp" << ids.id << " to " << m_type_str << "\n";
-                } else {
-                    kerStream << "%val" << ids.id << " = "
-                              << "sitofp i32 %tmp" << ids.id << " to " << m_type_str << "\n";
-                }
-
-            } else {
-                kerStream << "%val" << ids.id << " = call "
-                          << m_type_str << " "
-                          << m_op_str << "("
-                          << m_children[0]->getTypeStr() << " "
-                          << "%val" << ids.child_ids[0] << ")\n";
-            }
+            kerStream << m_type_str << " val" << ids.id << " = "
+                      << m_op_str << "(val" << ids.child_ids[0] << ");"
+                      << "\n";
         }
     };
 
diff --git a/src/backend/cuda/JIT/arith.cu b/src/backend/cuda/JIT/arith.cu
deleted file mode 100644
index adfa9e9068..0000000000
--- a/src/backend/cuda/JIT/arith.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include "types.h"
-
-#define ARITH_BASIC(fn, op, T)                  \
-    __device__ T ___##fn(T a, T b)              \
-    {                                           \
-        return a op b;                          \
-    }                                           \
-
-
-#define ARITH(fn, op)                                   \
-    ARITH_BASIC(fn, op, float)                          \
-    ARITH_BASIC(fn, op, double)                         \
-    ARITH_BASIC(fn, op, int)                            \
-    ARITH_BASIC(fn, op, uint)                           \
-    ARITH_BASIC(fn, op, char)                           \
-    ARITH_BASIC(fn, op, uchar)                          \
-    ARITH_BASIC(fn, op, intl)                           \
-    ARITH_BASIC(fn, op, uintl)                          \
-    ARITH_BASIC(fn, op, short)                          \
-    ARITH_BASIC(fn, op, ushort)                         \
-                                                        \
-    __device__ cfloat ___##fn(cfloat a, cfloat b)       \
-    {                                                   \
-        return cuC##fn##f(a, b);                        \
-    }                                                   \
-                                                        \
-    __device__ cdouble ___##fn(cdouble a, cdouble b)    \
-    {                                                   \
-        return cuC##fn(a, b);                           \
-    }                                                   \
-
-ARITH(add, +)
-ARITH(sub, -)
-ARITH(mul, *)
-ARITH(div, /)
diff --git a/src/backend/cuda/JIT/cast.cu b/src/backend/cuda/JIT/cast.cu
deleted file mode 100644
index 8905955145..0000000000
--- a/src/backend/cuda/JIT/cast.cu
+++ /dev/null
@@ -1,104 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include "types.h"
-
-#define CAST_BASIC(FN, To, Ti) __device__ To FN(Ti in) { return (To) in; }
-
-#define CAST_BASIC_BOOL(FN, To, Ti) __device__ To FN(Ti in) { return (To)(in != 0); }
-
-#define CAST(T, X)                              \
-    CAST_BASIC(___mk##X, T, float)              \
-    CAST_BASIC(___mk##X, T, double)             \
-    CAST_BASIC(___mk##X, T, int)                \
-    CAST_BASIC(___mk##X, T, uint)               \
-    CAST_BASIC(___mk##X, T, char)               \
-    CAST_BASIC(___mk##X, T, uchar)              \
-    CAST_BASIC(___mk##X, T, intl)               \
-    CAST_BASIC(___mk##X, T, uintl)              \
-    CAST_BASIC(___mk##X, T, short)              \
-    CAST_BASIC(___mk##X, T, ushort)             \
-
-CAST(float , S)
-CAST(double, D)
-CAST(int   , I)
-CAST(intl  , X)
-CAST(short , P)
-CAST(uint  , U)
-CAST(uchar , V)
-CAST(uintl , Y)
-CAST(ushort, Q)
-
-CAST_BASIC_BOOL(___mkJ, char, float)
-CAST_BASIC_BOOL(___mkJ, char, double)
-CAST_BASIC_BOOL(___mkJ, char, int)
-CAST_BASIC_BOOL(___mkJ, char, uint)
-CAST_BASIC_BOOL(___mkJ, char, char)
-CAST_BASIC_BOOL(___mkJ, char, uchar)
-CAST_BASIC_BOOL(___mkJ, char, intl)
-CAST_BASIC_BOOL(___mkJ, char, uintl)
-CAST_BASIC_BOOL(___mkJ, char, short)
-CAST_BASIC_BOOL(___mkJ, char, ushort)
-
-#define CPLX_BASIC(FN, To, Tr, Ti)              \
-    __device__ To FN(Ti in)                     \
-    {                                           \
-        To out = {(Tr)in, 0};                   \
-        return out;                             \
-    }                                           \
-
-#define CPLX_CAST(T, Tr, X)                     \
-    CPLX_BASIC(___mk##X, T, Tr, float)          \
-    CPLX_BASIC(___mk##X, T, Tr, double)         \
-    CPLX_BASIC(___mk##X, T, Tr, int)            \
-    CPLX_BASIC(___mk##X, T, Tr, uint)           \
-    CPLX_BASIC(___mk##X, T, Tr, char)           \
-    CPLX_BASIC(___mk##X, T, Tr, uchar)          \
-    CPLX_BASIC(___mk##X, T, Tr, uintl)          \
-    CPLX_BASIC(___mk##X, T, Tr, intl)           \
-    CPLX_BASIC(___mk##X, T, Tr, ushort)         \
-    CPLX_BASIC(___mk##X, T, Tr, short)          \
-
-CPLX_CAST(cfloat, float, C)
-CPLX_CAST(cdouble, double, Z)
-
-__device__ cfloat ___mkC(cfloat C)
-{
-    return C;
-}
-
-__device__ cfloat ___mkC(cdouble C)
-{
-    cfloat res = {C.x, C.y};
-    return res;
-}
-
-__device__ cdouble ___mkZ(cdouble C)
-{
-    return C;
-}
-
-__device__ cdouble ___mkZ(cfloat C)
-{
-    cdouble res = {C.x, C.y};
-    return res;
-}
-
-__device__ float ___real(cfloat in) { return in.x; }
-__device__ double ___real(cdouble in) { return in.x; }
-
-
-__device__ float ___imag(cfloat in) { return in.y; }
-__device__ double ___imag(cdouble in) { return in.y; }
-
-__device__ cfloat ___cplx(float l, float r) { cfloat out = {l, r}; return out; }
-__device__ cdouble ___cplx(double l, double r) { cdouble out = {l, r}; return out; }
-
-__device__ cfloat  ___conj(cfloat  in) { return cuConjf(in); }
-__device__ cdouble ___conj(cdouble in) { return cuConj (in); }
diff --git a/src/backend/cuda/JIT/exp.cu b/src/backend/cuda/JIT/exp.cu
deleted file mode 100644
index 3f110b4328..0000000000
--- a/src/backend/cuda/JIT/exp.cu
+++ /dev/null
@@ -1,100 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include "types.h"
-
-__device__ double sigmoid(double in)
-{
-    return (1.0) / (1 + exp(-in));
-}
-
-__device__ float sigmoidf(float in)
-{
-    return (1.0) / (1 + expf(-in));
-}
-
-#define MATH_BASIC(fn, T)                       \
-    __device__ T ___##fn(T a)                   \
-    {                                           \
-        return fn##f((float)a);                 \
-    }                                           \
-
-
-#define MATH(fn)                                \
-    MATH_BASIC(fn, float)                       \
-    MATH_BASIC(fn, int)                         \
-    MATH_BASIC(fn, uint)                        \
-    MATH_BASIC(fn, char)                        \
-    MATH_BASIC(fn, uchar)                       \
-    MATH_BASIC(fn, uintl)                       \
-    MATH_BASIC(fn, intl)                        \
-    MATH_BASIC(fn, ushort)                      \
-    MATH_BASIC(fn, short)                       \
-    __device__ double ___##fn(double a)         \
-    {                                           \
-        return fn(a);                           \
-    }                                           \
-
-
-MATH(exp)
-MATH(expm1)
-MATH(erf)
-MATH(erfc)
-MATH(sigmoid)
-
-MATH(log)
-MATH(log10)
-MATH(log1p)
-MATH(log2)
-
-MATH(sqrt)
-MATH(cbrt)
-
-#define MATH2_BASIC(fn, T)                      \
-    __device__ T ___##fn(T a, T b)              \
-    {                                           \
-        return fn##f((float)a, (float)b);       \
-    }                                           \
-
-#define MATH2(fn)                                   \
-    MATH2_BASIC(fn, float)                          \
-    MATH2_BASIC(fn, int)                            \
-    MATH2_BASIC(fn, uint)                           \
-    MATH2_BASIC(fn, char)                           \
-    MATH2_BASIC(fn, uchar)                          \
-    MATH2_BASIC(fn, uintl)                          \
-    MATH2_BASIC(fn, intl)                           \
-    MATH2_BASIC(fn, ushort)                         \
-    MATH2_BASIC(fn, short)                          \
-    __device__ double ___##fn(double a, double b)   \
-    {                                               \
-        return fn(a, b);                            \
-    }                                               \
-
-MATH2(pow)
-
-__device__ cfloat ___pow(cfloat a, float b)
-{
-    float R = cuCabsf(a);
-    float Theta = atan2(a.y, a.x);
-    float R_b = powf(R, b);
-    float Theta_b = Theta * b;
-    cfloat res = {R_b * cosf(Theta_b), R_b * sinf(Theta_b)};
-    return res;
-}
-
-__device__ cdouble ___pow(cdouble a, float b)
-{
-    float R = cuCabs(a);
-    float Theta = atan2(a.y, a.x);
-    float R_b = pow(R, b);
-    float Theta_b = Theta * b;
-    cdouble res = {R_b * cos(Theta_b), R_b * sin(Theta_b)};
-    return res;
-}
diff --git a/src/backend/cuda/JIT/hyper.cu b/src/backend/cuda/JIT/hyper.cu
deleted file mode 100644
index 6673fb1f14..0000000000
--- a/src/backend/cuda/JIT/hyper.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include "types.h"
-
-#define MATH_BASIC(fn, T)                       \
-    __device__ T ___##fn(T a)                   \
-    {                                           \
-        return fn##f((float)a);                 \
-    }                                           \
-
-
-#define MATH(fn)                                \
-    MATH_BASIC(fn, float)                       \
-    MATH_BASIC(fn, int)                         \
-    MATH_BASIC(fn, uint)                        \
-    MATH_BASIC(fn, char)                        \
-    MATH_BASIC(fn, uchar)                       \
-    MATH_BASIC(fn, uintl)                       \
-    MATH_BASIC(fn, intl)                        \
-    MATH_BASIC(fn, ushort)                      \
-    MATH_BASIC(fn, short)                       \
-    __device__ double ___##fn(double a)         \
-    {                                           \
-        return fn(a);                           \
-    }                                           \
-
-
-MATH(sinh)
-MATH(cosh)
-MATH(tanh)
-
-MATH(asinh)
-MATH(acosh)
-MATH(atanh)
diff --git a/src/backend/cuda/JIT/logic.cu b/src/backend/cuda/JIT/logic.cu
deleted file mode 100644
index 6072c3c447..0000000000
--- a/src/backend/cuda/JIT/logic.cu
+++ /dev/null
@@ -1,109 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include "types.h"
-
-#define LOGIC_BASIC(fn, op, T)                  \
-    __device__ bool ___##fn(T a, T b)           \
-    {                                           \
-        return a op b;                          \
-    }                                           \
-
-
-#define LOGIC(fn, op)                               \
-    LOGIC_BASIC(fn, op, float)                      \
-    LOGIC_BASIC(fn, op, double)                     \
-    LOGIC_BASIC(fn, op, int)                        \
-    LOGIC_BASIC(fn, op, uint)                       \
-    LOGIC_BASIC(fn, op, char)                       \
-    LOGIC_BASIC(fn, op, uchar)                      \
-    LOGIC_BASIC(fn, op, intl)                       \
-    LOGIC_BASIC(fn, op, uintl)                      \
-    LOGIC_BASIC(fn, op, short)                      \
-    LOGIC_BASIC(fn, op, ushort)                     \
-                                                    \
-    __device__ bool ___##fn(cfloat a, cfloat b)     \
-    {                                               \
-        return cabs2(a) op cabs2(b);                \
-    }                                               \
-                                                    \
-    __device__ bool ___##fn(cdouble a, cdouble b)   \
-    {                                               \
-        return cabs2(a) op cabs2(b);                \
-    }                                               \
-
-LOGIC(lt, <)
-LOGIC(gt, >)
-LOGIC(le, <=)
-LOGIC(ge, >=)
-LOGIC(and, &&)
-LOGIC(or, ||)
-
-#define LOGIC_EQ(fn, op, op2)                       \
-    LOGIC_BASIC(fn, op, float)                      \
-    LOGIC_BASIC(fn, op, double)                     \
-    LOGIC_BASIC(fn, op, int)                        \
-    LOGIC_BASIC(fn, op, uint)                       \
-    LOGIC_BASIC(fn, op, char)                       \
-    LOGIC_BASIC(fn, op, uchar)                      \
-    LOGIC_BASIC(fn, op, intl)                       \
-    LOGIC_BASIC(fn, op, uintl)                      \
-    LOGIC_BASIC(fn, op, short)                      \
-    LOGIC_BASIC(fn, op, ushort)                     \
-                                                    \
-    __device__ bool ___##fn(cfloat a, cfloat b)     \
-    {                                               \
-        return (a.x op b.x) op2 (a.y op b.y);       \
-    }                                               \
-                                                    \
-    __device__ bool ___##fn(cdouble a, cdouble b)   \
-    {                                               \
-        return (a.x op b.x) op2 (a.y op b.y);       \
-    }                                               \
-
-LOGIC_EQ(eq, ==, &&)
-LOGIC_EQ(neq, !=, ||)
-
-#define NOT_FN(T)                                   \
-    __device__ bool ___not(T in) { return !in; }    \
-
-NOT_FN(float)
-NOT_FN(double)
-NOT_FN(int)
-NOT_FN(uint)
-NOT_FN(char)
-NOT_FN(uchar)
-NOT_FN(intl)
-NOT_FN(uintl)
-NOT_FN(short)
-NOT_FN(ushort)
-
-#define BIT_FN(T)                                                   \
-    __device__ T ___bitand   (T lhs, T rhs) { return lhs &  rhs; }  \
-    __device__ T ___bitor    (T lhs, T rhs) { return lhs |  rhs; }  \
-    __device__ T ___bitxor   (T lhs, T rhs) { return lhs ^  rhs; }  \
-    __device__ T ___bitshiftl(T lhs, T rhs) { return lhs << rhs; }  \
-    __device__ T ___bitshiftr(T lhs, T rhs) { return lhs >> rhs; }  \
-
-BIT_FN(int)
-BIT_FN(char)
-BIT_FN(intl)
-BIT_FN(uchar)
-BIT_FN(uint)
-BIT_FN(uintl)
-BIT_FN(short)
-BIT_FN(ushort)
-
-__device__ char ___isNaN(float in) { return isnan(in); }
-__device__ char ___isINF(float in) { return isinf(in); }
-__device__ char ___iszero(float in) { return (in == 0); }
-
-__device__ char ___isNaN(double in) { return isnan(in); }
-__device__ char ___isINF(double in) { return isinf(in); }
-__device__ char ___iszero(double in) { return (in == 0); }
diff --git a/src/backend/cuda/JIT/numeric.cu b/src/backend/cuda/JIT/numeric.cu
deleted file mode 100644
index 2bcb15a112..0000000000
--- a/src/backend/cuda/JIT/numeric.cu
+++ /dev/null
@@ -1,191 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include "types.h"
-
-template<typename T> __device__ T sign(T a) { return signbit(a); }
-
-#define MATH_BASIC(fn, T)                       \
-    __device__ T ___##fn(T a)                   \
-    {                                           \
-        return fn(a);                           \
-    }                                           \
-
-
-#define MATH_NOOP(fn, T)                        \
-    __device__ T ___##fn(T a)                   \
-    {                                           \
-        return a;                               \
-    }                                           \
-
-
-#define MATH_CAST(fn, T, Tc)                    \
-    __device__ T ___##fn(T a)                   \
-    {                                           \
-        return (T)fn((Tc)a);                    \
-    }                                           \
-
-MATH_BASIC(floor, float)
-MATH_BASIC(floor, double)
-MATH_NOOP(floor, int)
-MATH_NOOP(floor, uint)
-MATH_NOOP(floor, char)
-MATH_NOOP(floor, uchar)
-MATH_NOOP(floor, uintl)
-MATH_NOOP(floor, intl)
-MATH_NOOP(floor, ushort)
-MATH_NOOP(floor, short)
-
-MATH_BASIC(ceil, float)
-MATH_BASIC(ceil, double)
-MATH_NOOP(ceil, int)
-MATH_NOOP(ceil, uint)
-MATH_NOOP(ceil, char)
-MATH_NOOP(ceil, uchar)
-MATH_NOOP(ceil, uintl)
-MATH_NOOP(ceil, intl)
-MATH_NOOP(ceil, ushort)
-MATH_NOOP(ceil, short)
-
-MATH_BASIC(round, float)
-MATH_BASIC(round, double)
-MATH_NOOP(round, int)
-MATH_NOOP(round, uint)
-MATH_NOOP(round, char)
-MATH_NOOP(round, uchar)
-MATH_NOOP(round, uintl)
-MATH_NOOP(round, intl)
-MATH_NOOP(round, ushort)
-MATH_NOOP(round, short)
-
-MATH_BASIC(trunc, float)
-MATH_BASIC(trunc, double)
-MATH_NOOP(trunc, int)
-MATH_NOOP(trunc, uint)
-MATH_NOOP(trunc, char)
-MATH_NOOP(trunc, uchar)
-MATH_NOOP(trunc, uintl)
-MATH_NOOP(trunc, intl)
-MATH_NOOP(trunc, ushort)
-MATH_NOOP(trunc, short)
-
-MATH_BASIC(sign, float)
-MATH_BASIC(sign, double)
-MATH_NOOP(sign, int)
-MATH_NOOP(sign, uint)
-MATH_NOOP(sign, char)
-MATH_NOOP(sign, uchar)
-MATH_NOOP(sign, uintl)
-MATH_NOOP(sign, intl)
-MATH_NOOP(sign, ushort)
-MATH_NOOP(sign, short)
-
-MATH_BASIC(abs, float)
-MATH_BASIC(abs, double)
-MATH_BASIC(abs, int)
-MATH_CAST(abs, char, int)
-MATH_NOOP(abs, uint)
-MATH_NOOP(abs, uchar)
-MATH_NOOP(abs, uintl)
-MATH_NOOP(abs, intl)
-MATH_NOOP(abs, ushort)
-MATH_NOOP(abs, short)
-
-MATH_BASIC(tgamma, float)
-MATH_BASIC(tgamma, double)
-MATH_CAST(tgamma, int   , float)
-MATH_CAST(tgamma, uint  , float)
-MATH_CAST(tgamma, char  , float)
-MATH_CAST(tgamma, uchar , float)
-MATH_CAST(tgamma, uintl , float)
-MATH_CAST(tgamma, intl  , float)
-MATH_CAST(tgamma, ushort, float)
-MATH_CAST(tgamma, short , float)
-
-MATH_BASIC(lgamma, float)
-MATH_BASIC(lgamma, double)
-MATH_CAST(lgamma, int   , float)
-MATH_CAST(lgamma, uint  , float)
-MATH_CAST(lgamma, char  , float)
-MATH_CAST(lgamma, uchar , float)
-MATH_CAST(lgamma, uintl , float)
-MATH_CAST(lgamma, intl  , float)
-MATH_CAST(lgamma, ushort, float)
-MATH_CAST(lgamma, short , float)
-
-MATH_NOOP(noop, float)
-MATH_NOOP(noop, double)
-MATH_NOOP(noop, cfloat)
-MATH_NOOP(noop, cdouble)
-MATH_NOOP(noop, int)
-MATH_NOOP(noop, uint)
-MATH_NOOP(noop, char)
-MATH_NOOP(noop, uchar)
-MATH_NOOP(noop, uintl)
-MATH_NOOP(noop, intl)
-MATH_NOOP(noop, ushort)
-MATH_NOOP(noop, short)
-
-__device__ float ___abs(cfloat a) { return cuCabsf(a); }
-__device__ double ___abs(cdouble a) { return cuCabs(a); }
-
-template<typename T> __device__ T rem(T a, T b) { return a % b; }
-__device__ float rem(float a, float b) { return remainderf(a, b); }
-__device__ double rem(double a, double b) { return remainder(a, b); }
-
-template<typename T> __device__ T mod(T a, T b) { return a % b; }
-__device__ float mod(float a, float b) { return fmodf(a, b); }
-__device__ double mod(double a, double b) { return fmod(a, b); }
-
-#define MATH2_BASIC(fn, T)                      \
-    __device__ T ___##fn(T a, T b)              \
-    {                                           \
-        return fn(a, b);                        \
-    }                                           \
-
-#define MATH2(fn)                                   \
-    MATH2_BASIC(fn, float)                          \
-    MATH2_BASIC(fn, int)                            \
-    MATH2_BASIC(fn, uint)                           \
-    MATH2_BASIC(fn, intl)                           \
-    MATH2_BASIC(fn, uintl)                          \
-    MATH2_BASIC(fn, char)                           \
-    MATH2_BASIC(fn, uchar)                          \
-    MATH2_BASIC(fn, short)                          \
-    MATH2_BASIC(fn, ushort)                         \
-    __device__ double ___##fn(double a, double b)   \
-    {                                               \
-        return fn(a, b);                            \
-    }                                               \
-
-MATH2(min)
-MATH2(max)
-MATH2(mod)
-MATH2(rem)
-
-__device__ float ___hypot(float a, float b)
-{
-    return hypot(a, b);
-}
-
-__device__ double ___hypot(double a, double b)
-{
-    return hypot(a, b);
-}
-
-#define COMPARE_CPLX(fn, op, T)                 \
-    __device__ T ___##fn(T a, T b)              \
-    {                                           \
-        return cabs2(a) op cabs2(b) ? a : b;    \
-    }                                           \
-
-COMPARE_CPLX(min, <, cfloat)
-COMPARE_CPLX(min, <, cdouble)
-COMPARE_CPLX(max, >, cfloat)
-COMPARE_CPLX(max, >, cdouble)
diff --git a/src/backend/cuda/JIT/trig.cu b/src/backend/cuda/JIT/trig.cu
deleted file mode 100644
index 372bd4d026..0000000000
--- a/src/backend/cuda/JIT/trig.cu
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include "types.h"
-
-#define MATH_BASIC(fn, T)                       \
-    __device__ T ___##fn(T a)                   \
-    {                                           \
-        return fn##f((float)a);                 \
-    }                                           \
-
-
-#define MATH(fn)                                \
-    MATH_BASIC(fn, float)                       \
-    MATH_BASIC(fn, int)                         \
-    MATH_BASIC(fn, uint)                        \
-    MATH_BASIC(fn, char)                        \
-    MATH_BASIC(fn, uchar)                       \
-    MATH_BASIC(fn, uintl)                       \
-    MATH_BASIC(fn, intl)                        \
-    MATH_BASIC(fn, ushort)                      \
-    MATH_BASIC(fn, short)                       \
-    __device__ double ___##fn(double a)         \
-    {                                           \
-        return fn(a);                           \
-    }                                           \
-
-
-MATH(sin)
-MATH(cos)
-MATH(tan)
-
-MATH(asin)
-MATH(acos)
-MATH(atan)
-
-#define ATAN2(T)                                \
-    __device__ T ___atan2(T x, T y)             \
-    {                                           \
-        return atan2((float)x, (float)y);       \
-    }                                           \
-
-ATAN2(float)
-ATAN2(int)
-ATAN2(uint)
-ATAN2(char)
-ATAN2(uchar)
-ATAN2(uintl)
-ATAN2(intl)
-ATAN2(ushort)
-ATAN2(short)
-
-__device__ double ___atan2(double x, double y)
-{
-    return atan2(x, y);
-}
diff --git a/src/backend/cuda/arith.hpp b/src/backend/cuda/arith.hpp
index 5a39fcdf1c..87117e90bb 100644
--- a/src/backend/cuda/arith.hpp
+++ b/src/backend/cuda/arith.hpp
@@ -10,7 +10,6 @@
 #include <af/dim4.hpp>
 #include <Array.hpp>
 #include <optypes.hpp>
-#include <err_cuda.hpp>
 #include <binary.hpp>
 
 namespace cuda
diff --git a/src/backend/cuda/binary.hpp b/src/backend/cuda/binary.hpp
index bb81c19ffa..d5659eb7e6 100644
--- a/src/backend/cuda/binary.hpp
+++ b/src/backend/cuda/binary.hpp
@@ -17,220 +17,158 @@
 namespace cuda
 {
 
-
-template<typename To, typename Ti, af_op_t op>
-struct BinOp
-{
-    std::string name;
-    int call_type;
-    BinOp() :
-        name("noop"),
-        call_type(0)
-    {}
-};
-
-#define BINARY(fn)                                      \
-    template<typename To, typename Ti>                  \
-    struct BinOp<To, Ti, af_##fn##_t>                   \
-    {                                                   \
-        std::string name;                               \
-        int call_type;                                  \
-        BinOp() :                                       \
-            name(cuMangledName<Ti, true>("___"#fn)),    \
-            call_type(0)                                \
-            {}                                          \
+    template<typename To, typename Ti, af_op_t op>
+    struct BinOp
+    {
+        const char *name()
+        {
+            return "__invalid";
+        }
     };
 
-#if defined(USE_LIBDEVICE)
-#define NVVM_ARITH_OP(T, fn, fname)             \
-    template<>                                  \
-    struct BinOp<T, T, af_##fn##_t>             \
+#define BINARY_TYPE_1(fn)                      \
+    template<typename To, typename Ti>          \
+    struct BinOp<To, Ti, af_##fn##_t>           \
     {                                           \
-        std::string name;                       \
-        int call_type;                          \
-        BinOp() :                               \
-            name(fname),                        \
-            call_type(1)                        \
-            {}                                  \
+        const char *name()                      \
+        {                                       \
+            return "__"#fn;                     \
+        }                                       \
     };                                          \
-
-#define NVVM_COMPARE_OP(T, fn, fname)           \
-    template<>                                  \
-    struct BinOp<char, T, af_##fn##_t>          \
+                                                \
+    template<typename To>                       \
+    struct BinOp<To, cfloat, af_##fn##_t>       \
+    {                                           \
+        const char *name()                      \
+        {                                       \
+            return "__c"#fn"f";                 \
+        }                                       \
+    };                                          \
+                                                \
+    template<typename To>                       \
+    struct BinOp<To, cdouble, af_##fn##_t>      \
     {                                           \
-        std::string name;                       \
-        int call_type;                          \
-        BinOp() :                               \
-            name(fname),                        \
-            call_type(2)                        \
-            {}                                  \
+        const char *name()                      \
+        {                                       \
+            return "__c"#fn;                    \
+        }                                       \
     };                                          \
 
-#define NVVM_BINARY_FUNC(T, fn, fname)          \
-    template<>                                  \
-    struct BinOp<T, T, af_##fn##_t>             \
+
+BINARY_TYPE_1(eq)
+BINARY_TYPE_1(neq)
+BINARY_TYPE_1(lt)
+BINARY_TYPE_1(le)
+BINARY_TYPE_1(gt)
+BINARY_TYPE_1(ge)
+BINARY_TYPE_1(add)
+BINARY_TYPE_1(sub)
+BINARY_TYPE_1(mul)
+BINARY_TYPE_1(div)
+BINARY_TYPE_1(and)
+BINARY_TYPE_1(or)
+BINARY_TYPE_1(bitand)
+BINARY_TYPE_1(bitor)
+BINARY_TYPE_1(bitxor)
+BINARY_TYPE_1(bitshiftl)
+BINARY_TYPE_1(bitshiftr)
+
+#undef BINARY_TYPE_1
+
+#define BINARY_TYPE_2(fn)                       \
+    template<typename To, typename Ti>          \
+    struct BinOp<To, Ti, af_##fn##_t>           \
+    {                                           \
+        const char *name()                      \
+        {                                       \
+            return "__"#fn;                     \
+        }                                       \
+    };                                          \
+    template<typename To>                       \
+    struct BinOp<To, float, af_##fn##_t>        \
+    {                                           \
+        const char *name()                      \
+        {                                       \
+            return "f"#fn;                      \
+        }                                       \
+    };                                          \
+    template<typename To>                       \
+    struct BinOp<To, double, af_##fn##_t>       \
+    {                                           \
+        const char *name()                      \
+        {                                       \
+            return "f"#fn;                      \
+        }                                       \
+    };                                          \
+    template<typename To>                       \
+    struct BinOp<To, cfloat, af_##fn##_t>       \
+    {                                           \
+        const char *name()                      \
+        {                                       \
+            return "__c"#fn"f";                 \
+        }                                       \
+    };                                          \
+                                                \
+    template<typename To>                       \
+    struct BinOp<To, cdouble, af_##fn##_t>      \
     {                                           \
-        std::string name;                       \
-        int call_type;                          \
-        BinOp() :                               \
-            name("@__nv_"#fname),               \
-            call_type(0)                        \
-            {}                                  \
+        const char *name()                      \
+        {                                       \
+            return "__c"#fn;                    \
+        }                                       \
     };                                          \
 
-#else
 
-#define NVVM_ARITH_OP(T, fn, fname)    // No specialization
-#define NVVM_COMPARE_OP(T, fn, fname)  // No specialization
-#define NVVM_BINARY_FUNC(T, fn, fname) // No specialization
+BINARY_TYPE_2(min)
+BINARY_TYPE_2(max)
+BINARY_TYPE_2(pow)
+BINARY_TYPE_2(rem)
+BINARY_TYPE_2(mod)
+
+template<typename Ti>
+struct BinOp<cfloat, Ti, af_cplx2_t>
+{
+    const char *name()
+    {
+        return "__cplx2f";
+    }
+};
+
+template<typename Ti>
+struct BinOp<cdouble, Ti, af_cplx2_t>
+{
+    const char *name()
+    {
+        return "__cplx2";
+    }
+};
+
+template<typename To, typename Ti>
+struct BinOp<To, Ti, af_cplx2_t>
+{
+    const char *name()
+    {
+        return "noop";
+    }
+};
+
+template<typename To, typename Ti>
+struct BinOp<To, Ti, af_atan2_t>
+{
+    const char *name()
+    {
+        return "atan2";
+    }
+};
 
-#endif
-
-#define NVVM_ARITH_OP_INT(fn, fname)         \
-    NVVM_ARITH_OP(int, fn, fname)            \
-    NVVM_ARITH_OP(short, fn, fname)          \
-    NVVM_ARITH_OP(intl, fn, fname)           \
-
-#define NVVM_ARITH_OP_UINT(fn, fname)        \
-    NVVM_ARITH_OP(uint, fn, fname)           \
-    NVVM_ARITH_OP(ushort, fn, fname)         \
-    NVVM_ARITH_OP(uintl, fn, fname)          \
-
-#define NVVM_ARITH_OP_FLOAT(fn, fname)       \
-    NVVM_ARITH_OP(float, fn, fname)          \
-    NVVM_ARITH_OP(double, fn, fname)         \
-
-#define NVVM_ARITH_OP_CPLX(fn, fname)        \
-    NVVM_ARITH_OP(cfloat, fn, fname)         \
-    NVVM_ARITH_OP(cdouble, fn, fname)        \
-
-#define NVVM_COMPARE_OP_INT(fn, fname)       \
-    NVVM_COMPARE_OP(int, fn, fname)          \
-    NVVM_COMPARE_OP(short, fn, fname)        \
-    NVVM_COMPARE_OP(intl, fn, fname)         \
-
-#define NVVM_COMPARE_OP_UINT(fn, fname)      \
-    NVVM_COMPARE_OP(uint, fn, fname)         \
-    NVVM_COMPARE_OP(ushort, fn, fname)       \
-    NVVM_COMPARE_OP(uintl, fn, fname)        \
-
-#define NVVM_COMPARE_OP_FLOAT(fn, fname)     \
-    NVVM_COMPARE_OP(float, fn, fname)        \
-    NVVM_COMPARE_OP(double, fn, fname)       \
-
-BINARY(add)
-NVVM_ARITH_OP_INT(add, "add")
-NVVM_ARITH_OP_UINT(add, "add")
-NVVM_ARITH_OP_FLOAT(add, "fadd")
-NVVM_ARITH_OP_CPLX(add, "fadd")
-
-BINARY(sub)
-NVVM_ARITH_OP_INT(sub, "sub")
-NVVM_ARITH_OP_UINT(sub, "sub")
-NVVM_ARITH_OP_FLOAT(sub, "fsub")
-NVVM_ARITH_OP_CPLX(sub, "fsub")
-
-BINARY(mul)
-NVVM_ARITH_OP_INT(mul, "mul")
-NVVM_ARITH_OP_UINT(mul, "mul")
-NVVM_ARITH_OP_FLOAT(mul, "fmul")
-
-BINARY(div)
-NVVM_ARITH_OP_INT(div, "sdiv")
-NVVM_ARITH_OP_UINT(div, "udiv")
-NVVM_ARITH_OP_FLOAT(div, "fdiv")
-
-BINARY(bitand)
-NVVM_ARITH_OP_INT(bitand, "and")
-NVVM_ARITH_OP_UINT(bitand, "and")
-
-BINARY(bitor)
-NVVM_ARITH_OP_INT(bitor, "or")
-NVVM_ARITH_OP_UINT(bitor, "or")
-
-BINARY(bitxor)
-NVVM_ARITH_OP_INT(bitxor, "xor")
-NVVM_ARITH_OP_UINT(bitxor, "xor")
-
-BINARY(bitshiftl)
-NVVM_ARITH_OP_INT(bitshiftl, "shl")
-NVVM_ARITH_OP_UINT(bitshiftl, "shl")
-
-BINARY(bitshiftr)
-NVVM_ARITH_OP_INT(bitshiftr, "lshr")
-NVVM_ARITH_OP_UINT(bitshiftr, "lshr")
-
-
-BINARY(and)
-BINARY(or)
-
-BINARY(lt)
-NVVM_COMPARE_OP_INT(lt, "icmp slt")
-NVVM_COMPARE_OP_UINT(lt, "icmp ult")
-NVVM_COMPARE_OP_FLOAT(lt, "fcmp olt")
-
-BINARY(gt)
-NVVM_COMPARE_OP_INT(gt, "icmp sgt")
-NVVM_COMPARE_OP_UINT(gt, "icmp ugt")
-NVVM_COMPARE_OP_FLOAT(gt, "fcmp ogt")
-
-BINARY(le)
-NVVM_COMPARE_OP_INT(le, "icmp sle")
-NVVM_COMPARE_OP_UINT(le, "icmp ule")
-NVVM_COMPARE_OP_FLOAT(le, "fcmp ole")
-
-BINARY(ge)
-NVVM_COMPARE_OP_INT(ge, "icmp sge")
-NVVM_COMPARE_OP_UINT(ge, "icmp uge")
-NVVM_COMPARE_OP_FLOAT(ge, "fcmp oge")
-
-BINARY(eq)
-NVVM_COMPARE_OP_INT(eq, "icmp eq")
-NVVM_COMPARE_OP_UINT(eq, "icmp eq")
-NVVM_COMPARE_OP_FLOAT(eq, "fcmp oeq")
-
-BINARY(neq)
-NVVM_COMPARE_OP_INT(neq, "icmp ne")
-NVVM_COMPARE_OP_UINT(neq, "icmp ne")
-NVVM_COMPARE_OP_FLOAT(neq, "fcmp one")
-
-BINARY(max)
-NVVM_BINARY_FUNC(float, max, fmaxf)
-NVVM_BINARY_FUNC(double, max, fmax)
-NVVM_BINARY_FUNC(int, max, max)
-NVVM_BINARY_FUNC(uint, max, umax)
-NVVM_BINARY_FUNC(intl, max, llmax)
-NVVM_BINARY_FUNC(uintl, max, ullmax)
-
-BINARY(min)
-NVVM_BINARY_FUNC(float, min, fminf)
-NVVM_BINARY_FUNC(double, min, fmin)
-NVVM_BINARY_FUNC(int, min, min)
-NVVM_BINARY_FUNC(uint, min, umin)
-NVVM_BINARY_FUNC(intl, min, llmin)
-NVVM_BINARY_FUNC(uintl, min, ullmin)
-
-BINARY(pow)
-NVVM_BINARY_FUNC(float, pow, powf)
-NVVM_BINARY_FUNC(double, pow, pow)
-
-BINARY(mod)
-NVVM_BINARY_FUNC(float, mod, fmodf)
-NVVM_BINARY_FUNC(double, mod, fmod)
-
-BINARY(rem)
-NVVM_BINARY_FUNC(float, rem, remainderf)
-NVVM_BINARY_FUNC(double, rem, remainder)
-
-BINARY(atan2)
-NVVM_BINARY_FUNC(float, atan2, atan2f)
-NVVM_BINARY_FUNC(double, atan2, atan2)
-
-BINARY(hypot)
-NVVM_BINARY_FUNC(float, hypot, hypotf)
-NVVM_BINARY_FUNC(double, hypot, hypot)
-
-#undef BINARY
+template<typename To, typename Ti>
+struct BinOp<To, Ti, af_hypot_t>
+{
+    const char *name()
+    {
+        return "hypot";
+    }
+};
 
 template<typename To, typename Ti, af_op_t op>
 Array<To> createBinaryNode(const Array<Ti> &lhs, const Array<Ti> &rhs, const af::dim4 &odims)
@@ -239,16 +177,14 @@ Array<To> createBinaryNode(const Array<Ti> &lhs, const Array<Ti> &rhs, const af:
 
     JIT::Node_ptr lhs_node = lhs.getNode();
     JIT::Node_ptr rhs_node = rhs.getNode();
-
-    JIT::BinaryNode *node = new JIT::BinaryNode(irname<To>(),
-                                                afShortName<To>(),
-                                                bop.name,
+    JIT::BinaryNode *node = new JIT::BinaryNode(getFullName<To>(),
+                                                shortname<To>(true),
+                                                bop.name(),
                                                 lhs_node,
-                                                rhs_node,
-                                                (int)(op),
-                                                bop.call_type);
+                                                rhs_node, (int)(op));
 
-    return createNodeArray<To>(odims, JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+    return createNodeArray<To>(odims, JIT::Node_ptr(
+                                   reinterpret_cast<JIT::Node *>(node)));
 }
 
 }
diff --git a/src/backend/cuda/cast.hpp b/src/backend/cuda/cast.hpp
index 906620036e..9f854c6e06 100644
--- a/src/backend/cuda/cast.hpp
+++ b/src/backend/cuda/cast.hpp
@@ -13,8 +13,8 @@
 #include <err_cuda.hpp>
 #include <math.hpp>
 #include <optypes.hpp>
-#include <types.hpp>
 #include <JIT/UnaryNode.hpp>
+#include <types.hpp>
 #include <Array.hpp>
 
 namespace cuda
@@ -23,18 +23,86 @@ namespace cuda
 template<typename To, typename Ti>
 struct CastOp
 {
-    std::string func;
-    CastOp() {
-        std::string tmp = std::string("___mk") + afShortName<To>();
-        func = cuMangledName<Ti, false>(tmp.c_str());
+    const char *name()
+    {
+        return "";
+    }
+};
+
+#define CAST_FN(TYPE)                           \
+    template<typename Ti>                       \
+        struct CastOp<TYPE, Ti>                 \
+    {                                           \
+        const char *name()                      \
+        {                                       \
+            return "("#TYPE")";                 \
+        }                                       \
+    };
+
+CAST_FN(int)
+CAST_FN(unsigned int)
+CAST_FN(unsigned char)
+CAST_FN(unsigned short)
+CAST_FN(short)
+CAST_FN(float)
+CAST_FN(double)
+
+#define CAST_CFN(TYPE)                          \
+    template<typename Ti>                       \
+    struct CastOp<TYPE, Ti>                     \
+    {                                           \
+        const char *name()                      \
+        {                                       \
+            return "__convert_"#TYPE;           \
+        }                                       \
+    };
+
+
+CAST_CFN(cfloat)
+CAST_CFN(cdouble)
+CAST_CFN(char)
+
+
+template<>
+struct CastOp<cfloat, cdouble>
+{
+    const char *name()
+    {
+        return "__convert_z2c";
+    }
+};
+
+
+template<>
+struct CastOp<cdouble, cfloat>
+{
+    const char *name()
+    {
+        return "__convert_c2z";
+    }
+};
+
+template<>
+struct CastOp<cfloat, cfloat>
+{
+    const char *name()
+    {
+        return "__convert_c2c";
     }
+};
+
 
-    const std::string name()
+template<>
+struct CastOp<cdouble, cdouble>
+{
+    const char *name()
     {
-        return func;
+        return "__convert_z2z";
     }
 };
 
+#undef CAST_FN
+#undef CAST_CFN
 
 template<typename To, typename Ti>
 struct CastWrapper
@@ -43,8 +111,8 @@ struct CastWrapper
     {
         CastOp<To, Ti> cop;
         JIT::Node_ptr in_node = in.getNode();
-        JIT::UnaryNode *node = new JIT::UnaryNode(irname<To>(),
-                                                  afShortName<To>(),
+        JIT::UnaryNode *node = new JIT::UnaryNode(getFullName<To>(),
+                                                  shortname<To>(true),
                                                   cop.name(),
                                                   in_node, af_cast_t);
         return createNodeArray<To>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
diff --git a/src/backend/cuda/complex.hpp b/src/backend/cuda/complex.hpp
index 6082a5e194..67ba0ebe4e 100644
--- a/src/backend/cuda/complex.hpp
+++ b/src/backend/cuda/complex.hpp
@@ -6,66 +6,28 @@
  * The complete license agreement can be obtained at:
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
+
 #include <af/dim4.hpp>
 #include <Array.hpp>
 #include <optypes.hpp>
-#include <err_cuda.hpp>
-#include <JIT/BinaryNode.hpp>
+#include <binary.hpp>
 #include <JIT/UnaryNode.hpp>
 
 namespace cuda
 {
-    template<typename T> static const std::string cplx_name() { return cuMangledName<T, false>("___noop");       }
-    template<> STATIC_ const std::string cplx_name<cfloat >() { return cuMangledName<float , true>("___cplx");   }
-    template<> STATIC_ const std::string cplx_name<cdouble>() { return cuMangledName<double, true>("___cplx");   }
-
-    template<typename T> static const std::string real_name() { return cuMangledName<T, false>("___noop");       }
-    template<> STATIC_ const std::string real_name<cfloat >() { return cuMangledName<cfloat , false>("___real"); }
-    template<> STATIC_ const std::string real_name<cdouble>() { return cuMangledName<cdouble, false>("___real"); }
-
-    template<typename T> static const std::string imag_name() { return cuMangledName<T, false>("___noop");       }
-    template<> STATIC_ const std::string imag_name<cfloat >() { return cuMangledName<cfloat , false>("___imag"); }
-    template<> STATIC_ const std::string imag_name<cdouble>() { return cuMangledName<cdouble, false>("___imag"); }
-
-    template<typename T> static const std::string abs_name()  { return cuMangledName<T, false>("___noop");       }
-#if defined(USE_LIBDEVICE)
-    template<> STATIC_ const std::string abs_name<float  >()  { return "@__nv_fabsf";  }
-    template<> STATIC_ const std::string abs_name<double >()  { return "@__nv_fabs" ;  }
-#else
-    template<> STATIC_ const std::string abs_name<float >()  { return cuMangledName<float , false>("___abs");  }
-    template<> STATIC_ const std::string abs_name<double>()  { return cuMangledName<double, false>("___abs");  }
-#endif
-    template<> STATIC_ const std::string abs_name<cfloat >()  { return cuMangledName<cfloat , false>("___abs");  }
-    template<> STATIC_ const std::string abs_name<cdouble>()  { return cuMangledName<cdouble, false>("___abs");  }
-
-    template<typename T> static const std::string conj_name() { return cuMangledName<T, false>("___noop");       }
-    template<> STATIC_ const std::string conj_name<cfloat >() { return cuMangledName<cfloat , false>("___conj"); }
-    template<> STATIC_ const std::string conj_name<cdouble>() { return cuMangledName<cdouble, false>("___conj"); }
-
     template<typename To, typename Ti>
     Array<To> cplx(const Array<Ti> &lhs, const Array<Ti> &rhs, const af::dim4 &odims)
     {
-        JIT::Node_ptr lhs_node = lhs.getNode();
-        JIT::Node_ptr rhs_node = rhs.getNode();
-
-        JIT::BinaryNode *node = new JIT::BinaryNode(irname<To>(),
-                                                    afShortName<To>(),
-                                                    cplx_name<To>(),
-                                                    lhs_node,
-                                                    rhs_node,
-                                                    (int)(af_cplx2_t),
-                                                    0);
-
-        return createNodeArray<To>(odims, JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+        return createBinaryNode<To, Ti, af_cplx2_t>(lhs, rhs, odims);
     }
 
     template<typename To, typename Ti>
     Array<To> real(const Array<Ti> &in)
     {
         JIT::Node_ptr in_node = in.getNode();
-        JIT::UnaryNode *node = new JIT::UnaryNode(irname<To>(),
-                                                  afShortName<To>(),
-                                                  real_name<Ti>(),
+        JIT::UnaryNode *node = new JIT::UnaryNode(getFullName<To>(),
+                                                  shortname<To>(true),
+                                                  "__creal",
                                                   in_node, af_real_t);
 
         return createNodeArray<To>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
@@ -75,32 +37,40 @@ namespace cuda
     Array<To> imag(const Array<Ti> &in)
     {
         JIT::Node_ptr in_node = in.getNode();
-        JIT::UnaryNode *node = new JIT::UnaryNode(irname<To>(),
-                                                  afShortName<To>(),
-                                                  imag_name<Ti>(),
+        JIT::UnaryNode *node = new JIT::UnaryNode(getFullName<To>(),
+                                                  shortname<To>(true),
+                                                  "__cimag",
                                                   in_node, af_imag_t);
 
         return createNodeArray<To>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
     }
 
+    template<typename T> static const char *abs_name() { return "fabs"; }
+    template<> STATIC_ const char *abs_name<cfloat>() { return "__cabsf"; }
+    template<> STATIC_ const char *abs_name<cdouble>() { return "__cabs"; }
+
     template<typename To, typename Ti>
     Array<To> abs(const Array<Ti> &in)
     {
         JIT::Node_ptr in_node = in.getNode();
-        JIT::UnaryNode *node = new JIT::UnaryNode(irname<To>(),
-                                                  afShortName<To>(),
+        JIT::UnaryNode *node = new JIT::UnaryNode(getFullName<To>(),
+                                                  shortname<To>(true),
                                                   abs_name<Ti>(),
                                                   in_node, af_abs_t);
 
         return createNodeArray<To>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
     }
 
+    template<typename T> static const char *conj_name() { return "__noop"; }
+    template<> STATIC_ const char *conj_name<cfloat>() { return "__cconjf"; }
+    template<> STATIC_ const char *conj_name<cdouble>() { return "__cconj"; }
+
     template<typename T>
     Array<T> conj(const Array<T> &in)
     {
         JIT::Node_ptr in_node = in.getNode();
-        JIT::UnaryNode *node = new JIT::UnaryNode(irname<T>(),
-                                                  afShortName<T>(),
+        JIT::UnaryNode *node = new JIT::UnaryNode(getFullName<T>(),
+                                                  shortname<T>(true),
                                                   conj_name<T>(),
                                                   in_node, af_conj_t);
 
diff --git a/src/backend/cuda/jit.cpp b/src/backend/cuda/jit.cpp
index a7870aa7bd..5c655b2247 100644
--- a/src/backend/cuda/jit.cpp
+++ b/src/backend/cuda/jit.cpp
@@ -12,35 +12,10 @@
 #include <copy.hpp>
 #include <JIT/Node.hpp>
 
-#include <ptx_headers/arith.hpp>
-#include <ptx_headers/cast.hpp>
-#include <ptx_headers/exp.hpp>
-#include <ptx_headers/hyper.hpp>
-#include <ptx_headers/logic.hpp>
-#include <ptx_headers/numeric.hpp>
-#include <ptx_headers/trig.hpp>
-
-#if defined(__LIBDEVICE_COMPUTE_20)
-#include <libdevice_headers/compute_20.hpp>
-#endif
-
-#if defined(__LIBDEVICE_COMPUTE_30)
-#include <libdevice_headers/compute_30.hpp>
-#endif
-
-#if defined(__LIBDEVICE_COMPUTE_35)
-#include <libdevice_headers/compute_35.hpp>
-#endif
-
-#if defined(__LIBDEVICE_COMPUTE_50)
-#include <libdevice_headers/compute_50.hpp>
-#endif
-
 #include <platform.hpp>
 #include <dispatch.hpp>
 #include <err_cuda.hpp>
 #include <math.hpp>
-#include <nvvm.h>
 
 #include <functional>
 #include <map>
@@ -48,6 +23,8 @@
 #include <stdexcept>
 #include <thread>
 #include <vector>
+#include <kernel_headers/jit.hpp>
+#include <nvrtc.h>
 
 namespace cuda
 {
@@ -55,8 +32,6 @@ namespace cuda
 using JIT::Node;
 using JIT::Node_ids;
 using JIT::Node_map_t;
-using JIT::str_map_iter;
-using JIT::str_map_t;
 
 using std::hash;
 using std::lock_guard;
@@ -67,12 +42,6 @@ using std::stringstream;
 using std::unique_ptr;
 using std::vector;
 
-const char *layout64 = "target datalayout = \"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\"\n\n\n";
-const char *layout32 = "target datalayout = \"e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\"\n\n\n";
-
-const char *triple64 = "target triple = \"nvptx64-unknown-cuda\"\n\n";
-const char *triple32 = "target triple = \"nvptx-unknown-cuda\"\n\n";
-
 static string getFuncName(const vector<Node *> &output_nodes,
                           const vector<Node *> &full_nodes,
                           const vector<Node_ids> &full_ids,
@@ -94,7 +63,7 @@ static string getFuncName(const vector<Node *> &output_nodes,
 
     hash<string> hash_fn;
 
-    hashName << "@KER";
+    hashName << "KER";
     hashName << hash_fn(funcName.str());
     return hashName.str();
 }
@@ -105,404 +74,207 @@ static string getKernelString(const string funcName,
                               const vector<int> &output_ids,
                               bool is_linear)
 {
-    static const char *defineVoid = "define void ";
-    static const char *generalDimParams = "\n"
-        "i32 %ostr0, i32 %ostr1, i32 %ostr2, i32 %ostr3,\n"
-        "i32 %odim0, i32 %odim1, i32 %odim2, i32 %odim3,\n"
-        "i32 %blkx, i32 %blky, i32 %ndims";
-
-    static const char *linearDimParams = "\n"
-        "i32 %nelem, i32 %blkx, i32 %blky";
-
-    const char *dimParams = is_linear ? linearDimParams : generalDimParams;
-
-    static const char *blockStart = "\n{\n\n"
-        "entry:\n\n";
-    static const char *blockEnd = "\n\n"
-        "ret void\n"
-        "\n\n}\n";
-
-    static const char *idAlias = "\n"
-        "%tidx = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()\n"
-        "%bdmx = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()\n"
-        "%bidx = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()\n"
-        "%bidy = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()\n"
-        "%gdmx = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()\n"
-        "\n\n";
-    static const char *earlyExit = "\n"
-        "end:\n\n"
-        "ret void\n";
-    static const char *core = "\n"
-        "core:\n\n";
 
-    static const char *generalIndex = "\n"
-        "%tidy = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()\n"
-        "%bdmy = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()\n"
-        "%blk_x = alloca i32, align 4\n"
-        "%blk_y = alloca i32, align 4\n"
-        "%id_3 = alloca i32, align 4\n"
-        "%id_2 = alloca i32, align 4\n"
-        "store i32 %bidx, i32* %blk_x, align 4\n"
-        "store i32 %bidy, i32* %blk_y, align 4\n"
-        "store i32 0, i32* %id_2, align 4\n"
-        "store i32 0, i32* %id_3, align 4\n"
-        "%two = alloca i32, align 4\n"
-        "store i32 2, i32* %two, align 4\n"
-        "%twoval = load i32* %two, align 4\n"
-        "%is34 = icmp sgt i32 %ndims, %twoval\n"
-        "br i1 %is34, label %do34, label %do2\n"
-        "\ndo34:\n"
-        "%id2t = sdiv i32 %bidx, %blkx\n"
-        "store i32 %id2t, i32* %id_2, align 4\n"
-        "%id2m = mul i32 %id2t, %blkx\n"
-        "%blk_xx = sub i32 %bidx, %id2m\n"
-        "store i32 %blk_xx, i32* %blk_x, align 4\n"
-        "%three = alloca i32, align 4\n"
-        "store i32 3, i32* %three, align 4\n"
-        "%threeval = load i32* %three, align 4\n"
-        "%is4 = icmp sgt i32 %ndims, %threeval\n"
-        "br i1 %is4, label %do4, label %do2\n"
-        "\ndo4:\n"
-        "%id3t = sdiv i32 %bidy, %blky\n"
-        "store i32 %id3t, i32* %id_3, align 4\n"
-        "%id3m = mul i32 %id3t, %blky\n"
-        "%blk_yy = sub i32 %bidy, %id3m\n"
-        "store i32 %blk_yy, i32* %blk_y, align 4\n"
-        "br label %do2\n"
-        "\ndo2:\n"
-        "%id2 = load i32* %id_2, align 4\n"
-        "%id3 = load i32* %id_3, align 4\n"
-        "%tmp_x = load i32* %blk_x, align 4\n"
-        "%id0m = mul i32 %tmp_x, %bdmx\n"
-        "%id0 = add i32 %tidx, %id0m\n"
-        "%tmp_y = load i32* %blk_y, align 4\n"
-        "%id1m = mul i32 %tmp_y, %bdmy\n"
-        "%id1 = add i32 %tidy, %id1m\n"
-        "\n\n"
-        "%off3o = mul i32 %id3, %ostr3\n"
-        "%off2o = mul i32 %id2, %ostr2\n"
-        "%off1o = mul i32 %id1, %ostr1\n"
-        "%off23o = add i32 %off3o, %off2o\n"
-        "%off123o = add i32 %off23o, %off1o\n"
-        "%idxa = add i32 %off123o, %id0\n"
-        "%idx = sext i32 %idxa to i64\n"
-        "\n\n"
-        "%cmp3 = icmp slt i32 %id3, %odim3\n"
-        "%cmp2 = icmp slt i32 %id2, %odim2\n"
-        "%cmp1 = icmp slt i32 %id1, %odim1\n"
-        "%cmp0 = icmp slt i32 %id0, %odim0\n"
-        "br i1 %cmp3, label %check2, label %end\n"
-        "\ncheck2:\n"
-        "br i1 %cmp2, label %check1, label %end\n"
-        "\ncheck1:\n"
-        "br i1 %cmp1, label %check0, label %end\n"
-        "\ncheck0:\n"
-        "br i1 %cmp0, label %core, label %end\n";
+    const char *includeFileStr = jit_cuh;
+
+    const char paramTStr[] =""
+        "template<typename T>\n"
+        "struct Param\n"
+        "{\n"
+        "  T *ptr;\n"
+        "  dim_t dims[4];\n"
+        "  dim_t strides[4];\n"
+        "};\n";
+
+    std::string typedefStr = "typedef unsigned int uint;\n";
+    typedefStr += "typedef ";
+    typedefStr += getFullName<dim_t>();
+    typedefStr += " dim_t;\n";
+
+    // Common CUDA code
+    // This part of the code does not change with the kernel.
+
+    static const char *kernelVoid =  "extern \"C\" __global__ void\n";
+    static const char *dimParams = "uint blocks_x, uint blocks_y, uint num_odims";
+    static const char *blockStart = "{\n\n";
+    static const char *blockEnd = "\n\n}";
 
     static const char *linearIndex = "\n"
-        "%boff = mul i32 %bidy, %gdmx\n"
-        "%bid  = add i32 %boff, %bidx\n"
-        "%goff = mul i32 %bid , %bdmx\n"
-        "%gid  = add i32 %goff ,%tidx\n"
-        "%idx  = sext i32 %gid to i64\n"
-        "%cmp0 = icmp slt i32 %gid, %nelem\n"
-        "br i1 %cmp0, label %core, label %end\n";
-
-    static const char *functionLoad = "\n"
-        "declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone\n"
-        "declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() nounwind readnone\n"
-        "declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() nounwind readnone\n"
-        "declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() nounwind readnone\n"
-        "declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone\n"
-        "declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() nounwind readnone\n"
-        "declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() nounwind readnone\n"
-        "\n";
+        "uint blockId  = blockIdx.y * gridDim.x + blockIdx.x;\n"
+        "uint threadId = threadIdx.x;\n"
+        "int idx = blockId * blockDim.x * blockDim.y + threadId;\n"
+        "if (idx >= outref.dims[3] * outref.strides[3]) return;\n";
+
+    static const char *generalIndex = "\n"
+        "uint id0 = 0, id1 = 0, id2 = 0, id3 = 0;\n"
+        "if (num_odims > 2) {\n"
+        "id2 = blockIdx.x / blocks_x;\n"
+        "id0 = blockIdx.x - id2 * blocks_x;\n"
+        "id0 = threadIdx.x + id0 * blockDim.x;\n"
+        "if (num_odims > 3) {\n"
+        "id3 = blockIdx.y / blocks_y;\n"
+        "id1 = blockIdx.y - id3 * blocks_y;\n"
+        "id1 = threadIdx.y + id1 * blockDim.y;\n"
+        "} else {\n"
+        "id1 = threadIdx.y + blockDim.y * blockIdx.y;\n"
+        "}\n"
+        " } else {\n"
+        "id3 = 0;\n"
+        "id2 = 0;\n"
+        "id1 = threadIdx.y + blockDim.y * blockIdx.y;\n"
+        "id0 = threadIdx.x + blockDim.x * blockIdx.x;\n"
+        "}\n"
+        "bool cond = \n"
+        "id0 < outref.dims[0] && \n"
+        "id1 < outref.dims[1] && \n"
+        "id2 < outref.dims[2] && \n"
+        "id3 < outref.dims[3];\n\n"
+        "if (!cond) return;\n\n"
+        "int idx = "
+        "outref.strides[3] * id3 + outref.strides[2] * id2 + "
+        "outref.strides[1] * id1 + id0;\n\n";
 
 
-    stringstream kerStream;
-    stringstream inAnnStream;
-    stringstream outAnnStream;
     stringstream inParamStream;
     stringstream outParamStream;
-    stringstream funcBodyStream;
-    stringstream offsetsStream;
     stringstream outWriteStream;
-    str_map_t declStrs;
-
-    vector<string> types_output(output_ids.size());
-    for (int i = 0; i < (int)output_ids.size(); i++) {
-        types_output[i] = full_nodes[output_ids[i]]->getTypeStr();
-    }
+    stringstream offsetsStream;
+    stringstream opsStream;
+    stringstream outrefstream;
 
     for (int i = 0; i < (int)full_nodes.size(); i++) {
         const auto &node = full_nodes[i];
         const auto &ids_curr = full_ids[i];
-        // Generate input parameters, needs only current id
-        node->genParams(inParamStream, inAnnStream, ids_curr.id, is_linear);
-        // Generate input offsets, needs only current id
+        // Generate input parameters, only needs current id
+        node->genParams(inParamStream, ids_curr.id, is_linear);
+        // Generate input offsets, only needs current id
         node->genOffsets(offsetsStream, ids_curr.id, is_linear);
-        // Generate the core function body, needs children id as well
-        node->genFuncs(funcBodyStream, declStrs, ids_curr, is_linear);
+        // Generate the core function body, needs children ids as well
+        node->genFuncs(opsStream, ids_curr);
     }
 
+    outrefstream << "Param<" << full_nodes[output_ids[0]]->getTypeStr()
+                 << "> outref = out" << output_ids[0] << ";\n";
+
     for (int i = 0; i < (int)output_ids.size(); i++) {
         int id = output_ids[i];
-        string outTypeStr = types_output[i];
-
         // Generate output parameters
-        outParamStream << outTypeStr << "* %out" << id << ",\n";
-
-        // Generate instruction to write output
-        outWriteStream << "%outIdx" << id
-                       << "= getelementptr inbounds "
-                       << outTypeStr
-                       << "* %out" << id
-                       << ", i64 %idx\n";
-        outWriteStream << "store "
-                       << outTypeStr
-                       << " %val" << id << ", "
-                       << outTypeStr
-                       << "* %outIdx" << id << "\n";
-
-        // Generate output annotation string
-        outAnnStream << outTypeStr << "*,\n";
-    }
-
-    if (sizeof(void *) == 8) {
-        kerStream << layout64;
-        kerStream << triple64;
-    } else {
-        kerStream << layout32;
-        kerStream << triple32;
+        outParamStream << "Param<" << full_nodes[id]->getTypeStr() << "> out" << id << ", \n";
+        // Generate code to write the output
+        outWriteStream << "out" << id << ".ptr[idx] = " << "val" << id << ";\n";
     }
 
-    const char *index = is_linear ? linearIndex : generalIndex;
-
-    kerStream << defineVoid
-              << funcName
-              << " (\n"
-              << inParamStream.str()
-              << outParamStream.str()
-              << dimParams
-              << " )\n"
-              << blockStart
-              << idAlias
-              << index
-              << earlyExit
-              << core
-              << offsetsStream.str()
-              << funcBodyStream.str()
-              << outWriteStream.str()
-              << blockEnd;
-
-    for(str_map_iter iterator = declStrs.begin();
-        iterator != declStrs.end(); iterator++) {
-        kerStream << iterator->first << "\n";
-    }
-    kerStream << functionLoad;
-
-    kerStream << "!nvvm.annotations = !{!1}\n"
-                 "!1 = metadata !{void (\n"
-              << inAnnStream.str()
-              << outAnnStream.str();
-
+    // Put various blocks into a single stream
+    stringstream kerStream;
+    kerStream << typedefStr;
+    kerStream << paramTStr;
+    kerStream << includeFileStr << "\n\n";
+    kerStream << kernelVoid;
+    kerStream << funcName;
+    kerStream << "(\n";
+    kerStream << inParamStream.str();
+    kerStream << outParamStream.str();
+    kerStream << dimParams;
+    kerStream << ")\n";
+    kerStream << blockStart;
+    kerStream << outrefstream.str();
     if (is_linear) {
-        kerStream << "i32, i32, i32\n";
+        kerStream << linearIndex;
     } else {
-        kerStream  << "i32, i32, i32, i32,\n"
-                      "i32, i32, i32, i32,\n"
-                      "i32, i32, i32\n";
+        kerStream << generalIndex;
     }
-
-    kerStream << ")* " << funcName << ",\n "
-              << "metadata !\"kernel\", i32 1}\n";
+    kerStream << offsetsStream.str();
+    kerStream << opsStream.str();
+    kerStream << outWriteStream.str();
+    kerStream << blockEnd;
 
     return kerStream.str();
 }
 
-#define NVVM_CHECK(fn, msg) do {                \
-        nvvmResult res = fn;                    \
-        if (res == NVVM_SUCCESS) break;         \
-        char nvvm_err_msg[1024];                \
-        snprintf(nvvm_err_msg,                  \
-                 sizeof(nvvm_err_msg),          \
-                 "NVVM Error (%d): %s\n",       \
-                 (int)(res), msg);              \
-        AF_ERROR(nvvm_err_msg,                  \
-                 AF_ERR_INTERNAL);              \
-                                                \
-    } while(0)
-
-#if defined(USE_LIBDEVICE)
-void compute_to_libdevice_table(const char **buffer, size_t *bc_buffer_len, int compute)
-{
-// These macros create a fallback compute if in case the specific libdevice
-// compute is not found
-// 50 -> 30 -> 20 -> Not Found
-// 35 -> 30 -> 20 -> Not Found
-// 30 -> 20 -> Not Found
-// 20 -> Not Found
-#if defined(__LIBDEVICE_COMPUTE_20)
-    #define COMPUTE_20_STR compute_20_bc
-    #define COMPUTE_20_LEN compute_20_bc_len
-#else
-    #define COMPUTE_20_STR NULL
-    #define COMPUTE_20_LEN 0
-#endif
-
-#if defined(__LIBDEVICE_COMPUTE_30)
-    #define COMPUTE_30_STR compute_30_bc
-    #define COMPUTE_30_LEN compute_30_bc_len
-#else   // Fallback
-    #define COMPUTE_30_STR COMPUTE_20_STR
-    #define COMPUTE_30_LEN COMPUTE_20_LEN
-#endif
-
-#if defined(__LIBDEVICE_COMPUTE_35)
-    #define COMPUTE_35_STR compute_35_bc
-    #define COMPUTE_35_LEN compute_35_bc_len
-#else   // Fallback
-    #define COMPUTE_35_STR COMPUTE_30_STR
-    #define COMPUTE_35_LEN COMPUTE_30_LEN
-#endif
-
-#if defined(__LIBDEVICE_COMPUTE_50)
-    #define COMPUTE_50_STR compute_50_bc
-    #define COMPUTE_50_LEN compute_50_bc_len
-#else   // Fallback
-    #define COMPUTE_50_STR COMPUTE_30_STR
-    #define COMPUTE_50_LEN COMPUTE_30_LEN
-#endif
-
-    // Source: http://docs.nvidia.com/cuda/libdevice-users-guide/basic-usage.html#version-selection
-    if(compute >= 20 && compute < 30) {
-        *buffer        = COMPUTE_20_STR;
-        *bc_buffer_len = COMPUTE_20_LEN;
-    } else if (compute == 30) {
-        *buffer        = COMPUTE_30_STR;
-        *bc_buffer_len = COMPUTE_30_LEN;
-    } else if (compute >= 31 && compute <  35) {
-        *buffer        = COMPUTE_20_STR;
-        *bc_buffer_len = COMPUTE_20_LEN;
-    } else if (compute >= 35 && compute <= 37) {
-        *buffer        = COMPUTE_35_STR;
-        *bc_buffer_len = COMPUTE_35_LEN;
-    } else if (compute >  37 && compute <  50) {
-        *buffer        = COMPUTE_30_STR;
-        *bc_buffer_len = COMPUTE_30_LEN;
-    } else if (compute >= 50 && compute <= 53) {
-        *buffer        = COMPUTE_50_STR;
-        *bc_buffer_len = COMPUTE_50_LEN;
-    } else if (compute >  53) {
-        *buffer        = COMPUTE_30_STR;
-        *bc_buffer_len = COMPUTE_30_LEN;
-    } else {
-        *buffer        = COMPUTE_30_STR;
-        *bc_buffer_len = COMPUTE_30_LEN;
-    }
-}
-#endif
-
-static unique_ptr<char[]> irToPtx(string IR, size_t *ptx_size)
-{
-    nvvmProgram prog;
-
-    NVVM_CHECK(nvvmCreateProgram(&prog), "Failed to create program");
-
-#if defined(USE_LIBDEVICE)
-    // Get compute version of device
-    cudaDeviceProp devProp = getDeviceProp(getActiveDeviceId());
-    int compute = devProp.major * 10 + devProp.minor;
-    const char *bc_buffer = NULL;
-    size_t bc_buffer_len = 0;
-    compute_to_libdevice_table(&bc_buffer, &bc_buffer_len, compute);
-    if(bc_buffer)
-        NVVM_CHECK(nvvmAddModuleToProgram(prog, bc_buffer, bc_buffer_len, "libdevice kernels"),
-                   "Failed to add libdevice");
-    else
-        NVVM_CHECK(nvvmAddModuleToProgram(prog, IR.c_str(), IR.size(), "generated kernel"),
-                   "Failed to add module");
-#endif
-
-    NVVM_CHECK(nvvmAddModuleToProgram(prog, IR.c_str(), IR.size(), "generated kernel"),
-               "Failed to add module");
-
-    //FIXME: Use proper compute
-    const char *options = NULL;
-    const int noptions = 0;
-
-//#ifdef NDEBUG
-#if 0
-    NVVM_CHECK(nvvmCompileProgram(prog, noptions, &options), "Failed to compile program");
-#else
-    nvvmResult comp_res = nvvmCompileProgram(prog, noptions, &options);
-    if (comp_res != NVVM_SUCCESS) {
-        size_t log_size = 0;
-        nvvmGetProgramLogSize(prog, &log_size);
-        printf("%ld, %zu\n", IR.size(), log_size);
-        unique_ptr<char[]> log(new char[log_size]);
-        nvvmGetProgramLog(prog, log.get());
-        printf("LOG:\n%s\n%s", log.get(), IR.c_str());
-        NVVM_CHECK(comp_res, "Failed to compile program");
-    }
-#endif
-
-    NVVM_CHECK(nvvmGetCompiledResultSize(prog, ptx_size), "Can not get ptx size");
-
-    unique_ptr<char[]> ptx{new char[*ptx_size]};
-    NVVM_CHECK(nvvmGetCompiledResult(prog, ptx.get()), "Can not get ptx from NVVM IR");
-    NVVM_CHECK(nvvmDestroyProgram(&prog), "Failed to destroy program");
-    return ptx;
-}
-
 typedef struct {
     CUmodule prog;
     CUfunction ker;
 } kc_entry_t;
 
-
-const size_t size = 1024;
-char linkInfo[size];
-char linkError[size];
-
-#ifndef NDEBUG
 #define CU_CHECK(fn) do {                       \
         CUresult res = fn;                      \
         if (res == CUDA_SUCCESS) break;         \
         char cu_err_msg[1024];                  \
         snprintf(cu_err_msg,                    \
                  sizeof(cu_err_msg),            \
-                 "CU Error (%d)\n%s\n",         \
-                 (int)(res), linkError);        \
+                 "CU Error (%d)\n",             \
+                 (int)(res));                   \
         AF_ERROR(cu_err_msg,                    \
                  AF_ERR_INTERNAL);              \
     } while(0)
-#else
-#define CU_CHECK(fn) do {                       \
+
+#ifndef NDEBUG
+#define CU_LINK_CHECK(fn) do {                  \
         CUresult res = fn;                      \
         if (res == CUDA_SUCCESS) break;         \
         char cu_err_msg[1024];                  \
         snprintf(cu_err_msg,                    \
                  sizeof(cu_err_msg),            \
-                 "CU Error (%d)\n",             \
-                 (int)(res));                   \
+                 "CU Error (%d)\n%s\n",         \
+                 (int)(res), linkError);        \
         AF_ERROR(cu_err_msg,                    \
                  AF_ERR_INTERNAL);              \
     } while(0)
+#else
+#define CU_LINK_CHECK(fn) CU_CHECK(fn)
+#endif
+
+#ifndef NDEBUG
+#define NVRTC_CHECK(fn) do {                        \
+        nvrtcResult res = fn;                       \
+        if (res == NVRTC_SUCCESS) break;            \
+        size_t logSize;                             \
+        nvrtcGetProgramLogSize(prog, &logSize);     \
+        unique_ptr<char []> log(new char[logSize]); \
+        char *logptr = log.get();                   \
+        nvrtcGetProgramLog(prog, logptr);           \
+        logptr[logSize] = '\x0';                    \
+        printf("%s\n", logptr);                     \
+        AF_ERROR("NVRTC ERROR",                     \
+                 AF_ERR_INTERNAL);                  \
+    } while(0)
+#else
+#define NVRTC_CHECK(fn) do {                        \
+        nvrtcResult res = fn;                       \
+        if (res == NVRTC_SUCCESS) break;            \
+        char nvrtc_err_msg[1024];                   \
+        snprintf(nvrtc_err_msg,                     \
+                 sizeof(nvrtc_err_msg),             \
+                 "NVRTC Error(%d): %s\n",           \
+                 res, nvrtcGetErrorString(res));    \
+        AF_ERROR(nvrtc_err_msg,                     \
+                 AF_ERR_INTERNAL);                  \
+    } while(0)
 #endif
 
+unique_ptr<char []> compileToPTX(const char *ker_name, string jit_ker, size_t *ptx_size)
+{
+    nvrtcProgram prog;
+    NVRTC_CHECK(nvrtcCreateProgram(&prog, jit_ker.c_str(), ker_name, 0, NULL, NULL));
+    NVRTC_CHECK(nvrtcCompileProgram(prog, 0, NULL));
+    NVRTC_CHECK(nvrtcGetPTXSize(prog, ptx_size));
+    unique_ptr<char []> ptx(new char[*ptx_size]);
+    NVRTC_CHECK(nvrtcGetPTX(prog, ptx.get()));
+    return ptx;
+}
+
 static kc_entry_t compileKernel(const char *ker_name, string jit_ker)
 {
     lock_guard<mutex> lock(getDriverApiMutex(getActiveDeviceId()));
 
+    const size_t linkLogSize = 1024;
+    char linkInfo[linkLogSize] = {0};
+    char linkError[linkLogSize] = {0};
+
     size_t ptx_size;
-    unique_ptr<char[]> ptx(irToPtx(jit_ker, &ptx_size));
+    auto ptx = compileToPTX(ker_name, jit_ker, &ptx_size);
 
     CUlinkState linkState;
-
-    linkInfo[0] = 0;
-    linkError[0] = 0;
-
     CUjit_option linkOptions[] = {
         CU_JIT_INFO_LOG_BUFFER,
         CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
@@ -513,49 +285,25 @@ static kc_entry_t compileKernel(const char *ker_name, string jit_ker)
 
     void *linkOptionValues[] = {
         linkInfo,
-        reinterpret_cast<void*>(1024),
+        reinterpret_cast<void*>(linkLogSize),
         linkError,
-        reinterpret_cast<void*>(1024),
+        reinterpret_cast<void*>(linkLogSize),
         reinterpret_cast<void*>(1)
     };
 
-    CU_CHECK(cuLinkCreate(5, linkOptions, linkOptionValues, &linkState));
-    CU_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)ptx.get(),
-                           ptx_size, ker_name, 0, NULL, NULL));
-
-    CU_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)arith_ptx,
-                           arith_ptx_len, "arith", 0, NULL, NULL));
-
-    CU_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)cast_ptx,
-                           cast_ptx_len, "cast", 0, NULL, NULL));
-
-    CU_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)exp_ptx,
-                           exp_ptx_len, "exp", 0, NULL, NULL));
-
-    CU_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)hyper_ptx,
-                           hyper_ptx_len, "hyper", 0, NULL, NULL));
-
-    CU_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)logic_ptx,
-                           logic_ptx_len, "logic", 0, NULL, NULL));
-
-    CU_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)numeric_ptx,
-                           numeric_ptx_len, "numeric", 0, NULL, NULL));
-
-    CU_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)trig_ptx,
-                           trig_ptx_len, "trig", 0, NULL, NULL));
+    CU_LINK_CHECK(cuLinkCreate(5, linkOptions, linkOptionValues, &linkState));
+    CU_LINK_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)ptx.get(),
+                                ptx_size, ker_name, 0, NULL, NULL));
 
     void *cubin;
     size_t cubinSize;
 
     CUmodule module;
     CUfunction kernel;
-
-    CU_CHECK(cuLinkComplete(linkState, &cubin, &cubinSize));
+    CU_LINK_CHECK(cuLinkComplete(linkState, &cubin, &cubinSize));
     CU_CHECK(cuModuleLoadDataEx(&module, cubin, 0, 0, 0));
-    CU_CHECK(cuModuleGetFunction(&kernel, module, ker_name + 1));
-
+    CU_CHECK(cuModuleGetFunction(&kernel, module, ker_name));
     kc_entry_t entry = {module, kernel};
-
     return entry;
 }
 
@@ -659,40 +407,12 @@ void evalNodes(vector<Param<T> >&outputs, vector<Node *> output_nodes)
     }
 
     for (int i = 0; i < num_outputs; i++) {
-        args.push_back(&outputs[i].ptr);
-    }
-
-    // DO NOT PUT THESE IN A SCOPE.
-    // The pointers are used later.
-    // Scoping them results in undefined behavior.
-
-    int strides[] = {(int)outputs[0].strides[0],
-                     (int)outputs[0].strides[1],
-                     (int)outputs[0].strides[2],
-                     (int)outputs[0].strides[3]};
-
-    int dims[] = {(int)outputs[0].dims[0],
-                  (int)outputs[0].dims[1],
-                  (int)outputs[0].dims[2],
-                  (int)outputs[0].dims[3]};
-
-    if (is_linear) {
-        int nelem = 1;
-        for (int i = 0; i < 4; i++) {
-            nelem *= outputs[0].dims[i];
-        }
-        args.push_back((void *)&nelem);
-    } else {
-        for (int i = 0; i < 4; i++) args.push_back((void *)(strides + i));
-        for (int i = 0; i < 4; i++) args.push_back((void *)(dims + i));
+        args.push_back((void *)&outputs[i]);
     }
 
     args.push_back((void *)&blocks_x_);
     args.push_back((void *)&blocks_y_);
-
-    if (!is_linear) {
-        args.push_back((void *)&num_odims);
-    }
+    args.push_back((void *)&num_odims);
 
     lock_guard<mutex> lock(getDriverApiMutex(getActiveDeviceId()));
     CU_CHECK(cuLaunchKernel(ker,
diff --git a/src/backend/cuda/kernel/jit.cuh b/src/backend/cuda/kernel/jit.cuh
new file mode 100644
index 0000000000..0868d1f3df
--- /dev/null
+++ b/src/backend/cuda/kernel/jit.cuh
@@ -0,0 +1,205 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+typedef float2 cuFloatComplex;
+typedef cuFloatComplex cfloat;
+
+typedef double2 cuDoubleComplex;
+typedef cuDoubleComplex cdouble;
+
+// ----------------------------------------------
+// REAL NUMBER OPERATIONS
+// ----------------------------------------------
+#define sign(in) signbit((in))
+#define __noop(a) (a)
+#define __add(lhs, rhs) (lhs) + (rhs)
+#define __sub(lhs, rhs) (lhs) - (rhs)
+#define __mul(lhs, rhs) (lhs) * (rhs)
+#define __div(lhs, rhs) (lhs) / (rhs)
+#define __and(lhs, rhs) (lhs) && (rhs)
+#define __or(lhs, rhs) (lhs) || (rhs)
+
+#define __lt(lhs, rhs) (lhs) < (rhs)
+#define __gt(lhs, rhs) (lhs) > (rhs)
+#define __le(lhs, rhs) (lhs) <= (rhs)
+#define __ge(lhs, rhs) (lhs) >= (rhs)
+#define __eq(lhs, rhs) (lhs) == (rhs)
+#define __neq(lhs, rhs) (lhs) != (rhs)
+
+#define __conj(in) (in)
+#define __real(in) (in)
+#define __imag(in) (0)
+#define __abs(in) abs(in)
+#define __sigmoid(in) (1.0/(1 + exp(-(in))))
+
+#define __bitor(lhs, rhs) ((lhs) | (rhs))
+#define __bitand(lhs, rhs) ((lhs) & (rhs))
+#define __bitxor(lhs, rhs) ((lhs) ^ (rhs))
+#define __bitshiftl(lhs, rhs) ((lhs) << (rhs))
+#define __bitshiftr(lhs, rhs) ((lhs) >> (rhs))
+
+#define __min(lhs, rhs) ((lhs) < (rhs)) ? (lhs) : (rhs)
+#define __max(lhs, rhs) ((lhs) > (rhs)) ? (lhs) : (rhs)
+#define __rem(lhs, rhs) ((lhs) % (rhs))
+#define __mod(lhs, rhs) ((lhs) % (rhs))
+#define __pow(lhs, rhs) fpow((float)lhs, (float)rhs)
+
+#define __convert_char(val) (char)((val) != 0)
+#define fpow(lhs, rhs) pow((lhs), (rhs))
+#define frem(lhs, rhs) remainder((lhs), (rhs))
+#define iszero(a) ((a) == 0)
+
+// ----------------------------------------------
+// COMPLEX FLOAT OPERATIONS
+// ----------------------------------------------
+
+#define __crealf(in) ((in).x)
+#define __cimagf(in) ((in).y)
+#define __cabsf(in) hypotf(in.x, in.y)
+
+__device__ cfloat __cplx2f(float x, float y)
+{
+    cfloat res = {x, y};
+    return res;
+}
+
+__device__ cfloat __cconjf(cfloat in)
+{
+    cfloat res = {in.x, -in.y};
+    return res;
+}
+
+__device__ cfloat __caddf(cfloat lhs, cfloat rhs)
+{
+    cfloat res = {lhs.x + rhs.x, lhs.y + rhs.y};
+    return res;
+}
+
+__device__ cfloat __csubf(cfloat lhs, cfloat rhs)
+{
+    cfloat res = {lhs.x - rhs.x, lhs.y - rhs.y};
+    return res;
+}
+
+__device__ cfloat __cmulf(cfloat lhs, cfloat rhs)
+{
+    cfloat out;
+    out.x = lhs.x * rhs.x - lhs.y * rhs.y;
+    out.y = lhs.x * rhs.y + lhs.y * rhs.x;
+    return out;
+}
+
+__device__ cfloat __cdivf(cfloat lhs, cfloat rhs)
+{
+    // Normalize by absolute value and multiply
+    float rhs_abs = __cabsf(rhs);
+    float inv_rhs_abs = 1.0f / rhs_abs;
+    float rhs_x = inv_rhs_abs * rhs.x;
+    float rhs_y = inv_rhs_abs * rhs.y;
+    cfloat out = {lhs.x * rhs_x + lhs.y * rhs_y,
+                  lhs.y * rhs_x - lhs.x * rhs_y};
+    out.x *= inv_rhs_abs;
+    out.y *= inv_rhs_abs;
+    return out;
+}
+
+__device__ cfloat __cminf(cfloat lhs, cfloat rhs)
+{
+    return __cabsf(lhs) < __cabsf(rhs) ? lhs : rhs;
+}
+
+__device__ cfloat __cmaxf(cfloat lhs, cfloat rhs)
+{
+    return __cabsf(lhs) > __cabsf(rhs) ? lhs : rhs;
+}
+#define __candf(lhs, rhs) __cabsf(lhs) && __cabsf(rhs)
+#define __corf(lhs, rhs) __cabsf(lhs) || __cabsf(rhs)
+#define __ceqf(lhs, rhs) (((lhs).x == (rhs).x) && ((lhs).y == (rhs).y))
+#define __cneqf(lhs, rhs) !__ceqf((lhs), (rhs))
+#define __cltf(lhs, rhs) (__cabsf(lhs) < __cabsf(rhs))
+#define __clef(lhs, rhs) (__cabsf(lhs) <= __cabsf(rhs))
+#define __cgtf(lhs, rhs) (__cabsf(lhs) > __cabsf(rhs))
+#define __cgef(lhs, rhs) (__cabsf(lhs) >= __cabsf(rhs))
+#define __convert_cfloat(real) __cplx2f(real, 0)
+#define __convert_c2c(in) (in)
+#define __convert_z2c(in) __cplx2f((float)in.x, (float)in.y)
+
+// ----------------------------------------------
+// COMPLEX DOUBLE OPERATIONS
+// ----------------------------------------------
+#define __creal(in) ((in).x)
+#define __cimag(in) ((in).y)
+#define __cabs(in) hypot(in.x, in.y)
+
+__device__ cdouble __cplx2(double x, double y)
+{
+    cdouble res = {x, y};
+    return res;
+}
+
+__device__ cdouble __cconj(cdouble in)
+{
+    cdouble res = {in.x, -in.y};
+    return res;
+}
+
+__device__ cdouble __cadd(cdouble lhs, cdouble rhs)
+{
+    cdouble res = {lhs.x + rhs.x, lhs.y + rhs.y};
+    return res;
+}
+
+__device__ cdouble __csub(cdouble lhs, cdouble rhs)
+{
+    cdouble res = {lhs.x - rhs.x, lhs.y - rhs.y};
+    return res;
+}
+
+__device__ cdouble __cmul(cdouble lhs, cdouble rhs)
+{
+    cdouble out;
+    out.x = lhs.x * rhs.x - lhs.y * rhs.y;
+    out.y = lhs.x * rhs.y + lhs.y * rhs.x;
+    return out;
+}
+
+__device__ cdouble __cdiv(cdouble lhs, cdouble rhs)
+{
+    // Normalize by absolute value and multiply
+    double rhs_abs = __cabs(rhs);
+    double inv_rhs_abs = 1.0 / rhs_abs;
+    double rhs_x = inv_rhs_abs * rhs.x;
+    double rhs_y = inv_rhs_abs * rhs.y;
+    cdouble out = {lhs.x * rhs_x + lhs.y * rhs_y,
+                  lhs.y * rhs_x - lhs.x * rhs_y};
+    out.x *= inv_rhs_abs;
+    out.y *= inv_rhs_abs;
+    return out;
+}
+
+__device__ cdouble __cmin(cdouble lhs, cdouble rhs)
+{
+    return __cabs(lhs) < __cabs(rhs) ? lhs : rhs;
+}
+
+__device__ cdouble __cmax(cdouble lhs, cdouble rhs)
+{
+    return __cabs(lhs) > __cabs(rhs) ? lhs : rhs;
+}
+#define __cand(lhs, rhs) __cabs(lhs) && __cabs(rhs)
+#define __cor(lhs, rhs) __cabs(lhs) || __cabs(rhs)
+#define __ceq(lhs, rhs) (((lhs).x == (rhs).x) && ((lhs).y == (rhs).y))
+#define __cneq(lhs, rhs) !__ceq((lhs), (rhs))
+#define __clt(lhs, rhs) (__cabs(lhs) < __cabs(rhs))
+#define __cle(lhs, rhs) (__cabs(lhs) <= __cabs(rhs))
+#define __cgt(lhs, rhs) (__cabs(lhs) > __cabs(rhs))
+#define __cge(lhs, rhs) (__cabs(lhs) >= __cabs(rhs))
+#define __convert_cdouble(real) __cplx2(real, 0)
+#define __convert_z2z(in) (in)
+#define __convert_c2z(in) __cplx2((double)in.x, (double)in.y)
diff --git a/src/backend/cuda/logic.hpp b/src/backend/cuda/logic.hpp
index 8261a02d73..2c047ba8f8 100644
--- a/src/backend/cuda/logic.hpp
+++ b/src/backend/cuda/logic.hpp
@@ -7,11 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#include <af/defines.h>
 #include <af/dim4.hpp>
 #include <Array.hpp>
 #include <optypes.hpp>
-#include <err_cuda.hpp>
 #include <binary.hpp>
+#include <err_cuda.hpp>
 
 namespace cuda
 {
diff --git a/src/backend/cuda/types.cpp b/src/backend/cuda/types.cpp
index 8c29c00b45..b69eb661e7 100644
--- a/src/backend/cuda/types.cpp
+++ b/src/backend/cuda/types.cpp
@@ -10,91 +10,38 @@
 #include <af/defines.h>
 #include "types.hpp"
 #include <sstream>
+#include <af/traits.hpp>
 
 namespace cuda
 {
-    template<typename T > const char *cuShortName() { return "q"; }
-    template<> const char *cuShortName<float   >() { return "f"; }
-    template<> const char *cuShortName<double  >() { return "d"; }
-    template<> const char *cuShortName<cfloat  >() { return "6float2"; }
-    template<> const char *cuShortName<cdouble >() { return "7double2"; }
-    template<> const char *cuShortName<int     >() { return "i"; }
-    template<> const char *cuShortName<uint    >() { return "j"; }
-    template<> const char *cuShortName<char    >() { return "c"; }
-    template<> const char *cuShortName<uchar   >() { return "h"; }
-    template<> const char *cuShortName<intl    >() { return "x"; }
-    template<> const char *cuShortName<uintl   >() { return "y"; }
-    template<> const char *cuShortName<short   >() { return "s"; }
-    template<> const char *cuShortName<ushort  >() { return "t"; }
 
-    template<typename T > const char *afShortName(bool caps) { return caps ?  "Q" : "q"; }
-    template<> const char *afShortName<float   >(bool caps) { return caps ?  "S" : "s"; }
-    template<> const char *afShortName<double  >(bool caps) { return caps ?  "D" : "d"; }
-    template<> const char *afShortName<cfloat  >(bool caps) { return caps ?  "C" : "c"; }
-    template<> const char *afShortName<cdouble >(bool caps) { return caps ?  "Z" : "z"; }
-    template<> const char *afShortName<int     >(bool caps) { return caps ?  "I" : "i"; }
-    template<> const char *afShortName<uint    >(bool caps) { return caps ?  "U" : "u"; }
-    template<> const char *afShortName<char    >(bool caps) { return caps ?  "J" : "j"; }
-    template<> const char *afShortName<uchar   >(bool caps) { return caps ?  "V" : "v"; }
-    template<> const char *afShortName<intl    >(bool caps) { return caps ?  "X" : "x"; }
-    template<> const char *afShortName<uintl   >(bool caps) { return caps ?  "Y" : "y"; }
-    template<> const char *afShortName<short   >(bool caps) { return caps ?  "P" : "P"; }
-    template<> const char *afShortName<ushort  >(bool caps) { return caps ?  "Q" : "Q"; }
-
-    template<typename T > const char *irname() { return  "i32"; }
-    template<> const char *irname<float   >() { return  "float"; }
-    template<> const char *irname<double  >() { return  "double"; }
-    template<> const char *irname<cfloat  >() { return  "<2 x float>"; }
-    template<> const char *irname<cdouble >() { return  "<2 x double>"; }
-    template<> const char *irname<int     >() { return  "i32"; }
-    template<> const char *irname<uint    >() { return  "i32"; }
-    template<> const char *irname<intl    >() { return  "i64"; }
-    template<> const char *irname<uintl   >() { return  "i64"; }
-    template<> const char *irname<char    >() { return  "i8"; }
-    template<> const char *irname<uchar   >() { return  "i8"; }
-    template<> const char *irname<short   >() { return  "i16"; }
-    template<> const char *irname<ushort  >() { return  "i16"; }
-
-    template <typename T>
-    static inline std::string toString(T val)
-    {
-        std::stringstream s;
-        s << val;
-        return s.str();
-    }
-
-    template<typename T, bool binary>
-    const std::string cuMangledName(const char *fn)
-    {
-        std::string cname(cuShortName<T>());
-        std::string fname(fn);
-        size_t flen = fname.size();
-
-        std::string res = std::string("@_Z") + toString(flen) + fname + cname;
-        if (binary) {
-            if (cname.size() > 1) {
-                res = res + "S_";
-            } else {
-                res = res + cname;
-            }
-        }
-        return res;
-    }
-
-#define INSTANTIATE(T)                                                  \
-    template const std::string cuMangledName<T, false>(const char *fn); \
-    template const std::string cuMangledName<T, true>(const char *fn);  \
+    template<typename T > const char *shortname(bool caps) { return caps ?  "Q" : "q"; }
+    template<> const char *shortname<float   >(bool caps) { return caps ?  "S" : "s"; }
+    template<> const char *shortname<double  >(bool caps) { return caps ?  "D" : "d"; }
+    template<> const char *shortname<cfloat  >(bool caps) { return caps ?  "C" : "c"; }
+    template<> const char *shortname<cdouble >(bool caps) { return caps ?  "Z" : "z"; }
+    template<> const char *shortname<int     >(bool caps) { return caps ?  "I" : "i"; }
+    template<> const char *shortname<uint    >(bool caps) { return caps ?  "U" : "u"; }
+    template<> const char *shortname<char    >(bool caps) { return caps ?  "J" : "j"; }
+    template<> const char *shortname<uchar   >(bool caps) { return caps ?  "V" : "v"; }
+    template<> const char *shortname<intl    >(bool caps) { return caps ?  "X" : "x"; }
+    template<> const char *shortname<uintl   >(bool caps) { return caps ?  "Y" : "y"; }
+    template<> const char *shortname<short   >(bool caps) { return caps ?  "P" : "P"; }
+    template<> const char *shortname<ushort  >(bool caps) { return caps ?  "Q" : "Q"; }
+
+#define INSTANTIATE(T)                                      \
+    template<> const char *getFullName<T>() { return #T; }  \
 
     INSTANTIATE(float)
     INSTANTIATE(double)
     INSTANTIATE(cfloat)
     INSTANTIATE(cdouble)
     INSTANTIATE(char)
-    INSTANTIATE(uchar)
-    INSTANTIATE(int)
-    INSTANTIATE(uint)
-    INSTANTIATE(intl)
-    INSTANTIATE(uintl)
+    INSTANTIATE(unsigned char)
     INSTANTIATE(short)
-    INSTANTIATE(ushort)
+    INSTANTIATE(unsigned short)
+    INSTANTIATE(int)
+    INSTANTIATE(unsigned int)
+    INSTANTIATE(unsigned long long)
+    INSTANTIATE(long long)
 }
diff --git a/src/backend/cuda/types.hpp b/src/backend/cuda/types.hpp
index 08aab5f374..b12200bdd8 100644
--- a/src/backend/cuda/types.hpp
+++ b/src/backend/cuda/types.hpp
@@ -10,6 +10,7 @@
 #pragma once
 #include <cuComplex.h>
 #include <string>
+#include <af/traits.hpp>
 
 namespace cuda
 {
@@ -23,7 +24,6 @@ template<typename T> struct is_complex          { static const bool value = fals
 template<> struct           is_complex<cfloat>  { static const bool value = true;   };
 template<> struct           is_complex<cdouble> { static const bool value = true;   };
 
-template<typename T, bool binary> const std::string cuMangledName(const char *fn);
-template<typename T > const char *afShortName(bool caps = true);
-template<typename T > const char *irname();
+template<typename T > const char *shortname(bool caps = true);
+template<typename T> const char *getFullName();
 }
diff --git a/src/backend/cuda/unary.hpp b/src/backend/cuda/unary.hpp
index 291f805193..13b9690cef 100644
--- a/src/backend/cuda/unary.hpp
+++ b/src/backend/cuda/unary.hpp
@@ -10,176 +10,90 @@
 #include <Array.hpp>
 #include <optypes.hpp>
 #include <math.hpp>
-#include <err_cuda.hpp>
 #include <JIT/UnaryNode.hpp>
 
 namespace cuda
 {
 
+template<af_op_t op>
+static const char *unaryName() { return "__noop"; }
+
+#define UNARY_DECL(OP, FNAME)                   \
+    template<> STATIC_                          \
+    const char *unaryName<af_##OP##_t>()        \
+    {                                           \
+        return FNAME;                           \
+    }                                           \
+
+#define UNARY_FN(OP) UNARY_DECL(OP, #OP)
+
+UNARY_FN(sin)
+UNARY_FN(cos)
+UNARY_FN(tan)
+
+UNARY_FN(asin)
+UNARY_FN(acos)
+UNARY_FN(atan)
+
+UNARY_FN(sinh)
+UNARY_FN(cosh)
+UNARY_FN(tanh)
+
+UNARY_FN(asinh)
+UNARY_FN(acosh)
+UNARY_FN(atanh)
+
+UNARY_FN(exp)
+UNARY_DECL(sigmoid, "__sigmoid")
+UNARY_FN(expm1)
+UNARY_FN(erf)
+UNARY_FN(erfc)
+
+UNARY_FN(tgamma)
+UNARY_FN(lgamma)
+
+UNARY_FN(log)
+UNARY_FN(log1p)
+UNARY_FN(log10)
+UNARY_FN(log2)
+
+UNARY_FN(sqrt)
+UNARY_FN(cbrt)
+
+UNARY_FN(trunc)
+UNARY_FN(round)
+UNARY_FN(sign)
+UNARY_FN(ceil)
+UNARY_FN(floor)
+
+UNARY_FN(isinf)
+UNARY_FN(isnan)
+UNARY_FN(iszero)
+
 template<typename T, af_op_t op>
-struct UnOp
+Array<T> unaryOp(const Array<T> &in)
 {
-    const char *name()
-    {
-        return "noop";
-    }
-};
-
-#define UNARY_FN(fn)                                \
-    template<typename T>                            \
-    struct UnOp<T, af_##fn##_t>                     \
-    {                                               \
-        std::string res;                            \
-        bool is_check;                              \
-        UnOp() :                                    \
-            res(cuMangledName<T, false>("___"#fn)), \
-            is_check(false)                         \
-        {                                           \
-        }                                           \
-        const std::string name()                    \
-        {                                           \
-            return res;                             \
-        }                                           \
-    };                                              \
-
-#define UNARY_FN_NAME(op, fn)                       \
-    template<typename T>                            \
-    struct UnOp<T, af_##op##_t>                     \
-    {                                               \
-        std::string res;                            \
-        bool is_check;                              \
-        UnOp() :                                    \
-            res(cuMangledName<T, false>("___"#fn)), \
-            is_check(false)                         \
-        {                                           \
-        }                                           \
-        const std::string name()                    \
-        {                                           \
-            return res;                             \
-        }                                           \
-    };                                              \
-
-#if defined(USE_LIBDEVICE)
-#define NVVM_SPECIALIZE_TYPE(T, fn, fname)      \
-    template<>                                  \
-    struct UnOp<T, af_##fn##_t>                 \
-    {                                           \
-        std::string res;                        \
-        bool is_check;                          \
-        UnOp() :                                \
-            res("@__nv_"#fname),                \
-            is_check(false)                     \
-        {                                       \
-        }                                       \
-        const std::string name()                \
-        {                                       \
-            return res;                         \
-        }                                       \
-    };                                          \
-
-#define NVVM_SPECIALIZE_CHECK(T, fn, fname)     \
-    template<>                                  \
-    struct UnOp<T, af_##fn##_t>                 \
-    {                                           \
-        std::string res;                        \
-        bool is_check;                          \
-        UnOp() :                                \
-            res("@__nv_"#fname),                \
-            is_check(true)                      \
-        {                                       \
-        }                                       \
-        const std::string name()                \
-        {                                       \
-            return res;                         \
-        }                                       \
-    };                                          \
-
-#else
-#define NVVM_SPECIALIZE_TYPE(T, fn, fname)  // no specialization
-#define NVVM_SPECIALIZE_CHECK(T, fn, fname)  // no specialization
-#endif
-
-#define NVVM_SPECIALIZE_FLOATING_NAME(fn, fname)    \
-    UNARY_FN(fn)                                    \
-    NVVM_SPECIALIZE_TYPE(float, fn, fname##f)       \
-    NVVM_SPECIALIZE_TYPE(double, fn, fname)         \
-
-
-#define NVVM_SPECIALIZE_FLOATING(fn)            \
-    NVVM_SPECIALIZE_FLOATING_NAME(fn, fn)
-
-NVVM_SPECIALIZE_FLOATING(sin)
-NVVM_SPECIALIZE_FLOATING(cos)
-NVVM_SPECIALIZE_FLOATING(tan)
-NVVM_SPECIALIZE_FLOATING(asin)
-NVVM_SPECIALIZE_FLOATING(acos)
-NVVM_SPECIALIZE_FLOATING(atan)
-NVVM_SPECIALIZE_FLOATING(sinh)
-NVVM_SPECIALIZE_FLOATING(cosh)
-NVVM_SPECIALIZE_FLOATING(tanh)
-NVVM_SPECIALIZE_FLOATING(asinh)
-NVVM_SPECIALIZE_FLOATING(acosh)
-NVVM_SPECIALIZE_FLOATING(atanh)
-NVVM_SPECIALIZE_FLOATING(exp)
-NVVM_SPECIALIZE_FLOATING(expm1)
-NVVM_SPECIALIZE_FLOATING(erf)
-NVVM_SPECIALIZE_FLOATING(erfc)
-NVVM_SPECIALIZE_FLOATING(tgamma)
-NVVM_SPECIALIZE_FLOATING(lgamma)
-NVVM_SPECIALIZE_FLOATING(log)
-NVVM_SPECIALIZE_FLOATING(log1p)
-NVVM_SPECIALIZE_FLOATING(log10)
-NVVM_SPECIALIZE_FLOATING(log2)
-NVVM_SPECIALIZE_FLOATING(sqrt)
-NVVM_SPECIALIZE_FLOATING(cbrt)
-NVVM_SPECIALIZE_FLOATING(round)
-NVVM_SPECIALIZE_FLOATING(trunc)
-NVVM_SPECIALIZE_FLOATING(ceil)
-NVVM_SPECIALIZE_FLOATING(floor)
-
-UNARY_FN(sign )
-NVVM_SPECIALIZE_CHECK(float , sign, signbitf)
-NVVM_SPECIALIZE_CHECK(double, sign, signbitd)
-
-UNARY_FN_NAME(isnan, isNaN)
-NVVM_SPECIALIZE_CHECK(float , isnan, isnanf)
-NVVM_SPECIALIZE_CHECK(double, isnan, isnand)
-
-UNARY_FN_NAME(isinf, isINF)
-NVVM_SPECIALIZE_CHECK(float , isinf, isinff)
-NVVM_SPECIALIZE_CHECK(double, isinf, isinfd)
-
-UNARY_FN_NAME(iszero, iszero)
-UNARY_FN(sigmoid)
-
-#undef UNARY_FN
-
-    template<typename T, af_op_t op>
-    Array<T> unaryOp(const Array<T> &in)
-    {
-
-        UnOp<T, op> uop;
-
-        JIT::Node_ptr in_node = in.getNode();
-
-        JIT::UnaryNode *node = new JIT::UnaryNode(irname<T>(),
-                                                  afShortName<T>(),
-                                                  uop.name(),
-                                                  in_node, op, uop.is_check);
-
-        return createNodeArray<T>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
-    }
-
-    template<typename T, af_op_t op>
-    Array<char> checkOp(const Array<T> &in)
-    {
-        UnOp<T, op> uop;
-
-        JIT::Node_ptr in_node = in.getNode();
-        JIT::UnaryNode *node = new JIT::UnaryNode(irname<char>(),
-                                                  afShortName<char>(),
-                                                  uop.name(),
-                                                  in_node, op, uop.is_check);
-        return createNodeArray<char>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
-    }
+    JIT::Node_ptr in_node = in.getNode();
+
+    JIT::UnaryNode *node = new JIT::UnaryNode(getFullName<T>(),
+                                              shortname<T>(true),
+                                              unaryName<op>(),
+                                              in_node, op);
+
+    return createNodeArray<T>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+}
+
+template<typename T, af_op_t op>
+Array<char> checkOp(const Array<T> &in)
+{
+    JIT::Node_ptr in_node = in.getNode();
+
+    JIT::UnaryNode *node = new JIT::UnaryNode(getFullName<char>(),
+                                              shortname<char>(true),
+                                              unaryName<op>(),
+                                              in_node, op);
+
+    return createNodeArray<char>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+}
+
 }

From 225a3e045c519e0325a6af44db5909bea34ae8d0 Mon Sep 17 00:00:00 2001
From: Pavan Yalamanchili <contact@pavanky.com>
Date: Mon, 19 Jun 2017 11:16:29 -0700
Subject: [PATCH 2/4] Fixing underflow/overflow issues with complex math in
 opencl jit

---
 src/backend/opencl/kernel/jit.cl | 70 ++++++++++++++++----------------
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/src/backend/opencl/kernel/jit.cl b/src/backend/opencl/kernel/jit.cl
index 3092449418..183e52be81 100644
--- a/src/backend/opencl/kernel/jit.cl
+++ b/src/backend/opencl/kernel/jit.cl
@@ -27,17 +27,14 @@
 #define __real(in) (in)
 #define __imag(in) (0)
 #define __abs(in) abs(in)
-#define __abs2(in) (in) * (in)
 
 #define __crealf(in) ((in).x)
 #define __cimagf(in) ((in).y)
-#define __cabsf2(in) ((in).x * (in).x + (in).y * (in).y)
-#define __cabsf(in) sqrt(__cabsf2(in))
+#define __cabsf(in) hypot((in).x, (in).y)
 
 #define __creal(in) ((in).x)
 #define __cimag(in) ((in).y)
-#define __cabs2(in) ((in).x * (in).x + (in).y * (in).y)
-#define __cabs(in) sqrt(__cabs2(in))
+#define __cabs(in) hypot((in).x, (in).y)
 #define __sigmoid(in) (1.0/(1 + exp(-(in))))
 
 float2 __cconjf(float2 in)
@@ -69,35 +66,37 @@ float2 __cmulf(float2 lhs, float2 rhs)
 // FIXME: overflow / underflow issues
 float2 __cdivf(float2 lhs, float2 rhs)
 {
-    float2 out;
-    float den = (rhs.x * rhs.x + rhs.y * rhs.y);
-    float2 num = __cmulf(lhs, __cconjf(rhs));
-
-    out.x = num.x / den;
-    out.y = num.y / den;
-
+    // Normalize by absolute value and multiply
+    float rhs_abs = __cabsf(rhs);
+    float inv_rhs_abs = 1.0f / rhs_abs;
+    float rhs_x = inv_rhs_abs * rhs.x;
+    float rhs_y = inv_rhs_abs * rhs.y;
+    float2 out = {lhs.x * rhs_x + lhs.y * rhs_y,
+                  lhs.y * rhs_x - lhs.x * rhs_y};
+    out.x *= inv_rhs_abs;
+    out.y *= inv_rhs_abs;
     return out;
 }
 
-#define __candf(lhs, rhs) __cabsf2(lhs) && __cabsf2(rhs)
-#define __cand(lhs, rhs) __cabs2(lhs) && __cabs2(rhs)
+#define __candf(lhs, rhs) __cabsf(lhs) && __cabsf(rhs)
+#define __cand(lhs, rhs) __cabs(lhs) && __cabs(rhs)
 
-#define __corf(lhs, rhs) __cabsf2(lhs) || __cabsf2(rhs)
-#define __cor(lhs, rhs) __cabs2(lhs) || __cabs2(rhs)
+#define __corf(lhs, rhs) __cabsf(lhs) || __cabsf(rhs)
+#define __cor(lhs, rhs) __cabs(lhs) || __cabs(rhs)
 
 #define __ceqf(lhs, rhs) (((lhs).x == (rhs).x) && ((lhs).y == (rhs).y))
 #define __cneqf(lhs, rhs) !__ceqf((lhs), (rhs))
-#define __cltf(lhs, rhs) (__cabsf2(lhs) < __cabsf2(rhs))
-#define __clef(lhs, rhs) (__cabsf2(lhs) <= __cabsf2(rhs))
-#define __cgtf(lhs, rhs) (__cabsf2(lhs) > __cabsf2(rhs))
-#define __cgef(lhs, rhs) (__cabsf2(lhs) >= __cabsf2(rhs))
+#define __cltf(lhs, rhs) (__cabsf(lhs) < __cabsf(rhs))
+#define __clef(lhs, rhs) (__cabsf(lhs) <= __cabsf(rhs))
+#define __cgtf(lhs, rhs) (__cabsf(lhs) > __cabsf(rhs))
+#define __cgef(lhs, rhs) (__cabsf(lhs) >= __cabsf(rhs))
 
 #define __ceq(lhs, rhs) (((lhs).x == (rhs).x) && ((lhs).y == (rhs).y))
 #define __cneq(lhs, rhs) !__ceq((lhs), (rhs))
-#define __clt(lhs, rhs) (__cabs2(lhs) < __cabs2(rhs))
-#define __cle(lhs, rhs) (__cabs2(lhs) <= __cabs2(rhs))
-#define __cgt(lhs, rhs) (__cabs2(lhs) > __cabs2(rhs))
-#define __cge(lhs, rhs) (__cabs2(lhs) >= __cabs2(rhs))
+#define __clt(lhs, rhs) (__cabs(lhs) < __cabs(rhs))
+#define __cle(lhs, rhs) (__cabs(lhs) <= __cabs(rhs))
+#define __cgt(lhs, rhs) (__cabs(lhs) > __cabs(rhs))
+#define __cge(lhs, rhs) (__cabs(lhs) >= __cabs(rhs))
 
 #define __bitor(lhs, rhs) ((lhs) | (rhs))
 #define __bitand(lhs, rhs) ((lhs) & (rhs))
@@ -113,12 +112,12 @@ float2 __cdivf(float2 lhs, float2 rhs)
 
 float2 __cminf(float2 lhs, float2 rhs)
 {
-    return __abs2(lhs) < __abs2(rhs) ? lhs : rhs;
+    return __cabsf(lhs) < __cabsf(rhs) ? lhs : rhs;
 }
 
 float2 __cmaxf(float2 lhs, float2 rhs)
 {
-    return __abs2(lhs) > __abs2(rhs) ? lhs : rhs;
+    return __cabsf(lhs) > __cabsf(rhs) ? lhs : rhs;
 }
 
 float2 __cplx2f(float lhs, float rhs)
@@ -175,23 +174,26 @@ double2 __cmul(double2 lhs, double2 rhs)
 
 double2 __cdiv(double2 lhs, double2 rhs)
 {
-    double2 out;
-    double den = (rhs.x * rhs.x + rhs.y * rhs.y);
-    double2 num = __cmul(lhs, __cconj(rhs));
-
-    out.x = num.x / den;
-    out.y = num.y / den;
+    // Normalize by absolute value and multiply
+    double rhs_abs = __cabs(rhs);
+    double inv_rhs_abs = 1.0 / rhs_abs;
+    double rhs_x = inv_rhs_abs * rhs.x;
+    double rhs_y = inv_rhs_abs * rhs.y;
+    double2 out = {lhs.x * rhs_x + lhs.y * rhs_y,
+                  lhs.y * rhs_x - lhs.x * rhs_y};
+    out.x *= inv_rhs_abs;
+    out.y *= inv_rhs_abs;
     return out;
 }
 
 double2 __cmin(double2 lhs, double2 rhs)
 {
-    return __abs2(lhs) < __abs2(rhs) ? lhs : rhs;
+    return __cabs(lhs) < __cabs(rhs) ? lhs : rhs;
 }
 
 double2 __cmax(double2 lhs, double2 rhs)
 {
-    return __abs2(lhs) > __abs2(rhs) ? lhs : rhs;
+    return __cabs(lhs) > __cabs(rhs) ? lhs : rhs;
 }
 
 double2 __cplx2(double lhs, double rhs)

From 81d74af7c30758a2815cec248652c55c9533abe4 Mon Sep 17 00:00:00 2001
From: Pavan Yalamanchili <contact@pavanky.com>
Date: Tue, 20 Jun 2017 05:17:28 -0700
Subject: [PATCH 3/4] Style changes for JIT in CUDA and OpenCL backends

---
 src/backend/cuda/Array.cpp            |   4 +-
 src/backend/cuda/JIT/BufferNode.hpp   |   8 +-
 src/backend/cuda/JIT/Node.hpp         |   2 +-
 src/backend/cuda/binary.hpp           |   5 +-
 src/backend/cuda/cast.hpp             |   6 +-
 src/backend/cuda/complex.hpp          |   8 +-
 src/backend/cuda/jit.cpp              | 146 +++++++++++++-------------
 src/backend/cuda/kernel/jit.cuh       |   2 +-
 src/backend/cuda/scalar.hpp           |   3 +-
 src/backend/cuda/types.cpp            |   2 +-
 src/backend/cuda/types.hpp            |   2 +-
 src/backend/cuda/unary.hpp            |   4 +-
 src/backend/opencl/Array.cpp          |   4 +-
 src/backend/opencl/JIT/BufferNode.hpp |   9 +-
 src/backend/opencl/binary.hpp         |   3 +-
 src/backend/opencl/cast.hpp           |   6 +-
 src/backend/opencl/complex.hpp        |   8 +-
 src/backend/opencl/jit.cpp            |  75 ++++++-------
 src/backend/opencl/kernel/jit.cl      |   2 +-
 src/backend/opencl/scalar.hpp         |   3 +-
 src/backend/opencl/types.cpp          |   2 +-
 src/backend/opencl/types.hpp          |   2 +-
 src/backend/opencl/unary.hpp          |   4 +-
 23 files changed, 146 insertions(+), 164 deletions(-)

diff --git a/src/backend/cuda/Array.cpp b/src/backend/cuda/Array.cpp
index da98060331..cec91d5c04 100644
--- a/src/backend/cuda/Array.cpp
+++ b/src/backend/cuda/Array.cpp
@@ -33,8 +33,8 @@ namespace cuda
     template<typename T>
     Node_ptr bufferNodePtr()
     {
-        return Node_ptr(reinterpret_cast<Node *>(new BufferNode<T>(getFullName<T>(),
-                                                                   shortname<T>(true))));
+        return Node_ptr(new BufferNode<T>(getFullName<T>(),
+                                          shortname<T>(true)));
     }
 
     template<typename T>
diff --git a/src/backend/cuda/JIT/BufferNode.hpp b/src/backend/cuda/JIT/BufferNode.hpp
index 6c4c2fb0a2..4c14f6c2d5 100644
--- a/src/backend/cuda/JIT/BufferNode.hpp
+++ b/src/backend/cuda/JIT/BufferNode.hpp
@@ -26,8 +26,8 @@ namespace JIT
         std::shared_ptr<T> m_data;
         Param<T> m_param;
         unsigned m_bytes;
-        bool m_linear_buffer;
         std::once_flag m_set_data_flag;
+        bool m_linear_buffer;
 
     public:
 
@@ -39,10 +39,6 @@ namespace JIT
 
         bool isBuffer() { return true; }
 
-        ~BufferNode()
-        {
-        }
-
         void setData(Param<T> param, std::shared_ptr<T> data, const unsigned bytes, bool is_linear)
         {
             std::call_once(m_set_data_flag, [this, param, data, bytes, is_linear]() {
@@ -94,7 +90,7 @@ namespace JIT
             if (is_linear) {
                 kerStream << idx_str << " = idx;\n";
             } else {
-                std::string info_str = std::string("in") + std::to_string(id);;
+                std::string info_str = std::string("in") + std::to_string(id);
                 kerStream << idx_str << " = "
                           << "(id3 < " << info_str << ".dims[3]) * "
                           << info_str << ".strides[3] * id3 + "
diff --git a/src/backend/cuda/JIT/Node.hpp b/src/backend/cuda/JIT/Node.hpp
index feae91ca98..995d4c7b1d 100644
--- a/src/backend/cuda/JIT/Node.hpp
+++ b/src/backend/cuda/JIT/Node.hpp
@@ -65,7 +65,7 @@ namespace JIT
             }
         }
 
-        virtual void genKerName(std::stringstream &kerStream, Node_ids ids) {}
+        virtual void genKerName (std::stringstream &kerStream, Node_ids ids) {}
         virtual void genParams  (std::stringstream &kerStream, int id, bool is_linear) {}
         virtual void genOffsets (std::stringstream &kerStream, int id, bool is_linear) {}
         virtual void genFuncs   (std::stringstream &kerStream, Node_ids) {}
diff --git a/src/backend/cuda/binary.hpp b/src/backend/cuda/binary.hpp
index d5659eb7e6..58d6254453 100644
--- a/src/backend/cuda/binary.hpp
+++ b/src/backend/cuda/binary.hpp
@@ -108,7 +108,6 @@ BINARY_TYPE_1(bitshiftr)
             return "__c"#fn"f";                 \
         }                                       \
     };                                          \
-                                                \
     template<typename To>                       \
     struct BinOp<To, cdouble, af_##fn##_t>      \
     {                                           \
@@ -118,7 +117,6 @@ BINARY_TYPE_1(bitshiftr)
         }                                       \
     };                                          \
 
-
 BINARY_TYPE_2(min)
 BINARY_TYPE_2(max)
 BINARY_TYPE_2(pow)
@@ -183,8 +181,7 @@ Array<To> createBinaryNode(const Array<Ti> &lhs, const Array<Ti> &rhs, const af:
                                                 lhs_node,
                                                 rhs_node, (int)(op));
 
-    return createNodeArray<To>(odims, JIT::Node_ptr(
-                                   reinterpret_cast<JIT::Node *>(node)));
+    return createNodeArray<To>(odims, JIT::Node_ptr(node));
 }
 
 }
diff --git a/src/backend/cuda/cast.hpp b/src/backend/cuda/cast.hpp
index 9f854c6e06..5b1ffe3a96 100644
--- a/src/backend/cuda/cast.hpp
+++ b/src/backend/cuda/cast.hpp
@@ -57,12 +57,10 @@ CAST_FN(double)
         }                                       \
     };
 
-
 CAST_CFN(cfloat)
 CAST_CFN(cdouble)
 CAST_CFN(char)
 
-
 template<>
 struct CastOp<cfloat, cdouble>
 {
@@ -72,7 +70,6 @@ struct CastOp<cfloat, cdouble>
     }
 };
 
-
 template<>
 struct CastOp<cdouble, cfloat>
 {
@@ -91,7 +88,6 @@ struct CastOp<cfloat, cfloat>
     }
 };
 
-
 template<>
 struct CastOp<cdouble, cdouble>
 {
@@ -115,7 +111,7 @@ struct CastWrapper
                                                   shortname<To>(true),
                                                   cop.name(),
                                                   in_node, af_cast_t);
-        return createNodeArray<To>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+        return createNodeArray<To>(in.dims(), JIT::Node_ptr(node));
     }
 };
 
diff --git a/src/backend/cuda/complex.hpp b/src/backend/cuda/complex.hpp
index 67ba0ebe4e..578d982087 100644
--- a/src/backend/cuda/complex.hpp
+++ b/src/backend/cuda/complex.hpp
@@ -30,7 +30,7 @@ namespace cuda
                                                   "__creal",
                                                   in_node, af_real_t);
 
-        return createNodeArray<To>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+        return createNodeArray<To>(in.dims(), JIT::Node_ptr(node));
     }
 
     template<typename To, typename Ti>
@@ -42,7 +42,7 @@ namespace cuda
                                                   "__cimag",
                                                   in_node, af_imag_t);
 
-        return createNodeArray<To>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+        return createNodeArray<To>(in.dims(), JIT::Node_ptr(node));
     }
 
     template<typename T> static const char *abs_name() { return "fabs"; }
@@ -58,7 +58,7 @@ namespace cuda
                                                   abs_name<Ti>(),
                                                   in_node, af_abs_t);
 
-        return createNodeArray<To>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+        return createNodeArray<To>(in.dims(), JIT::Node_ptr(node));
     }
 
     template<typename T> static const char *conj_name() { return "__noop"; }
@@ -74,6 +74,6 @@ namespace cuda
                                                   conj_name<T>(),
                                                   in_node, af_conj_t);
 
-        return createNodeArray<T>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+        return createNodeArray<T>(in.dims(), JIT::Node_ptr(node));
     }
 }
diff --git a/src/backend/cuda/jit.cpp b/src/backend/cuda/jit.cpp
index 5c655b2247..6f703765ab 100644
--- a/src/backend/cuda/jit.cpp
+++ b/src/backend/cuda/jit.cpp
@@ -7,24 +7,25 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <af/dim4.hpp>
+#include <kernel_headers/jit.hpp>
+#include <dispatch.hpp>
+#include <err_cuda.hpp>
+
 #include <Array.hpp>
 #include <copy.hpp>
 #include <JIT/Node.hpp>
-
 #include <platform.hpp>
-#include <dispatch.hpp>
-#include <err_cuda.hpp>
 #include <math.hpp>
 
+#include <af/dim4.hpp>
+#include <nvrtc.h>
+
 #include <functional>
 #include <map>
 #include <memory>
 #include <stdexcept>
 #include <thread>
 #include <vector>
-#include <kernel_headers/jit.hpp>
-#include <nvrtc.h>
 
 namespace cuda
 {
@@ -77,14 +78,14 @@ static string getKernelString(const string funcName,
 
     const char *includeFileStr = jit_cuh;
 
-    const char paramTStr[] =""
-        "template<typename T>\n"
-        "struct Param\n"
-        "{\n"
-        "  T *ptr;\n"
-        "  dim_t dims[4];\n"
-        "  dim_t strides[4];\n"
-        "};\n";
+    const char paramTStr[] =R"JIT(
+        template<typename T>
+        struct Param
+        {
+          T *ptr;
+          dim_t dims[4];
+          dim_t strides[4];
+        };)JIT";
 
     std::string typedefStr = "typedef unsigned int uint;\n";
     typedefStr += "typedef ";
@@ -94,46 +95,48 @@ static string getKernelString(const string funcName,
     // Common CUDA code
     // This part of the code does not change with the kernel.
 
-    static const char *kernelVoid =  "extern \"C\" __global__ void\n";
+    static const char *kernelVoid = "extern \"C\" __global__ void\n";
     static const char *dimParams = "uint blocks_x, uint blocks_y, uint num_odims";
     static const char *blockStart = "{\n\n";
     static const char *blockEnd = "\n\n}";
 
-    static const char *linearIndex = "\n"
-        "uint blockId  = blockIdx.y * gridDim.x + blockIdx.x;\n"
-        "uint threadId = threadIdx.x;\n"
-        "int idx = blockId * blockDim.x * blockDim.y + threadId;\n"
-        "if (idx >= outref.dims[3] * outref.strides[3]) return;\n";
-
-    static const char *generalIndex = "\n"
-        "uint id0 = 0, id1 = 0, id2 = 0, id3 = 0;\n"
-        "if (num_odims > 2) {\n"
-        "id2 = blockIdx.x / blocks_x;\n"
-        "id0 = blockIdx.x - id2 * blocks_x;\n"
-        "id0 = threadIdx.x + id0 * blockDim.x;\n"
-        "if (num_odims > 3) {\n"
-        "id3 = blockIdx.y / blocks_y;\n"
-        "id1 = blockIdx.y - id3 * blocks_y;\n"
-        "id1 = threadIdx.y + id1 * blockDim.y;\n"
-        "} else {\n"
-        "id1 = threadIdx.y + blockDim.y * blockIdx.y;\n"
-        "}\n"
-        " } else {\n"
-        "id3 = 0;\n"
-        "id2 = 0;\n"
-        "id1 = threadIdx.y + blockDim.y * blockIdx.y;\n"
-        "id0 = threadIdx.x + blockDim.x * blockIdx.x;\n"
-        "}\n"
-        "bool cond = \n"
-        "id0 < outref.dims[0] && \n"
-        "id1 < outref.dims[1] && \n"
-        "id2 < outref.dims[2] && \n"
-        "id3 < outref.dims[3];\n\n"
-        "if (!cond) return;\n\n"
-        "int idx = "
-        "outref.strides[3] * id3 + outref.strides[2] * id2 + "
-        "outref.strides[1] * id1 + id0;\n\n";
-
+    static const char *linearIndex = R"JIT(
+        uint blockId  = blockIdx.y * gridDim.x + blockIdx.x;
+        uint threadId = threadIdx.x;
+        int idx = blockId * blockDim.x * blockDim.y + threadId;
+        if (idx >= outref.dims[3] * outref.strides[3]) return;
+        )JIT";
+
+    static const char *generalIndex = R"JIT(
+        uint id0 = 0, id1 = 0, id2 = 0, id3 = 0;
+        if (num_odims > 2) {
+            id2 = blockIdx.x / blocks_x;
+            id0 = blockIdx.x - id2 * blocks_x;
+            id0 = threadIdx.x + id0 * blockDim.x;
+            if (num_odims > 3) {
+                id3 = blockIdx.y / blocks_y;
+                id1 = blockIdx.y - id3 * blocks_y;
+                id1 = threadIdx.y + id1 * blockDim.y;
+            } else {
+                id1 = threadIdx.y + blockDim.y * blockIdx.y;
+            }
+        } else {
+            id3 = 0;
+            id2 = 0;
+            id1 = threadIdx.y + blockDim.y * blockIdx.y;
+            id0 = threadIdx.x + blockDim.x * blockIdx.x;
+        }
+
+        bool cond = id0 < outref.dims[0] &&
+                    id1 < outref.dims[1] &&
+                    id2 < outref.dims[2] &&
+                    id3 < outref.dims[3];
+        if (!cond) return;
+
+        int idx = outref.strides[3] * id3 +
+                  outref.strides[2] * id2 +
+                  outref.strides[1] * id1 + id0;
+        )JIT";
 
     stringstream inParamStream;
     stringstream outParamStream;
@@ -161,7 +164,7 @@ static string getKernelString(const string funcName,
         // Generate output parameters
         outParamStream << "Param<" << full_nodes[id]->getTypeStr() << "> out" << id << ", \n";
         // Generate code to write the output
-        outWriteStream << "out" << id << ".ptr[idx] = " << "val" << id << ";\n";
+        outWriteStream << "out" << id << ".ptr[idx] = val" << id << ";\n";
     }
 
     // Put various blocks into a single stream
@@ -225,18 +228,18 @@ typedef struct {
 #endif
 
 #ifndef NDEBUG
-#define NVRTC_CHECK(fn) do {                        \
-        nvrtcResult res = fn;                       \
-        if (res == NVRTC_SUCCESS) break;            \
-        size_t logSize;                             \
-        nvrtcGetProgramLogSize(prog, &logSize);     \
-        unique_ptr<char []> log(new char[logSize]); \
-        char *logptr = log.get();                   \
-        nvrtcGetProgramLog(prog, logptr);           \
-        logptr[logSize] = '\x0';                    \
-        printf("%s\n", logptr);                     \
-        AF_ERROR("NVRTC ERROR",                     \
-                 AF_ERR_INTERNAL);                  \
+#define NVRTC_CHECK(fn) do {                            \
+        nvrtcResult res = fn;                           \
+        if (res == NVRTC_SUCCESS) break;                \
+        size_t logSize;                                 \
+        nvrtcGetProgramLogSize(prog, &logSize);         \
+        unique_ptr<char []> log(new char[logSize +1]);  \
+        char *logptr = log.get();                       \
+        nvrtcGetProgramLog(prog, logptr);               \
+        logptr[logSize] = '\x0';                        \
+        printf("%s\n", logptr);                         \
+        AF_ERROR("NVRTC ERROR",                         \
+                 AF_ERR_INTERNAL);                      \
     } while(0)
 #else
 #define NVRTC_CHECK(fn) do {                        \
@@ -252,14 +255,16 @@ typedef struct {
     } while(0)
 #endif
 
-unique_ptr<char []> compileToPTX(const char *ker_name, string jit_ker, size_t *ptx_size)
+std::vector<char> compileToPTX(const char *ker_name, string jit_ker)
 {
     nvrtcProgram prog;
+    size_t ptx_size;
+    std::vector<char> ptx;
     NVRTC_CHECK(nvrtcCreateProgram(&prog, jit_ker.c_str(), ker_name, 0, NULL, NULL));
     NVRTC_CHECK(nvrtcCompileProgram(prog, 0, NULL));
-    NVRTC_CHECK(nvrtcGetPTXSize(prog, ptx_size));
-    unique_ptr<char []> ptx(new char[*ptx_size]);
-    NVRTC_CHECK(nvrtcGetPTX(prog, ptx.get()));
+    NVRTC_CHECK(nvrtcGetPTXSize(prog, &ptx_size));
+    ptx.resize(ptx_size);
+    NVRTC_CHECK(nvrtcGetPTX(prog, ptx.data()));
     return ptx;
 }
 
@@ -271,8 +276,7 @@ static kc_entry_t compileKernel(const char *ker_name, string jit_ker)
     char linkInfo[linkLogSize] = {0};
     char linkError[linkLogSize] = {0};
 
-    size_t ptx_size;
-    auto ptx = compileToPTX(ker_name, jit_ker, &ptx_size);
+    auto ptx = compileToPTX(ker_name, jit_ker);
 
     CUlinkState linkState;
     CUjit_option linkOptions[] = {
@@ -292,8 +296,8 @@ static kc_entry_t compileKernel(const char *ker_name, string jit_ker)
     };
 
     CU_LINK_CHECK(cuLinkCreate(5, linkOptions, linkOptionValues, &linkState));
-    CU_LINK_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)ptx.get(),
-                                ptx_size, ker_name, 0, NULL, NULL));
+    CU_LINK_CHECK(cuLinkAddData(linkState, CU_JIT_INPUT_PTX, (void*)ptx.data(),
+                                ptx.size(), ker_name, 0, NULL, NULL));
 
     void *cubin;
     size_t cubinSize;
diff --git a/src/backend/cuda/kernel/jit.cuh b/src/backend/cuda/kernel/jit.cuh
index 0868d1f3df..830a9e58c4 100644
--- a/src/backend/cuda/kernel/jit.cuh
+++ b/src/backend/cuda/kernel/jit.cuh
@@ -177,7 +177,7 @@ __device__ cdouble __cdiv(cdouble lhs, cdouble rhs)
     double rhs_x = inv_rhs_abs * rhs.x;
     double rhs_y = inv_rhs_abs * rhs.y;
     cdouble out = {lhs.x * rhs_x + lhs.y * rhs_y,
-                  lhs.y * rhs_x - lhs.x * rhs_y};
+                   lhs.y * rhs_x - lhs.x * rhs_y};
     out.x *= inv_rhs_abs;
     out.y *= inv_rhs_abs;
     return out;
diff --git a/src/backend/cuda/scalar.hpp b/src/backend/cuda/scalar.hpp
index b2bd1606cc..46fca748a8 100644
--- a/src/backend/cuda/scalar.hpp
+++ b/src/backend/cuda/scalar.hpp
@@ -18,8 +18,7 @@ namespace cuda
 template<typename T>
 Array<T> createScalarNode(const dim4 &size, const T val)
 {
-    JIT::ScalarNode<T> *node = new JIT::ScalarNode<T>(val);
-    return createNodeArray<T>(size, JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+    return createNodeArray<T>(size, JIT::Node_ptr(new JIT::ScalarNode<T>(val)));
 }
 
 }
diff --git a/src/backend/cuda/types.cpp b/src/backend/cuda/types.cpp
index b69eb661e7..9d85037ba6 100644
--- a/src/backend/cuda/types.cpp
+++ b/src/backend/cuda/types.cpp
@@ -15,7 +15,7 @@
 namespace cuda
 {
 
-    template<typename T > const char *shortname(bool caps) { return caps ?  "Q" : "q"; }
+    template<typename T> const char *shortname(bool caps) { return caps ?  "Q" : "q"; }
     template<> const char *shortname<float   >(bool caps) { return caps ?  "S" : "s"; }
     template<> const char *shortname<double  >(bool caps) { return caps ?  "D" : "d"; }
     template<> const char *shortname<cfloat  >(bool caps) { return caps ?  "C" : "c"; }
diff --git a/src/backend/cuda/types.hpp b/src/backend/cuda/types.hpp
index b12200bdd8..3376e84bb5 100644
--- a/src/backend/cuda/types.hpp
+++ b/src/backend/cuda/types.hpp
@@ -24,6 +24,6 @@ template<typename T> struct is_complex          { static const bool value = fals
 template<> struct           is_complex<cfloat>  { static const bool value = true;   };
 template<> struct           is_complex<cdouble> { static const bool value = true;   };
 
-template<typename T > const char *shortname(bool caps = true);
+template<typename T> const char *shortname(bool caps = true);
 template<typename T> const char *getFullName();
 }
diff --git a/src/backend/cuda/unary.hpp b/src/backend/cuda/unary.hpp
index 13b9690cef..ed3d81944a 100644
--- a/src/backend/cuda/unary.hpp
+++ b/src/backend/cuda/unary.hpp
@@ -80,7 +80,7 @@ Array<T> unaryOp(const Array<T> &in)
                                               unaryName<op>(),
                                               in_node, op);
 
-    return createNodeArray<T>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+    return createNodeArray<T>(in.dims(), JIT::Node_ptr(node));
 }
 
 template<typename T, af_op_t op>
@@ -93,7 +93,7 @@ Array<char> checkOp(const Array<T> &in)
                                               unaryName<op>(),
                                               in_node, op);
 
-    return createNodeArray<char>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+    return createNodeArray<char>(in.dims(), JIT::Node_ptr(node));
 }
 
 }
diff --git a/src/backend/opencl/Array.cpp b/src/backend/opencl/Array.cpp
index 1ab28a8e31..ea061e21e3 100644
--- a/src/backend/opencl/Array.cpp
+++ b/src/backend/opencl/Array.cpp
@@ -30,8 +30,8 @@ namespace opencl
     template<typename T>
     Node_ptr bufferNodePtr()
     {
-        return Node_ptr(reinterpret_cast<Node *>(new BufferNode(dtype_traits<T>::getName(),
-                                                                shortname<T>(true))));
+        return Node_ptr(new BufferNode(dtype_traits<T>::getName(),
+                                       shortname<T>(true)));
     }
 
     template<typename T>
diff --git a/src/backend/opencl/JIT/BufferNode.hpp b/src/backend/opencl/JIT/BufferNode.hpp
index 92a2c32697..c55206b472 100644
--- a/src/backend/opencl/JIT/BufferNode.hpp
+++ b/src/backend/opencl/JIT/BufferNode.hpp
@@ -25,8 +25,8 @@ namespace JIT
         std::shared_ptr<cl::Buffer> m_data;
         KParam m_info;
         unsigned m_bytes;
-        bool m_linear_buffer;
         std::once_flag m_set_data_flag;
+        bool m_linear_buffer;
 
     public:
 
@@ -38,10 +38,6 @@ namespace JIT
 
         bool isBuffer() { return true; }
 
-        ~BufferNode()
-        {
-        }
-
         void setData(KParam info, std::shared_ptr<cl::Buffer> data, const unsigned bytes, bool is_linear)
         {
             std::call_once(m_set_data_flag, [this, info, data, bytes, is_linear]() {
@@ -92,7 +88,7 @@ namespace JIT
         void genOffsets(std::stringstream &kerStream, int id, bool is_linear)
         {
             std::string idx_str = std::string("int idx") + std::to_string(id);
-            std::string info_str = std::string("iInfo") + std::to_string(id);;
+            std::string info_str = std::string("iInfo") + std::to_string(id);
 
             if (!is_linear) {
                 kerStream << idx_str << " = "
@@ -122,7 +118,6 @@ namespace JIT
             len++;
             buf_count++;
             bytes += m_bytes;
-            return;
         }
     };
 
diff --git a/src/backend/opencl/binary.hpp b/src/backend/opencl/binary.hpp
index 11493a5966..9a653a8302 100644
--- a/src/backend/opencl/binary.hpp
+++ b/src/backend/opencl/binary.hpp
@@ -183,8 +183,7 @@ Array<To> createBinaryNode(const Array<Ti> &lhs, const Array<Ti> &rhs, const af:
                                                 lhs_node,
                                                 rhs_node, (int)(op));
 
-    return createNodeArray<To>(odims, JIT::Node_ptr(
-                                   reinterpret_cast<JIT::Node *>(node)));
+    return createNodeArray<To>(odims, JIT::Node_ptr(node));
 }
 
 }
diff --git a/src/backend/opencl/cast.hpp b/src/backend/opencl/cast.hpp
index bddbd5ad34..3df062053f 100644
--- a/src/backend/opencl/cast.hpp
+++ b/src/backend/opencl/cast.hpp
@@ -55,12 +55,10 @@ CAST_FN(double)
         }                                       \
     };
 
-
 CAST_CFN(cfloat)
 CAST_CFN(cdouble)
 CAST_CFN(char)
 
-
 template<>
 struct CastOp<cfloat, cdouble>
 {
@@ -70,7 +68,6 @@ struct CastOp<cfloat, cdouble>
     }
 };
 
-
 template<>
 struct CastOp<cdouble, cfloat>
 {
@@ -89,7 +86,6 @@ struct CastOp<cfloat, cfloat>
     }
 };
 
-
 template<>
 struct CastOp<cdouble, cdouble>
 {
@@ -113,7 +109,7 @@ struct CastWrapper
                                                   shortname<To>(true),
                                                   cop.name(),
                                                   in_node, af_cast_t);
-        return createNodeArray<To>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+        return createNodeArray<To>(in.dims(), JIT::Node_ptr(node));
     }
 };
 
diff --git a/src/backend/opencl/complex.hpp b/src/backend/opencl/complex.hpp
index 0838370c3c..e72850675d 100644
--- a/src/backend/opencl/complex.hpp
+++ b/src/backend/opencl/complex.hpp
@@ -30,7 +30,7 @@ namespace opencl
                                                   "__creal",
                                                   in_node, af_real_t);
 
-        return createNodeArray<To>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+        return createNodeArray<To>(in.dims(), JIT::Node_ptr(node));
     }
 
     template<typename To, typename Ti>
@@ -42,7 +42,7 @@ namespace opencl
                                                   "__cimag",
                                                   in_node, af_imag_t);
 
-        return createNodeArray<To>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+        return createNodeArray<To>(in.dims(), JIT::Node_ptr(node));
     }
 
     template<typename T> static const char *abs_name() { return "fabs"; }
@@ -58,7 +58,7 @@ namespace opencl
                                                   abs_name<Ti>(),
                                                   in_node, af_abs_t);
 
-        return createNodeArray<To>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+        return createNodeArray<To>(in.dims(), JIT::Node_ptr(node));
     }
 
     template<typename T> static const char *conj_name() { return "__noop"; }
@@ -74,6 +74,6 @@ namespace opencl
                                                   conj_name<T>(),
                                                   in_node, af_conj_t);
 
-        return createNodeArray<T>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+        return createNodeArray<T>(in.dims(), JIT::Node_ptr(node));
     }
 }
diff --git a/src/backend/opencl/jit.cpp b/src/backend/opencl/jit.cpp
index 5442eba7f4..7f2c1f4644 100644
--- a/src/backend/opencl/jit.cpp
+++ b/src/backend/opencl/jit.cpp
@@ -80,46 +80,47 @@ static string getKernelString(const string funcName,
     // Common OpenCL code
     // This part of the code does not change with the kernel.
 
-    static const char *kernelVoid =  "__kernel void\n";
+    static const char *kernelVoid = "__kernel void\n";
     static const char *dimParams = "KParam oInfo, uint groups_0, uint groups_1, uint num_odims";
     static const char *blockStart = "{\n\n";
     static const char *blockEnd = "\n\n}";
 
-    static const char *linearIndex = "\n"
-        "uint groupId  = get_group_id(1) * get_num_groups(0) + get_group_id(0);\n"
-        "uint threadId = get_local_id(0);\n"
-        "int idx = groupId * get_local_size(0) * get_local_size(1) + threadId;\n"
-        "if (idx >= oInfo.dims[3] * oInfo.strides[3]) return;\n";
-
-    static const char *generalIndex = "\n"
-        "uint id0 = 0, id1 = 0, id2 = 0, id3 = 0;\n"
-        "if (num_odims > 2) {\n"
-        "id2 = get_group_id(0) / groups_0;\n"
-        "id0 = get_group_id(0) - id2 * groups_0;\n"
-        "id0 = get_local_id(0) + id0 * get_local_size(0);\n"
-        "if (num_odims > 3) {\n"
-        "id3 = get_group_id(1) / groups_1;\n"
-        "id1 = get_group_id(1) - id3 * groups_1;\n"
-        "id1 = get_local_id(1) + id1 * get_local_size(1);\n"
-        "} else {\n"
-        "id1 = get_global_id(1);\n"
-        "}\n"
-        " } else {\n"
-        "id3 = 0;\n"
-        "id2 = 0;\n"
-        "id1 = get_global_id(1);\n"
-        "id0 = get_global_id(0);\n"
-        "}\n"
-        "bool cond = \n"
-        "id0 < oInfo.dims[0] && \n"
-        "id1 < oInfo.dims[1] && \n"
-        "id2 < oInfo.dims[2] && \n"
-        "id3 < oInfo.dims[3];\n\n"
-        "if (!cond) return;\n\n"
-        "int idx = "
-        "oInfo.strides[3] * id3 + oInfo.strides[2] * id2 + "
-        "oInfo.strides[1] * id1 + id0 + oInfo.offset;\n\n";
-
+    static const char *linearIndex = R"JIT(
+        uint groupId  = get_group_id(1) * get_num_groups(0) + get_group_id(0);
+        uint threadId = get_local_id(0);
+        int idx = groupId * get_local_size(0) * get_local_size(1) + threadId;
+        if (idx >= oInfo.dims[3] * oInfo.strides[3]) return;
+        )JIT";
+
+    static const char *generalIndex = R"JIT(
+        uint id0 = 0, id1 = 0, id2 = 0, id3 = 0;
+        if (num_odims > 2) {
+            id2 = get_group_id(0) / groups_0;
+            id0 = get_group_id(0) - id2 * groups_0;
+            id0 = get_local_id(0) + id0 * get_local_size(0);
+            if (num_odims > 3) {
+                id3 = get_group_id(1) / groups_1;
+                id1 = get_group_id(1) - id3 * groups_1;
+                id1 = get_local_id(1) + id1 * get_local_size(1);
+            } else {
+                id1 = get_global_id(1);
+            }
+        } else {
+            id3 = 0;
+            id2 = 0;
+            id1 = get_global_id(1);
+            id0 = get_global_id(0);
+        }
+        bool cond = id0 < oInfo.dims[0] &&
+                    id1 < oInfo.dims[1] &&
+                    id2 < oInfo.dims[2] &&
+                    id3 < oInfo.dims[3];
+        if (!cond) return;
+        int idx = oInfo.strides[3] * id3 +
+                  oInfo.strides[2] * id2 +
+                  oInfo.strides[1] * id1 +
+                  id0 + oInfo.offset;
+        )JIT";
 
     stringstream inParamStream;
     stringstream outParamStream;
@@ -143,7 +144,7 @@ static string getKernelString(const string funcName,
         // Generate output parameters
         outParamStream << "__global " << full_nodes[id]->getTypeStr() << " *out" << id << ", \n";
         // Generate code to write the output
-        outWriteStream << "out" << id << "[idx] = " << "val" << id << ";\n";
+        outWriteStream << "out" << id << "[idx] = val" << id << ";\n";
     }
 
     // Put various blocks into a single stream
diff --git a/src/backend/opencl/kernel/jit.cl b/src/backend/opencl/kernel/jit.cl
index 183e52be81..3e797ac050 100644
--- a/src/backend/opencl/kernel/jit.cl
+++ b/src/backend/opencl/kernel/jit.cl
@@ -180,7 +180,7 @@ double2 __cdiv(double2 lhs, double2 rhs)
     double rhs_x = inv_rhs_abs * rhs.x;
     double rhs_y = inv_rhs_abs * rhs.y;
     double2 out = {lhs.x * rhs_x + lhs.y * rhs_y,
-                  lhs.y * rhs_x - lhs.x * rhs_y};
+                   lhs.y * rhs_x - lhs.x * rhs_y};
     out.x *= inv_rhs_abs;
     out.y *= inv_rhs_abs;
     return out;
diff --git a/src/backend/opencl/scalar.hpp b/src/backend/opencl/scalar.hpp
index b6abf47ff9..fbd96b3ecc 100644
--- a/src/backend/opencl/scalar.hpp
+++ b/src/backend/opencl/scalar.hpp
@@ -18,8 +18,7 @@ namespace opencl
 template<typename T>
 Array<T> createScalarNode(const dim4 &size, const T val)
 {
-    JIT::ScalarNode<T> *node = new JIT::ScalarNode<T>(val);
-    return createNodeArray<T>(size, JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+    return createNodeArray<T>(size, JIT::Node_ptr(new JIT::ScalarNode<T>(val)));
 }
 
 }
diff --git a/src/backend/opencl/types.cpp b/src/backend/opencl/types.cpp
index 6581b047db..13744e444b 100644
--- a/src/backend/opencl/types.cpp
+++ b/src/backend/opencl/types.cpp
@@ -13,7 +13,7 @@
 namespace opencl
 {
 
-    template<typename T > const char *shortname(bool caps) { return caps ? "X" : "x"; }
+    template<typename T> const char *shortname(bool caps) { return caps ? "X" : "x"; }
 
     template<> const char *shortname<float   >(bool caps) { return caps ? "S" : "s"; }
     template<> const char *shortname<double  >(bool caps) { return caps ? "D" : "d"; }
diff --git a/src/backend/opencl/types.hpp b/src/backend/opencl/types.hpp
index 4490f2bb36..277ba2c07e 100644
--- a/src/backend/opencl/types.hpp
+++ b/src/backend/opencl/types.hpp
@@ -33,7 +33,7 @@ template<typename T> struct is_complex          { static const bool value = fals
 template<> struct           is_complex<cfloat>  { static const bool value = true;   };
 template<> struct           is_complex<cdouble> { static const bool value = true;   };
 
-template<typename T > const char *shortname(bool caps=false);
+template<typename T> const char *shortname(bool caps=false);
 
 template<typename T>
 struct ToNumStr
diff --git a/src/backend/opencl/unary.hpp b/src/backend/opencl/unary.hpp
index 1e363d7dcb..66f775da5d 100644
--- a/src/backend/opencl/unary.hpp
+++ b/src/backend/opencl/unary.hpp
@@ -80,7 +80,7 @@ Array<T> unaryOp(const Array<T> &in)
                                               unaryName<op>(),
                                               in_node, op);
 
-    return createNodeArray<T>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+    return createNodeArray<T>(in.dims(), JIT::Node_ptr(node));
 }
 
 template<typename T, af_op_t op>
@@ -93,7 +93,7 @@ Array<char> checkOp(const Array<T> &in)
                                               unaryName<op>(),
                                               in_node, op);
 
-    return createNodeArray<char>(in.dims(), JIT::Node_ptr(reinterpret_cast<JIT::Node *>(node)));
+    return createNodeArray<char>(in.dims(), JIT::Node_ptr(node));
 }
 
 }

From bd5667a664cc6eb917715aa2c9638b8151f32e1a Mon Sep 17 00:00:00 2001
From: Pavan Yalamanchili <contact@pavanky.com>
Date: Tue, 20 Jun 2017 11:39:20 -0700
Subject: [PATCH 4/4] Use std::string instead of char * for storing text of the
 jit.cuh file

---
 src/backend/cuda/jit.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/cuda/jit.cpp b/src/backend/cuda/jit.cpp
index 6f703765ab..e8d960e02c 100644
--- a/src/backend/cuda/jit.cpp
+++ b/src/backend/cuda/jit.cpp
@@ -76,9 +76,9 @@ static string getKernelString(const string funcName,
                               bool is_linear)
 {
 
-    const char *includeFileStr = jit_cuh;
+    const std::string includeFileStr(jit_cuh, jit_cuh_len);
 
-    const char paramTStr[] =R"JIT(
+    const std::string paramTStr = R"JIT(
         template<typename T>
         struct Param
         {