From 98e023d7a10e301f793c04246c80bcc1c1bcfd1b Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Wed, 30 Mar 2016 15:22:00 -0400 Subject: [PATCH 01/28] Updated boost.compute to latest develop commit --- CMakeModules/build_boost_compute.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeModules/build_boost_compute.cmake b/CMakeModules/build_boost_compute.cmake index 03c20435a8..eca4e32771 100644 --- a/CMakeModules/build_boost_compute.cmake +++ b/CMakeModules/build_boost_compute.cmake @@ -1,9 +1,9 @@ # If using a commit, remove the v prefix to VER in URL. # If using a tag, don't use v in VER # This is because of how github handles it's release tar balls -SET(VER 0.5) -SET(URL https://github.com/boostorg/compute/archive/v${VER}.tar.gz) -SET(MD5 69a52598ac539d3b7f6005a3dd2b6f58) +SET(VER 523d8e974559977fab006190e9d40eb2e4f87bd0) +SET(URL https://github.com/boostorg/compute/archive/${VER}.tar.gz) +SET(MD5 bbce9e2730e449db5c8f88eae160ea12) SET(thirdPartyDir "${CMAKE_BINARY_DIR}/third_party") SET(srcDir "${thirdPartyDir}/compute-${VER}") From 8026cdb82984d610037667a25f4460f191115ef7 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Wed, 30 Mar 2016 15:22:39 -0400 Subject: [PATCH 02/28] Use stable sort for thrust and boost compute --- src/backend/cuda/kernel/sort_by_key.hpp | 4 ++-- src/backend/cuda/kernel/sort_index.hpp | 4 ++-- src/backend/opencl/kernel/sort_by_key.hpp | 8 ++++---- src/backend/opencl/kernel/sort_index.hpp | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/backend/cuda/kernel/sort_by_key.hpp b/src/backend/cuda/kernel/sort_by_key.hpp index 42a3256a1c..e06d9202b6 100644 --- a/src/backend/cuda/kernel/sort_by_key.hpp +++ b/src/backend/cuda/kernel/sort_by_key.hpp @@ -44,9 +44,9 @@ namespace cuda int ovalOffset = ovalWZ + y * oval.strides[1]; if(isAscending) { - THRUST_SELECT(thrust::sort_by_key, okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0], oval_ptr + ovalOffset); + THRUST_SELECT(thrust::stable_sort_by_key, okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0], oval_ptr + ovalOffset); } else { - THRUST_SELECT(thrust::sort_by_key, okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0], oval_ptr + ovalOffset, thrust::greater()); + THRUST_SELECT(thrust::stable_sort_by_key, okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0], oval_ptr + ovalOffset, thrust::greater()); } } } diff --git a/src/backend/cuda/kernel/sort_index.hpp b/src/backend/cuda/kernel/sort_index.hpp index 9d29914f23..8762f28a4a 100644 --- a/src/backend/cuda/kernel/sort_index.hpp +++ b/src/backend/cuda/kernel/sort_index.hpp @@ -42,11 +42,11 @@ namespace cuda THRUST_SELECT(thrust::sequence, idx_ptr + idxOffset, idx_ptr + idxOffset + idx.dims[0]); if(isAscending) { - THRUST_SELECT(thrust::sort_by_key, + THRUST_SELECT(thrust::stable_sort_by_key, val_ptr + valOffset, val_ptr + valOffset + val.dims[0], idx_ptr + idxOffset); } else { - THRUST_SELECT(thrust::sort_by_key, + THRUST_SELECT(thrust::stable_sort_by_key, val_ptr + valOffset, val_ptr + valOffset + val.dims[0], idx_ptr + idxOffset, thrust::greater()); } diff --git a/src/backend/opencl/kernel/sort_by_key.hpp b/src/backend/opencl/kernel/sort_by_key.hpp index 0cb9cb042d..555e39cdf1 100644 --- a/src/backend/opencl/kernel/sort_by_key.hpp +++ b/src/backend/opencl/kernel/sort_by_key.hpp @@ -20,7 +20,7 @@ #pragma GCC diagnostic ignored "-Wdeprecated-declarations" #include -#include +#include #include #include @@ -72,10 +72,10 @@ namespace opencl compute::buffer_iterator< type_t > end = compute::make_buffer_iterator< type_t >(okey_buf, okeyOffset + okey.info.dims[0]); compute::buffer_iterator< type_t > vals = compute::make_buffer_iterator< type_t >(oval_buf, ovalOffset); if(isAscending) { - compute::sort_by_key(start, end, vals, c_queue); + compute::stable_sort_by_key(start, end, vals, c_queue); } else { - compute::sort_by_key(start, end, vals, - compute::greater< type_t >(), c_queue); + compute::stable_sort_by_key(start, end, vals, + compute::greater< type_t >(), c_queue); } } } diff --git a/src/backend/opencl/kernel/sort_index.hpp b/src/backend/opencl/kernel/sort_index.hpp index 3a8ab1401e..4926dc34c3 100644 --- a/src/backend/opencl/kernel/sort_index.hpp +++ b/src/backend/opencl/kernel/sort_index.hpp @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include @@ -73,12 +73,12 @@ namespace opencl compute::iota(idx_begin, idx_begin + val.info.dims[0], 0, c_queue); if(isAscending) { - compute::sort_by_key( + compute::stable_sort_by_key( compute::make_buffer_iterator< type_t >(val_buf, valOffset), compute::make_buffer_iterator< type_t >(val_buf, valOffset + val.info.dims[0]), idx_begin, compute::less< type_t >(), c_queue); } else { - compute::sort_by_key( + compute::stable_sort_by_key( compute::make_buffer_iterator< type_t >(val_buf, valOffset), compute::make_buffer_iterator< type_t >(val_buf, valOffset + val.info.dims[0]), idx_begin, compute::greater< type_t >(), c_queue); From 09129b00a072c5ada07cb3321b16cf582f0f0d32 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Wed, 30 Mar 2016 16:49:21 -0400 Subject: [PATCH 03/28] Allow complex types as value type in sort_by_key This allows complex sorting based on a key, which can be the abs, real, imag etc of the value array. So the user can choose which metric they want to use. --- src/api/c/sort.cpp | 16 +++++++++------ src/backend/cpu/sort_by_key.cpp | 2 ++ src/backend/cuda/sort_by_key_impl.hpp | 24 ++++++++++++----------- src/backend/opencl/kernel/sort_by_key.hpp | 19 +++++++++++++++++- src/backend/opencl/sort_by_key/impl.hpp | 22 +++++++++++---------- 5 files changed, 55 insertions(+), 28 deletions(-) diff --git a/src/api/c/sort.cpp b/src/api/c/sort.cpp index 1de63c5052..66ffce9eb1 100644 --- a/src/api/c/sort.cpp +++ b/src/api/c/sort.cpp @@ -150,6 +150,8 @@ void sort_by_key_tmplt(af_array *okey, af_array *oval, const af_array ikey, cons switch(vtype) { case f32: sort_by_key(okey, oval, ikey, ival, dim, isAscending); break; case f64: sort_by_key(okey, oval, ikey, ival, dim, isAscending); break; + case c32: sort_by_key(okey, oval, ikey, ival, dim, isAscending); break; + case c64: sort_by_key(okey, oval, ikey, ival, dim, isAscending); break; case s32: sort_by_key(okey, oval, ikey, ival, dim, isAscending); break; case u32: sort_by_key(okey, oval, ikey, ival, dim, isAscending); break; case s16: sort_by_key(okey, oval, ikey, ival, dim, isAscending); break; @@ -169,20 +171,22 @@ af_err af_sort_by_key(af_array *out_keys, af_array *out_values, const unsigned dim, const bool isAscending) { try { - ArrayInfo info = getInfo(keys); - af_dtype type = info.getType(); + ArrayInfo kinfo = getInfo(keys); + af_dtype ktype = kinfo.getType(); ArrayInfo vinfo = getInfo(values); - DIM_ASSERT(3, info.elements() > 0); - DIM_ASSERT(4, info.dims() == vinfo.dims()); + DIM_ASSERT(3, kinfo.elements() > 0); + DIM_ASSERT(4, kinfo.dims() == vinfo.dims()); // Only Dim 0 supported ARG_ASSERT(5, dim == 0); + TYPE_ASSERT(kinfo.isReal()); + af_array oKey; af_array oVal; - switch(type) { + switch(ktype) { case f32: sort_by_key_tmplt(&oKey, &oVal, keys, values, dim, isAscending); break; case f64: sort_by_key_tmplt(&oKey, &oVal, keys, values, dim, isAscending); break; case s32: sort_by_key_tmplt(&oKey, &oVal, keys, values, dim, isAscending); break; @@ -193,7 +197,7 @@ af_err af_sort_by_key(af_array *out_keys, af_array *out_values, case u64: sort_by_key_tmplt(&oKey, &oVal, keys, values, dim, isAscending); break; case u8: sort_by_key_tmplt(&oKey, &oVal, keys, values, dim, isAscending); break; case b8: sort_by_key_tmplt(&oKey, &oVal, keys, values, dim, isAscending); break; - default: TYPE_ERROR(1, type); + default: TYPE_ERROR(1, ktype); } std::swap(*out_keys , oKey); std::swap(*out_values , oVal); diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/sort_by_key.cpp index 5a99257033..46ced4b9ef 100644 --- a/src/backend/cpu/sort_by_key.cpp +++ b/src/backend/cpu/sort_by_key.cpp @@ -46,6 +46,8 @@ void sort_by_key(Array &okey, Array &oval, #define INSTANTIATE1(Tk) \ INSTANTIATE(Tk, float) \ INSTANTIATE(Tk, double) \ + INSTANTIATE(Tk, cfloat) \ + INSTANTIATE(Tk, cdouble) \ INSTANTIATE(Tk, int) \ INSTANTIATE(Tk, uint) \ INSTANTIATE(Tk, char) \ diff --git a/src/backend/cuda/sort_by_key_impl.hpp b/src/backend/cuda/sort_by_key_impl.hpp index d01ace404e..217b17dc8a 100644 --- a/src/backend/cuda/sort_by_key_impl.hpp +++ b/src/backend/cuda/sort_by_key_impl.hpp @@ -35,15 +35,17 @@ namespace cuda sort_by_key(Array &okey, Array &oval, \ const Array &ikey, const Array &ival, const uint dim); \ -#define INSTANTIATE1(Tk, dr) \ - INSTANTIATE(Tk, float, dr) \ - INSTANTIATE(Tk, double, dr) \ - INSTANTIATE(Tk, int, dr) \ - INSTANTIATE(Tk, uint, dr) \ - INSTANTIATE(Tk, short, dr) \ - INSTANTIATE(Tk, ushort, dr) \ - INSTANTIATE(Tk, char, dr) \ - INSTANTIATE(Tk, uchar, dr) \ - INSTANTIATE(Tk, intl, dr) \ - INSTANTIATE(Tk, uintl, dr) +#define INSTANTIATE1(Tk , dr) \ + INSTANTIATE(Tk, float , dr) \ + INSTANTIATE(Tk, double , dr) \ + INSTANTIATE(Tk, cfloat , dr) \ + INSTANTIATE(Tk, cdouble, dr) \ + INSTANTIATE(Tk, int , dr) \ + INSTANTIATE(Tk, uint , dr) \ + INSTANTIATE(Tk, short , dr) \ + INSTANTIATE(Tk, ushort , dr) \ + INSTANTIATE(Tk, char , dr) \ + INSTANTIATE(Tk, uchar , dr) \ + INSTANTIATE(Tk, intl , dr) \ + INSTANTIATE(Tk, uintl , dr) } diff --git a/src/backend/opencl/kernel/sort_by_key.hpp b/src/backend/opencl/kernel/sort_by_key.hpp index 555e39cdf1..e3306e6e68 100644 --- a/src/backend/opencl/kernel/sort_by_key.hpp +++ b/src/backend/opencl/kernel/sort_by_key.hpp @@ -40,9 +40,26 @@ namespace opencl { using std::conditional; using std::is_same; + + // If type is cdouble, return std::complex, else return T + template + using ztype_t = typename conditional::value, + std::complex, T + >::type; + + // If type is cfloat, return std::complex, else return ztype_t + template + using ctype_t = typename conditional::value, + std::complex, ztype_t + >::type; + + // If type is intl, return cl_long, else return ctype_t template - using ltype_t = typename conditional::value, cl_long, T>::type; + using ltype_t = typename conditional::value, + cl_long, ctype_t + >::type; + // If type is uintl, return cl_ulong, else return ltype_t template using type_t = typename conditional::value, cl_ulong, ltype_t diff --git a/src/backend/opencl/sort_by_key/impl.hpp b/src/backend/opencl/sort_by_key/impl.hpp index 49d184113f..68c5ce70ae 100644 --- a/src/backend/opencl/sort_by_key/impl.hpp +++ b/src/backend/opencl/sort_by_key/impl.hpp @@ -43,15 +43,17 @@ namespace opencl #define INSTANTIATE1(Tk, isAscending) \ - INSTANTIATE(Tk, float , isAscending) \ - INSTANTIATE(Tk, double, isAscending) \ - INSTANTIATE(Tk, int , isAscending) \ - INSTANTIATE(Tk, uint , isAscending) \ - INSTANTIATE(Tk, char , isAscending) \ - INSTANTIATE(Tk, uchar , isAscending) \ - INSTANTIATE(Tk, short , isAscending) \ - INSTANTIATE(Tk, ushort, isAscending) \ - INSTANTIATE(Tk, intl , isAscending) \ - INSTANTIATE(Tk, uintl , isAscending) \ + INSTANTIATE(Tk, float , isAscending) \ + INSTANTIATE(Tk, double , isAscending) \ + INSTANTIATE(Tk, cfloat , isAscending) \ + INSTANTIATE(Tk, cdouble, isAscending) \ + INSTANTIATE(Tk, int , isAscending) \ + INSTANTIATE(Tk, uint , isAscending) \ + INSTANTIATE(Tk, char , isAscending) \ + INSTANTIATE(Tk, uchar , isAscending) \ + INSTANTIATE(Tk, short , isAscending) \ + INSTANTIATE(Tk, ushort , isAscending) \ + INSTANTIATE(Tk, intl , isAscending) \ + INSTANTIATE(Tk, uintl , isAscending) \ } From cda059fcb22a22446210e0c9d120353821102382 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Mar 2016 17:28:02 -0400 Subject: [PATCH 04/28] Improvements to sort * Sort now allows all dimensions * Sort if much faster by using batched mode. This takes up more memory though. * Enabled large tests for sort * Added tests for sorting on dim1 and dim2 --- src/api/c/sort.cpp | 2 - src/backend/cpu/kernel/sort.hpp | 2 +- src/backend/cpu/sort.cpp | 48 +++++++++++- src/backend/cuda/kernel/sort.hpp | 77 ++++++++++++++++++-- src/backend/cuda/kernel/sort_by_key.hpp | 14 ++-- src/backend/cuda/sort.cu | 9 ++- src/backend/opencl/kernel/sort.hpp | 89 ++++++++++++++++++++--- src/backend/opencl/kernel/sort_by_key.hpp | 28 +------ src/backend/opencl/kernel/sort_helper.hpp | 46 ++++++++++++ src/backend/opencl/kernel/sort_index.hpp | 11 +-- src/backend/opencl/sort.cpp | 8 +- test/sort.cpp | 77 ++++++++++++++++++-- 12 files changed, 336 insertions(+), 75 deletions(-) create mode 100644 src/backend/opencl/kernel/sort_helper.hpp diff --git a/src/api/c/sort.cpp b/src/api/c/sort.cpp index 66ffce9eb1..e3f3ae35da 100644 --- a/src/api/c/sort.cpp +++ b/src/api/c/sort.cpp @@ -42,8 +42,6 @@ af_err af_sort(af_array *out, const af_array in, const unsigned dim, const bool af_dtype type = info.getType(); DIM_ASSERT(1, info.elements() > 0); - // Only Dim 0 supported - ARG_ASSERT(2, dim == 0); af_array val; diff --git a/src/backend/cpu/kernel/sort.hpp b/src/backend/cpu/kernel/sort.hpp index 292c6383dc..e0ae62c932 100644 --- a/src/backend/cpu/kernel/sort.hpp +++ b/src/backend/cpu/kernel/sort.hpp @@ -23,7 +23,7 @@ namespace kernel // Based off of http://stackoverflow.com/a/12399290 template -void sort0(Array val) +void sort0Iterative(Array val) { // initialize original index locations T *val_ptr = val.get(); diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp index bc6396b258..c3c5286fbf 100644 --- a/src/backend/cpu/sort.cpp +++ b/src/backend/cpu/sort.cpp @@ -15,11 +15,54 @@ #include #include #include +#include +#include +#include +#include #include namespace cpu { +template +void sortBatched(Array& val) +{ + af::dim4 inDims = val.dims(); + + // Sort dimension + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + Array key = iota(seqDims, tileDims); + + Array *resKey = initArray(); + Array *resVal = initArray(); + + val.modDims(inDims.elements()); + key.modDims(inDims.elements()); + + sort_by_key(*resVal, *resKey, val, key, 0); + + // Needs to be ascending (true) in order to maintain the indices properly + sort_by_key(key, val, *resKey, *resVal, 0); + val.eval(); + + val.modDims(inDims); +} + +template +void sort0(Array& val) +{ + int higherDims = val.elements() / val.dims()[0]; + // TODO Make a better heurisitic + if(higherDims > 10) + sortBatched(val); + else + getQueue().enqueue(kernel::sort0Iterative, val); +} + template Array sort(const Array &in, const unsigned dim) { @@ -27,7 +70,10 @@ Array sort(const Array &in, const unsigned dim) Array out = copyArray(in); switch(dim) { - case 0: getQueue().enqueue(kernel::sort0, out); break; + case 0: sort0(out); break; + case 1: sortBatched(out); break; + case 2: sortBatched(out); break; + case 3: sortBatched(out); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } return out; diff --git a/src/backend/cuda/kernel/sort.hpp b/src/backend/cuda/kernel/sort.hpp index b23e308633..21d9122d64 100644 --- a/src/backend/cuda/kernel/sort.hpp +++ b/src/backend/cuda/kernel/sort.hpp @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include #include @@ -19,15 +21,11 @@ namespace cuda { namespace kernel { - // Kernel Launch Config Values - static const unsigned TX = 32; - static const unsigned TY = 8; - /////////////////////////////////////////////////////////////////////////// // Wrapper functions /////////////////////////////////////////////////////////////////////////// template - void sort0(Param val) + void sort0Iterative(Param val) { thrust::device_ptr val_ptr = thrust::device_pointer_cast(val.ptr); @@ -49,5 +47,74 @@ namespace cuda } POST_LAUNCH_CHECK(); } + + template + void sortBatched(Param pVal) + { + af::dim4 inDims; + for(int i = 0; i < 4; i++) + inDims[i] = pVal.dims[i]; + + // Sort dimension + // tileDims * seqDims = inDims + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + // Create/call iota + // Array key = iota(seqDims, tileDims); + dim4 keydims = inDims; + uint* key = memAlloc(keydims.elements()); + Param pKey; + pKey.ptr = key; + pKey.strides[0] = 1; + pKey.dims[0] = keydims[0]; + for(int i = 1; i < 4; i++) { + pKey.dims[i] = keydims[i]; + pKey.strides[i] = pKey.strides[i - 1] * pKey.dims[i - 1]; + } + kernel::iota(pKey, seqDims, tileDims); + + // Flat + //val.modDims(inDims.elements()); + //key.modDims(inDims.elements()); + pKey.dims[0] = inDims.elements(); + pKey.strides[0] = 1; + pVal.dims[0] = inDims.elements(); + pVal.strides[0] = 1; + for(int i = 1; i < 4; i++) { + pKey.dims[i] = 1; + pKey.strides[i] = pKey.strides[i - 1] * pKey.dims[i - 1]; + pVal.dims[i] = 1; + pVal.strides[i] = pVal.strides[i - 1] * pVal.dims[i - 1]; + } + + // Sort indices + // sort_by_key(*resVal, *resKey, val, key, 0); + kernel::sort0_by_key(pVal, pKey); + + // Needs to be ascending (true) in order to maintain the indices properly + kernel::sort0_by_key(pKey, pVal); + + // No need of doing moddims here because the original Array + // dimensions have not been changed + //val.modDims(inDims); + + // Not really necessary + // CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId()))); + memFree(key); + } + + template + void sort0(Param val) + { + int higherDims = val.dims[1] * val.dims[2] * val.dims[3]; + // TODO Make a better heurisitic + if(higherDims > 10) + sortBatched(val); + else + kernel::sort0Iterative(val); + } } } diff --git a/src/backend/cuda/kernel/sort_by_key.hpp b/src/backend/cuda/kernel/sort_by_key.hpp index e06d9202b6..bfaa79a311 100644 --- a/src/backend/cuda/kernel/sort_by_key.hpp +++ b/src/backend/cuda/kernel/sort_by_key.hpp @@ -19,10 +19,6 @@ namespace cuda { namespace kernel { - // Kernel Launch Config Values - static const unsigned TX = 32; - static const unsigned TY = 8; - /////////////////////////////////////////////////////////////////////////// // Wrapper functions /////////////////////////////////////////////////////////////////////////// @@ -44,9 +40,15 @@ namespace cuda int ovalOffset = ovalWZ + y * oval.strides[1]; if(isAscending) { - THRUST_SELECT(thrust::stable_sort_by_key, okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0], oval_ptr + ovalOffset); + THRUST_SELECT(thrust::stable_sort_by_key, + okey_ptr + okeyOffset, + okey_ptr + okeyOffset + okey.dims[0], + oval_ptr + ovalOffset); } else { - THRUST_SELECT(thrust::stable_sort_by_key, okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0], oval_ptr + ovalOffset, thrust::greater()); + THRUST_SELECT(thrust::stable_sort_by_key, + okey_ptr + okeyOffset, + okey_ptr + okeyOffset + okey.dims[0], + oval_ptr + ovalOffset, thrust::greater()); } } } diff --git a/src/backend/cuda/sort.cu b/src/backend/cuda/sort.cu index 6d14c0309f..4ae3b759fb 100644 --- a/src/backend/cuda/sort.cu +++ b/src/backend/cuda/sort.cu @@ -22,10 +22,11 @@ namespace cuda { Array out = copyArray(in); switch(dim) { - - case 0: kernel::sort0(out); - break; - default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); + case 0: kernel::sort0(out); break; + case 1: kernel::sortBatched(out); break; + case 2: kernel::sortBatched(out); break; + case 3: kernel::sortBatched(out); break; + default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } return out; } diff --git a/src/backend/opencl/kernel/sort.hpp b/src/backend/opencl/kernel/sort.hpp index 013d8c53a9..7b7799ca89 100644 --- a/src/backend/opencl/kernel/sort.hpp +++ b/src/backend/opencl/kernel/sort.hpp @@ -15,6 +15,9 @@ #include #include #include +#include +#include +#include #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" @@ -38,18 +41,8 @@ namespace opencl { namespace kernel { - using std::conditional; - using std::is_same; - template - using ltype_t = typename conditional::value, cl_long, T>::type; - - template - using type_t = typename conditional::value, - cl_ulong, ltype_t - >::type; - template - void sort0(Param val) + void sort0Iterative(Param val) { try { compute::command_queue c_queue(getQueue()()); @@ -85,6 +78,80 @@ namespace opencl throw; } } + + template + void sortBatched(Param pVal) + { + try{ + af::dim4 inDims; + for(int i = 0; i < 4; i++) + inDims[i] = pVal.info.dims[i]; + + // Sort dimension + // tileDims * seqDims = inDims + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + // Create/call iota + // Array key = iota(seqDims, tileDims); + dim4 keydims = inDims; + cl::Buffer* key = bufferAlloc(keydims.elements() * sizeof(uint)); + Param pKey; + pKey.data = key; + pKey.info.offset = 0; + pKey.info.dims[0] = keydims[0]; + pKey.info.strides[0] = 1; + for(int i = 1; i < 4; i++) { + pKey.info.dims[i] = keydims[i]; + pKey.info.strides[i] = pKey.info.strides[i - 1] * pKey.info.dims[i - 1]; + } + kernel::iota(pKey, seqDims, tileDims); + + // Flat + //val.modDims(inDims.elements()); + //key.modDims(inDims.elements()); + pKey.info.dims[0] = inDims.elements(); + pKey.info.strides[0] = 1; + pVal.info.dims[0] = inDims.elements(); + pVal.info.strides[0] = 1; + for(int i = 1; i < 4; i++) { + pKey.info.dims[i] = 1; + pKey.info.strides[i] = pKey.info.strides[i - 1] * pKey.info.dims[i - 1]; + pVal.info.dims[i] = 1; + pVal.info.strides[i] = pVal.info.strides[i - 1] * pVal.info.dims[i - 1]; + } + + // Sort indices + // sort_by_key(*resVal, *resKey, val, key, 0); + kernel::sort0_by_key(pVal, pKey); + + // Needs to be ascending (true) in order to maintain the indices properly + kernel::sort0_by_key(pKey, pVal); + + // No need of doing moddims here because the original Array + // dimensions have not been changed + //val.modDims(inDims); + + CL_DEBUG_FINISH(getQueue()); + bufferFree(key); + } catch (cl::Error err) { + CL_TO_AF_ERROR(err); + throw; + } + } + + template + void sort0(Param val) + { + int higherDims = val.info.dims[1] * val.info.dims[2] * val.info.dims[3]; + // TODO Make a better heurisitic + if(higherDims > 10) + sortBatched(val); + else + kernel::sort0Iterative(val); + } } } diff --git a/src/backend/opencl/kernel/sort_by_key.hpp b/src/backend/opencl/kernel/sort_by_key.hpp index e3306e6e68..c3807f7a31 100644 --- a/src/backend/opencl/kernel/sort_by_key.hpp +++ b/src/backend/opencl/kernel/sort_by_key.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" @@ -38,33 +39,6 @@ namespace opencl { namespace kernel { - using std::conditional; - using std::is_same; - - // If type is cdouble, return std::complex, else return T - template - using ztype_t = typename conditional::value, - std::complex, T - >::type; - - // If type is cfloat, return std::complex, else return ztype_t - template - using ctype_t = typename conditional::value, - std::complex, ztype_t - >::type; - - // If type is intl, return cl_long, else return ctype_t - template - using ltype_t = typename conditional::value, - cl_long, ctype_t - >::type; - - // If type is uintl, return cl_ulong, else return ltype_t - template - using type_t = typename conditional::value, - cl_ulong, ltype_t - >::type; - template void sort0_by_key(Param okey, Param oval) { diff --git a/src/backend/opencl/kernel/sort_helper.hpp b/src/backend/opencl/kernel/sort_helper.hpp new file mode 100644 index 0000000000..07ab0eeb69 --- /dev/null +++ b/src/backend/opencl/kernel/sort_helper.hpp @@ -0,0 +1,46 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include + +namespace opencl +{ + namespace kernel + { + using std::conditional; + using std::is_same; + + // If type is cdouble, return std::complex, else return T + template + using ztype_t = typename conditional::value, + std::complex, T + >::type; + + // If type is cfloat, return std::complex, else return ztype_t + template + using ctype_t = typename conditional::value, + std::complex, ztype_t + >::type; + + // If type is intl, return cl_long, else return ctype_t + template + using ltype_t = typename conditional::value, + cl_long, ctype_t + >::type; + + // If type is uintl, return cl_ulong, else return ltype_t + template + using type_t = typename conditional::value, + cl_ulong, ltype_t + >::type; + + } +} diff --git a/src/backend/opencl/kernel/sort_index.hpp b/src/backend/opencl/kernel/sort_index.hpp index 4926dc34c3..0fa4847fc1 100644 --- a/src/backend/opencl/kernel/sort_index.hpp +++ b/src/backend/opencl/kernel/sort_index.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" @@ -39,16 +40,6 @@ namespace opencl { namespace kernel { - using std::conditional; - using std::is_same; - template - using ltype_t = typename conditional::value, cl_long, T>::type; - - template - using type_t = typename conditional::value, - cl_ulong, ltype_t - >::type; - template void sort0_index(Param val, Param idx) { diff --git a/src/backend/opencl/sort.cpp b/src/backend/opencl/sort.cpp index 762d815095..0bf2dc04cd 100644 --- a/src/backend/opencl/sort.cpp +++ b/src/backend/opencl/sort.cpp @@ -23,9 +23,11 @@ namespace opencl try { Array out = copyArray(in); switch(dim) { - case 0: kernel::sort0(out); - break; - default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); + case 0: kernel::sort0(out); break; + case 1: kernel::sortBatched(out); break; + case 2: kernel::sortBatched(out); break; + case 3: kernel::sortBatched(out); break; + default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } return out; } catch (std::exception &ex) { diff --git a/test/sort.cpp b/test/sort.cpp index 7ec6f5565e..116b136abe 100644 --- a/test/sort.cpp +++ b/test/sort.cpp @@ -107,15 +107,15 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, bool SORT_INIT(SortMedTrue, sort_med1, true, 0); SORT_INIT(SortMedFalse, sort_med1, false, 2); // Takes too much time in current implementation. Enable when everything is parallel - //SORT_INIT(SortMed5True, sort_med, true, 0); - //SORT_INIT(SortMed5False, sort_med, false, 2); - //SORT_INIT(SortLargeTrue, sort_large, true, 0); - //SORT_INIT(SortLargeFalse, sort_large, false, 2); + SORT_INIT(SortMed5True, sort_med, true, 0); + SORT_INIT(SortMed5False, sort_med, false, 2); + SORT_INIT(SortLargeTrue, sort_large, true, 0); + SORT_INIT(SortLargeFalse, sort_large, false, 2); ////////////////////////////////////// CPP //////////////////////////////// // -TEST(Sort, CPP) +TEST(Sort, CPPDim0) { if (noDoubleTests()) return; @@ -147,3 +147,70 @@ TEST(Sort, CPP) delete[] sxData; } +TEST(Sort, CPPDim1) +{ + if (noDoubleTests()) return; + + const bool dir = true; + const unsigned resultIdx0 = 0; + + vector numDims; + vector > in; + vector > tests; + readTests(string(TEST_DIR"/sort/sort_10x10.test"),numDims,in,tests); + + af::dim4 idims = numDims[0]; + af::array input(idims, &(in[0].front())); + + af::array input_ = reorder(input, 1, 0, 2, 3); + + af::array output = af::sort(input_, 1, dir); + + size_t nElems = tests[resultIdx0].size(); + + // Get result + float* sxData = new float[tests[resultIdx0].size()]; + output.host((void*)sxData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx0][elIter], sxData[elIter]) << "at: " << elIter << std::endl; + } + + // Delete + delete[] sxData; +} + +TEST(Sort, CPPDim2) +{ + if (noDoubleTests()) return; + + const bool dir = false; + const unsigned resultIdx0 = 2; + + vector numDims; + vector > in; + vector > tests; + readTests(string(TEST_DIR"/sort/sort_med.test"),numDims,in,tests); + + af::dim4 idims = numDims[0]; + af::array input(idims, &(in[0].front())); + + af::array input_ = reorder(input, 1, 2, 0, 3); + + af::array output = af::sort(input_, 2, dir); + + size_t nElems = tests[resultIdx0].size(); + + // Get result + float* sxData = new float[tests[resultIdx0].size()]; + output.host((void*)sxData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx0][elIter], sxData[elIter]) << "at: " << elIter << std::endl; + } + + // Delete + delete[] sxData; +} From e6c9e934a33f5e97a1ff97061cde490d8d5b898e Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Mar 2016 17:29:44 -0400 Subject: [PATCH 05/28] Revert "Updated boost.compute to latest develop commit" This reverts commit 98e023d7a10e301f793c04246c80bcc1c1bcfd1b. --- CMakeModules/build_boost_compute.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeModules/build_boost_compute.cmake b/CMakeModules/build_boost_compute.cmake index eca4e32771..03c20435a8 100644 --- a/CMakeModules/build_boost_compute.cmake +++ b/CMakeModules/build_boost_compute.cmake @@ -1,9 +1,9 @@ # If using a commit, remove the v prefix to VER in URL. # If using a tag, don't use v in VER # This is because of how github handles it's release tar balls -SET(VER 523d8e974559977fab006190e9d40eb2e4f87bd0) -SET(URL https://github.com/boostorg/compute/archive/${VER}.tar.gz) -SET(MD5 bbce9e2730e449db5c8f88eae160ea12) +SET(VER 0.5) +SET(URL https://github.com/boostorg/compute/archive/v${VER}.tar.gz) +SET(MD5 69a52598ac539d3b7f6005a3dd2b6f58) SET(thirdPartyDir "${CMAKE_BINARY_DIR}/third_party") SET(srcDir "${thirdPartyDir}/compute-${VER}") From aaa0a056744d46c4d9027ff9ef25ceed3fe8fe3a Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 31 Mar 2016 17:40:13 -0400 Subject: [PATCH 06/28] Boost.Compute sort/sort_by_key are stable in v0.5. So revert to that --- src/backend/opencl/kernel/sort.hpp | 6 +++--- src/backend/opencl/kernel/sort_by_key.hpp | 8 ++++---- src/backend/opencl/kernel/sort_index.hpp | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/backend/opencl/kernel/sort.hpp b/src/backend/opencl/kernel/sort.hpp index 7b7799ca89..63f1658208 100644 --- a/src/backend/opencl/kernel/sort.hpp +++ b/src/backend/opencl/kernel/sort.hpp @@ -23,7 +23,7 @@ #pragma GCC diagnostic ignored "-Wdeprecated-declarations" #include -#include +#include #include #include @@ -58,12 +58,12 @@ namespace opencl int valOffset = valWZ + y * val.info.strides[1]; if(isAscending) { - compute::stable_sort( + compute::sort( compute::make_buffer_iterator< type_t >(val_buf, valOffset), compute::make_buffer_iterator< type_t >(val_buf, valOffset + val.info.dims[0]), compute::less< type_t >(), c_queue); } else { - compute::stable_sort( + compute::sort( compute::make_buffer_iterator< type_t >(val_buf, valOffset), compute::make_buffer_iterator< type_t >(val_buf, valOffset + val.info.dims[0]), compute::greater< type_t >(), c_queue); diff --git a/src/backend/opencl/kernel/sort_by_key.hpp b/src/backend/opencl/kernel/sort_by_key.hpp index c3807f7a31..513ddbfb6d 100644 --- a/src/backend/opencl/kernel/sort_by_key.hpp +++ b/src/backend/opencl/kernel/sort_by_key.hpp @@ -21,7 +21,7 @@ #pragma GCC diagnostic ignored "-Wdeprecated-declarations" #include -#include +#include #include #include @@ -60,12 +60,12 @@ namespace opencl int ovalOffset = ovalWZ + y * oval.info.strides[1]; compute::buffer_iterator< type_t > start= compute::make_buffer_iterator< type_t >(okey_buf, okeyOffset); - compute::buffer_iterator< type_t > end = compute::make_buffer_iterator< type_t >(okey_buf, okeyOffset + okey.info.dims[0]); + compute::buffer_iterator< type_t > end = compute::make_buffer_iterator< type_t >(okey_buf, okeyOffset + okey.info.dims[0]); compute::buffer_iterator< type_t > vals = compute::make_buffer_iterator< type_t >(oval_buf, ovalOffset); if(isAscending) { - compute::stable_sort_by_key(start, end, vals, c_queue); + compute::sort_by_key(start, end, vals, c_queue); } else { - compute::stable_sort_by_key(start, end, vals, + compute::sort_by_key(start, end, vals, compute::greater< type_t >(), c_queue); } } diff --git a/src/backend/opencl/kernel/sort_index.hpp b/src/backend/opencl/kernel/sort_index.hpp index 0fa4847fc1..aae0a94ea6 100644 --- a/src/backend/opencl/kernel/sort_index.hpp +++ b/src/backend/opencl/kernel/sort_index.hpp @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include @@ -64,12 +64,12 @@ namespace opencl compute::iota(idx_begin, idx_begin + val.info.dims[0], 0, c_queue); if(isAscending) { - compute::stable_sort_by_key( + compute::sort_by_key( compute::make_buffer_iterator< type_t >(val_buf, valOffset), compute::make_buffer_iterator< type_t >(val_buf, valOffset + val.info.dims[0]), idx_begin, compute::less< type_t >(), c_queue); } else { - compute::stable_sort_by_key( + compute::sort_by_key( compute::make_buffer_iterator< type_t >(val_buf, valOffset), compute::make_buffer_iterator< type_t >(val_buf, valOffset + val.info.dims[0]), idx_begin, compute::greater< type_t >(), c_queue); From 3422d012df209db8a1c563d5937b12b8b8b90235 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 1 Apr 2016 11:45:09 -0400 Subject: [PATCH 07/28] Call modDims when setDataDims is called --- src/api/c/moddims.cpp | 1 - src/backend/cpu/Array.hpp | 1 + src/backend/cuda/Array.hpp | 1 + src/backend/opencl/Array.hpp | 1 + 4 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/api/c/moddims.cpp b/src/api/c/moddims.cpp index b8f1fafa6c..1d326c0846 100644 --- a/src/api/c/moddims.cpp +++ b/src/api/c/moddims.cpp @@ -31,7 +31,6 @@ Array modDims(const Array& in, const af::dim4 &newDims) Out = copyArray(in); } - Out.modDims(newDims); Out.setDataDims(newDims); return Out; diff --git a/src/backend/cpu/Array.hpp b/src/backend/cpu/Array.hpp index 2809a2b80d..cf970d18c7 100644 --- a/src/backend/cpu/Array.hpp +++ b/src/backend/cpu/Array.hpp @@ -181,6 +181,7 @@ namespace cpu void setDataDims(const dim4 &new_dims) { + modDims(new_dims); data_dims = new_dims; } diff --git a/src/backend/cuda/Array.hpp b/src/backend/cuda/Array.hpp index c2292087aa..1f9512fb8d 100644 --- a/src/backend/cuda/Array.hpp +++ b/src/backend/cuda/Array.hpp @@ -174,6 +174,7 @@ namespace cuda void setDataDims(const dim4 &new_dims) { + modDims(new_dims); data_dims = new_dims; } diff --git a/src/backend/opencl/Array.hpp b/src/backend/opencl/Array.hpp index ada3b41dc3..f83d5c0120 100644 --- a/src/backend/opencl/Array.hpp +++ b/src/backend/opencl/Array.hpp @@ -211,6 +211,7 @@ namespace opencl void setDataDims(const dim4 &new_dims) { + modDims(new_dims); data_dims = new_dims; } From 3fea8a2649a0a077dadb2de5fba7174c67c8c70b Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 1 Apr 2016 11:46:49 -0400 Subject: [PATCH 08/28] Fix temp Array T and moddims in CPU batched sort --- src/backend/cpu/sort.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp index c3c5286fbf..fbf613f962 100644 --- a/src/backend/cpu/sort.cpp +++ b/src/backend/cpu/sort.cpp @@ -37,19 +37,19 @@ void sortBatched(Array& val) Array key = iota(seqDims, tileDims); - Array *resKey = initArray(); - Array *resVal = initArray(); + Array resKey = createEmptyArray(dim4()); + Array resVal = createEmptyArray(dim4()); - val.modDims(inDims.elements()); - key.modDims(inDims.elements()); + val.setDataDims(inDims.elements()); + key.setDataDims(inDims.elements()); - sort_by_key(*resVal, *resKey, val, key, 0); + sort_by_key(resVal, resKey, val, key, 0); // Needs to be ascending (true) in order to maintain the indices properly - sort_by_key(key, val, *resKey, *resVal, 0); + sort_by_key(key, val, resKey, resVal, 0); val.eval(); - val.modDims(inDims); + val.setDataDims(inDims); } template From 03ce51f87c7fd3ff69808ff41cb75e33b64edb13 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 4 Apr 2016 15:40:25 -0400 Subject: [PATCH 09/28] Call thrust/compute::sort_by_key instead of kernel::sort_by_key wrapper --- src/backend/cuda/kernel/sort.hpp | 24 +++++++++++++++++++++--- src/backend/opencl/kernel/sort.hpp | 21 ++++++++++++++++++--- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/src/backend/cuda/kernel/sort.hpp b/src/backend/cuda/kernel/sort.hpp index 21d9122d64..f0095b144d 100644 --- a/src/backend/cuda/kernel/sort.hpp +++ b/src/backend/cuda/kernel/sort.hpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -92,10 +91,29 @@ namespace cuda // Sort indices // sort_by_key(*resVal, *resKey, val, key, 0); - kernel::sort0_by_key(pVal, pKey); + //kernel::sort0_by_key(pVal, pKey); + thrust::device_ptr pVal_ptr = thrust::device_pointer_cast(pVal.ptr); + thrust::device_ptr pKey_ptr = thrust::device_pointer_cast(pKey.ptr); + if(isAscending) { + THRUST_SELECT(thrust::stable_sort_by_key, + pVal_ptr, + pVal_ptr + pVal.dims[0], + pKey_ptr); + } else { + THRUST_SELECT(thrust::stable_sort_by_key, + pVal_ptr, + pVal_ptr + pVal.dims[0], + pKey_ptr, thrust::greater()); + } + POST_LAUNCH_CHECK(); // Needs to be ascending (true) in order to maintain the indices properly - kernel::sort0_by_key(pKey, pVal); + //kernel::sort0_by_key(pKey, pVal); + THRUST_SELECT(thrust::stable_sort_by_key, + pKey_ptr, + pKey_ptr + pVal.dims[0], + pVal_ptr); + POST_LAUNCH_CHECK(); // No need of doing moddims here because the original Array // dimensions have not been changed diff --git a/src/backend/opencl/kernel/sort.hpp b/src/backend/opencl/kernel/sort.hpp index 63f1658208..b9a7d39b28 100644 --- a/src/backend/opencl/kernel/sort.hpp +++ b/src/backend/opencl/kernel/sort.hpp @@ -17,13 +17,13 @@ #include #include #include -#include #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" #include #include +#include #include #include @@ -125,10 +125,25 @@ namespace opencl // Sort indices // sort_by_key(*resVal, *resKey, val, key, 0); - kernel::sort0_by_key(pVal, pKey); + //kernel::sort0_by_key(pVal, pKey); + compute::command_queue c_queue(getQueue()()); + + compute::buffer pKey_buf((*pKey.data)()); + compute::buffer pVal_buf((*pVal.data)()); + + compute::buffer_iterator< type_t > val0 = compute::make_buffer_iterator< type_t >(pVal_buf, 0); + compute::buffer_iterator< type_t > valN = compute::make_buffer_iterator< type_t >(pVal_buf,+ pVal.info.dims[0]); + compute::buffer_iterator< type_t > key0 = compute::make_buffer_iterator< type_t >(pKey_buf, 0); + compute::buffer_iterator< type_t > keyN = compute::make_buffer_iterator< type_t >(pKey_buf, pKey.info.dims[0]); + if(isAscending) { + compute::sort_by_key(val0, valN, key0, c_queue); + } else { + compute::sort_by_key(val0, valN, key0, compute::greater< type_t >(), c_queue); + } // Needs to be ascending (true) in order to maintain the indices properly - kernel::sort0_by_key(pKey, pVal); + //kernel::sort0_by_key(pKey, pVal); + compute::sort_by_key(key0, keyN, val0, c_queue); // No need of doing moddims here because the original Array // dimensions have not been changed From 6c47b15657019ad815db966512a096b212ca8d8e Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 4 Apr 2016 19:28:22 -0400 Subject: [PATCH 10/28] PERF Add batched sort to sort_index for CPU and CUDA --- src/api/c/sort.cpp | 2 - src/backend/cpu/kernel/sort_helper.hpp | 56 ++++++++++++ src/backend/cpu/kernel/sort_index.hpp | 117 +++++++++++++++++++----- src/backend/cpu/sort_index.cpp | 13 ++- src/backend/cuda/kernel/harris.hpp | 2 +- src/backend/cuda/kernel/iota.hpp | 6 +- src/backend/cuda/kernel/orb.hpp | 2 +- src/backend/cuda/kernel/sort_helper.hpp | 66 +++++++++++++ src/backend/cuda/kernel/sort_index.hpp | 87 +++++++++++++++++- src/backend/cuda/sort_index.cu | 14 ++- test/sort_index.cpp | 3 +- 11 files changed, 325 insertions(+), 43 deletions(-) create mode 100644 src/backend/cpu/kernel/sort_helper.hpp create mode 100644 src/backend/cuda/kernel/sort_helper.hpp diff --git a/src/api/c/sort.cpp b/src/api/c/sort.cpp index e3f3ae35da..7f81fbc540 100644 --- a/src/api/c/sort.cpp +++ b/src/api/c/sort.cpp @@ -91,8 +91,6 @@ af_err af_sort_index(af_array *out, af_array *indices, const af_array in, const af_dtype type = info.getType(); DIM_ASSERT(2, info.elements() > 0); - // Only Dim 0 supported - ARG_ASSERT(3, dim == 0); af_array val; af_array idx; diff --git a/src/backend/cpu/kernel/sort_helper.hpp b/src/backend/cpu/kernel/sort_helper.hpp new file mode 100644 index 0000000000..4b8f2f95b0 --- /dev/null +++ b/src/backend/cpu/kernel/sort_helper.hpp @@ -0,0 +1,56 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ + namespace kernel + { + static const int copyPairIter = 4; + + template + using IndexPair = std::pair; + + template + struct IPCompare + { + bool operator()(const IndexPair &lhs, const IndexPair &rhs) + { + // Check stable sort condition + if(isAscending) return (lhs.first < rhs.first); + else return (lhs.first > rhs.first); + } + }; + + template + using KeyIndexPair = std::pair, uint>; + + template + struct KIPCompareV + { + bool operator()(const KeyIndexPair &lhs, const KeyIndexPair &rhs) + { + // Check stable sort condition + if(isAscending) return (lhs.first.first < rhs.first.first); + else return (lhs.first.first > rhs.first.first); + } + }; + + template + struct KIPCompareK + { + bool operator()(const KeyIndexPair &lhs, const KeyIndexPair &rhs) + { + if(isAscending) return (lhs.second < rhs.second); + else return (lhs.second > rhs.second); + } + }; + } +} diff --git a/src/backend/cpu/kernel/sort_index.hpp b/src/backend/cpu/kernel/sort_index.hpp index b71cc47071..1b86507aae 100644 --- a/src/backend/cpu/kernel/sort_index.hpp +++ b/src/backend/cpu/kernel/sort_index.hpp @@ -15,6 +15,7 @@ #include #include #include +#include namespace cpu { @@ -22,50 +23,118 @@ namespace kernel { template -void sort0_index(Array val, Array idx, const Array in) +void sort0IndexIterative(Array val, Array idx) { // initialize original index locations - uint *idx_ptr = idx.get(); - T *val_ptr = val.get(); - const T *in_ptr = in.get(); - function op = std::greater(); - if(isAscending) { op = std::less(); } + uint *idx_ptr = idx.get(); + T *val_ptr = val.get(); - std::vector seq_vec(idx.dims()[0]); - std::iota(seq_vec.begin(), seq_vec.end(), 0); + std::vector > X; + X.reserve(val.dims()[0]); - const T *comp_ptr = nullptr; - auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);}; - - for(dim_t w = 0; w < in.dims()[3]; w++) { + for(dim_t w = 0; w < val.dims()[3]; w++) { dim_t valW = w * val.strides()[3]; dim_t idxW = w * idx.strides()[3]; - dim_t inW = w * in.strides()[3]; - for(dim_t z = 0; z < in.dims()[2]; z++) { + for(dim_t z = 0; z < val.dims()[2]; z++) { dim_t valWZ = valW + z * val.strides()[2]; dim_t idxWZ = idxW + z * idx.strides()[2]; - dim_t inWZ = inW + z * in.strides()[2]; - for(dim_t y = 0; y < in.dims()[1]; y++) { - + for(dim_t y = 0; y < val.dims()[1]; y++) { dim_t valOffset = valWZ + y * val.strides()[1]; dim_t idxOffset = idxWZ + y * idx.strides()[1]; - dim_t inOffset = inWZ + y * in.strides()[1]; - uint *ptr = idx_ptr + idxOffset; - std::copy(seq_vec.begin(), seq_vec.end(), ptr); + X.clear(); + std::transform(val_ptr + valOffset, val_ptr + valOffset + val.dims()[0], + idx_ptr + idxOffset, + std::back_inserter(X), + [](T v_, uint i_) { return std::make_pair(v_, i_); } + ); + + //comp_ptr = &X.front(); + std::stable_sort(X.begin(), X.end(), IPCompare()); + + for(unsigned it = 0; it < X.size(); it++) { + val_ptr[valOffset + it] = X[it].first; + idx_ptr[idxOffset + it] = X[it].second; + } + } + } + } + + return; +} + +template +void sortIndexBatched(Array val, Array idx) +{ + af::dim4 inDims = val.dims(); + + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; - comp_ptr = in_ptr + inOffset; - std::stable_sort(ptr, ptr + in.dims()[0], comparator); + uint* key = memAlloc(inDims.elements()); + // IOTA + { + af::dim4 dims = inDims; + uint* out = key; + af::dim4 strides(1); + for(int i = 1; i < 4; i++) + strides[i] = strides[i-1] * dims[i-1]; - for (dim_t i = 0; i < val.dims()[0]; ++i){ - val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]]; + for(dim_t w = 0; w < dims[3]; w++) { + dim_t offW = w * strides[3]; + T valW = (w % seqDims[3]) * seqDims[0] * seqDims[1] * seqDims[2]; + for(dim_t z = 0; z < dims[2]; z++) { + dim_t offWZ = offW + z * strides[2]; + T valZ = valW + (z % seqDims[2]) * seqDims[0] * seqDims[1]; + for(dim_t y = 0; y < dims[1]; y++) { + dim_t offWZY = offWZ + y * strides[1]; + T valY = valZ + (y % seqDims[1]) * seqDims[0]; + for(dim_t x = 0; x < dims[0]; x++) { + dim_t id = offWZY + x; + out[id] = valY + (x % seqDims[0]); + } } } } } + // initialize original index locations + uint *idx_ptr = idx.get(); + T *val_ptr = val.get(); + + std::vector > X; + X.reserve(val.elements()); + + for(unsigned i = 0; i < val.elements(); i++) { + X.push_back(std::make_pair(std::make_pair(val_ptr[i], idx_ptr[i]), key[i])); + } + + memFree(key); // key is no longer required + + std::stable_sort(X.begin(), X.end(), KIPCompareV()); + + std::stable_sort(X.begin(), X.end(), KIPCompareK()); + + for(unsigned it = 0; it < val.elements(); it++) { + val_ptr[it] = X[it].first.first; + idx_ptr[it] = X[it].first.second; + } + return; } +template +void sort0Index(Array val, Array idx) +{ + int higherDims = val.dims()[1] * val.dims()[2] * val.dims()[3]; + // TODO Make a better heurisitic + if(higherDims > 0) + kernel::sortIndexBatched(val, idx); + else + kernel::sort0IndexIterative(val, idx); +} + } } diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp index 77860ede18..883cb24bb3 100644 --- a/src/backend/cpu/sort_index.cpp +++ b/src/backend/cpu/sort_index.cpp @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include namespace cpu @@ -24,10 +26,15 @@ void sort_index(Array &val, Array &idx, const Array &in, const uint { in.eval(); - val = createEmptyArray(in.dims()); - idx = createEmptyArray(in.dims()); + val = copyArray(in); + idx = range(in.dims(), dim); + idx.eval(); + switch(dim) { - case 0: getQueue().enqueue(kernel::sort0_index, val, idx, in); break; + case 0: getQueue().enqueue(kernel::sort0Index, val, idx); break; + case 1: getQueue().enqueue(kernel::sortIndexBatched, val, idx); break; + case 2: getQueue().enqueue(kernel::sortIndexBatched, val, idx); break; + case 3: getQueue().enqueue(kernel::sortIndexBatched, val, idx); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } } diff --git a/src/backend/cuda/kernel/harris.hpp b/src/backend/cuda/kernel/harris.hpp index 44f98d92c1..3cb28b2b2f 100644 --- a/src/backend/cuda/kernel/harris.hpp +++ b/src/backend/cuda/kernel/harris.hpp @@ -339,7 +339,7 @@ void harris(unsigned* corners_out, harris_idx.ptr = memAlloc(sort_elem); // Sort Harris responses - sort0_index(harris_responses, harris_idx); + sort0Index(harris_responses, harris_idx); *x_out = memAlloc(*corners_out); *y_out = memAlloc(*corners_out); diff --git a/src/backend/cuda/kernel/iota.hpp b/src/backend/cuda/kernel/iota.hpp index 2632266c92..fc28c82882 100644 --- a/src/backend/cuda/kernel/iota.hpp +++ b/src/backend/cuda/kernel/iota.hpp @@ -18,8 +18,8 @@ namespace cuda namespace kernel { // Kernel Launch Config Values - static const unsigned TX = 32; - static const unsigned TY = 8; + static const unsigned IOTA_TX = 32; + static const unsigned IOTA_TY = 8; static const unsigned TILEX = 512; static const unsigned TILEY = 32; @@ -71,7 +71,7 @@ namespace cuda template void iota(Param out, const dim4 &sdims, const dim4 &tdims) { - dim3 threads(TX, TY, 1); + dim3 threads(IOTA_TX, IOTA_TY, 1); int blocksPerMatX = divup(out.dims[0], TILEX); int blocksPerMatY = divup(out.dims[1], TILEY); diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp index 89de56065d..8448418f8b 100644 --- a/src/backend/cuda/kernel/orb.hpp +++ b/src/backend/cuda/kernel/orb.hpp @@ -397,7 +397,7 @@ void orb(unsigned* out_feat, harris_idx.ptr = memAlloc(sort_elem); // Sort features according to Harris responses - sort0_index(harris_sorted, harris_idx); + sort0Index(harris_sorted, harris_idx); feat_pyr[i] = std::min(feat_pyr[i], lvl_best[i]); diff --git a/src/backend/cuda/kernel/sort_helper.hpp b/src/backend/cuda/kernel/sort_helper.hpp new file mode 100644 index 0000000000..445e9a28f5 --- /dev/null +++ b/src/backend/cuda/kernel/sort_helper.hpp @@ -0,0 +1,66 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include + +// This needs to be in global namespace as it is used by thrust +template +struct IndexPair +{ + T val; + uint idx; +}; + +template +struct IPCompare +{ + __host__ __device__ + bool operator()(const IndexPair &lhs, const IndexPair &rhs) const + { + // Check stable sort condition + if(isAscending) return (lhs.val < rhs.val); + else return (lhs.val > rhs.val); + } +}; + +namespace cuda +{ + namespace kernel + { + static const int copyPairIter = 4; + + template + __global__ + void makeIndexPair(IndexPair *out, const Tk *key, const Tv *val, const int N) + { + int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x; + + for(int i = tIdx; i < N; i += blockDim.x) + { + out[i].val = val[i]; + out[i].idx = key[i]; + } + } + + template + __global__ + void splitIndexPair(Tk *key, Tv *val, const IndexPair *out, const int N) + { + int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x; + + for(int i = tIdx; i < N; i += blockDim.x) + { + val[i] = out[i].val; + key[i] = out[i].idx; + } + } + } +} diff --git a/src/backend/cuda/kernel/sort_index.hpp b/src/backend/cuda/kernel/sort_index.hpp index 8762f28a4a..d23c503005 100644 --- a/src/backend/cuda/kernel/sort_index.hpp +++ b/src/backend/cuda/kernel/sort_index.hpp @@ -12,8 +12,11 @@ #include #include #include +#include +#include + #include -#include +#include #include namespace cuda @@ -24,7 +27,7 @@ namespace cuda // Wrapper functions /////////////////////////////////////////////////////////////////////////// template - void sort0_index(Param val, Param idx) + void sort0IndexIterative(Param val, Param idx) { thrust::device_ptr val_ptr = thrust::device_pointer_cast(val.ptr); thrust::device_ptr idx_ptr = thrust::device_pointer_cast(idx.ptr); @@ -40,7 +43,6 @@ namespace cuda int valOffset = valWZ + y * val.strides[1]; int idxOffset = idxWZ + y * idx.strides[1]; - THRUST_SELECT(thrust::sequence, idx_ptr + idxOffset, idx_ptr + idxOffset + idx.dims[0]); if(isAscending) { THRUST_SELECT(thrust::stable_sort_by_key, val_ptr + valOffset, val_ptr + valOffset + val.dims[0], @@ -55,5 +57,84 @@ namespace cuda } POST_LAUNCH_CHECK(); } + + template + void sortIndexBatched(Param pVal, Param pIdx) + { + af::dim4 inDims; + for(int i = 0; i < 4; i++) + inDims[i] = pVal.dims[i]; + + // Sort dimension + // tileDims * seqDims = inDims + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + // Create/call iota + // Array key = iota(seqDims, tileDims); + dim4 keydims = inDims; + uint* key = memAlloc(keydims.elements()); + Param pKey; + pKey.ptr = key; + pKey.strides[0] = 1; + pKey.dims[0] = keydims[0]; + for(int i = 1; i < 4; i++) { + pKey.dims[i] = keydims[i]; + pKey.strides[i] = pKey.strides[i - 1] * pKey.dims[i - 1]; + } + cuda::kernel::iota(pKey, seqDims, tileDims); + + // Flat - Not required since inplace and both are continuous + //val.modDims(inDims.elements()); + //key.modDims(inDims.elements()); + + // Make val, idx into a pair + thrust::device_vector > X(inDims.elements()); + IndexPair *Xptr = thrust::raw_pointer_cast(X.data()); + + const int threads = 256; + int blocks = divup(inDims.elements(), threads * copyPairIter); + CUDA_LAUNCH((makeIndexPair), blocks, threads, + Xptr, pIdx.ptr, pVal.ptr, inDims.elements()); + + // Sort indices + // sort_by_key(*resVal, *resKey, val, key, 0); + THRUST_SELECT(thrust::stable_sort_by_key, + X.begin(), X.end(), + pKey.ptr, + IPCompare()); + POST_LAUNCH_CHECK(); + + // Needs to be ascending (true) in order to maintain the indices properly + //kernel::sort0_by_key(pKey, pVal); + THRUST_SELECT(thrust::stable_sort_by_key, + pKey.ptr, + pKey.ptr + inDims.elements(), + X.begin()); + POST_LAUNCH_CHECK(); + + CUDA_LAUNCH((splitIndexPair), blocks, threads, + pIdx.ptr, pVal.ptr, Xptr, inDims.elements()); + POST_LAUNCH_CHECK(); + + // No need of doing moddims here because the original Array + // dimensions have not been changed + //val.modDims(inDims); + + memFree(key); + } + + template + void sort0Index(Param val, Param idx) + { + int higherDims = val.dims[1] * val.dims[2] * val.dims[3]; + // TODO Make a better heurisitic + if(higherDims > 5) + sortIndexBatched(val, idx); + else + kernel::sort0IndexIterative(val, idx); + } } } diff --git a/src/backend/cuda/sort_index.cu b/src/backend/cuda/sort_index.cu index 606aab4eb1..270df30128 100644 --- a/src/backend/cuda/sort_index.cu +++ b/src/backend/cuda/sort_index.cu @@ -9,11 +9,13 @@ #include #include -#include #include +#include #include #include #include +#include +#include namespace cuda { @@ -21,10 +23,14 @@ namespace cuda void sort_index(Array &val, Array &idx, const Array &in, const uint dim) { val = copyArray(in); - idx = createEmptyArray(in.dims()); + idx = range(in.dims(), dim); + idx.eval(); + switch(dim) { - case 0: kernel::sort0_index(val, idx); - break; + case 0: kernel::sort0Index(val, idx); break; + case 1: kernel::sortIndexBatched(val, idx); break; + case 2: kernel::sortIndexBatched(val, idx); break; + case 3: kernel::sortIndexBatched(val, idx); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } } diff --git a/test/sort_index.cpp b/test/sort_index.cpp index 6aa240d5a5..2326ed706f 100644 --- a/test/sort_index.cpp +++ b/test/sort_index.cpp @@ -124,12 +124,11 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const //SORT_INIT(SortMed5False, sort_med, false, 2, 3); //SORT_INIT(SortLargeTrue, sort_large, true, 0, 1); //SORT_INIT(SortLargeFalse, sort_large, false, 2, 3); -; //////////////////////////////////// CPP ///////////////////////////////// // -TEST(SortIndex, CPP) +TEST(SortIndex, CPPDim0) { if (noDoubleTests()) return; From 48dcfb0cb5c7d6c94d028f13eafcba7852a67741 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Mon, 18 Apr 2016 18:08:44 -0400 Subject: [PATCH 11/28] Fix ordering of data from sort --- src/backend/cpu/sort.cpp | 16 +++++++++++++++- src/backend/cuda/sort.cu | 14 ++++++++++++++ src/backend/opencl/sort.cpp | 14 ++++++++++++++ test/sort.cpp | 4 ++++ 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp index fbf613f962..7fa8769f11 100644 --- a/src/backend/cpu/sort.cpp +++ b/src/backend/cpu/sort.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include namespace cpu @@ -49,7 +50,7 @@ void sortBatched(Array& val) sort_by_key(key, val, resKey, resVal, 0); val.eval(); - val.setDataDims(inDims); + val.setDataDims(inDims); // This is correct only for dim0 } template @@ -76,6 +77,19 @@ Array sort(const Array &in, const unsigned dim) case 3: sortBatched(out); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } + + if(dim != 0) { + af::dim4 preorderDims = out.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = out.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = out.dims()[i - 1]; + } + + out = reorder(out, reorderDims); + } return out; } diff --git a/src/backend/cuda/sort.cu b/src/backend/cuda/sort.cu index 4ae3b759fb..99b42d4196 100644 --- a/src/backend/cuda/sort.cu +++ b/src/backend/cuda/sort.cu @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -28,6 +29,19 @@ namespace cuda case 3: kernel::sortBatched(out); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } + + if(dim != 0) { + af::dim4 preorderDims = out.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = out.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = out.dims()[i - 1]; + } + + out = reorder(out, reorderDims); + } return out; } diff --git a/src/backend/opencl/sort.cpp b/src/backend/opencl/sort.cpp index 0bf2dc04cd..c7bd774ecd 100644 --- a/src/backend/opencl/sort.cpp +++ b/src/backend/opencl/sort.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -29,6 +30,19 @@ namespace opencl case 3: kernel::sortBatched(out); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } + + if(dim != 0) { + af::dim4 preorderDims = out.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = out.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = out.dims()[i - 1]; + } + + out = reorder(out, reorderDims); + } return out; } catch (std::exception &ex) { AF_ERROR(ex.what(), AF_ERR_INTERNAL); diff --git a/test/sort.cpp b/test/sort.cpp index 116b136abe..977b54b5c7 100644 --- a/test/sort.cpp +++ b/test/sort.cpp @@ -166,6 +166,8 @@ TEST(Sort, CPPDim1) af::array output = af::sort(input_, 1, dir); + output = reorder(output, 1, 0, 2, 3); // Required for checking with test data + size_t nElems = tests[resultIdx0].size(); // Get result @@ -200,6 +202,8 @@ TEST(Sort, CPPDim2) af::array output = af::sort(input_, 2, dir); + output = reorder(output, 2, 0, 1, 3); // Required for checking with test data + size_t nElems = tests[resultIdx0].size(); // Get result From c79b26fa58dfd3d8493a1cea5460fa573cced894 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Tue, 19 Apr 2016 10:46:23 -0400 Subject: [PATCH 12/28] Fix reordering of data for sort_index in cpu and cuda --- src/backend/cpu/sort.cpp | 1 + src/backend/cpu/sort_index.cpp | 18 ++++++++++++++++++ src/backend/cuda/sort.cu | 1 + src/backend/cuda/sort_index.cu | 17 +++++++++++++++++ src/backend/opencl/sort.cpp | 1 + 5 files changed, 38 insertions(+) diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp index 7fa8769f11..4a649e0b23 100644 --- a/src/backend/cpu/sort.cpp +++ b/src/backend/cpu/sort.cpp @@ -88,6 +88,7 @@ Array sort(const Array &in, const unsigned dim) preorderDims[i] = out.dims()[i - 1]; } + out.setDataDims(preorderDims); out = reorder(out, reorderDims); } return out; diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp index 883cb24bb3..36ca57b3e8 100644 --- a/src/backend/cpu/sort_index.cpp +++ b/src/backend/cpu/sort_index.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include namespace cpu @@ -37,6 +38,23 @@ void sort_index(Array &val, Array &idx, const Array &in, const uint case 3: getQueue().enqueue(kernel::sortIndexBatched, val, idx); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } + + if(dim != 0) { + af::dim4 preorderDims = val.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = val.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = val.dims()[i - 1]; + } + + val.setDataDims(preorderDims); + idx.setDataDims(preorderDims); + + val = reorder(val, reorderDims); + idx = reorder(idx, reorderDims); + } } #define INSTANTIATE(T) \ diff --git a/src/backend/cuda/sort.cu b/src/backend/cuda/sort.cu index 99b42d4196..9b0f4c53af 100644 --- a/src/backend/cuda/sort.cu +++ b/src/backend/cuda/sort.cu @@ -40,6 +40,7 @@ namespace cuda preorderDims[i] = out.dims()[i - 1]; } + out.setDataDims(preorderDims); out = reorder(out, reorderDims); } return out; diff --git a/src/backend/cuda/sort_index.cu b/src/backend/cuda/sort_index.cu index 270df30128..ab54c24a9c 100644 --- a/src/backend/cuda/sort_index.cu +++ b/src/backend/cuda/sort_index.cu @@ -33,6 +33,23 @@ namespace cuda case 3: kernel::sortIndexBatched(val, idx); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } + + if(dim != 0) { + af::dim4 preorderDims = val.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = val.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = val.dims()[i - 1]; + } + + val.setDataDims(preorderDims); + idx.setDataDims(preorderDims); + + val = reorder(val, reorderDims); + idx = reorder(idx, reorderDims); + } } #define INSTANTIATE(T) \ diff --git a/src/backend/opencl/sort.cpp b/src/backend/opencl/sort.cpp index c7bd774ecd..1548f27472 100644 --- a/src/backend/opencl/sort.cpp +++ b/src/backend/opencl/sort.cpp @@ -41,6 +41,7 @@ namespace opencl preorderDims[i] = out.dims()[i - 1]; } + out.setDataDims(preorderDims); out = reorder(out, reorderDims); } return out; From 08613a817564f89d18397f53de83b2633ed9d97b Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Wed, 20 Apr 2016 13:22:00 -0400 Subject: [PATCH 13/28] Added sort_by_key batching to CPU and CUDA --- src/api/c/sort.cpp | 2 - src/backend/cpu/kernel/sort_by_key.hpp | 127 +++++++++++++++++------- src/backend/cpu/kernel/sort_helper.hpp | 22 ++-- src/backend/cpu/kernel/sort_index.hpp | 16 +-- src/backend/cpu/sort_by_key.cpp | 32 ++++-- src/backend/cuda/kernel/sort_by_key.hpp | 83 +++++++++++++++- src/backend/cuda/kernel/sort_helper.hpp | 26 ++--- src/backend/cuda/kernel/sort_index.hpp | 37 +++---- src/backend/cuda/sort_by_key_impl.hpp | 27 ++++- 9 files changed, 274 insertions(+), 98 deletions(-) diff --git a/src/api/c/sort.cpp b/src/api/c/sort.cpp index 7f81fbc540..dd58175936 100644 --- a/src/api/c/sort.cpp +++ b/src/api/c/sort.cpp @@ -174,8 +174,6 @@ af_err af_sort_by_key(af_array *out_keys, af_array *out_values, DIM_ASSERT(3, kinfo.elements() > 0); DIM_ASSERT(4, kinfo.dims() == vinfo.dims()); - // Only Dim 0 supported - ARG_ASSERT(5, dim == 0); TYPE_ASSERT(kinfo.isReal()); diff --git a/src/backend/cpu/kernel/sort_by_key.hpp b/src/backend/cpu/kernel/sort_by_key.hpp index f9d391dc46..1be4a94d3a 100644 --- a/src/backend/cpu/kernel/sort_by_key.hpp +++ b/src/backend/cpu/kernel/sort_by_key.hpp @@ -16,6 +16,7 @@ #include #include #include +#include namespace cpu { @@ -23,57 +24,40 @@ namespace kernel { template -void sort0_by_key(Array okey, Array oval, Array oidx, - const Array ikey, const Array ival) +void sort0ByKeyIterative(Array okey, Array oval) { - function op = std::greater(); - if(isAscending) { op = std::less(); } - // Get pointers and initialize original index locations - uint *oidx_ptr = oidx.get(); - Tk *okey_ptr = okey.get(); - Tv *oval_ptr = oval.get(); - const Tk *ikey_ptr = ikey.get(); - const Tv *ival_ptr = ival.get(); - - std::vector seq_vec(oidx.dims()[0]); - std::iota(seq_vec.begin(), seq_vec.end(), 0); + Tk *okey_ptr = okey.get(); + Tv *oval_ptr = oval.get(); - const Tk *comp_ptr = nullptr; - auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);}; + std::vector > X; + X.reserve(okey.dims()[0]); - for(dim_t w = 0; w < ikey.dims()[3]; w++) { + for(dim_t w = 0; w < okey.dims()[3]; w++) { dim_t okeyW = w * okey.strides()[3]; dim_t ovalW = w * oval.strides()[3]; - dim_t oidxW = w * oidx.strides()[3]; - dim_t ikeyW = w * ikey.strides()[3]; - dim_t ivalW = w * ival.strides()[3]; - for(dim_t z = 0; z < ikey.dims()[2]; z++) { + for(dim_t z = 0; z < okey.dims()[2]; z++) { dim_t okeyWZ = okeyW + z * okey.strides()[2]; dim_t ovalWZ = ovalW + z * oval.strides()[2]; - dim_t oidxWZ = oidxW + z * oidx.strides()[2]; - dim_t ikeyWZ = ikeyW + z * ikey.strides()[2]; - dim_t ivalWZ = ivalW + z * ival.strides()[2]; - for(dim_t y = 0; y < ikey.dims()[1]; y++) { + for(dim_t y = 0; y < okey.dims()[1]; y++) { dim_t okeyOffset = okeyWZ + y * okey.strides()[1]; dim_t ovalOffset = ovalWZ + y * oval.strides()[1]; - dim_t oidxOffset = oidxWZ + y * oidx.strides()[1]; - dim_t ikeyOffset = ikeyWZ + y * ikey.strides()[1]; - dim_t ivalOffset = ivalWZ + y * ival.strides()[1]; - uint *ptr = oidx_ptr + oidxOffset; - std::copy(seq_vec.begin(), seq_vec.end(), ptr); + X.clear(); + std::transform(okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims()[0], + oval_ptr + ovalOffset, + std::back_inserter(X), + [](Tk v_, Tv i_) { return std::make_pair(v_, i_); } + ); - comp_ptr = ikey_ptr + ikeyOffset; - std::stable_sort(ptr, ptr + ikey.dims()[0], comparator); + std::stable_sort(X.begin(), X.end(), IPCompare()); - for (dim_t i = 0; i < oval.dims()[0]; ++i){ - uint sortIdx = oidx_ptr[oidxOffset + i]; - okey_ptr[okeyOffset + i] = ikey_ptr[ikeyOffset + sortIdx]; - oval_ptr[ovalOffset + i] = ival_ptr[ivalOffset + sortIdx]; + for(unsigned it = 0; it < X.size(); it++) { + okey_ptr[okeyOffset + it] = X[it].first; + oval_ptr[ovalOffset + it] = X[it].second; } } } @@ -82,5 +66,78 @@ void sort0_by_key(Array okey, Array oval, Array oidx, return; } +template +void sortByKeyBatched(Array okey, Array oval) +{ + af::dim4 inDims = okey.dims(); + + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + uint* key = memAlloc(inDims.elements()); + // IOTA + { + af::dim4 dims = inDims; + uint* out = key; + af::dim4 strides(1); + for(int i = 1; i < 4; i++) + strides[i] = strides[i-1] * dims[i-1]; + + for(dim_t w = 0; w < dims[3]; w++) { + dim_t offW = w * strides[3]; + uint okeyW = (w % seqDims[3]) * seqDims[0] * seqDims[1] * seqDims[2]; + for(dim_t z = 0; z < dims[2]; z++) { + dim_t offWZ = offW + z * strides[2]; + uint okeyZ = okeyW + (z % seqDims[2]) * seqDims[0] * seqDims[1]; + for(dim_t y = 0; y < dims[1]; y++) { + dim_t offWZY = offWZ + y * strides[1]; + uint okeyY = okeyZ + (y % seqDims[1]) * seqDims[0]; + for(dim_t x = 0; x < dims[0]; x++) { + dim_t id = offWZY + x; + out[id] = okeyY + (x % seqDims[0]); + } + } + } + } + } + + // initialize original index locations + Tk *okey_ptr = okey.get(); + Tv *oval_ptr = oval.get(); + + std::vector > X; + X.reserve(okey.elements()); + + for(unsigned i = 0; i < okey.elements(); i++) { + X.push_back(std::make_pair(std::make_pair(okey_ptr[i], oval_ptr[i]), key[i])); + } + + memFree(key); // key is no longer required + + std::stable_sort(X.begin(), X.end(), KIPCompareV()); + + std::stable_sort(X.begin(), X.end(), KIPCompareK()); + + for(unsigned it = 0; it < okey.elements(); it++) { + okey_ptr[it] = X[it].first.first; + oval_ptr[it] = X[it].first.second; + } + + return; +} + +template +void sort0ByKey(Array okey, Array oval) +{ + int higherDims = okey.dims()[1] * okey.dims()[2] * okey.dims()[3]; + // TODO Make a better heurisitic + if(higherDims > 0) + kernel::sortByKeyBatched(okey, oval); + else + kernel::sort0ByKeyIterative(okey, oval); +} + } } diff --git a/src/backend/cpu/kernel/sort_helper.hpp b/src/backend/cpu/kernel/sort_helper.hpp index 4b8f2f95b0..ff7da3560b 100644 --- a/src/backend/cpu/kernel/sort_helper.hpp +++ b/src/backend/cpu/kernel/sort_helper.hpp @@ -13,15 +13,13 @@ namespace cpu { namespace kernel { - static const int copyPairIter = 4; + template + using IndexPair = std::pair; - template - using IndexPair = std::pair; - - template + template struct IPCompare { - bool operator()(const IndexPair &lhs, const IndexPair &rhs) + bool operator()(const IndexPair &lhs, const IndexPair &rhs) { // Check stable sort condition if(isAscending) return (lhs.first < rhs.first); @@ -29,13 +27,13 @@ namespace cpu } }; - template - using KeyIndexPair = std::pair, uint>; + template + using KeyIndexPair = std::pair, uint>; - template + template struct KIPCompareV { - bool operator()(const KeyIndexPair &lhs, const KeyIndexPair &rhs) + bool operator()(const KeyIndexPair &lhs, const KeyIndexPair &rhs) { // Check stable sort condition if(isAscending) return (lhs.first.first < rhs.first.first); @@ -43,10 +41,10 @@ namespace cpu } }; - template + template struct KIPCompareK { - bool operator()(const KeyIndexPair &lhs, const KeyIndexPair &rhs) + bool operator()(const KeyIndexPair &lhs, const KeyIndexPair &rhs) { if(isAscending) return (lhs.second < rhs.second); else return (lhs.second > rhs.second); diff --git a/src/backend/cpu/kernel/sort_index.hpp b/src/backend/cpu/kernel/sort_index.hpp index 1b86507aae..7a23a7df49 100644 --- a/src/backend/cpu/kernel/sort_index.hpp +++ b/src/backend/cpu/kernel/sort_index.hpp @@ -29,7 +29,7 @@ void sort0IndexIterative(Array val, Array idx) uint *idx_ptr = idx.get(); T *val_ptr = val.get(); - std::vector > X; + std::vector > X; X.reserve(val.dims()[0]); for(dim_t w = 0; w < val.dims()[3]; w++) { @@ -50,7 +50,7 @@ void sort0IndexIterative(Array val, Array idx) ); //comp_ptr = &X.front(); - std::stable_sort(X.begin(), X.end(), IPCompare()); + std::stable_sort(X.begin(), X.end(), IPCompare()); for(unsigned it = 0; it < X.size(); it++) { val_ptr[valOffset + it] = X[it].first; @@ -84,13 +84,13 @@ void sortIndexBatched(Array val, Array idx) for(dim_t w = 0; w < dims[3]; w++) { dim_t offW = w * strides[3]; - T valW = (w % seqDims[3]) * seqDims[0] * seqDims[1] * seqDims[2]; + uint valW = (w % seqDims[3]) * seqDims[0] * seqDims[1] * seqDims[2]; for(dim_t z = 0; z < dims[2]; z++) { dim_t offWZ = offW + z * strides[2]; - T valZ = valW + (z % seqDims[2]) * seqDims[0] * seqDims[1]; + uint valZ = valW + (z % seqDims[2]) * seqDims[0] * seqDims[1]; for(dim_t y = 0; y < dims[1]; y++) { dim_t offWZY = offWZ + y * strides[1]; - T valY = valZ + (y % seqDims[1]) * seqDims[0]; + uint valY = valZ + (y % seqDims[1]) * seqDims[0]; for(dim_t x = 0; x < dims[0]; x++) { dim_t id = offWZY + x; out[id] = valY + (x % seqDims[0]); @@ -104,7 +104,7 @@ void sortIndexBatched(Array val, Array idx) uint *idx_ptr = idx.get(); T *val_ptr = val.get(); - std::vector > X; + std::vector > X; X.reserve(val.elements()); for(unsigned i = 0; i < val.elements(); i++) { @@ -113,9 +113,9 @@ void sortIndexBatched(Array val, Array idx) memFree(key); // key is no longer required - std::stable_sort(X.begin(), X.end(), KIPCompareV()); + std::stable_sort(X.begin(), X.end(), KIPCompareV()); - std::stable_sort(X.begin(), X.end(), KIPCompareK()); + std::stable_sort(X.begin(), X.end(), KIPCompareK()); for(unsigned it = 0; it < val.elements(); it++) { val_ptr[it] = X[it].first.first; diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/sort_by_key.cpp index 46ced4b9ef..46b06602b4 100644 --- a/src/backend/cpu/sort_by_key.cpp +++ b/src/backend/cpu/sort_by_key.cpp @@ -11,6 +11,9 @@ #include #include #include +#include +#include +#include #include namespace cpu @@ -23,16 +26,33 @@ void sort_by_key(Array &okey, Array &oval, ikey.eval(); ival.eval(); - okey = createEmptyArray(ikey.dims()); - oval = createEmptyArray(ival.dims()); - Array oidx = createValueArray(ikey.dims(), 0u); - oidx.eval(); + okey = copyArray(ikey); + oval = copyArray(ival); switch(dim) { - case 0: getQueue().enqueue(kernel::sort0_by_key, - okey, oval, oidx, ikey, ival); break; + case 0: getQueue().enqueue(kernel::sort0ByKey, okey, oval); break; + case 1: getQueue().enqueue(kernel::sortByKeyBatched, okey, oval); break; + case 2: getQueue().enqueue(kernel::sortByKeyBatched, okey, oval); break; + case 3: getQueue().enqueue(kernel::sortByKeyBatched, okey, oval); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } + + if(dim != 0) { + af::dim4 preorderDims = okey.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = okey.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = okey.dims()[i - 1]; + } + + okey.setDataDims(preorderDims); + oval.setDataDims(preorderDims); + + okey = reorder(okey, reorderDims); + oval = reorder(oval, reorderDims); + } } #define INSTANTIATE(Tk, Tv) \ diff --git a/src/backend/cuda/kernel/sort_by_key.hpp b/src/backend/cuda/kernel/sort_by_key.hpp index bfaa79a311..beffa5476e 100644 --- a/src/backend/cuda/kernel/sort_by_key.hpp +++ b/src/backend/cuda/kernel/sort_by_key.hpp @@ -12,7 +12,11 @@ #include #include #include +#include +#include + #include +#include #include namespace cuda @@ -23,7 +27,7 @@ namespace cuda // Wrapper functions /////////////////////////////////////////////////////////////////////////// template - void sort0_by_key(Param okey, Param oval) + void sort0ByKeyIterative(Param okey, Param oval) { thrust::device_ptr okey_ptr = thrust::device_pointer_cast(okey.ptr); thrust::device_ptr oval_ptr = thrust::device_pointer_cast(oval.ptr); @@ -55,5 +59,82 @@ namespace cuda } POST_LAUNCH_CHECK(); } + + template + void sortByKeyBatched(Param pKey, Param pVal) + { + af::dim4 inDims; + for(int i = 0; i < 4; i++) + inDims[i] = pKey.dims[i]; + + // Sort dimension + // tileDims * seqDims = inDims + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + // Create/call iota + // Array key = iota(seqDims, tileDims); + dim4 keydims = inDims; + uint* key = memAlloc(keydims.elements()); + Param pSeq; + pSeq.ptr = key; + pSeq.strides[0] = 1; + pSeq.dims[0] = keydims[0]; + for(int i = 1; i < 4; i++) { + pSeq.dims[i] = keydims[i]; + pSeq.strides[i] = pSeq.strides[i - 1] * pSeq.dims[i - 1]; + } + cuda::kernel::iota(pSeq, seqDims, tileDims); + + // Make pkey, pVal into a pair + thrust::device_vector > X(inDims.elements()); + IndexPair *Xptr = thrust::raw_pointer_cast(X.data()); + + const int threads = 256; + int blocks = divup(inDims.elements(), threads * copyPairIter); + CUDA_LAUNCH((makeIndexPair), blocks, threads, + Xptr, pKey.ptr, pVal.ptr, inDims.elements()); + POST_LAUNCH_CHECK(); + + // Sort indices + // Need to convert pSeq to thrust::device_ptr, otherwise thrust + // throws weird errors for all *64 data types (double, intl, uintl etc) + thrust::device_ptr dSeq = thrust::device_pointer_cast(pSeq.ptr); + THRUST_SELECT(thrust::stable_sort_by_key, + X.begin(), X.end(), + dSeq, + IPCompare()); + POST_LAUNCH_CHECK(); + + // Needs to be ascending (true) in order to maintain the indices properly + THRUST_SELECT(thrust::stable_sort_by_key, + dSeq, + dSeq + inDims.elements(), + X.begin()); + POST_LAUNCH_CHECK(); + + CUDA_LAUNCH((splitIndexPair), blocks, threads, + pKey.ptr, pVal.ptr, Xptr, inDims.elements()); + POST_LAUNCH_CHECK(); + + // No need of doing moddims here because the original Array + // dimensions have not been changed + //val.modDims(inDims); + + memFree(key); + } + + template + void sort0ByKey(Param okey, Param oval) + { + int higherDims = okey.dims[1] * okey.dims[2] * okey.dims[3]; + // TODO Make a better heurisitic + if(higherDims > 5) + sortByKeyBatched(okey, oval); + else + kernel::sort0ByKeyIterative(okey, oval); + } } } diff --git a/src/backend/cuda/kernel/sort_helper.hpp b/src/backend/cuda/kernel/sort_helper.hpp index 445e9a28f5..93fb33ac8f 100644 --- a/src/backend/cuda/kernel/sort_helper.hpp +++ b/src/backend/cuda/kernel/sort_helper.hpp @@ -12,22 +12,22 @@ #include // This needs to be in global namespace as it is used by thrust -template +template struct IndexPair { - T val; - uint idx; + Tk first; + Tv second; }; -template +template struct IPCompare { __host__ __device__ - bool operator()(const IndexPair &lhs, const IndexPair &rhs) const + bool operator()(const IndexPair &lhs, const IndexPair &rhs) const { // Check stable sort condition - if(isAscending) return (lhs.val < rhs.val); - else return (lhs.val > rhs.val); + if(isAscending) return (lhs.first < rhs.first); + else return (lhs.first > rhs.first); } }; @@ -39,27 +39,27 @@ namespace cuda template __global__ - void makeIndexPair(IndexPair *out, const Tk *key, const Tv *val, const int N) + void makeIndexPair(IndexPair *out, const Tk *first, const Tv *second, const int N) { int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x; for(int i = tIdx; i < N; i += blockDim.x) { - out[i].val = val[i]; - out[i].idx = key[i]; + out[i].first = first[i]; + out[i].second = second[i]; } } template __global__ - void splitIndexPair(Tk *key, Tv *val, const IndexPair *out, const int N) + void splitIndexPair(Tk *first, Tv *second, const IndexPair *out, const int N) { int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x; for(int i = tIdx; i < N; i += blockDim.x) { - val[i] = out[i].val; - key[i] = out[i].idx; + first[i] = out[i].first; + second[i] = out[i].second; } } } diff --git a/src/backend/cuda/kernel/sort_index.hpp b/src/backend/cuda/kernel/sort_index.hpp index d23c503005..40a5d59311 100644 --- a/src/backend/cuda/kernel/sort_index.hpp +++ b/src/backend/cuda/kernel/sort_index.hpp @@ -73,50 +73,51 @@ namespace cuda seqDims[dim] = 1; // Create/call iota - // Array key = iota(seqDims, tileDims); + // Array seq = iota(seqDims, tileDims); dim4 keydims = inDims; uint* key = memAlloc(keydims.elements()); - Param pKey; - pKey.ptr = key; - pKey.strides[0] = 1; - pKey.dims[0] = keydims[0]; + Param pSeq; + pSeq.ptr = key; + pSeq.strides[0] = 1; + pSeq.dims[0] = keydims[0]; for(int i = 1; i < 4; i++) { - pKey.dims[i] = keydims[i]; - pKey.strides[i] = pKey.strides[i - 1] * pKey.dims[i - 1]; + pSeq.dims[i] = keydims[i]; + pSeq.strides[i] = pSeq.strides[i - 1] * pSeq.dims[i - 1]; } - cuda::kernel::iota(pKey, seqDims, tileDims); + cuda::kernel::iota(pSeq, seqDims, tileDims); // Flat - Not required since inplace and both are continuous //val.modDims(inDims.elements()); //key.modDims(inDims.elements()); // Make val, idx into a pair - thrust::device_vector > X(inDims.elements()); - IndexPair *Xptr = thrust::raw_pointer_cast(X.data()); + thrust::device_vector > X(inDims.elements()); + IndexPair *Xptr = thrust::raw_pointer_cast(X.data()); const int threads = 256; int blocks = divup(inDims.elements(), threads * copyPairIter); - CUDA_LAUNCH((makeIndexPair), blocks, threads, - Xptr, pIdx.ptr, pVal.ptr, inDims.elements()); + CUDA_LAUNCH((makeIndexPair), blocks, threads, + Xptr, pVal.ptr, pIdx.ptr, inDims.elements()); // Sort indices // sort_by_key(*resVal, *resKey, val, key, 0); + thrust::device_ptr dSeq = thrust::device_pointer_cast(pSeq.ptr); THRUST_SELECT(thrust::stable_sort_by_key, X.begin(), X.end(), - pKey.ptr, - IPCompare()); + dSeq, + IPCompare()); POST_LAUNCH_CHECK(); // Needs to be ascending (true) in order to maintain the indices properly //kernel::sort0_by_key(pKey, pVal); THRUST_SELECT(thrust::stable_sort_by_key, - pKey.ptr, - pKey.ptr + inDims.elements(), + dSeq, + dSeq + inDims.elements(), X.begin()); POST_LAUNCH_CHECK(); - CUDA_LAUNCH((splitIndexPair), blocks, threads, - pIdx.ptr, pVal.ptr, Xptr, inDims.elements()); + CUDA_LAUNCH((splitIndexPair), blocks, threads, + pVal.ptr, pIdx.ptr, Xptr, inDims.elements()); POST_LAUNCH_CHECK(); // No need of doing moddims here because the original Array diff --git a/src/backend/cuda/sort_by_key_impl.hpp b/src/backend/cuda/sort_by_key_impl.hpp index 217b17dc8a..8cc86b55db 100644 --- a/src/backend/cuda/sort_by_key_impl.hpp +++ b/src/backend/cuda/sort_by_key_impl.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -23,10 +24,30 @@ namespace cuda { okey = copyArray(ikey); oval = copyArray(ival); + switch(dim) { - case 0: kernel::sort0_by_key(okey, oval); - break; - default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); + case 0: kernel::sort0ByKey(okey, oval); break; + case 1: kernel::sortByKeyBatched(okey, oval); break; + case 2: kernel::sortByKeyBatched(okey, oval); break; + case 3: kernel::sortByKeyBatched(okey, oval); break; + default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); + } + + if(dim != 0) { + af::dim4 preorderDims = okey.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = okey.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = okey.dims()[i - 1]; + } + + okey.setDataDims(preorderDims); + oval.setDataDims(preorderDims); + + okey = reorder(okey, reorderDims); + oval = reorder(oval, reorderDims); } } From aaa13f6cb514395dd723a1b6e09388f3825ee258 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Wed, 20 Apr 2016 13:50:59 -0400 Subject: [PATCH 14/28] Added tests for sort_index and sort_by_key for higher dimensions --- test/sort_by_key.cpp | 104 +++++++++++++++++++++++++++++++++++++++++-- test/sort_index.cpp | 95 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 196 insertions(+), 3 deletions(-) diff --git a/test/sort_by_key.cpp b/test/sort_by_key.cpp index ed827c9da5..cbb13b8785 100644 --- a/test/sort_by_key.cpp +++ b/test/sort_by_key.cpp @@ -119,15 +119,15 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const SORT_INIT(Sort1000False, sort_by_key_1000, false, 2, 3); SORT_INIT(SortMedFalse, sort_by_key_med, false, 2, 3); // Takes too much time in current implementation. Enable when everything is parallel - //SORT_INIT(SortLargeTrue, sort_by_key_large, true, 0, 1); - //SORT_INIT(SortLargeFalse, sort_by_key_large, false, 2, 3); + SORT_INIT(SortLargeTrue, sort_by_key_large, true, 0, 1); + SORT_INIT(SortLargeFalse, sort_by_key_large, false, 2, 3); ////////////////////////////////////// CPP /////////////////////////////// // -TEST(SortByKey, CPP) +TEST(SortByKey, CPPDim0) { if (noDoubleTests()) return; @@ -168,3 +168,101 @@ TEST(SortByKey, CPP) delete[] keyData; delete[] valData; } + +TEST(SortByKey, CPPDim1) +{ + if (noDoubleTests()) return; + + const bool dir = true; + const unsigned resultIdx0 = 0; + const unsigned resultIdx1 = 1; + + vector numDims; + vector > in; + vector > tests; + readTests(string(TEST_DIR"/sort/sort_by_key_large.test"),numDims,in,tests); + + af::dim4 idims = numDims[0]; + af::array keys(idims, &(in[0].front())); + af::array vals(idims, &(in[1].front())); + + af::array keys_ = reorder(keys, 1, 0, 2, 3); + af::array vals_ = reorder(vals, 1, 0, 2, 3); + + af::array out_keys, out_vals; + af::sort(out_keys, out_vals, keys_, vals_, 1, dir); + + out_keys = reorder(out_keys, 1, 0, 2, 3); + out_vals = reorder(out_vals, 1, 0, 2, 3); + + size_t nElems = tests[resultIdx0].size(); + // Get result + float* keyData = new float[tests[resultIdx0].size()]; + out_keys.host((void*)keyData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx0][elIter], keyData[elIter]) << "at: " << elIter << std::endl; + } + + float* valData = new float[tests[resultIdx1].size()]; + out_vals.host((void*)valData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx1][elIter], valData[elIter]) << "at: " << elIter << std::endl; + } + + // Delete + delete[] keyData; + delete[] valData; +} + +TEST(SortByKey, CPPDim2) +{ + if (noDoubleTests()) return; + + const bool dir = false; + const unsigned resultIdx0 = 2; + const unsigned resultIdx1 = 3; + + vector numDims; + vector > in; + vector > tests; + readTests(string(TEST_DIR"/sort/sort_by_key_large.test"),numDims,in,tests); + + af::dim4 idims = numDims[0]; + af::array keys(idims, &(in[0].front())); + af::array vals(idims, &(in[1].front())); + + af::array keys_ = reorder(keys, 1, 2, 0, 3); + af::array vals_ = reorder(vals, 1, 2, 0, 3); + + af::array out_keys, out_vals; + af::sort(out_keys, out_vals, keys_, vals_, 2, dir); + + out_keys = reorder(out_keys, 2, 0, 1, 3); + out_vals = reorder(out_vals, 2, 0, 1, 3); + + size_t nElems = tests[resultIdx0].size(); + // Get result + float* keyData = new float[tests[resultIdx0].size()]; + out_keys.host((void*)keyData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx0][elIter], keyData[elIter]) << "at: " << elIter << std::endl; + } + + float* valData = new float[tests[resultIdx1].size()]; + out_vals.host((void*)valData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx1][elIter], valData[elIter]) << "at: " << elIter << std::endl; + } + + // Delete + delete[] keyData; + delete[] valData; +} diff --git a/test/sort_index.cpp b/test/sort_index.cpp index 2326ed706f..eed85047bf 100644 --- a/test/sort_index.cpp +++ b/test/sort_index.cpp @@ -170,3 +170,98 @@ TEST(SortIndex, CPPDim0) delete[] sxData; delete[] ixData; } + +TEST(SortIndex, CPPDim1) +{ + if (noDoubleTests()) return; + + const bool dir = true; + const unsigned resultIdx0 = 0; + const unsigned resultIdx1 = 1; + + vector numDims; + vector > in; + vector > tests; + readTests(string(TEST_DIR"/sort/sort_10x10.test"),numDims,in,tests); + + af::dim4 idims = numDims[0]; + af::array input_(idims, &(in[0].front())); + af::array input = reorder(input_, 1, 0, 2, 3); + + af::array outValues, outIndices; + af::sort(outValues, outIndices, input, 1, dir); + + outValues = reorder(outValues, 1, 0, 2, 3); + outIndices = reorder(outIndices, 1, 0, 2, 3); + + size_t nElems = tests[resultIdx0].size(); + + // Get result + float* sxData = new float[tests[resultIdx0].size()]; + outValues.host((void*)sxData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx0][elIter], sxData[elIter]) << "at: " << elIter << std::endl; + } + + // Get result + unsigned* ixData = new unsigned[tests[resultIdx1].size()]; + outIndices.host((void*)ixData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx1][elIter], ixData[elIter]) << "at: " << elIter << std::endl; + } + + // Delete + delete[] sxData; + delete[] ixData; +} + +TEST(SortIndex, CPPDim2) +{ + if (noDoubleTests()) return; + + const bool dir = false; + const unsigned resultIdx0 = 2; + const unsigned resultIdx1 = 3; + + vector numDims; + vector > in; + vector > tests; + readTests(string(TEST_DIR"/sort/sort_med.test"),numDims,in,tests); + + af::dim4 idims = numDims[0]; + af::array input_(idims, &(in[0].front())); + af::array input = reorder(input_, 1, 2, 0, 3); + + af::array outValues, outIndices; + af::sort(outValues, outIndices, input, 2, dir); + + outValues = reorder(outValues, 2, 0, 1, 3); + outIndices = reorder(outIndices, 2, 0, 1, 3); + size_t nElems = tests[resultIdx0].size(); + + // Get result + float* sxData = new float[tests[resultIdx0].size()]; + outValues.host((void*)sxData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx0][elIter], sxData[elIter]) << "at: " << elIter << std::endl; + } + + // Get result + unsigned* ixData = new unsigned[tests[resultIdx1].size()]; + outIndices.host((void*)ixData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + EXPECT_EQ(tests[resultIdx1][elIter], ixData[elIter]) << "at: " << elIter << std::endl; + } + + // Delete + delete[] sxData; + delete[] ixData; +} From 8dad427391087265bc9df8b3ec6f528b3fd5cc11 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 21 Apr 2016 12:00:24 -0400 Subject: [PATCH 15/28] Add sort index, sort by key batching to OpenCL --- src/backend/opencl/kernel/harris.hpp | 2 +- src/backend/opencl/kernel/iota.hpp | 6 +- src/backend/opencl/kernel/orb.hpp | 2 +- src/backend/opencl/kernel/sort.hpp | 8 +- src/backend/opencl/kernel/sort_by_key.hpp | 143 +++++++++++++++++--- src/backend/opencl/kernel/sort_helper.hpp | 118 ++++++++++++++++ src/backend/opencl/kernel/sort_index.hpp | 123 ++++++++++++++++- src/backend/opencl/kernel/sort_make_pair.cl | 43 ++++++ src/backend/opencl/sort_by_key/impl.hpp | 29 +++- src/backend/opencl/sort_index.cpp | 33 ++++- 10 files changed, 466 insertions(+), 41 deletions(-) create mode 100644 src/backend/opencl/kernel/sort_make_pair.cl diff --git a/src/backend/opencl/kernel/harris.hpp b/src/backend/opencl/kernel/harris.hpp index 7fffdee423..3c0e531d6d 100644 --- a/src/backend/opencl/kernel/harris.hpp +++ b/src/backend/opencl/kernel/harris.hpp @@ -287,7 +287,7 @@ void harris(unsigned* corners_out, harris_idx.data = bufferAlloc(sort_elem * sizeof(unsigned)); // Sort Harris responses - sort0_index(harris_resp, harris_idx); + sort0Index(harris_resp, harris_idx); x_out.data = bufferAlloc(*corners_out * sizeof(float)); y_out.data = bufferAlloc(*corners_out * sizeof(float)); diff --git a/src/backend/opencl/kernel/iota.hpp b/src/backend/opencl/kernel/iota.hpp index bad486abd2..210b6b202e 100644 --- a/src/backend/opencl/kernel/iota.hpp +++ b/src/backend/opencl/kernel/iota.hpp @@ -31,8 +31,8 @@ namespace opencl namespace kernel { // Kernel Launch Config Values - static const int TX = 32; - static const int TY = 8; + static const int IOTA_TX = 32; + static const int IOTA_TY = 8; static const int TILEX = 512; static const int TILEY = 32; @@ -64,7 +64,7 @@ namespace opencl const int, const int, const int, const int, const int, const int> (*iotaKernels[device]); - NDRange local(TX, TY, 1); + NDRange local(IOTA_TX, IOTA_TY, 1); int blocksPerMatX = divup(out.info.dims[0], TILEX); int blocksPerMatY = divup(out.info.dims[1], TILEY); diff --git a/src/backend/opencl/kernel/orb.hpp b/src/backend/opencl/kernel/orb.hpp index 69c1176210..317bb4e3d8 100644 --- a/src/backend/opencl/kernel/orb.hpp +++ b/src/backend/opencl/kernel/orb.hpp @@ -305,7 +305,7 @@ void orb(unsigned* out_feat, d_harris_sorted.data = d_score_harris; d_harris_idx.data = bufferAlloc((d_harris_idx.info.dims[0]) * sizeof(unsigned)); - sort0_index(d_harris_sorted, d_harris_idx); + sort0Index(d_harris_sorted, d_harris_idx); cl::Buffer* d_x_lvl = bufferAlloc(usable_feat * sizeof(float)); cl::Buffer* d_y_lvl = bufferAlloc(usable_feat * sizeof(float)); diff --git a/src/backend/opencl/kernel/sort.hpp b/src/backend/opencl/kernel/sort.hpp index b9a7d39b28..98ba75977a 100644 --- a/src/backend/opencl/kernel/sort.hpp +++ b/src/backend/opencl/kernel/sort.hpp @@ -131,10 +131,10 @@ namespace opencl compute::buffer pKey_buf((*pKey.data)()); compute::buffer pVal_buf((*pVal.data)()); - compute::buffer_iterator< type_t > val0 = compute::make_buffer_iterator< type_t >(pVal_buf, 0); - compute::buffer_iterator< type_t > valN = compute::make_buffer_iterator< type_t >(pVal_buf,+ pVal.info.dims[0]); - compute::buffer_iterator< type_t > key0 = compute::make_buffer_iterator< type_t >(pKey_buf, 0); - compute::buffer_iterator< type_t > keyN = compute::make_buffer_iterator< type_t >(pKey_buf, pKey.info.dims[0]); + compute::buffer_iterator > val0 = compute::make_buffer_iterator >(pVal_buf, 0); + compute::buffer_iterator > valN = compute::make_buffer_iterator >(pVal_buf,+ pVal.info.dims[0]); + compute::buffer_iterator key0 = compute::make_buffer_iterator(pKey_buf, 0); + compute::buffer_iterator keyN = compute::make_buffer_iterator(pKey_buf, pKey.info.dims[0]); if(isAscending) { compute::sort_by_key(val0, valN, key0, c_queue); } else { diff --git a/src/backend/opencl/kernel/sort_by_key.hpp b/src/backend/opencl/kernel/sort_by_key.hpp index 513ddbfb6d..33a020712e 100644 --- a/src/backend/opencl/kernel/sort_by_key.hpp +++ b/src/backend/opencl/kernel/sort_by_key.hpp @@ -16,6 +16,7 @@ #include #include #include +#include #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" @@ -24,6 +25,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include namespace compute = boost::compute; @@ -40,33 +47,33 @@ namespace opencl namespace kernel { template - void sort0_by_key(Param okey, Param oval) + void sort0ByKeyIterative(Param pKey, Param pVal) { try { compute::command_queue c_queue(getQueue()()); - compute::buffer okey_buf((*okey.data)()); - compute::buffer oval_buf((*oval.data)()); + compute::buffer pKey_buf((*pKey.data)()); + compute::buffer pVal_buf((*pVal.data)()); - for(int w = 0; w < okey.info.dims[3]; w++) { - int okeyW = w * okey.info.strides[3]; - int ovalW = w * oval.info.strides[3]; - for(int z = 0; z < okey.info.dims[2]; z++) { - int okeyWZ = okeyW + z * okey.info.strides[2]; - int ovalWZ = ovalW + z * oval.info.strides[2]; - for(int y = 0; y < okey.info.dims[1]; y++) { + for(int w = 0; w < pKey.info.dims[3]; w++) { + int pKeyW = w * pKey.info.strides[3]; + int pValW = w * pVal.info.strides[3]; + for(int z = 0; z < pKey.info.dims[2]; z++) { + int pKeyWZ = pKeyW + z * pKey.info.strides[2]; + int pValWZ = pValW + z * pVal.info.strides[2]; + for(int y = 0; y < pKey.info.dims[1]; y++) { - int okeyOffset = okeyWZ + y * okey.info.strides[1]; - int ovalOffset = ovalWZ + y * oval.info.strides[1]; + int pKeyOffset = pKeyWZ + y * pKey.info.strides[1]; + int pValOffset = pValWZ + y * pVal.info.strides[1]; - compute::buffer_iterator< type_t > start= compute::make_buffer_iterator< type_t >(okey_buf, okeyOffset); - compute::buffer_iterator< type_t > end = compute::make_buffer_iterator< type_t >(okey_buf, okeyOffset + okey.info.dims[0]); - compute::buffer_iterator< type_t > vals = compute::make_buffer_iterator< type_t >(oval_buf, ovalOffset); + compute::buffer_iterator< type_t > start= compute::make_buffer_iterator< type_t >(pKey_buf, pKeyOffset); + compute::buffer_iterator< type_t > end = compute::make_buffer_iterator< type_t >(pKey_buf, pKeyOffset + pKey.info.dims[0]); + compute::buffer_iterator< type_t > vals = compute::make_buffer_iterator< type_t >(pVal_buf, pValOffset); if(isAscending) { compute::sort_by_key(start, end, vals, c_queue); } else { compute::sort_by_key(start, end, vals, - compute::greater< type_t >(), c_queue); + compute::greater< type_t >(), c_queue); } } } @@ -78,6 +85,110 @@ namespace opencl throw; } } + + template + void sortByKeyBatched(Param pKey, Param pVal) + { + typedef type_t Tk; + typedef type_t Tv; + typedef std::pair IndexPair; + + try { + af::dim4 inDims; + for(int i = 0; i < 4; i++) + inDims[i] = pKey.info.dims[i]; + + // Sort dimension + // tileDims * seqDims = inDims + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + // Create/call iota + // Array key = iota(seqDims, tileDims); + dim4 keydims = inDims; + cl::Buffer* key = bufferAlloc(keydims.elements() * sizeof(unsigned)); + Param pSeq; + pSeq.data = key; + pSeq.info.offset = 0; + pSeq.info.dims[0] = keydims[0]; + pSeq.info.strides[0] = 1; + for(int i = 1; i < 4; i++) { + pSeq.info.dims[i] = keydims[i]; + pSeq.info.strides[i] = pSeq.info.strides[i - 1] * pSeq.info.dims[i - 1]; + } + kernel::iota(pSeq, seqDims, tileDims); + + int elements = inDims.elements(); + + // Flat - Not required since inplace and both are continuous + //val.modDims(inDims.elements()); + //key.modDims(inDims.elements()); + + // Sort indices + // sort_by_key(*resVal, *resKey, val, key, 0); + //kernel::sort0_by_key(pVal, pKey); + compute::command_queue c_queue(getQueue()()); + compute::context c_context(getContext()()); + + // Create buffer iterators for seq + compute::buffer pSeq_buf((*pSeq.data)()); + compute::buffer_iterator seq0 = compute::make_buffer_iterator(pSeq_buf, 0); + compute::buffer_iterator seqN = compute::make_buffer_iterator(pSeq_buf, elements); + + // Copy key, val into X pair + cl::Buffer* X = bufferAlloc(elements * sizeof(IndexPair)); + // Use Tk_ and Tv_ here, not Tk and Tv + kernel::makePair(X, pKey.data, pVal.data, elements); + compute::buffer X_buf((*X)()); + compute::buffer_iterator X0 = compute::make_buffer_iterator(X_buf, 0); + compute::buffer_iterator XN = compute::make_buffer_iterator(X_buf, elements); + + // FIRST SORT CALL + compute::function IPCompare = + makeCompareFunction(); + + compute::sort_by_key(X0, XN, seq0, IPCompare, c_queue); + getQueue().finish(); + + // Needs to be ascending (true) in order to maintain the indices properly + //kernel::sort0_by_key(pKey, pVal); + // + // Because we use a pair as values, we need to use a custom comparator + BOOST_COMPUTE_FUNCTION(bool, Compare_Seq, (const unsigned lhs, const unsigned rhs), + { + return lhs < rhs; + } + ); + compute::sort_by_key(seq0, seqN, X0, Compare_Seq, c_queue); + getQueue().finish(); + + kernel::splitPair(pKey.data, pVal.data, X, elements); + + //// No need of doing moddims here because the original Array + //// dimensions have not been changed + ////val.modDims(inDims); + + CL_DEBUG_FINISH(getQueue()); + bufferFree(key); + bufferFree(X); + } catch (cl::Error err) { + CL_TO_AF_ERROR(err); + throw; + } + } + + template + void sort0ByKey(Param pKey, Param pVal) + { + int higherDims = pKey.info.dims[1] * pKey.info.dims[2] * pKey.info.dims[3]; + // TODO Make a better heurisitic + if(higherDims > 5) + kernel::sortByKeyBatched(pKey, pVal); + else + kernel::sort0ByKeyIterative(pKey, pVal); + } } } diff --git a/src/backend/opencl/kernel/sort_helper.hpp b/src/backend/opencl/kernel/sort_helper.hpp index 07ab0eeb69..6ba9eff0ae 100644 --- a/src/backend/opencl/kernel/sort_helper.hpp +++ b/src/backend/opencl/kernel/sort_helper.hpp @@ -8,8 +8,41 @@ ********************************************************/ #pragma once +#include #include #include +#include +#include +#include +#include +#include +#include +#include + +#include + +template +inline +boost::compute::function, const std::pair)> +makeCompareFunction() +{ + // Cannot use isAscending in BOOST_COMPUTE_FUNCTION + if(isAscending) { + BOOST_COMPUTE_FUNCTION(bool, IPCompare, (std::pair lhs, std::pair rhs), + { + return lhs.first < rhs.first; + } + ); + return IPCompare; + } else { + BOOST_COMPUTE_FUNCTION(bool, IPCompare, (std::pair lhs, std::pair rhs), + { + return lhs.first > rhs.first; + } + ); + return IPCompare; + } +} namespace opencl { @@ -42,5 +75,90 @@ namespace opencl cl_ulong, ltype_t >::type; + static const int copyPairIter = 4; + + template + void makePair(cl::Buffer *out, const cl::Buffer *first, const cl::Buffer *second, const unsigned N) + { + try { + static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; + static std::map makePairProgs; + static std::map makePairKernels; + + int device = getActiveDeviceId(); + + std::call_once( compileFlags[device], [device] () { + std::ostringstream options; + options << " -D Tk=" << dtype_traits::getName() + << " -D Tv=" << dtype_traits::getName() + << " -D copyPairIter=" << copyPairIter; + if (std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value) { + options << " -D USE_DOUBLE"; + } + Program prog; + buildProgram(prog, sort_make_pair_cl, sort_make_pair_cl_len, options.str()); + makePairProgs[device] = new Program(prog); + makePairKernels[device] = new Kernel(*makePairProgs[device], "make_pair_kernel"); + }); + + auto makePairOp = make_kernel + (*makePairKernels[device]); + + NDRange local(256, 1, 1); + NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1); + + makePairOp(EnqueueArgs(getQueue(), global, local), *out, *first, *second, N); + + CL_DEBUG_FINISH(getQueue()); + } catch (cl::Error err) { + CL_TO_AF_ERROR(err); + throw; + } + } + + template + void splitPair(cl::Buffer *first, cl::Buffer *second, const cl::Buffer *in, const unsigned N) + { + try { + static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; + static std::map splitPairProgs; + static std::map splitPairKernels; + + int device = getActiveDeviceId(); + + std::call_once( compileFlags[device], [device] () { + std::ostringstream options; + options << " -D Tk=" << dtype_traits::getName() + << " -D Tv=" << dtype_traits::getName() + << " -D copyPairIter=" << copyPairIter; + if (std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value) { + options << " -D USE_DOUBLE"; + } + Program prog; + buildProgram(prog, sort_make_pair_cl, sort_make_pair_cl_len, options.str()); + splitPairProgs[device] = new Program(prog); + splitPairKernels[device] = new Kernel(*splitPairProgs[device], "split_pair_kernel"); + }); + + auto splitPairOp = make_kernel + (*splitPairKernels[device]); + + NDRange local(256, 1, 1); + NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1); + + splitPairOp(EnqueueArgs(getQueue(), global, local), *first, *second, *in, N); + + CL_DEBUG_FINISH(getQueue()); + } catch (cl::Error err) { + CL_TO_AF_ERROR(err); + throw; + } + } } } diff --git a/src/backend/opencl/kernel/sort_index.hpp b/src/backend/opencl/kernel/sort_index.hpp index aae0a94ea6..ef5faa612f 100644 --- a/src/backend/opencl/kernel/sort_index.hpp +++ b/src/backend/opencl/kernel/sort_index.hpp @@ -16,15 +16,21 @@ #include #include #include +#include #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" #include -#include #include #include #include +#include +#include +#include +#include +#include +#include namespace compute = boost::compute; @@ -41,7 +47,7 @@ namespace opencl namespace kernel { template - void sort0_index(Param val, Param idx) + void sort0IndexIterative(Param val, Param idx) { try { compute::command_queue c_queue(getQueue()()); @@ -60,19 +66,18 @@ namespace opencl int valOffset = valWZ + y * val.info.strides[1]; int idxOffset = idxWZ + y * idx.info.strides[1]; - compute::buffer_iterator idx_begin(idx_buf, idxOffset); - compute::iota(idx_begin, idx_begin + val.info.dims[0], 0, c_queue); - if(isAscending) { compute::sort_by_key( compute::make_buffer_iterator< type_t >(val_buf, valOffset), compute::make_buffer_iterator< type_t >(val_buf, valOffset + val.info.dims[0]), - idx_begin, compute::less< type_t >(), c_queue); + compute::make_buffer_iterator< type_t >(idx_buf, idxOffset), + compute::less< type_t >(), c_queue); } else { compute::sort_by_key( compute::make_buffer_iterator< type_t >(val_buf, valOffset), compute::make_buffer_iterator< type_t >(val_buf, valOffset + val.info.dims[0]), - idx_begin, compute::greater< type_t >(), c_queue); + compute::make_buffer_iterator< type_t >(idx_buf, idxOffset), + compute::greater< type_t >(), c_queue); } } } @@ -84,6 +89,110 @@ namespace opencl throw; } } + + template + void sortIndexBatched(Param pVal, Param pIdx) + { + typedef type_t Tk; + typedef uint Tv; + typedef std::pair IndexPair; + + try { + af::dim4 inDims; + for(int i = 0; i < 4; i++) + inDims[i] = pVal.info.dims[i]; + + // Sort dimension + // tileDims * seqDims = inDims + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + // Create/call iota + // Array key = iota(seqDims, tileDims); + dim4 keydims = inDims; + cl::Buffer* key = bufferAlloc(keydims.elements() * sizeof(Tv)); + Param pSeq; + pSeq.data = key; + pSeq.info.offset = 0; + pSeq.info.dims[0] = keydims[0]; + pSeq.info.strides[0] = 1; + for(int i = 1; i < 4; i++) { + pSeq.info.dims[i] = keydims[i]; + pSeq.info.strides[i] = pSeq.info.strides[i - 1] * pSeq.info.dims[i - 1]; + } + kernel::iota(pSeq, seqDims, tileDims); + + int elements = inDims.elements(); + + // Flat - Not required since inplace and both are continuous + //val.modDims(inDims.elements()); + //key.modDims(inDims.elements()); + + // Sort indices + // sort_by_key(*resVal, *resKey, val, key, 0); + //kernel::sort0_by_key(pVal, pKey); + compute::command_queue c_queue(getQueue()()); + compute::context c_context(getContext()()); + + // Create buffer iterators for seq + compute::buffer pSeq_buf((*pSeq.data)()); + compute::buffer_iterator seq0 = compute::make_buffer_iterator(pSeq_buf, 0); + compute::buffer_iterator seqN = compute::make_buffer_iterator(pSeq_buf, elements); + + // Copy val, idx into X pair + cl::Buffer* X = bufferAlloc(elements * sizeof(IndexPair)); + // Use T here, not Tk + kernel::makePair(X, pVal.data, pIdx.data, elements); + compute::buffer X_buf((*X)()); + compute::buffer_iterator X0 = compute::make_buffer_iterator(X_buf, 0); + compute::buffer_iterator XN = compute::make_buffer_iterator(X_buf, elements); + + // FIRST SORT CALL + compute::function IPCompare = + makeCompareFunction(); + + compute::sort_by_key(X0, XN, seq0, IPCompare, c_queue); + getQueue().finish(); + + // Needs to be ascending (true) in order to maintain the indices properly + //kernel::sort0_by_key(pKey, pVal); + // + // Because we use a pair as values, we need to use a custom comparator + BOOST_COMPUTE_FUNCTION(bool, Compare_Tv, (const Tv lhs, const Tv rhs), + { + return lhs < rhs; + } + ); + compute::sort_by_key(seq0, seqN, X0, Compare_Tv, c_queue); + getQueue().finish(); + + kernel::splitPair(pVal.data, pIdx.data, X, elements); + + //// No need of doing moddims here because the original Array + //// dimensions have not been changed + ////val.modDims(inDims); + + CL_DEBUG_FINISH(getQueue()); + bufferFree(key); + bufferFree(X); + } catch (cl::Error err) { + CL_TO_AF_ERROR(err); + throw; + } + } + + template + void sort0Index(Param val, Param idx) + { + int higherDims = val.info.dims[1] * val.info.dims[2] * val.info.dims[3]; + // TODO Make a better heurisitic + if(higherDims > 5) + sortIndexBatched(val, idx); + else + kernel::sort0IndexIterative(val, idx); + } } } diff --git a/src/backend/opencl/kernel/sort_make_pair.cl b/src/backend/opencl/kernel/sort_make_pair.cl new file mode 100644 index 0000000000..f5e5413d73 --- /dev/null +++ b/src/backend/opencl/kernel/sort_make_pair.cl @@ -0,0 +1,43 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +struct IndexPair +{ + Tk first; + Tv second; +}; + +typedef struct IndexPair IndexPair_t; + +__kernel +void make_pair_kernel(__global IndexPair_t *out, + __global const Tk *first, __global const Tv *second, + const unsigned N) +{ + int tIdx = get_group_id(0) * get_local_size(0) * copyPairIter + get_local_id(0); + const int blockDimX = get_local_size(0); + + for(int i = tIdx; i < N; i += blockDimX) { + out[i].first = first[i]; + out[i].second = second[i]; + } +} + +__kernel +void split_pair_kernel( __global Tk *first, __global Tv *second, + __global const IndexPair_t *out, const unsigned N) +{ + int tIdx = get_group_id(0) * get_local_size(0) * copyPairIter + get_local_id(0); + const int blockDimX = get_local_size(0); + + for(int i = tIdx; i < N; i += blockDimX) { + first[i] = out[i].first; + second[i] = out[i].second; + } +} diff --git a/src/backend/opencl/sort_by_key/impl.hpp b/src/backend/opencl/sort_by_key/impl.hpp index 68c5ce70ae..f68fe91b3c 100644 --- a/src/backend/opencl/sort_by_key/impl.hpp +++ b/src/backend/opencl/sort_by_key/impl.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -24,12 +25,32 @@ namespace opencl try { okey = copyArray(ikey); oval = copyArray(ival); + switch(dim) { - case 0: kernel::sort0_by_key(okey, oval); - break; - default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); + case 0: kernel::sort0ByKey(okey, oval); break; + case 1: kernel::sortByKeyBatched(okey, oval); break; + case 2: kernel::sortByKeyBatched(okey, oval); break; + case 3: kernel::sortByKeyBatched(okey, oval); break; + default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); + } + + if(dim != 0) { + af::dim4 preorderDims = okey.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = okey.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = okey.dims()[i - 1]; + } + + okey.setDataDims(preorderDims); + oval.setDataDims(preorderDims); + + okey = reorder(okey, reorderDims); + oval = reorder(oval, reorderDims); } - }catch(std::exception &ex) { + } catch(std::exception &ex) { AF_ERROR(ex.what(), AF_ERR_INTERNAL); } } diff --git a/src/backend/opencl/sort_index.cpp b/src/backend/opencl/sort_index.cpp index c7aaa70feb..49795c2eb5 100644 --- a/src/backend/opencl/sort_index.cpp +++ b/src/backend/opencl/sort_index.cpp @@ -14,6 +14,8 @@ #include #include #include +#include +#include namespace opencl { @@ -22,17 +24,38 @@ namespace opencl { try { val = copyArray(in); - idx = createEmptyArray(in.dims()); + idx = range(in.dims(), dim); + idx.eval(); switch(dim) { - case 0: kernel::sort0_index(val, idx); - break; - default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); + case 0: kernel::sort0Index(val, idx); break; + case 1: kernel::sortIndexBatched(val, idx); break; + case 2: kernel::sortIndexBatched(val, idx); break; + case 3: kernel::sortIndexBatched(val, idx); break; + default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } - } catch (std::exception &ex) { + + if(dim != 0) { + af::dim4 preorderDims = val.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = val.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = val.dims()[i - 1]; + } + + val.setDataDims(preorderDims); + idx.setDataDims(preorderDims); + + val = reorder(val, reorderDims); + idx = reorder(idx, reorderDims); + } + } catch (std::exception &ex) { AF_ERROR(ex.what(), AF_ERR_INTERNAL); } } + #define INSTANTIATE(T) \ template void sort_index(Array &val, Array &idx, const Array &in, \ const uint dim); \ From efdc54264dde88b3635e0c6c0a1017293cd25353 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 21 Apr 2016 13:18:18 -0400 Subject: [PATCH 16/28] Combine sort_index and sort_by_key kernels in CPU --- src/backend/cpu/kernel/sort_index.hpp | 140 -------------------------- src/backend/cpu/sort_index.cpp | 33 +++--- 2 files changed, 17 insertions(+), 156 deletions(-) delete mode 100644 src/backend/cpu/kernel/sort_index.hpp diff --git a/src/backend/cpu/kernel/sort_index.hpp b/src/backend/cpu/kernel/sort_index.hpp deleted file mode 100644 index 7a23a7df49..0000000000 --- a/src/backend/cpu/kernel/sort_index.hpp +++ /dev/null @@ -1,140 +0,0 @@ -/******************************************************* - * Copyright (c) 2015, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#pragma once -#include -#include -#include -#include -#include -#include -#include -#include - -namespace cpu -{ -namespace kernel -{ - -template -void sort0IndexIterative(Array val, Array idx) -{ - // initialize original index locations - uint *idx_ptr = idx.get(); - T *val_ptr = val.get(); - - std::vector > X; - X.reserve(val.dims()[0]); - - for(dim_t w = 0; w < val.dims()[3]; w++) { - dim_t valW = w * val.strides()[3]; - dim_t idxW = w * idx.strides()[3]; - for(dim_t z = 0; z < val.dims()[2]; z++) { - dim_t valWZ = valW + z * val.strides()[2]; - dim_t idxWZ = idxW + z * idx.strides()[2]; - for(dim_t y = 0; y < val.dims()[1]; y++) { - dim_t valOffset = valWZ + y * val.strides()[1]; - dim_t idxOffset = idxWZ + y * idx.strides()[1]; - - X.clear(); - std::transform(val_ptr + valOffset, val_ptr + valOffset + val.dims()[0], - idx_ptr + idxOffset, - std::back_inserter(X), - [](T v_, uint i_) { return std::make_pair(v_, i_); } - ); - - //comp_ptr = &X.front(); - std::stable_sort(X.begin(), X.end(), IPCompare()); - - for(unsigned it = 0; it < X.size(); it++) { - val_ptr[valOffset + it] = X[it].first; - idx_ptr[idxOffset + it] = X[it].second; - } - } - } - } - - return; -} - -template -void sortIndexBatched(Array val, Array idx) -{ - af::dim4 inDims = val.dims(); - - af::dim4 tileDims(1); - af::dim4 seqDims = inDims; - tileDims[dim] = inDims[dim]; - seqDims[dim] = 1; - - uint* key = memAlloc(inDims.elements()); - // IOTA - { - af::dim4 dims = inDims; - uint* out = key; - af::dim4 strides(1); - for(int i = 1; i < 4; i++) - strides[i] = strides[i-1] * dims[i-1]; - - for(dim_t w = 0; w < dims[3]; w++) { - dim_t offW = w * strides[3]; - uint valW = (w % seqDims[3]) * seqDims[0] * seqDims[1] * seqDims[2]; - for(dim_t z = 0; z < dims[2]; z++) { - dim_t offWZ = offW + z * strides[2]; - uint valZ = valW + (z % seqDims[2]) * seqDims[0] * seqDims[1]; - for(dim_t y = 0; y < dims[1]; y++) { - dim_t offWZY = offWZ + y * strides[1]; - uint valY = valZ + (y % seqDims[1]) * seqDims[0]; - for(dim_t x = 0; x < dims[0]; x++) { - dim_t id = offWZY + x; - out[id] = valY + (x % seqDims[0]); - } - } - } - } - } - - // initialize original index locations - uint *idx_ptr = idx.get(); - T *val_ptr = val.get(); - - std::vector > X; - X.reserve(val.elements()); - - for(unsigned i = 0; i < val.elements(); i++) { - X.push_back(std::make_pair(std::make_pair(val_ptr[i], idx_ptr[i]), key[i])); - } - - memFree(key); // key is no longer required - - std::stable_sort(X.begin(), X.end(), KIPCompareV()); - - std::stable_sort(X.begin(), X.end(), KIPCompareK()); - - for(unsigned it = 0; it < val.elements(); it++) { - val_ptr[it] = X[it].first.first; - idx_ptr[it] = X[it].first.second; - } - - return; -} - -template -void sort0Index(Array val, Array idx) -{ - int higherDims = val.dims()[1] * val.dims()[2] * val.dims()[3]; - // TODO Make a better heurisitic - if(higherDims > 0) - kernel::sortIndexBatched(val, idx); - else - kernel::sort0IndexIterative(val, idx); -} - -} -} diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp index 36ca57b3e8..b865db9c1c 100644 --- a/src/backend/cpu/sort_index.cpp +++ b/src/backend/cpu/sort_index.cpp @@ -17,43 +17,44 @@ #include #include #include -#include +#include namespace cpu { template -void sort_index(Array &val, Array &idx, const Array &in, const uint dim) +void sort_index(Array &okey, Array &oval, const Array &in, const uint dim) { in.eval(); - val = copyArray(in); - idx = range(in.dims(), dim); - idx.eval(); + // okey is values, oval is indices + okey = copyArray(in); + oval = range(in.dims(), dim); + oval.eval(); switch(dim) { - case 0: getQueue().enqueue(kernel::sort0Index, val, idx); break; - case 1: getQueue().enqueue(kernel::sortIndexBatched, val, idx); break; - case 2: getQueue().enqueue(kernel::sortIndexBatched, val, idx); break; - case 3: getQueue().enqueue(kernel::sortIndexBatched, val, idx); break; + case 0: getQueue().enqueue(kernel::sort0ByKey, okey, oval); break; + case 1: getQueue().enqueue(kernel::sortByKeyBatched, okey, oval); break; + case 2: getQueue().enqueue(kernel::sortByKeyBatched, okey, oval); break; + case 3: getQueue().enqueue(kernel::sortByKeyBatched, okey, oval); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } if(dim != 0) { - af::dim4 preorderDims = val.dims(); + af::dim4 preorderDims = okey.dims(); af::dim4 reorderDims(0, 1, 2, 3); reorderDims[dim] = 0; - preorderDims[0] = val.dims()[dim]; + preorderDims[0] = okey.dims()[dim]; for(int i = 1; i <= (int)dim; i++) { reorderDims[i - 1] = i; - preorderDims[i] = val.dims()[i - 1]; + preorderDims[i] = okey.dims()[i - 1]; } - val.setDataDims(preorderDims); - idx.setDataDims(preorderDims); + okey.setDataDims(preorderDims); + oval.setDataDims(preorderDims); - val = reorder(val, reorderDims); - idx = reorder(idx, reorderDims); + okey = reorder(okey, reorderDims); + oval = reorder(oval, reorderDims); } } From 460bf6c8baaf1b7fce4a758dbe3f9cd48c291499 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 21 Apr 2016 13:29:17 -0400 Subject: [PATCH 17/28] Combine sort_index and sort_by_key kernels in CUDA --- src/backend/cuda/kernel/harris.hpp | 4 +- src/backend/cuda/kernel/orb.hpp | 4 +- src/backend/cuda/kernel/sort_by_key.hpp | 3 +- src/backend/cuda/kernel/sort_index.hpp | 141 ------------------------ src/backend/cuda/sort_index.cu | 32 +++--- 5 files changed, 22 insertions(+), 162 deletions(-) delete mode 100644 src/backend/cuda/kernel/sort_index.hpp diff --git a/src/backend/cuda/kernel/harris.hpp b/src/backend/cuda/kernel/harris.hpp index 3cb28b2b2f..c773ae45c5 100644 --- a/src/backend/cuda/kernel/harris.hpp +++ b/src/backend/cuda/kernel/harris.hpp @@ -18,7 +18,7 @@ #include #include "convolve.hpp" #include "gradient.hpp" -#include "sort_index.hpp" +#include "sort_by_key.hpp" namespace cuda { @@ -339,7 +339,7 @@ void harris(unsigned* corners_out, harris_idx.ptr = memAlloc(sort_elem); // Sort Harris responses - sort0Index(harris_responses, harris_idx); + sort0ByKey(harris_responses, harris_idx); *x_out = memAlloc(*corners_out); *y_out = memAlloc(*corners_out); diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp index 8448418f8b..b5ed10340e 100644 --- a/src/backend/cuda/kernel/orb.hpp +++ b/src/backend/cuda/kernel/orb.hpp @@ -16,7 +16,7 @@ #include #include "convolve.hpp" #include "orb_patch.hpp" -#include "sort_index.hpp" +#include "sort_by_key.hpp" #include @@ -397,7 +397,7 @@ void orb(unsigned* out_feat, harris_idx.ptr = memAlloc(sort_elem); // Sort features according to Harris responses - sort0Index(harris_sorted, harris_idx); + sort0ByKey(harris_sorted, harris_idx); feat_pyr[i] = std::min(feat_pyr[i], lvl_best[i]); diff --git a/src/backend/cuda/kernel/sort_by_key.hpp b/src/backend/cuda/kernel/sort_by_key.hpp index beffa5476e..1536d1bf53 100644 --- a/src/backend/cuda/kernel/sort_by_key.hpp +++ b/src/backend/cuda/kernel/sort_by_key.hpp @@ -109,6 +109,7 @@ namespace cuda POST_LAUNCH_CHECK(); // Needs to be ascending (true) in order to maintain the indices properly + //kernel::sort0_by_key(pKey, pVal); THRUST_SELECT(thrust::stable_sort_by_key, dSeq, dSeq + inDims.elements(), @@ -132,7 +133,7 @@ namespace cuda int higherDims = okey.dims[1] * okey.dims[2] * okey.dims[3]; // TODO Make a better heurisitic if(higherDims > 5) - sortByKeyBatched(okey, oval); + kernel::sortByKeyBatched(okey, oval); else kernel::sort0ByKeyIterative(okey, oval); } diff --git a/src/backend/cuda/kernel/sort_index.hpp b/src/backend/cuda/kernel/sort_index.hpp deleted file mode 100644 index 40a5d59311..0000000000 --- a/src/backend/cuda/kernel/sort_index.hpp +++ /dev/null @@ -1,141 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -namespace cuda -{ - namespace kernel - { - /////////////////////////////////////////////////////////////////////////// - // Wrapper functions - /////////////////////////////////////////////////////////////////////////// - template - void sort0IndexIterative(Param val, Param idx) - { - thrust::device_ptr val_ptr = thrust::device_pointer_cast(val.ptr); - thrust::device_ptr idx_ptr = thrust::device_pointer_cast(idx.ptr); - - for(int w = 0; w < val.dims[3]; w++) { - int valW = w * val.strides[3]; - int idxW = w * idx.strides[3]; - for(int z = 0; z < val.dims[2]; z++) { - int valWZ = valW + z * val.strides[2]; - int idxWZ = idxW + z * idx.strides[2]; - for(int y = 0; y < val.dims[1]; y++) { - - int valOffset = valWZ + y * val.strides[1]; - int idxOffset = idxWZ + y * idx.strides[1]; - - if(isAscending) { - THRUST_SELECT(thrust::stable_sort_by_key, - val_ptr + valOffset, val_ptr + valOffset + val.dims[0], - idx_ptr + idxOffset); - } else { - THRUST_SELECT(thrust::stable_sort_by_key, - val_ptr + valOffset, val_ptr + valOffset + val.dims[0], - idx_ptr + idxOffset, thrust::greater()); - } - } - } - } - POST_LAUNCH_CHECK(); - } - - template - void sortIndexBatched(Param pVal, Param pIdx) - { - af::dim4 inDims; - for(int i = 0; i < 4; i++) - inDims[i] = pVal.dims[i]; - - // Sort dimension - // tileDims * seqDims = inDims - af::dim4 tileDims(1); - af::dim4 seqDims = inDims; - tileDims[dim] = inDims[dim]; - seqDims[dim] = 1; - - // Create/call iota - // Array seq = iota(seqDims, tileDims); - dim4 keydims = inDims; - uint* key = memAlloc(keydims.elements()); - Param pSeq; - pSeq.ptr = key; - pSeq.strides[0] = 1; - pSeq.dims[0] = keydims[0]; - for(int i = 1; i < 4; i++) { - pSeq.dims[i] = keydims[i]; - pSeq.strides[i] = pSeq.strides[i - 1] * pSeq.dims[i - 1]; - } - cuda::kernel::iota(pSeq, seqDims, tileDims); - - // Flat - Not required since inplace and both are continuous - //val.modDims(inDims.elements()); - //key.modDims(inDims.elements()); - - // Make val, idx into a pair - thrust::device_vector > X(inDims.elements()); - IndexPair *Xptr = thrust::raw_pointer_cast(X.data()); - - const int threads = 256; - int blocks = divup(inDims.elements(), threads * copyPairIter); - CUDA_LAUNCH((makeIndexPair), blocks, threads, - Xptr, pVal.ptr, pIdx.ptr, inDims.elements()); - - // Sort indices - // sort_by_key(*resVal, *resKey, val, key, 0); - thrust::device_ptr dSeq = thrust::device_pointer_cast(pSeq.ptr); - THRUST_SELECT(thrust::stable_sort_by_key, - X.begin(), X.end(), - dSeq, - IPCompare()); - POST_LAUNCH_CHECK(); - - // Needs to be ascending (true) in order to maintain the indices properly - //kernel::sort0_by_key(pKey, pVal); - THRUST_SELECT(thrust::stable_sort_by_key, - dSeq, - dSeq + inDims.elements(), - X.begin()); - POST_LAUNCH_CHECK(); - - CUDA_LAUNCH((splitIndexPair), blocks, threads, - pVal.ptr, pIdx.ptr, Xptr, inDims.elements()); - POST_LAUNCH_CHECK(); - - // No need of doing moddims here because the original Array - // dimensions have not been changed - //val.modDims(inDims); - - memFree(key); - } - - template - void sort0Index(Param val, Param idx) - { - int higherDims = val.dims[1] * val.dims[2] * val.dims[3]; - // TODO Make a better heurisitic - if(higherDims > 5) - sortIndexBatched(val, idx); - else - kernel::sort0IndexIterative(val, idx); - } - } -} diff --git a/src/backend/cuda/sort_index.cu b/src/backend/cuda/sort_index.cu index ab54c24a9c..03c69ad4f3 100644 --- a/src/backend/cuda/sort_index.cu +++ b/src/backend/cuda/sort_index.cu @@ -9,7 +9,7 @@ #include #include -#include +#include #include #include #include @@ -20,35 +20,35 @@ namespace cuda { template - void sort_index(Array &val, Array &idx, const Array &in, const uint dim) + void sort_index(Array &okey, Array &oval, const Array &in, const uint dim) { - val = copyArray(in); - idx = range(in.dims(), dim); - idx.eval(); + okey = copyArray(in); + oval = range(in.dims(), dim); + oval.eval(); switch(dim) { - case 0: kernel::sort0Index(val, idx); break; - case 1: kernel::sortIndexBatched(val, idx); break; - case 2: kernel::sortIndexBatched(val, idx); break; - case 3: kernel::sortIndexBatched(val, idx); break; + case 0: kernel::sort0ByKey(okey, oval); break; + case 1: kernel::sortByKeyBatched(okey, oval); break; + case 2: kernel::sortByKeyBatched(okey, oval); break; + case 3: kernel::sortByKeyBatched(okey, oval); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } if(dim != 0) { - af::dim4 preorderDims = val.dims(); + af::dim4 preorderDims = okey.dims(); af::dim4 reorderDims(0, 1, 2, 3); reorderDims[dim] = 0; - preorderDims[0] = val.dims()[dim]; + preorderDims[0] = okey.dims()[dim]; for(int i = 1; i <= (int)dim; i++) { reorderDims[i - 1] = i; - preorderDims[i] = val.dims()[i - 1]; + preorderDims[i] = okey.dims()[i - 1]; } - val.setDataDims(preorderDims); - idx.setDataDims(preorderDims); + okey.setDataDims(preorderDims); + oval.setDataDims(preorderDims); - val = reorder(val, reorderDims); - idx = reorder(idx, reorderDims); + okey = reorder(okey, reorderDims); + oval = reorder(oval, reorderDims); } } From 363e86e5031f64689db50efb08b3323dea443dba Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 21 Apr 2016 13:29:46 -0400 Subject: [PATCH 18/28] Combine sort_index and sort_by_key kernels in OpenCL --- src/backend/opencl/kernel/harris.hpp | 4 +- src/backend/opencl/kernel/orb.hpp | 4 +- src/backend/opencl/kernel/sift_nonfree.hpp | 3 +- src/backend/opencl/kernel/sort_helper.hpp | 26 +-- src/backend/opencl/kernel/sort_index.hpp | 199 ------------------ .../{sort_make_pair.cl => sort_pair.cl} | 0 src/backend/opencl/sort_index.cpp | 33 +-- 7 files changed, 36 insertions(+), 233 deletions(-) delete mode 100644 src/backend/opencl/kernel/sort_index.hpp rename src/backend/opencl/kernel/{sort_make_pair.cl => sort_pair.cl} (100%) diff --git a/src/backend/opencl/kernel/harris.hpp b/src/backend/opencl/kernel/harris.hpp index 3c0e531d6d..4c203c9377 100644 --- a/src/backend/opencl/kernel/harris.hpp +++ b/src/backend/opencl/kernel/harris.hpp @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include @@ -287,7 +287,7 @@ void harris(unsigned* corners_out, harris_idx.data = bufferAlloc(sort_elem * sizeof(unsigned)); // Sort Harris responses - sort0Index(harris_resp, harris_idx); + sort0ByKey(harris_resp, harris_idx); x_out.data = bufferAlloc(*corners_out * sizeof(float)); y_out.data = bufferAlloc(*corners_out * sizeof(float)); diff --git a/src/backend/opencl/kernel/orb.hpp b/src/backend/opencl/kernel/orb.hpp index 317bb4e3d8..612edacfe0 100644 --- a/src/backend/opencl/kernel/orb.hpp +++ b/src/backend/opencl/kernel/orb.hpp @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include @@ -305,7 +305,7 @@ void orb(unsigned* out_feat, d_harris_sorted.data = d_score_harris; d_harris_idx.data = bufferAlloc((d_harris_idx.info.dims[0]) * sizeof(unsigned)); - sort0Index(d_harris_sorted, d_harris_idx); + sort0ByKey(d_harris_sorted, d_harris_idx); cl::Buffer* d_x_lvl = bufferAlloc(usable_feat * sizeof(float)); cl::Buffer* d_y_lvl = bufferAlloc(usable_feat * sizeof(float)); diff --git a/src/backend/opencl/kernel/sift_nonfree.hpp b/src/backend/opencl/kernel/sift_nonfree.hpp index c28f432fce..0b78ef02f3 100644 --- a/src/backend/opencl/kernel/sift_nonfree.hpp +++ b/src/backend/opencl/kernel/sift_nonfree.hpp @@ -92,11 +92,12 @@ #include #include #include -#include #include #include #include +namespace compute = boost::compute; + using cl::Buffer; using cl::Program; using cl::Kernel; diff --git a/src/backend/opencl/kernel/sort_helper.hpp b/src/backend/opencl/kernel/sort_helper.hpp index 6ba9eff0ae..7f500c6bb0 100644 --- a/src/backend/opencl/kernel/sort_helper.hpp +++ b/src/backend/opencl/kernel/sort_helper.hpp @@ -8,7 +8,7 @@ ********************************************************/ #pragma once -#include +#include #include #include #include @@ -82,8 +82,8 @@ namespace opencl { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; - static std::map makePairProgs; - static std::map makePairKernels; + static std::map sortPairProgs; + static std::map sortPairKernels; int device = getActiveDeviceId(); @@ -99,13 +99,13 @@ namespace opencl options << " -D USE_DOUBLE"; } Program prog; - buildProgram(prog, sort_make_pair_cl, sort_make_pair_cl_len, options.str()); - makePairProgs[device] = new Program(prog); - makePairKernels[device] = new Kernel(*makePairProgs[device], "make_pair_kernel"); + buildProgram(prog, sort_pair_cl, sort_pair_cl_len, options.str()); + sortPairProgs[device] = new Program(prog); + sortPairKernels[device] = new Kernel(*sortPairProgs[device], "make_pair_kernel"); }); auto makePairOp = make_kernel - (*makePairKernels[device]); + (*sortPairKernels[device]); NDRange local(256, 1, 1); NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1); @@ -124,8 +124,8 @@ namespace opencl { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; - static std::map splitPairProgs; - static std::map splitPairKernels; + static std::map sortPairProgs; + static std::map sortPairKernels; int device = getActiveDeviceId(); @@ -141,13 +141,13 @@ namespace opencl options << " -D USE_DOUBLE"; } Program prog; - buildProgram(prog, sort_make_pair_cl, sort_make_pair_cl_len, options.str()); - splitPairProgs[device] = new Program(prog); - splitPairKernels[device] = new Kernel(*splitPairProgs[device], "split_pair_kernel"); + buildProgram(prog, sort_pair_cl, sort_pair_cl_len, options.str()); + sortPairProgs[device] = new Program(prog); + sortPairKernels[device] = new Kernel(*sortPairProgs[device], "split_pair_kernel"); }); auto splitPairOp = make_kernel - (*splitPairKernels[device]); + (*sortPairKernels[device]); NDRange local(256, 1, 1); NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1); diff --git a/src/backend/opencl/kernel/sort_index.hpp b/src/backend/opencl/kernel/sort_index.hpp deleted file mode 100644 index ef5faa612f..0000000000 --- a/src/backend/opencl/kernel/sort_index.hpp +++ /dev/null @@ -1,199 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace compute = boost::compute; - -using cl::Buffer; -using cl::Program; -using cl::Kernel; -using cl::make_kernel; -using cl::EnqueueArgs; -using cl::NDRange; -using std::string; - -namespace opencl -{ - namespace kernel - { - template - void sort0IndexIterative(Param val, Param idx) - { - try { - compute::command_queue c_queue(getQueue()()); - - compute::buffer val_buf((*val.data)()); - compute::buffer idx_buf((*idx.data)()); - - for(int w = 0; w < (int)val.info.dims[3]; w++) { - int valW = w * (int)val.info.strides[3]; - int idxW = w * idx.info.strides[3]; - for(int z = 0; z < (int)val.info.dims[2]; z++) { - int valWZ = valW + z * (int)val.info.strides[2]; - int idxWZ = idxW + z * idx.info.strides[2]; - for(int y = 0; y < (int)val.info.dims[1]; y++) { - - int valOffset = valWZ + y * val.info.strides[1]; - int idxOffset = idxWZ + y * idx.info.strides[1]; - - if(isAscending) { - compute::sort_by_key( - compute::make_buffer_iterator< type_t >(val_buf, valOffset), - compute::make_buffer_iterator< type_t >(val_buf, valOffset + val.info.dims[0]), - compute::make_buffer_iterator< type_t >(idx_buf, idxOffset), - compute::less< type_t >(), c_queue); - } else { - compute::sort_by_key( - compute::make_buffer_iterator< type_t >(val_buf, valOffset), - compute::make_buffer_iterator< type_t >(val_buf, valOffset + val.info.dims[0]), - compute::make_buffer_iterator< type_t >(idx_buf, idxOffset), - compute::greater< type_t >(), c_queue); - } - } - } - } - - CL_DEBUG_FINISH(getQueue()); - } catch (cl::Error err) { - CL_TO_AF_ERROR(err); - throw; - } - } - - template - void sortIndexBatched(Param pVal, Param pIdx) - { - typedef type_t Tk; - typedef uint Tv; - typedef std::pair IndexPair; - - try { - af::dim4 inDims; - for(int i = 0; i < 4; i++) - inDims[i] = pVal.info.dims[i]; - - // Sort dimension - // tileDims * seqDims = inDims - af::dim4 tileDims(1); - af::dim4 seqDims = inDims; - tileDims[dim] = inDims[dim]; - seqDims[dim] = 1; - - // Create/call iota - // Array key = iota(seqDims, tileDims); - dim4 keydims = inDims; - cl::Buffer* key = bufferAlloc(keydims.elements() * sizeof(Tv)); - Param pSeq; - pSeq.data = key; - pSeq.info.offset = 0; - pSeq.info.dims[0] = keydims[0]; - pSeq.info.strides[0] = 1; - for(int i = 1; i < 4; i++) { - pSeq.info.dims[i] = keydims[i]; - pSeq.info.strides[i] = pSeq.info.strides[i - 1] * pSeq.info.dims[i - 1]; - } - kernel::iota(pSeq, seqDims, tileDims); - - int elements = inDims.elements(); - - // Flat - Not required since inplace and both are continuous - //val.modDims(inDims.elements()); - //key.modDims(inDims.elements()); - - // Sort indices - // sort_by_key(*resVal, *resKey, val, key, 0); - //kernel::sort0_by_key(pVal, pKey); - compute::command_queue c_queue(getQueue()()); - compute::context c_context(getContext()()); - - // Create buffer iterators for seq - compute::buffer pSeq_buf((*pSeq.data)()); - compute::buffer_iterator seq0 = compute::make_buffer_iterator(pSeq_buf, 0); - compute::buffer_iterator seqN = compute::make_buffer_iterator(pSeq_buf, elements); - - // Copy val, idx into X pair - cl::Buffer* X = bufferAlloc(elements * sizeof(IndexPair)); - // Use T here, not Tk - kernel::makePair(X, pVal.data, pIdx.data, elements); - compute::buffer X_buf((*X)()); - compute::buffer_iterator X0 = compute::make_buffer_iterator(X_buf, 0); - compute::buffer_iterator XN = compute::make_buffer_iterator(X_buf, elements); - - // FIRST SORT CALL - compute::function IPCompare = - makeCompareFunction(); - - compute::sort_by_key(X0, XN, seq0, IPCompare, c_queue); - getQueue().finish(); - - // Needs to be ascending (true) in order to maintain the indices properly - //kernel::sort0_by_key(pKey, pVal); - // - // Because we use a pair as values, we need to use a custom comparator - BOOST_COMPUTE_FUNCTION(bool, Compare_Tv, (const Tv lhs, const Tv rhs), - { - return lhs < rhs; - } - ); - compute::sort_by_key(seq0, seqN, X0, Compare_Tv, c_queue); - getQueue().finish(); - - kernel::splitPair(pVal.data, pIdx.data, X, elements); - - //// No need of doing moddims here because the original Array - //// dimensions have not been changed - ////val.modDims(inDims); - - CL_DEBUG_FINISH(getQueue()); - bufferFree(key); - bufferFree(X); - } catch (cl::Error err) { - CL_TO_AF_ERROR(err); - throw; - } - } - - template - void sort0Index(Param val, Param idx) - { - int higherDims = val.info.dims[1] * val.info.dims[2] * val.info.dims[3]; - // TODO Make a better heurisitic - if(higherDims > 5) - sortIndexBatched(val, idx); - else - kernel::sort0IndexIterative(val, idx); - } - } -} - -#pragma GCC diagnostic pop diff --git a/src/backend/opencl/kernel/sort_make_pair.cl b/src/backend/opencl/kernel/sort_pair.cl similarity index 100% rename from src/backend/opencl/kernel/sort_make_pair.cl rename to src/backend/opencl/kernel/sort_pair.cl diff --git a/src/backend/opencl/sort_index.cpp b/src/backend/opencl/sort_index.cpp index 49795c2eb5..bb5474909d 100644 --- a/src/backend/opencl/sort_index.cpp +++ b/src/backend/opencl/sort_index.cpp @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include @@ -20,36 +20,37 @@ namespace opencl { template - void sort_index(Array &val, Array &idx, const Array &in, const uint dim) + void sort_index(Array &okey, Array &oval, const Array &in, const uint dim) { try { - val = copyArray(in); - idx = range(in.dims(), dim); - idx.eval(); + // okey contains values, oval contains indices + okey = copyArray(in); + oval = range(in.dims(), dim); + oval.eval(); switch(dim) { - case 0: kernel::sort0Index(val, idx); break; - case 1: kernel::sortIndexBatched(val, idx); break; - case 2: kernel::sortIndexBatched(val, idx); break; - case 3: kernel::sortIndexBatched(val, idx); break; + case 0: kernel::sort0ByKey(okey, oval); break; + case 1: kernel::sortByKeyBatched(okey, oval); break; + case 2: kernel::sortByKeyBatched(okey, oval); break; + case 3: kernel::sortByKeyBatched(okey, oval); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } if(dim != 0) { - af::dim4 preorderDims = val.dims(); + af::dim4 preorderDims = okey.dims(); af::dim4 reorderDims(0, 1, 2, 3); reorderDims[dim] = 0; - preorderDims[0] = val.dims()[dim]; + preorderDims[0] = okey.dims()[dim]; for(int i = 1; i <= (int)dim; i++) { reorderDims[i - 1] = i; - preorderDims[i] = val.dims()[i - 1]; + preorderDims[i] = okey.dims()[i - 1]; } - val.setDataDims(preorderDims); - idx.setDataDims(preorderDims); + okey.setDataDims(preorderDims); + oval.setDataDims(preorderDims); - val = reorder(val, reorderDims); - idx = reorder(idx, reorderDims); + okey = reorder(okey, reorderDims); + oval = reorder(oval, reorderDims); } } catch (std::exception &ex) { AF_ERROR(ex.what(), AF_ERR_INTERNAL); From 87513e07d16958cceed4b896ad135c55e4316028 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 21 Apr 2016 13:42:54 -0400 Subject: [PATCH 19/28] Fix sort calls from harris and orb in CUDA --- src/backend/cuda/kernel/harris.hpp | 3 +++ src/backend/cuda/kernel/orb.hpp | 5 ++++- src/backend/cuda/kernel/range.hpp | 14 +++++++------- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/backend/cuda/kernel/harris.hpp b/src/backend/cuda/kernel/harris.hpp index c773ae45c5..9361b72e23 100644 --- a/src/backend/cuda/kernel/harris.hpp +++ b/src/backend/cuda/kernel/harris.hpp @@ -19,6 +19,7 @@ #include "convolve.hpp" #include "gradient.hpp" #include "sort_by_key.hpp" +#include "range.hpp" namespace cuda { @@ -336,7 +337,9 @@ void harris(unsigned* corners_out, int sort_elem = harris_responses.strides[3] * harris_responses.dims[3]; harris_responses.ptr = d_resp_corners; + // Create indices using range harris_idx.ptr = memAlloc(sort_elem); + kernel::range(harris_idx, 0); // Sort Harris responses sort0ByKey(harris_responses, harris_idx); diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp index b5ed10340e..8a2b535cee 100644 --- a/src/backend/cuda/kernel/orb.hpp +++ b/src/backend/cuda/kernel/orb.hpp @@ -17,6 +17,7 @@ #include "convolve.hpp" #include "orb_patch.hpp" #include "sort_by_key.hpp" +#include "range.hpp" #include @@ -394,10 +395,12 @@ void orb(unsigned* out_feat, int sort_elem = harris_sorted.strides[3] * harris_sorted.dims[3]; harris_sorted.ptr = d_score_harris; + // Create indices using range harris_idx.ptr = memAlloc(sort_elem); + kernel::range(harris_idx, 0); // Sort features according to Harris responses - sort0ByKey(harris_sorted, harris_idx); + kernel::sort0ByKey(harris_sorted, harris_idx); feat_pyr[i] = std::min(feat_pyr[i], lvl_best[i]); diff --git a/src/backend/cuda/kernel/range.hpp b/src/backend/cuda/kernel/range.hpp index 9670b07bd6..6880ed566a 100644 --- a/src/backend/cuda/kernel/range.hpp +++ b/src/backend/cuda/kernel/range.hpp @@ -18,10 +18,10 @@ namespace cuda namespace kernel { // Kernel Launch Config Values - static const unsigned TX = 32; - static const unsigned TY = 8; - static const unsigned TILEX = 512; - static const unsigned TILEY = 32; + static const unsigned RANGE_TX = 32; + static const unsigned RANGE_TY = 8; + static const unsigned RANGE_TILEX = 512; + static const unsigned RANGE_TILEY = 32; template __global__ @@ -74,10 +74,10 @@ namespace cuda template void range(Param out, const int dim) { - dim3 threads(TX, TY, 1); + dim3 threads(RANGE_TX, RANGE_TY, 1); - int blocksPerMatX = divup(out.dims[0], TILEX); - int blocksPerMatY = divup(out.dims[1], TILEY); + int blocksPerMatX = divup(out.dims[0], RANGE_TILEX); + int blocksPerMatY = divup(out.dims[1], RANGE_TILEY); dim3 blocks(blocksPerMatX * out.dims[2], blocksPerMatY * out.dims[3], 1); From c6e08d5f14ca6a3d8f51f19c98c2223e2a158251 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 21 Apr 2016 13:46:51 -0400 Subject: [PATCH 20/28] Fix sort calls from harris and orb in OpenCL --- src/backend/opencl/kernel/harris.hpp | 5 ++++- src/backend/opencl/kernel/orb.hpp | 5 ++++- src/backend/opencl/kernel/range.hpp | 14 +++++++------- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/backend/opencl/kernel/harris.hpp b/src/backend/opencl/kernel/harris.hpp index 4c203c9377..442275d326 100644 --- a/src/backend/opencl/kernel/harris.hpp +++ b/src/backend/opencl/kernel/harris.hpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -284,10 +285,12 @@ void harris(unsigned* corners_out, int sort_elem = harris_resp.info.strides[3] * harris_resp.info.dims[3]; harris_resp.data = d_resp_corners; + // Create indices using range harris_idx.data = bufferAlloc(sort_elem * sizeof(unsigned)); + kernel::range(harris_idx, 0); // Sort Harris responses - sort0ByKey(harris_resp, harris_idx); + kernel::sort0ByKey(harris_resp, harris_idx); x_out.data = bufferAlloc(*corners_out * sizeof(float)); y_out.data = bufferAlloc(*corners_out * sizeof(float)); diff --git a/src/backend/opencl/kernel/orb.hpp b/src/backend/opencl/kernel/orb.hpp index 612edacfe0..0c752d2c21 100644 --- a/src/backend/opencl/kernel/orb.hpp +++ b/src/backend/opencl/kernel/orb.hpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -303,9 +304,11 @@ void orb(unsigned* out_feat, d_harris_sorted.info.offset = 0; d_harris_idx.info.offset = 0; d_harris_sorted.data = d_score_harris; + // Create indices using range d_harris_idx.data = bufferAlloc((d_harris_idx.info.dims[0]) * sizeof(unsigned)); + kernel::range(d_harris_idx, 0); - sort0ByKey(d_harris_sorted, d_harris_idx); + kernel::sort0ByKey(d_harris_sorted, d_harris_idx); cl::Buffer* d_x_lvl = bufferAlloc(usable_feat * sizeof(float)); cl::Buffer* d_y_lvl = bufferAlloc(usable_feat * sizeof(float)); diff --git a/src/backend/opencl/kernel/range.hpp b/src/backend/opencl/kernel/range.hpp index 2f8be8cd4a..0299c030d4 100644 --- a/src/backend/opencl/kernel/range.hpp +++ b/src/backend/opencl/kernel/range.hpp @@ -31,10 +31,10 @@ namespace opencl namespace kernel { // Kernel Launch Config Values - static const int TX = 32; - static const int TY = 8; - static const int TILEX = 512; - static const int TILEY = 32; + static const int RANGE_TX = 32; + static const int RANGE_TY = 8; + static const int RANGE_TILEX = 512; + static const int RANGE_TILEY = 32; template void range(Param out, const int dim) @@ -62,10 +62,10 @@ namespace opencl auto rangeOp = make_kernel (*rangeKernels[device]); - NDRange local(TX, TY, 1); + NDRange local(RANGE_TX, RANGE_TY, 1); - int blocksPerMatX = divup(out.info.dims[0], TILEX); - int blocksPerMatY = divup(out.info.dims[1], TILEY); + int blocksPerMatX = divup(out.info.dims[0], RANGE_TILEX); + int blocksPerMatY = divup(out.info.dims[1], RANGE_TILEY); NDRange global(local[0] * blocksPerMatX * out.info.dims[2], local[1] * blocksPerMatY * out.info.dims[3], 1); From 8f48cdcd6a3cdc48ed050f0372a75c9a255b521d Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 21 Apr 2016 15:02:26 -0400 Subject: [PATCH 21/28] Clean up sort tests --- test/sort.cpp | 2 +- test/sort_by_key.cpp | 2 +- test/sort_index.cpp | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/sort.cpp b/test/sort.cpp index 977b54b5c7..9a496f3236 100644 --- a/test/sort.cpp +++ b/test/sort.cpp @@ -106,7 +106,7 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, bool SORT_INIT(Sort1000False, sort_1000, false, 2); SORT_INIT(SortMedTrue, sort_med1, true, 0); SORT_INIT(SortMedFalse, sort_med1, false, 2); - // Takes too much time in current implementation. Enable when everything is parallel + SORT_INIT(SortMed5True, sort_med, true, 0); SORT_INIT(SortMed5False, sort_med, false, 2); SORT_INIT(SortLargeTrue, sort_large, true, 0); diff --git a/test/sort_by_key.cpp b/test/sort_by_key.cpp index cbb13b8785..dae46bef54 100644 --- a/test/sort_by_key.cpp +++ b/test/sort_by_key.cpp @@ -118,7 +118,7 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const SORT_INIT(SortMedTrue, sort_by_key_med, true, 0, 1); SORT_INIT(Sort1000False, sort_by_key_1000, false, 2, 3); SORT_INIT(SortMedFalse, sort_by_key_med, false, 2, 3); - // Takes too much time in current implementation. Enable when everything is parallel + SORT_INIT(SortLargeTrue, sort_by_key_large, true, 0, 1); SORT_INIT(SortLargeFalse, sort_by_key_large, false, 2, 3); diff --git a/test/sort_index.cpp b/test/sort_index.cpp index eed85047bf..fe11462310 100644 --- a/test/sort_index.cpp +++ b/test/sort_index.cpp @@ -119,11 +119,11 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const SORT_INIT(SortMedTrue, sort_med1, true, 0, 1); SORT_INIT(Sort1000False, sort_1000, false, 2, 3); SORT_INIT(SortMedFalse, sort_med1, false, 2, 3); - // Takes too much time in current implementation. Enable when everything is parallel - //SORT_INIT(SortMed5True, sort_med, true, 0, 1); - //SORT_INIT(SortMed5False, sort_med, false, 2, 3); - //SORT_INIT(SortLargeTrue, sort_large, true, 0, 1); - //SORT_INIT(SortLargeFalse, sort_large, false, 2, 3); + + SORT_INIT(SortMed5True, sort_med, true, 0, 1); + SORT_INIT(SortMed5False, sort_med, false, 2, 3); + SORT_INIT(SortLargeTrue, sort_large, true, 0, 1); + SORT_INIT(SortLargeFalse, sort_large, false, 2, 3); //////////////////////////////////// CPP ///////////////////////////////// From b6a6a8732bb43e257c0744a6c183f7d602f8d56d Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 21 Apr 2016 18:56:16 -0400 Subject: [PATCH 22/28] Fixed sort_by_key kernel for OpenCL --- src/backend/opencl/kernel/sort_by_key.hpp | 70 +++++++++++++---------- src/backend/opencl/kernel/sort_helper.hpp | 33 +++++++++++ test/sort_index.cpp | 2 +- 3 files changed, 75 insertions(+), 30 deletions(-) diff --git a/src/backend/opencl/kernel/sort_by_key.hpp b/src/backend/opencl/kernel/sort_by_key.hpp index 33a020712e..6984603882 100644 --- a/src/backend/opencl/kernel/sort_by_key.hpp +++ b/src/backend/opencl/kernel/sort_by_key.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -91,7 +92,6 @@ namespace opencl { typedef type_t Tk; typedef type_t Tv; - typedef std::pair IndexPair; try { af::dim4 inDims; @@ -136,35 +136,46 @@ namespace opencl compute::buffer pSeq_buf((*pSeq.data)()); compute::buffer_iterator seq0 = compute::make_buffer_iterator(pSeq_buf, 0); compute::buffer_iterator seqN = compute::make_buffer_iterator(pSeq_buf, elements); - - // Copy key, val into X pair - cl::Buffer* X = bufferAlloc(elements * sizeof(IndexPair)); - // Use Tk_ and Tv_ here, not Tk and Tv - kernel::makePair(X, pKey.data, pVal.data, elements); - compute::buffer X_buf((*X)()); - compute::buffer_iterator X0 = compute::make_buffer_iterator(X_buf, 0); - compute::buffer_iterator XN = compute::make_buffer_iterator(X_buf, elements); - - // FIRST SORT CALL - compute::function IPCompare = - makeCompareFunction(); - - compute::sort_by_key(X0, XN, seq0, IPCompare, c_queue); - getQueue().finish(); - + // Create buffer iterators for key and val + compute::buffer pKey_buf((*pKey.data)()); + compute::buffer pVal_buf((*pVal.data)()); + compute::buffer_iterator key0 = compute::make_buffer_iterator(pKey_buf, 0); + compute::buffer_iterator keyN = compute::make_buffer_iterator(pKey_buf, elements); + compute::buffer_iterator val0 = compute::make_buffer_iterator(pVal_buf, 0); + compute::buffer_iterator valN = compute::make_buffer_iterator(pVal_buf, elements); + + // Sort By Key for descending is stable in the reverse + // (greater) order. Sorting in ascending with negated values + // will give the right result + if(!isAscending) compute::transform(key0, keyN, key0, flipFunction(), c_queue); + + // Create a copy of the pKey buffer + cl::Buffer* cKey = bufferAlloc(elements * sizeof(Tk)); + compute::buffer cKey_buf((*cKey)()); + compute::buffer_iterator cKey0 = compute::make_buffer_iterator(cKey_buf, 0); + compute::buffer_iterator cKeyN = compute::make_buffer_iterator(cKey_buf, elements); + compute::copy(key0, keyN, cKey0, c_queue); + + // FIRST SORT + compute::sort_by_key(key0, keyN, seq0, c_queue); + compute::sort_by_key(cKey0, cKeyN, val0, c_queue); + + // Create a copy of the seq buffer after first sort + cl::Buffer* cSeq = bufferAlloc(elements * sizeof(unsigned)); + compute::buffer cSeq_buf((*cSeq)()); + compute::buffer_iterator cSeq0 = compute::make_buffer_iterator(cSeq_buf, 0); + compute::buffer_iterator cSeqN = compute::make_buffer_iterator(cSeq_buf, elements); + compute::copy(seq0, seqN, cSeq0, c_queue); + + // SECOND SORT + // First call will sort key, second sort will sort val // Needs to be ascending (true) in order to maintain the indices properly //kernel::sort0_by_key(pKey, pVal); - // - // Because we use a pair as values, we need to use a custom comparator - BOOST_COMPUTE_FUNCTION(bool, Compare_Seq, (const unsigned lhs, const unsigned rhs), - { - return lhs < rhs; - } - ); - compute::sort_by_key(seq0, seqN, X0, Compare_Seq, c_queue); - getQueue().finish(); + compute::sort_by_key(seq0, seqN, key0, c_queue); + compute::sort_by_key(cSeq0, cSeqN, val0, c_queue); - kernel::splitPair(pKey.data, pVal.data, X, elements); + // If descending, flip it back + if(!isAscending) compute::transform(key0, keyN, key0, flipFunction(), c_queue); //// No need of doing moddims here because the original Array //// dimensions have not been changed @@ -172,7 +183,8 @@ namespace opencl CL_DEBUG_FINISH(getQueue()); bufferFree(key); - bufferFree(X); + bufferFree(cSeq); + bufferFree(cKey); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; @@ -184,7 +196,7 @@ namespace opencl { int higherDims = pKey.info.dims[1] * pKey.info.dims[2] * pKey.info.dims[3]; // TODO Make a better heurisitic - if(higherDims > 5) + if(higherDims > 0) kernel::sortByKeyBatched(pKey, pVal); else kernel::sort0ByKeyIterative(pKey, pVal); diff --git a/src/backend/opencl/kernel/sort_helper.hpp b/src/backend/opencl/kernel/sort_helper.hpp index 7f500c6bb0..899e0873fa 100644 --- a/src/backend/opencl/kernel/sort_helper.hpp +++ b/src/backend/opencl/kernel/sort_helper.hpp @@ -44,6 +44,39 @@ makeCompareFunction() } } +template +inline boost::compute::function +flipFunction() +{ + BOOST_COMPUTE_FUNCTION(Tk, negateFn, (const Tk x), + { + return -x; + } + ); + + return negateFn; +} + +#define INSTANTIATE_FLIP(TY, XMAX) \ +template<> inline boost::compute::function \ +flipFunction() \ +{ \ + BOOST_COMPUTE_FUNCTION(TY, negateFn, (const TY x), \ + { \ + return XMAX - x; \ + } \ + ); \ + \ + return negateFn; \ +} + +INSTANTIATE_FLIP(unsigned, UINT_MAX) +INSTANTIATE_FLIP(unsigned short, USHRT_MAX) +INSTANTIATE_FLIP(unsigned char, UCHAR_MAX) +INSTANTIATE_FLIP(cl_ulong, ULONG_MAX) + +#undef INSTANTIATE_FLIP + namespace opencl { namespace kernel diff --git a/test/sort_index.cpp b/test/sort_index.cpp index fe11462310..0df4744c02 100644 --- a/test/sort_index.cpp +++ b/test/sort_index.cpp @@ -258,7 +258,7 @@ TEST(SortIndex, CPPDim2) // Compare result for (size_t elIter = 0; elIter < nElems; ++elIter) { - EXPECT_EQ(tests[resultIdx1][elIter], ixData[elIter]) << "at: " << elIter << std::endl; + ASSERT_EQ(tests[resultIdx1][elIter], ixData[elIter]) << "at: " << elIter << std::endl; } // Delete From 75b1a6bc0f428fd43f3189239aa355cce9f50e1a Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 21 Apr 2016 20:10:20 -0400 Subject: [PATCH 23/28] Instantiate sort_by_key kernels in separately --- src/backend/cuda/CMakeLists.txt | 2 +- src/backend/cuda/kernel/iota.hpp | 3 +- src/backend/cuda/kernel/sort_by_key.hpp | 119 +--------- .../cuda/{ => kernel}/sort_by_key/ascd_f32.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/ascd_f64.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/ascd_s16.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/ascd_s32.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/ascd_s64.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/ascd_s8.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/ascd_u16.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/ascd_u32.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/ascd_u64.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/ascd_u8.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/desc_f32.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/desc_f64.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/desc_s16.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/desc_s32.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/desc_s64.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/desc_s8.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/desc_u16.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/desc_u32.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/desc_u64.cu | 5 +- .../cuda/{ => kernel}/sort_by_key/desc_u8.cu | 5 +- src/backend/cuda/kernel/sort_by_key_impl.hpp | 213 ++++++++++++++++++ src/backend/cuda/kernel/sort_helper.hpp | 66 ------ .../{sort_by_key_impl.hpp => sort_by_key.cu} | 48 ++-- 26 files changed, 331 insertions(+), 220 deletions(-) rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_f32.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_f64.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_s16.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_s32.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_s64.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_s8.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_u16.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_u32.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_u64.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_u8.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/desc_f32.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/desc_f64.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/desc_s16.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/desc_s32.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/desc_s64.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/desc_s8.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/desc_u16.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/desc_u32.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/desc_u64.cu (86%) rename src/backend/cuda/{ => kernel}/sort_by_key/desc_u8.cu (86%) create mode 100644 src/backend/cuda/kernel/sort_by_key_impl.hpp delete mode 100644 src/backend/cuda/kernel/sort_helper.hpp rename src/backend/cuda/{sort_by_key_impl.hpp => sort_by_key.cu} (63%) diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt index b887a98d67..4efb42764a 100644 --- a/src/backend/cuda/CMakeLists.txt +++ b/src/backend/cuda/CMakeLists.txt @@ -158,7 +158,7 @@ FILE(GLOB cuda_headers FILE(GLOB cuda_sources "*.cu" "*.cpp" - "sort_by_key/*.cu" + "kernel/sort_by_key/*.cu" "kernel/*.cu") FILE(GLOB jit_sources diff --git a/src/backend/cuda/kernel/iota.hpp b/src/backend/cuda/kernel/iota.hpp index fc28c82882..e2f7e591fb 100644 --- a/src/backend/cuda/kernel/iota.hpp +++ b/src/backend/cuda/kernel/iota.hpp @@ -7,6 +7,7 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#include #include #include #include @@ -69,7 +70,7 @@ namespace cuda // Wrapper functions /////////////////////////////////////////////////////////////////////////// template - void iota(Param out, const dim4 &sdims, const dim4 &tdims) + void iota(Param out, const af::dim4 &sdims, const af::dim4 &tdims) { dim3 threads(IOTA_TX, IOTA_TY, 1); diff --git a/src/backend/cuda/kernel/sort_by_key.hpp b/src/backend/cuda/kernel/sort_by_key.hpp index 1536d1bf53..35250a8ad1 100644 --- a/src/backend/cuda/kernel/sort_by_key.hpp +++ b/src/backend/cuda/kernel/sort_by_key.hpp @@ -12,130 +12,19 @@ #include #include #include -#include -#include - -#include -#include -#include namespace cuda { namespace kernel { - /////////////////////////////////////////////////////////////////////////// - // Wrapper functions - /////////////////////////////////////////////////////////////////////////// template - void sort0ByKeyIterative(Param okey, Param oval) - { - thrust::device_ptr okey_ptr = thrust::device_pointer_cast(okey.ptr); - thrust::device_ptr oval_ptr = thrust::device_pointer_cast(oval.ptr); - - for(int w = 0; w < okey.dims[3]; w++) { - int okeyW = w * okey.strides[3]; - int ovalW = w * oval.strides[3]; - for(int z = 0; z < okey.dims[2]; z++) { - int okeyWZ = okeyW + z * okey.strides[2]; - int ovalWZ = ovalW + z * oval.strides[2]; - for(int y = 0; y < okey.dims[1]; y++) { - - int okeyOffset = okeyWZ + y * okey.strides[1]; - int ovalOffset = ovalWZ + y * oval.strides[1]; - - if(isAscending) { - THRUST_SELECT(thrust::stable_sort_by_key, - okey_ptr + okeyOffset, - okey_ptr + okeyOffset + okey.dims[0], - oval_ptr + ovalOffset); - } else { - THRUST_SELECT(thrust::stable_sort_by_key, - okey_ptr + okeyOffset, - okey_ptr + okeyOffset + okey.dims[0], - oval_ptr + ovalOffset, thrust::greater()); - } - } - } - } - POST_LAUNCH_CHECK(); - } + void sort0ByKeyIterative(Param okey, Param oval); template - void sortByKeyBatched(Param pKey, Param pVal) - { - af::dim4 inDims; - for(int i = 0; i < 4; i++) - inDims[i] = pKey.dims[i]; - - // Sort dimension - // tileDims * seqDims = inDims - af::dim4 tileDims(1); - af::dim4 seqDims = inDims; - tileDims[dim] = inDims[dim]; - seqDims[dim] = 1; - - // Create/call iota - // Array key = iota(seqDims, tileDims); - dim4 keydims = inDims; - uint* key = memAlloc(keydims.elements()); - Param pSeq; - pSeq.ptr = key; - pSeq.strides[0] = 1; - pSeq.dims[0] = keydims[0]; - for(int i = 1; i < 4; i++) { - pSeq.dims[i] = keydims[i]; - pSeq.strides[i] = pSeq.strides[i - 1] * pSeq.dims[i - 1]; - } - cuda::kernel::iota(pSeq, seqDims, tileDims); - - // Make pkey, pVal into a pair - thrust::device_vector > X(inDims.elements()); - IndexPair *Xptr = thrust::raw_pointer_cast(X.data()); - - const int threads = 256; - int blocks = divup(inDims.elements(), threads * copyPairIter); - CUDA_LAUNCH((makeIndexPair), blocks, threads, - Xptr, pKey.ptr, pVal.ptr, inDims.elements()); - POST_LAUNCH_CHECK(); - - // Sort indices - // Need to convert pSeq to thrust::device_ptr, otherwise thrust - // throws weird errors for all *64 data types (double, intl, uintl etc) - thrust::device_ptr dSeq = thrust::device_pointer_cast(pSeq.ptr); - THRUST_SELECT(thrust::stable_sort_by_key, - X.begin(), X.end(), - dSeq, - IPCompare()); - POST_LAUNCH_CHECK(); - - // Needs to be ascending (true) in order to maintain the indices properly - //kernel::sort0_by_key(pKey, pVal); - THRUST_SELECT(thrust::stable_sort_by_key, - dSeq, - dSeq + inDims.elements(), - X.begin()); - POST_LAUNCH_CHECK(); - - CUDA_LAUNCH((splitIndexPair), blocks, threads, - pKey.ptr, pVal.ptr, Xptr, inDims.elements()); - POST_LAUNCH_CHECK(); - - // No need of doing moddims here because the original Array - // dimensions have not been changed - //val.modDims(inDims); - - memFree(key); - } + void sortByKeyBatched(Param pKey, Param pVal); template - void sort0ByKey(Param okey, Param oval) - { - int higherDims = okey.dims[1] * okey.dims[2] * okey.dims[3]; - // TODO Make a better heurisitic - if(higherDims > 5) - kernel::sortByKeyBatched(okey, oval); - else - kernel::sort0ByKeyIterative(okey, oval); - } + void sort0ByKey(Param okey, Param oval); + } } diff --git a/src/backend/cuda/sort_by_key/ascd_f32.cu b/src/backend/cuda/kernel/sort_by_key/ascd_f32.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_f32.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_f32.cu index 44b770402c..284e8b4938 100644 --- a/src/backend/cuda/sort_by_key/ascd_f32.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_f32.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(float, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_f64.cu b/src/backend/cuda/kernel/sort_by_key/ascd_f64.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_f64.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_f64.cu index 17b54a3903..ba19ec447c 100644 --- a/src/backend/cuda/sort_by_key/ascd_f64.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_f64.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(double, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_s16.cu b/src/backend/cuda/kernel/sort_by_key/ascd_s16.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_s16.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_s16.cu index d51e9ae671..1be6e540ca 100644 --- a/src/backend/cuda/sort_by_key/ascd_s16.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_s16.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(short, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_s32.cu b/src/backend/cuda/kernel/sort_by_key/ascd_s32.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_s32.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_s32.cu index 75adbddc0b..8cee7c9b49 100644 --- a/src/backend/cuda/sort_by_key/ascd_s32.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_s32.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(int, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_s64.cu b/src/backend/cuda/kernel/sort_by_key/ascd_s64.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_s64.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_s64.cu index 25a1e589f8..0e5a7c81a2 100644 --- a/src/backend/cuda/sort_by_key/ascd_s64.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_s64.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(intl, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_s8.cu b/src/backend/cuda/kernel/sort_by_key/ascd_s8.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_s8.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_s8.cu index f47a397727..81ed32952f 100644 --- a/src/backend/cuda/sort_by_key/ascd_s8.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_s8.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(char, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_u16.cu b/src/backend/cuda/kernel/sort_by_key/ascd_u16.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_u16.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_u16.cu index e06036abc7..e232c08376 100644 --- a/src/backend/cuda/sort_by_key/ascd_u16.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_u16.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(ushort, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_u32.cu b/src/backend/cuda/kernel/sort_by_key/ascd_u32.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_u32.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_u32.cu index 6f7939aa12..34a4580936 100644 --- a/src/backend/cuda/sort_by_key/ascd_u32.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_u32.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(uint, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_u64.cu b/src/backend/cuda/kernel/sort_by_key/ascd_u64.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_u64.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_u64.cu index 63eec5fdd4..fc576e7f99 100644 --- a/src/backend/cuda/sort_by_key/ascd_u64.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_u64.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(uintl, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_u8.cu b/src/backend/cuda/kernel/sort_by_key/ascd_u8.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_u8.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_u8.cu index a2e1dec887..ed8454d53e 100644 --- a/src/backend/cuda/sort_by_key/ascd_u8.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_u8.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(uchar, true) } +} diff --git a/src/backend/cuda/sort_by_key/desc_f32.cu b/src/backend/cuda/kernel/sort_by_key/desc_f32.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_f32.cu rename to src/backend/cuda/kernel/sort_by_key/desc_f32.cu index 1bbb10bbba..73459ac033 100644 --- a/src/backend/cuda/sort_by_key/desc_f32.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_f32.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(float, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_f64.cu b/src/backend/cuda/kernel/sort_by_key/desc_f64.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_f64.cu rename to src/backend/cuda/kernel/sort_by_key/desc_f64.cu index ecbed78878..be0536b1e3 100644 --- a/src/backend/cuda/sort_by_key/desc_f64.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_f64.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(double, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_s16.cu b/src/backend/cuda/kernel/sort_by_key/desc_s16.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_s16.cu rename to src/backend/cuda/kernel/sort_by_key/desc_s16.cu index 63967b6117..0fc3b50827 100644 --- a/src/backend/cuda/sort_by_key/desc_s16.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_s16.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(short, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_s32.cu b/src/backend/cuda/kernel/sort_by_key/desc_s32.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_s32.cu rename to src/backend/cuda/kernel/sort_by_key/desc_s32.cu index 49904437f4..cfda29c7de 100644 --- a/src/backend/cuda/sort_by_key/desc_s32.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_s32.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(int, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_s64.cu b/src/backend/cuda/kernel/sort_by_key/desc_s64.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_s64.cu rename to src/backend/cuda/kernel/sort_by_key/desc_s64.cu index a10ee11475..b334a91a99 100644 --- a/src/backend/cuda/sort_by_key/desc_s64.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_s64.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(intl, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_s8.cu b/src/backend/cuda/kernel/sort_by_key/desc_s8.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_s8.cu rename to src/backend/cuda/kernel/sort_by_key/desc_s8.cu index cad78dfc84..f02d5ce2fe 100644 --- a/src/backend/cuda/sort_by_key/desc_s8.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_s8.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(char, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_u16.cu b/src/backend/cuda/kernel/sort_by_key/desc_u16.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_u16.cu rename to src/backend/cuda/kernel/sort_by_key/desc_u16.cu index 69dc01634b..9b0a77cb25 100644 --- a/src/backend/cuda/sort_by_key/desc_u16.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_u16.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(ushort, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_u32.cu b/src/backend/cuda/kernel/sort_by_key/desc_u32.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_u32.cu rename to src/backend/cuda/kernel/sort_by_key/desc_u32.cu index ae2ad4bc84..1d02aec848 100644 --- a/src/backend/cuda/sort_by_key/desc_u32.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_u32.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(uint, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_u64.cu b/src/backend/cuda/kernel/sort_by_key/desc_u64.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_u64.cu rename to src/backend/cuda/kernel/sort_by_key/desc_u64.cu index 43f60c075b..597bd2c1b4 100644 --- a/src/backend/cuda/sort_by_key/desc_u64.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_u64.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(uintl, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_u8.cu b/src/backend/cuda/kernel/sort_by_key/desc_u8.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_u8.cu rename to src/backend/cuda/kernel/sort_by_key/desc_u8.cu index 51d8096620..4f55479604 100644 --- a/src/backend/cuda/sort_by_key/desc_u8.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_u8.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(uchar, false) } +} diff --git a/src/backend/cuda/kernel/sort_by_key_impl.hpp b/src/backend/cuda/kernel/sort_by_key_impl.hpp new file mode 100644 index 0000000000..66a6087401 --- /dev/null +++ b/src/backend/cuda/kernel/sort_by_key_impl.hpp @@ -0,0 +1,213 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +// This needs to be in global namespace as it is used by thrust +template +struct IndexPair +{ + Tk first; + Tv second; +}; + +template +struct IPCompare +{ + __host__ __device__ + bool operator()(const IndexPair &lhs, const IndexPair &rhs) const + { + // Check stable sort condition + if(isAscending) return (lhs.first < rhs.first); + else return (lhs.first > rhs.first); + } +}; + +namespace cuda +{ + namespace kernel + { + static const int copyPairIter = 4; + + template + __global__ + void makeIndexPair(IndexPair *out, const Tk *first, const Tv *second, const int N) + { + int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x; + + for(int i = tIdx; i < N; i += blockDim.x) + { + out[i].first = first[i]; + out[i].second = second[i]; + } + } + + template + __global__ + void splitIndexPair(Tk *first, Tv *second, const IndexPair *out, const int N) + { + int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x; + + for(int i = tIdx; i < N; i += blockDim.x) + { + first[i] = out[i].first; + second[i] = out[i].second; + } + } + + /////////////////////////////////////////////////////////////////////////// + // Wrapper functions + /////////////////////////////////////////////////////////////////////////// + template + void sort0ByKeyIterative(Param okey, Param oval) + { + thrust::device_ptr okey_ptr = thrust::device_pointer_cast(okey.ptr); + thrust::device_ptr oval_ptr = thrust::device_pointer_cast(oval.ptr); + + for(int w = 0; w < okey.dims[3]; w++) { + int okeyW = w * okey.strides[3]; + int ovalW = w * oval.strides[3]; + for(int z = 0; z < okey.dims[2]; z++) { + int okeyWZ = okeyW + z * okey.strides[2]; + int ovalWZ = ovalW + z * oval.strides[2]; + for(int y = 0; y < okey.dims[1]; y++) { + + int okeyOffset = okeyWZ + y * okey.strides[1]; + int ovalOffset = ovalWZ + y * oval.strides[1]; + + if(isAscending) { + THRUST_SELECT(thrust::stable_sort_by_key, + okey_ptr + okeyOffset, + okey_ptr + okeyOffset + okey.dims[0], + oval_ptr + ovalOffset); + } else { + THRUST_SELECT(thrust::stable_sort_by_key, + okey_ptr + okeyOffset, + okey_ptr + okeyOffset + okey.dims[0], + oval_ptr + ovalOffset, thrust::greater()); + } + } + } + } + POST_LAUNCH_CHECK(); + } + + template + void sortByKeyBatched(Param pKey, Param pVal) + { + af::dim4 inDims; + for(int i = 0; i < 4; i++) + inDims[i] = pKey.dims[i]; + + // Sort dimension + // tileDims * seqDims = inDims + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + // Create/call iota + // Array key = iota(seqDims, tileDims); + af::dim4 keydims = inDims; + uint* key = memAlloc(keydims.elements()); + Param pSeq; + pSeq.ptr = key; + pSeq.strides[0] = 1; + pSeq.dims[0] = keydims[0]; + for(int i = 1; i < 4; i++) { + pSeq.dims[i] = keydims[i]; + pSeq.strides[i] = pSeq.strides[i - 1] * pSeq.dims[i - 1]; + } + cuda::kernel::iota(pSeq, seqDims, tileDims); + + // Make pkey, pVal into a pair + thrust::device_vector > X(inDims.elements()); + IndexPair *Xptr = thrust::raw_pointer_cast(X.data()); + + const int threads = 256; + int blocks = divup(inDims.elements(), threads * copyPairIter); + CUDA_LAUNCH((makeIndexPair), blocks, threads, + Xptr, pKey.ptr, pVal.ptr, inDims.elements()); + POST_LAUNCH_CHECK(); + + // Sort indices + // Need to convert pSeq to thrust::device_ptr, otherwise thrust + // throws weird errors for all *64 data types (double, intl, uintl etc) + thrust::device_ptr dSeq = thrust::device_pointer_cast(pSeq.ptr); + THRUST_SELECT(thrust::stable_sort_by_key, + X.begin(), X.end(), + dSeq, + IPCompare()); + POST_LAUNCH_CHECK(); + + // Needs to be ascending (true) in order to maintain the indices properly + //kernel::sort0_by_key(pKey, pVal); + THRUST_SELECT(thrust::stable_sort_by_key, + dSeq, + dSeq + inDims.elements(), + X.begin()); + POST_LAUNCH_CHECK(); + + CUDA_LAUNCH((splitIndexPair), blocks, threads, + pKey.ptr, pVal.ptr, Xptr, inDims.elements()); + POST_LAUNCH_CHECK(); + + // No need of doing moddims here because the original Array + // dimensions have not been changed + //val.modDims(inDims); + + memFree(key); + } + + template + void sort0ByKey(Param okey, Param oval) + { + int higherDims = okey.dims[1] * okey.dims[2] * okey.dims[3]; + // TODO Make a better heurisitic + if(higherDims > 5) + kernel::sortByKeyBatched(okey, oval); + else + kernel::sort0ByKeyIterative(okey, oval); + } + +#define INSTANTIATE(Tk, Tv, dr) \ + template void sort0ByKey(Param okey, Param oval); \ + template void sort0ByKeyIterative(Param okey, Param oval); \ + template void sortByKeyBatched(Param okey, Param oval); \ + template void sortByKeyBatched(Param okey, Param oval); \ + template void sortByKeyBatched(Param okey, Param oval); \ + template void sortByKeyBatched(Param okey, Param oval); \ + +#define INSTANTIATE1(Tk , dr) \ + INSTANTIATE(Tk, float , dr) \ + INSTANTIATE(Tk, double , dr) \ + INSTANTIATE(Tk, cfloat , dr) \ + INSTANTIATE(Tk, cdouble, dr) \ + INSTANTIATE(Tk, int , dr) \ + INSTANTIATE(Tk, uint , dr) \ + INSTANTIATE(Tk, short , dr) \ + INSTANTIATE(Tk, ushort , dr) \ + INSTANTIATE(Tk, char , dr) \ + INSTANTIATE(Tk, uchar , dr) \ + INSTANTIATE(Tk, intl , dr) \ + INSTANTIATE(Tk, uintl , dr) + } +} diff --git a/src/backend/cuda/kernel/sort_helper.hpp b/src/backend/cuda/kernel/sort_helper.hpp deleted file mode 100644 index 93fb33ac8f..0000000000 --- a/src/backend/cuda/kernel/sort_helper.hpp +++ /dev/null @@ -1,66 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#include -#include -#include - -// This needs to be in global namespace as it is used by thrust -template -struct IndexPair -{ - Tk first; - Tv second; -}; - -template -struct IPCompare -{ - __host__ __device__ - bool operator()(const IndexPair &lhs, const IndexPair &rhs) const - { - // Check stable sort condition - if(isAscending) return (lhs.first < rhs.first); - else return (lhs.first > rhs.first); - } -}; - -namespace cuda -{ - namespace kernel - { - static const int copyPairIter = 4; - - template - __global__ - void makeIndexPair(IndexPair *out, const Tk *first, const Tv *second, const int N) - { - int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x; - - for(int i = tIdx; i < N; i += blockDim.x) - { - out[i].first = first[i]; - out[i].second = second[i]; - } - } - - template - __global__ - void splitIndexPair(Tk *first, Tv *second, const IndexPair *out, const int N) - { - int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x; - - for(int i = tIdx; i < N; i += blockDim.x) - { - first[i] = out[i].first; - second[i] = out[i].second; - } - } - } -} diff --git a/src/backend/cuda/sort_by_key_impl.hpp b/src/backend/cuda/sort_by_key.cu similarity index 63% rename from src/backend/cuda/sort_by_key_impl.hpp rename to src/backend/cuda/sort_by_key.cu index 8cc86b55db..2d5d68eef0 100644 --- a/src/backend/cuda/sort_by_key_impl.hpp +++ b/src/backend/cuda/sort_by_key.cu @@ -51,22 +51,36 @@ namespace cuda } } -#define INSTANTIATE(Tk, Tv, dr) \ - template void \ - sort_by_key(Array &okey, Array &oval, \ - const Array &ikey, const Array &ival, const uint dim); \ +#define INSTANTIATE(Tk, Tv) \ + template void sort_by_key(Array &okey, Array &oval, \ + const Array &ikey, const Array &ival, const uint dim); \ + template void sort_by_key(Array &okey, Array &oval, \ + const Array &ikey, const Array &ival, const uint dim); \ + +#define INSTANTIATE1(Tk ) \ + INSTANTIATE(Tk, float ) \ + INSTANTIATE(Tk, double ) \ + INSTANTIATE(Tk, cfloat ) \ + INSTANTIATE(Tk, cdouble) \ + INSTANTIATE(Tk, int ) \ + INSTANTIATE(Tk, uint ) \ + INSTANTIATE(Tk, short ) \ + INSTANTIATE(Tk, ushort ) \ + INSTANTIATE(Tk, char ) \ + INSTANTIATE(Tk, uchar ) \ + INSTANTIATE(Tk, intl ) \ + INSTANTIATE(Tk, uintl ) + + +INSTANTIATE1(float ) +INSTANTIATE1(double) +INSTANTIATE1(int ) +INSTANTIATE1(uint ) +INSTANTIATE1(short ) +INSTANTIATE1(ushort) +INSTANTIATE1(char ) +INSTANTIATE1(uchar ) +INSTANTIATE1(intl ) +INSTANTIATE1(uintl ) -#define INSTANTIATE1(Tk , dr) \ - INSTANTIATE(Tk, float , dr) \ - INSTANTIATE(Tk, double , dr) \ - INSTANTIATE(Tk, cfloat , dr) \ - INSTANTIATE(Tk, cdouble, dr) \ - INSTANTIATE(Tk, int , dr) \ - INSTANTIATE(Tk, uint , dr) \ - INSTANTIATE(Tk, short , dr) \ - INSTANTIATE(Tk, ushort , dr) \ - INSTANTIATE(Tk, char , dr) \ - INSTANTIATE(Tk, uchar , dr) \ - INSTANTIATE(Tk, intl , dr) \ - INSTANTIATE(Tk, uintl , dr) } From bbdae15c0875d633d18fa2b5ed63122a14b6c3a2 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 21 Apr 2016 20:37:31 -0400 Subject: [PATCH 24/28] Instantiate sort_by_key kernels in separately in opencl --- src/backend/opencl/CMakeLists.txt | 9 +- src/backend/opencl/kernel/iota.hpp | 3 +- src/backend/opencl/kernel/sort_by_key.hpp | 184 +-------- .../opencl/{ => kernel}/sort_by_key/b8.cpp | 5 +- .../opencl/{ => kernel}/sort_by_key/f32.cpp | 5 +- .../opencl/{ => kernel}/sort_by_key/f64.cpp | 5 +- .../opencl/{ => kernel}/sort_by_key/s16.cpp | 5 +- .../opencl/{ => kernel}/sort_by_key/s32.cpp | 5 +- .../opencl/{ => kernel}/sort_by_key/s64.cpp | 5 +- .../opencl/{ => kernel}/sort_by_key/u16.cpp | 5 +- .../opencl/{ => kernel}/sort_by_key/u32.cpp | 5 +- .../opencl/{ => kernel}/sort_by_key/u64.cpp | 5 +- .../opencl/{ => kernel}/sort_by_key/u8.cpp | 5 +- .../opencl/kernel/sort_by_key_impl.hpp | 373 ++++++++++++++++++ src/backend/opencl/kernel/sort_helper.hpp | 151 +------ .../{sort_by_key/impl.hpp => sort_by_key.cpp} | 48 ++- src/backend/opencl/traits.hpp | 1 + 17 files changed, 453 insertions(+), 366 deletions(-) rename src/backend/opencl/{ => kernel}/sort_by_key/b8.cpp (87%) rename src/backend/opencl/{ => kernel}/sort_by_key/f32.cpp (87%) rename src/backend/opencl/{ => kernel}/sort_by_key/f64.cpp (87%) rename src/backend/opencl/{ => kernel}/sort_by_key/s16.cpp (87%) rename src/backend/opencl/{ => kernel}/sort_by_key/s32.cpp (87%) rename src/backend/opencl/{ => kernel}/sort_by_key/s64.cpp (87%) rename src/backend/opencl/{ => kernel}/sort_by_key/u16.cpp (87%) rename src/backend/opencl/{ => kernel}/sort_by_key/u32.cpp (87%) rename src/backend/opencl/{ => kernel}/sort_by_key/u64.cpp (87%) rename src/backend/opencl/{ => kernel}/sort_by_key/u8.cpp (87%) create mode 100644 src/backend/opencl/kernel/sort_by_key_impl.hpp rename src/backend/opencl/{sort_by_key/impl.hpp => sort_by_key.cpp} (65%) diff --git a/src/backend/opencl/CMakeLists.txt b/src/backend/opencl/CMakeLists.txt index 2cb8ddfdf9..b7eb77ded5 100644 --- a/src/backend/opencl/CMakeLists.txt +++ b/src/backend/opencl/CMakeLists.txt @@ -111,12 +111,10 @@ ENDIF() FILE(GLOB opencl_headers "*.hpp" - "*.h" - "sort_by_key/*.hpp") + "*.h") FILE(GLOB opencl_sources - "*.cpp" - "sort_by_key/*.cpp") + "*.cpp") FILE(GLOB jit_sources "jit/*.hpp") @@ -128,7 +126,8 @@ FILE(GLOB opencl_kernels "kernel/*.cl") FILE(GLOB kernel_sources - "kernel/*.cpp") + "kernel/*.cpp" + "kernel/sort_by_key/*.cpp") FILE(GLOB conv_ker_headers "kernel/convolve/*.hpp") diff --git a/src/backend/opencl/kernel/iota.hpp b/src/backend/opencl/kernel/iota.hpp index 210b6b202e..7cd8046d68 100644 --- a/src/backend/opencl/kernel/iota.hpp +++ b/src/backend/opencl/kernel/iota.hpp @@ -9,6 +9,7 @@ #pragma once #include +#include #include #include #include @@ -37,7 +38,7 @@ namespace opencl static const int TILEY = 32; template - void iota(Param out, const dim4 &sdims, const dim4 &tdims) + void iota(Param out, const af::dim4 &sdims, const af::dim4 &tdims) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; diff --git a/src/backend/opencl/kernel/sort_by_key.hpp b/src/backend/opencl/kernel/sort_by_key.hpp index 6984603882..224f6411ff 100644 --- a/src/backend/opencl/kernel/sort_by_key.hpp +++ b/src/backend/opencl/kernel/sort_by_key.hpp @@ -8,200 +8,22 @@ ********************************************************/ #pragma once -#include #include -#include -#include #include #include #include -#include -#include -#include - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace compute = boost::compute; - -using cl::Buffer; -using cl::Program; -using cl::Kernel; -using cl::make_kernel; -using cl::EnqueueArgs; -using cl::NDRange; -using std::string; namespace opencl { namespace kernel { template - void sort0ByKeyIterative(Param pKey, Param pVal) - { - try { - compute::command_queue c_queue(getQueue()()); - - compute::buffer pKey_buf((*pKey.data)()); - compute::buffer pVal_buf((*pVal.data)()); - - for(int w = 0; w < pKey.info.dims[3]; w++) { - int pKeyW = w * pKey.info.strides[3]; - int pValW = w * pVal.info.strides[3]; - for(int z = 0; z < pKey.info.dims[2]; z++) { - int pKeyWZ = pKeyW + z * pKey.info.strides[2]; - int pValWZ = pValW + z * pVal.info.strides[2]; - for(int y = 0; y < pKey.info.dims[1]; y++) { - - int pKeyOffset = pKeyWZ + y * pKey.info.strides[1]; - int pValOffset = pValWZ + y * pVal.info.strides[1]; - - compute::buffer_iterator< type_t > start= compute::make_buffer_iterator< type_t >(pKey_buf, pKeyOffset); - compute::buffer_iterator< type_t > end = compute::make_buffer_iterator< type_t >(pKey_buf, pKeyOffset + pKey.info.dims[0]); - compute::buffer_iterator< type_t > vals = compute::make_buffer_iterator< type_t >(pVal_buf, pValOffset); - if(isAscending) { - compute::sort_by_key(start, end, vals, c_queue); - } else { - compute::sort_by_key(start, end, vals, - compute::greater< type_t >(), c_queue); - } - } - } - } - - CL_DEBUG_FINISH(getQueue()); - } catch (cl::Error err) { - CL_TO_AF_ERROR(err); - throw; - } - } + void sort0ByKeyIterative(Param pKey, Param pVal); template - void sortByKeyBatched(Param pKey, Param pVal) - { - typedef type_t Tk; - typedef type_t Tv; - - try { - af::dim4 inDims; - for(int i = 0; i < 4; i++) - inDims[i] = pKey.info.dims[i]; - - // Sort dimension - // tileDims * seqDims = inDims - af::dim4 tileDims(1); - af::dim4 seqDims = inDims; - tileDims[dim] = inDims[dim]; - seqDims[dim] = 1; - - // Create/call iota - // Array key = iota(seqDims, tileDims); - dim4 keydims = inDims; - cl::Buffer* key = bufferAlloc(keydims.elements() * sizeof(unsigned)); - Param pSeq; - pSeq.data = key; - pSeq.info.offset = 0; - pSeq.info.dims[0] = keydims[0]; - pSeq.info.strides[0] = 1; - for(int i = 1; i < 4; i++) { - pSeq.info.dims[i] = keydims[i]; - pSeq.info.strides[i] = pSeq.info.strides[i - 1] * pSeq.info.dims[i - 1]; - } - kernel::iota(pSeq, seqDims, tileDims); - - int elements = inDims.elements(); - - // Flat - Not required since inplace and both are continuous - //val.modDims(inDims.elements()); - //key.modDims(inDims.elements()); - - // Sort indices - // sort_by_key(*resVal, *resKey, val, key, 0); - //kernel::sort0_by_key(pVal, pKey); - compute::command_queue c_queue(getQueue()()); - compute::context c_context(getContext()()); - - // Create buffer iterators for seq - compute::buffer pSeq_buf((*pSeq.data)()); - compute::buffer_iterator seq0 = compute::make_buffer_iterator(pSeq_buf, 0); - compute::buffer_iterator seqN = compute::make_buffer_iterator(pSeq_buf, elements); - // Create buffer iterators for key and val - compute::buffer pKey_buf((*pKey.data)()); - compute::buffer pVal_buf((*pVal.data)()); - compute::buffer_iterator key0 = compute::make_buffer_iterator(pKey_buf, 0); - compute::buffer_iterator keyN = compute::make_buffer_iterator(pKey_buf, elements); - compute::buffer_iterator val0 = compute::make_buffer_iterator(pVal_buf, 0); - compute::buffer_iterator valN = compute::make_buffer_iterator(pVal_buf, elements); - - // Sort By Key for descending is stable in the reverse - // (greater) order. Sorting in ascending with negated values - // will give the right result - if(!isAscending) compute::transform(key0, keyN, key0, flipFunction(), c_queue); - - // Create a copy of the pKey buffer - cl::Buffer* cKey = bufferAlloc(elements * sizeof(Tk)); - compute::buffer cKey_buf((*cKey)()); - compute::buffer_iterator cKey0 = compute::make_buffer_iterator(cKey_buf, 0); - compute::buffer_iterator cKeyN = compute::make_buffer_iterator(cKey_buf, elements); - compute::copy(key0, keyN, cKey0, c_queue); - - // FIRST SORT - compute::sort_by_key(key0, keyN, seq0, c_queue); - compute::sort_by_key(cKey0, cKeyN, val0, c_queue); - - // Create a copy of the seq buffer after first sort - cl::Buffer* cSeq = bufferAlloc(elements * sizeof(unsigned)); - compute::buffer cSeq_buf((*cSeq)()); - compute::buffer_iterator cSeq0 = compute::make_buffer_iterator(cSeq_buf, 0); - compute::buffer_iterator cSeqN = compute::make_buffer_iterator(cSeq_buf, elements); - compute::copy(seq0, seqN, cSeq0, c_queue); - - // SECOND SORT - // First call will sort key, second sort will sort val - // Needs to be ascending (true) in order to maintain the indices properly - //kernel::sort0_by_key(pKey, pVal); - compute::sort_by_key(seq0, seqN, key0, c_queue); - compute::sort_by_key(cSeq0, cSeqN, val0, c_queue); - - // If descending, flip it back - if(!isAscending) compute::transform(key0, keyN, key0, flipFunction(), c_queue); - - //// No need of doing moddims here because the original Array - //// dimensions have not been changed - ////val.modDims(inDims); - - CL_DEBUG_FINISH(getQueue()); - bufferFree(key); - bufferFree(cSeq); - bufferFree(cKey); - } catch (cl::Error err) { - CL_TO_AF_ERROR(err); - throw; - } - } + void sortByKeyBatched(Param pKey, Param pVal); template - void sort0ByKey(Param pKey, Param pVal) - { - int higherDims = pKey.info.dims[1] * pKey.info.dims[2] * pKey.info.dims[3]; - // TODO Make a better heurisitic - if(higherDims > 0) - kernel::sortByKeyBatched(pKey, pVal); - else - kernel::sort0ByKeyIterative(pKey, pVal); - } + void sort0ByKey(Param pKey, Param pVal); } } - -#pragma GCC diagnostic pop diff --git a/src/backend/opencl/sort_by_key/b8.cpp b/src/backend/opencl/kernel/sort_by_key/b8.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/b8.cpp rename to src/backend/opencl/kernel/sort_by_key/b8.cpp index 118d20dc92..ad0d7f48ae 100644 --- a/src/backend/opencl/sort_by_key/b8.cpp +++ b/src/backend/opencl/kernel/sort_by_key/b8.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(char,true) INSTANTIATE1(char,false) } +} diff --git a/src/backend/opencl/sort_by_key/f32.cpp b/src/backend/opencl/kernel/sort_by_key/f32.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/f32.cpp rename to src/backend/opencl/kernel/sort_by_key/f32.cpp index a7baf486f1..a1e9ae5f1f 100644 --- a/src/backend/opencl/sort_by_key/f32.cpp +++ b/src/backend/opencl/kernel/sort_by_key/f32.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(float,true) INSTANTIATE1(float,false) } +} diff --git a/src/backend/opencl/sort_by_key/f64.cpp b/src/backend/opencl/kernel/sort_by_key/f64.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/f64.cpp rename to src/backend/opencl/kernel/sort_by_key/f64.cpp index 6971c90982..7fb7a79bd8 100644 --- a/src/backend/opencl/sort_by_key/f64.cpp +++ b/src/backend/opencl/kernel/sort_by_key/f64.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(double,true) INSTANTIATE1(double,false) } +} diff --git a/src/backend/opencl/sort_by_key/s16.cpp b/src/backend/opencl/kernel/sort_by_key/s16.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/s16.cpp rename to src/backend/opencl/kernel/sort_by_key/s16.cpp index 44e17b5030..491ea0e3a2 100644 --- a/src/backend/opencl/sort_by_key/s16.cpp +++ b/src/backend/opencl/kernel/sort_by_key/s16.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(short,true) INSTANTIATE1(short,false) } +} diff --git a/src/backend/opencl/sort_by_key/s32.cpp b/src/backend/opencl/kernel/sort_by_key/s32.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/s32.cpp rename to src/backend/opencl/kernel/sort_by_key/s32.cpp index 9fed1a53b3..67ba20e7dd 100644 --- a/src/backend/opencl/sort_by_key/s32.cpp +++ b/src/backend/opencl/kernel/sort_by_key/s32.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(int,true) INSTANTIATE1(int,false) } +} diff --git a/src/backend/opencl/sort_by_key/s64.cpp b/src/backend/opencl/kernel/sort_by_key/s64.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/s64.cpp rename to src/backend/opencl/kernel/sort_by_key/s64.cpp index e2ed8d687b..a48f36ee47 100644 --- a/src/backend/opencl/sort_by_key/s64.cpp +++ b/src/backend/opencl/kernel/sort_by_key/s64.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(intl,true) INSTANTIATE1(intl,false) } +} diff --git a/src/backend/opencl/sort_by_key/u16.cpp b/src/backend/opencl/kernel/sort_by_key/u16.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/u16.cpp rename to src/backend/opencl/kernel/sort_by_key/u16.cpp index c53b68fb53..36678d0a42 100644 --- a/src/backend/opencl/sort_by_key/u16.cpp +++ b/src/backend/opencl/kernel/sort_by_key/u16.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(ushort,true) INSTANTIATE1(ushort,false) } +} diff --git a/src/backend/opencl/sort_by_key/u32.cpp b/src/backend/opencl/kernel/sort_by_key/u32.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/u32.cpp rename to src/backend/opencl/kernel/sort_by_key/u32.cpp index c2e3e62163..f1e4b5322f 100644 --- a/src/backend/opencl/sort_by_key/u32.cpp +++ b/src/backend/opencl/kernel/sort_by_key/u32.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(uint,true) INSTANTIATE1(uint,false) } +} diff --git a/src/backend/opencl/sort_by_key/u64.cpp b/src/backend/opencl/kernel/sort_by_key/u64.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/u64.cpp rename to src/backend/opencl/kernel/sort_by_key/u64.cpp index 89649b1ba5..0a6f5b0c4f 100644 --- a/src/backend/opencl/sort_by_key/u64.cpp +++ b/src/backend/opencl/kernel/sort_by_key/u64.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(uintl,true) INSTANTIATE1(uintl,false) } +} diff --git a/src/backend/opencl/sort_by_key/u8.cpp b/src/backend/opencl/kernel/sort_by_key/u8.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/u8.cpp rename to src/backend/opencl/kernel/sort_by_key/u8.cpp index 2dfb4c3a73..45af011a86 100644 --- a/src/backend/opencl/sort_by_key/u8.cpp +++ b/src/backend/opencl/kernel/sort_by_key/u8.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(uchar,true) INSTANTIATE1(uchar,false) } +} diff --git a/src/backend/opencl/kernel/sort_by_key_impl.hpp b/src/backend/opencl/kernel/sort_by_key_impl.hpp new file mode 100644 index 0000000000..dc1aa2735f --- /dev/null +++ b/src/backend/opencl/kernel/sort_by_key_impl.hpp @@ -0,0 +1,373 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace compute = boost::compute; + +using cl::Buffer; +using cl::Program; +using cl::Kernel; +using cl::make_kernel; +using cl::EnqueueArgs; +using cl::NDRange; +using std::string; + +template +inline +boost::compute::function, const std::pair)> +makeCompareFunction() +{ + // Cannot use isAscending in BOOST_COMPUTE_FUNCTION + if(isAscending) { + BOOST_COMPUTE_FUNCTION(bool, IPCompare, (std::pair lhs, std::pair rhs), + { + return lhs.first < rhs.first; + } + ); + return IPCompare; + } else { + BOOST_COMPUTE_FUNCTION(bool, IPCompare, (std::pair lhs, std::pair rhs), + { + return lhs.first > rhs.first; + } + ); + return IPCompare; + } +} + +template +inline boost::compute::function +flipFunction() +{ + BOOST_COMPUTE_FUNCTION(Tk, negateFn, (const Tk x), + { + return -x; + } + ); + + return negateFn; +} + +#define INSTANTIATE_FLIP(TY, XMAX) \ +template<> inline boost::compute::function \ +flipFunction() \ +{ \ + BOOST_COMPUTE_FUNCTION(TY, negateFn, (const TY x), \ + { \ + return XMAX - x; \ + } \ + ); \ + \ + return negateFn; \ +} + +INSTANTIATE_FLIP(unsigned, UINT_MAX) +INSTANTIATE_FLIP(unsigned short, USHRT_MAX) +INSTANTIATE_FLIP(unsigned char, UCHAR_MAX) +INSTANTIATE_FLIP(cl_ulong, ULONG_MAX) + +#undef INSTANTIATE_FLIP + +namespace opencl +{ + namespace kernel + { + static const int copyPairIter = 4; + + template + void makePair(cl::Buffer *out, const cl::Buffer *first, const cl::Buffer *second, const unsigned N) + { + try { + static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; + static std::map sortPairProgs; + static std::map sortPairKernels; + + int device = getActiveDeviceId(); + + std::call_once( compileFlags[device], [device] () { + std::ostringstream options; + options << " -D Tk=" << dtype_traits::getName() + << " -D Tv=" << dtype_traits::getName() + << " -D copyPairIter=" << copyPairIter; + if (std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value) { + options << " -D USE_DOUBLE"; + } + Program prog; + buildProgram(prog, sort_pair_cl, sort_pair_cl_len, options.str()); + sortPairProgs[device] = new Program(prog); + sortPairKernels[device] = new Kernel(*sortPairProgs[device], "make_pair_kernel"); + }); + + auto makePairOp = make_kernel + (*sortPairKernels[device]); + + NDRange local(256, 1, 1); + NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1); + + makePairOp(EnqueueArgs(getQueue(), global, local), *out, *first, *second, N); + + CL_DEBUG_FINISH(getQueue()); + } catch (cl::Error err) { + CL_TO_AF_ERROR(err); + throw; + } + } + + template + void splitPair(cl::Buffer *first, cl::Buffer *second, const cl::Buffer *in, const unsigned N) + { + try { + static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; + static std::map sortPairProgs; + static std::map sortPairKernels; + + int device = getActiveDeviceId(); + + std::call_once( compileFlags[device], [device] () { + std::ostringstream options; + options << " -D Tk=" << dtype_traits::getName() + << " -D Tv=" << dtype_traits::getName() + << " -D copyPairIter=" << copyPairIter; + if (std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value) { + options << " -D USE_DOUBLE"; + } + Program prog; + buildProgram(prog, sort_pair_cl, sort_pair_cl_len, options.str()); + sortPairProgs[device] = new Program(prog); + sortPairKernels[device] = new Kernel(*sortPairProgs[device], "split_pair_kernel"); + }); + + auto splitPairOp = make_kernel + (*sortPairKernels[device]); + + NDRange local(256, 1, 1); + NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1); + + splitPairOp(EnqueueArgs(getQueue(), global, local), *first, *second, *in, N); + + CL_DEBUG_FINISH(getQueue()); + } catch (cl::Error err) { + CL_TO_AF_ERROR(err); + throw; + } + } + + template + void sort0ByKeyIterative(Param pKey, Param pVal) + { + try { + compute::command_queue c_queue(getQueue()()); + + compute::buffer pKey_buf((*pKey.data)()); + compute::buffer pVal_buf((*pVal.data)()); + + for(int w = 0; w < pKey.info.dims[3]; w++) { + int pKeyW = w * pKey.info.strides[3]; + int pValW = w * pVal.info.strides[3]; + for(int z = 0; z < pKey.info.dims[2]; z++) { + int pKeyWZ = pKeyW + z * pKey.info.strides[2]; + int pValWZ = pValW + z * pVal.info.strides[2]; + for(int y = 0; y < pKey.info.dims[1]; y++) { + + int pKeyOffset = pKeyWZ + y * pKey.info.strides[1]; + int pValOffset = pValWZ + y * pVal.info.strides[1]; + + compute::buffer_iterator< type_t > start= compute::make_buffer_iterator< type_t >(pKey_buf, pKeyOffset); + compute::buffer_iterator< type_t > end = compute::make_buffer_iterator< type_t >(pKey_buf, pKeyOffset + pKey.info.dims[0]); + compute::buffer_iterator< type_t > vals = compute::make_buffer_iterator< type_t >(pVal_buf, pValOffset); + if(isAscending) { + compute::sort_by_key(start, end, vals, c_queue); + } else { + compute::sort_by_key(start, end, vals, + compute::greater< type_t >(), c_queue); + } + } + } + } + + CL_DEBUG_FINISH(getQueue()); + } catch (cl::Error err) { + CL_TO_AF_ERROR(err); + throw; + } + } + + template + void sortByKeyBatched(Param pKey, Param pVal) + { + typedef type_t Tk; + typedef type_t Tv; + + try { + af::dim4 inDims; + for(int i = 0; i < 4; i++) + inDims[i] = pKey.info.dims[i]; + + // Sort dimension + // tileDims * seqDims = inDims + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + // Create/call iota + // Array key = iota(seqDims, tileDims); + cl::Buffer* key = bufferAlloc(inDims.elements() * sizeof(unsigned)); + Param pSeq; + pSeq.data = key; + pSeq.info.offset = 0; + pSeq.info.dims[0] = inDims[0]; + pSeq.info.strides[0] = 1; + for(int i = 1; i < 4; i++) { + pSeq.info.dims[i] = inDims[i]; + pSeq.info.strides[i] = pSeq.info.strides[i - 1] * pSeq.info.dims[i - 1]; + } + kernel::iota(pSeq, seqDims, tileDims); + + int elements = inDims.elements(); + + // Flat - Not required since inplace and both are continuous + //val.modDims(inDims.elements()); + //key.modDims(inDims.elements()); + + // Sort indices + // sort_by_key(*resVal, *resKey, val, key, 0); + //kernel::sort0_by_key(pVal, pKey); + compute::command_queue c_queue(getQueue()()); + compute::context c_context(getContext()()); + + // Create buffer iterators for seq + compute::buffer pSeq_buf((*pSeq.data)()); + compute::buffer_iterator seq0 = compute::make_buffer_iterator(pSeq_buf, 0); + compute::buffer_iterator seqN = compute::make_buffer_iterator(pSeq_buf, elements); + // Create buffer iterators for key and val + compute::buffer pKey_buf((*pKey.data)()); + compute::buffer pVal_buf((*pVal.data)()); + compute::buffer_iterator key0 = compute::make_buffer_iterator(pKey_buf, 0); + compute::buffer_iterator keyN = compute::make_buffer_iterator(pKey_buf, elements); + compute::buffer_iterator val0 = compute::make_buffer_iterator(pVal_buf, 0); + compute::buffer_iterator valN = compute::make_buffer_iterator(pVal_buf, elements); + + // Sort By Key for descending is stable in the reverse + // (greater) order. Sorting in ascending with negated values + // will give the right result + if(!isAscending) compute::transform(key0, keyN, key0, flipFunction(), c_queue); + + // Create a copy of the pKey buffer + cl::Buffer* cKey = bufferAlloc(elements * sizeof(Tk)); + compute::buffer cKey_buf((*cKey)()); + compute::buffer_iterator cKey0 = compute::make_buffer_iterator(cKey_buf, 0); + compute::buffer_iterator cKeyN = compute::make_buffer_iterator(cKey_buf, elements); + compute::copy(key0, keyN, cKey0, c_queue); + + // FIRST SORT + compute::sort_by_key(key0, keyN, seq0, c_queue); + compute::sort_by_key(cKey0, cKeyN, val0, c_queue); + + // Create a copy of the seq buffer after first sort + cl::Buffer* cSeq = bufferAlloc(elements * sizeof(unsigned)); + compute::buffer cSeq_buf((*cSeq)()); + compute::buffer_iterator cSeq0 = compute::make_buffer_iterator(cSeq_buf, 0); + compute::buffer_iterator cSeqN = compute::make_buffer_iterator(cSeq_buf, elements); + compute::copy(seq0, seqN, cSeq0, c_queue); + + // SECOND SORT + // First call will sort key, second sort will sort val + // Needs to be ascending (true) in order to maintain the indices properly + //kernel::sort0_by_key(pKey, pVal); + compute::sort_by_key(seq0, seqN, key0, c_queue); + compute::sort_by_key(cSeq0, cSeqN, val0, c_queue); + + // If descending, flip it back + if(!isAscending) compute::transform(key0, keyN, key0, flipFunction(), c_queue); + + //// No need of doing moddims here because the original Array + //// dimensions have not been changed + ////val.modDims(inDims); + + CL_DEBUG_FINISH(getQueue()); + bufferFree(key); + bufferFree(cSeq); + bufferFree(cKey); + } catch (cl::Error err) { + CL_TO_AF_ERROR(err); + throw; + } + } + + template + void sort0ByKey(Param pKey, Param pVal) + { + int higherDims = pKey.info.dims[1] * pKey.info.dims[2] * pKey.info.dims[3]; + // TODO Make a better heurisitic + if(higherDims > 0) + kernel::sortByKeyBatched(pKey, pVal); + else + kernel::sort0ByKeyIterative(pKey, pVal); + } + +#define INSTANTIATE(Tk, Tv, dr) \ + template void sort0ByKey(Param okey, Param oval); \ + template void sort0ByKeyIterative(Param okey, Param oval); \ + template void sortByKeyBatched(Param okey, Param oval); \ + template void sortByKeyBatched(Param okey, Param oval); \ + template void sortByKeyBatched(Param okey, Param oval); \ + template void sortByKeyBatched(Param okey, Param oval); \ + +#define INSTANTIATE1(Tk , dr) \ + INSTANTIATE(Tk, float , dr) \ + INSTANTIATE(Tk, double , dr) \ + INSTANTIATE(Tk, cfloat , dr) \ + INSTANTIATE(Tk, cdouble, dr) \ + INSTANTIATE(Tk, int , dr) \ + INSTANTIATE(Tk, uint , dr) \ + INSTANTIATE(Tk, short , dr) \ + INSTANTIATE(Tk, ushort , dr) \ + INSTANTIATE(Tk, char , dr) \ + INSTANTIATE(Tk, uchar , dr) \ + INSTANTIATE(Tk, intl , dr) \ + INSTANTIATE(Tk, uintl , dr) + } +} + +#pragma GCC diagnostic pop diff --git a/src/backend/opencl/kernel/sort_helper.hpp b/src/backend/opencl/kernel/sort_helper.hpp index 899e0873fa..b8031c2314 100644 --- a/src/backend/opencl/kernel/sort_helper.hpp +++ b/src/backend/opencl/kernel/sort_helper.hpp @@ -8,75 +8,11 @@ ********************************************************/ #pragma once -#include -#include #include -#include -#include -#include -#include #include #include #include -#include - -template -inline -boost::compute::function, const std::pair)> -makeCompareFunction() -{ - // Cannot use isAscending in BOOST_COMPUTE_FUNCTION - if(isAscending) { - BOOST_COMPUTE_FUNCTION(bool, IPCompare, (std::pair lhs, std::pair rhs), - { - return lhs.first < rhs.first; - } - ); - return IPCompare; - } else { - BOOST_COMPUTE_FUNCTION(bool, IPCompare, (std::pair lhs, std::pair rhs), - { - return lhs.first > rhs.first; - } - ); - return IPCompare; - } -} - -template -inline boost::compute::function -flipFunction() -{ - BOOST_COMPUTE_FUNCTION(Tk, negateFn, (const Tk x), - { - return -x; - } - ); - - return negateFn; -} - -#define INSTANTIATE_FLIP(TY, XMAX) \ -template<> inline boost::compute::function \ -flipFunction() \ -{ \ - BOOST_COMPUTE_FUNCTION(TY, negateFn, (const TY x), \ - { \ - return XMAX - x; \ - } \ - ); \ - \ - return negateFn; \ -} - -INSTANTIATE_FLIP(unsigned, UINT_MAX) -INSTANTIATE_FLIP(unsigned short, USHRT_MAX) -INSTANTIATE_FLIP(unsigned char, UCHAR_MAX) -INSTANTIATE_FLIP(cl_ulong, ULONG_MAX) - -#undef INSTANTIATE_FLIP - namespace opencl { namespace kernel @@ -107,91 +43,6 @@ namespace opencl using type_t = typename conditional::value, cl_ulong, ltype_t >::type; - - static const int copyPairIter = 4; - - template - void makePair(cl::Buffer *out, const cl::Buffer *first, const cl::Buffer *second, const unsigned N) - { - try { - static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; - static std::map sortPairProgs; - static std::map sortPairKernels; - - int device = getActiveDeviceId(); - - std::call_once( compileFlags[device], [device] () { - std::ostringstream options; - options << " -D Tk=" << dtype_traits::getName() - << " -D Tv=" << dtype_traits::getName() - << " -D copyPairIter=" << copyPairIter; - if (std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value) { - options << " -D USE_DOUBLE"; - } - Program prog; - buildProgram(prog, sort_pair_cl, sort_pair_cl_len, options.str()); - sortPairProgs[device] = new Program(prog); - sortPairKernels[device] = new Kernel(*sortPairProgs[device], "make_pair_kernel"); - }); - - auto makePairOp = make_kernel - (*sortPairKernels[device]); - - NDRange local(256, 1, 1); - NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1); - - makePairOp(EnqueueArgs(getQueue(), global, local), *out, *first, *second, N); - - CL_DEBUG_FINISH(getQueue()); - } catch (cl::Error err) { - CL_TO_AF_ERROR(err); - throw; - } - } - - template - void splitPair(cl::Buffer *first, cl::Buffer *second, const cl::Buffer *in, const unsigned N) - { - try { - static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; - static std::map sortPairProgs; - static std::map sortPairKernels; - - int device = getActiveDeviceId(); - - std::call_once( compileFlags[device], [device] () { - std::ostringstream options; - options << " -D Tk=" << dtype_traits::getName() - << " -D Tv=" << dtype_traits::getName() - << " -D copyPairIter=" << copyPairIter; - if (std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value) { - options << " -D USE_DOUBLE"; - } - Program prog; - buildProgram(prog, sort_pair_cl, sort_pair_cl_len, options.str()); - sortPairProgs[device] = new Program(prog); - sortPairKernels[device] = new Kernel(*sortPairProgs[device], "split_pair_kernel"); - }); - - auto splitPairOp = make_kernel - (*sortPairKernels[device]); - - NDRange local(256, 1, 1); - NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1); - - splitPairOp(EnqueueArgs(getQueue(), global, local), *first, *second, *in, N); - - CL_DEBUG_FINISH(getQueue()); - } catch (cl::Error err) { - CL_TO_AF_ERROR(err); - throw; - } - } } } + diff --git a/src/backend/opencl/sort_by_key/impl.hpp b/src/backend/opencl/sort_by_key.cpp similarity index 65% rename from src/backend/opencl/sort_by_key/impl.hpp rename to src/backend/opencl/sort_by_key.cpp index f68fe91b3c..27c2dc2462 100644 --- a/src/backend/opencl/sort_by_key/impl.hpp +++ b/src/backend/opencl/sort_by_key.cpp @@ -55,26 +55,36 @@ namespace opencl } } -#define INSTANTIATE(Tk, Tv, isAscending) \ - template void \ - sort_by_key(Array &okey, Array &oval, \ - const Array &ikey, \ - const Array &ival, \ - const unsigned dim); \ +#define INSTANTIATE(Tk, Tv) \ + template void sort_by_key(Array &okey, Array &oval, \ + const Array &ikey, const Array &ival, const uint dim); \ + template void sort_by_key(Array &okey, Array &oval, \ + const Array &ikey, const Array &ival, const uint dim); \ +#define INSTANTIATE1(Tk ) \ + INSTANTIATE(Tk, float ) \ + INSTANTIATE(Tk, double ) \ + INSTANTIATE(Tk, cfloat ) \ + INSTANTIATE(Tk, cdouble) \ + INSTANTIATE(Tk, int ) \ + INSTANTIATE(Tk, uint ) \ + INSTANTIATE(Tk, short ) \ + INSTANTIATE(Tk, ushort ) \ + INSTANTIATE(Tk, char ) \ + INSTANTIATE(Tk, uchar ) \ + INSTANTIATE(Tk, intl ) \ + INSTANTIATE(Tk, uintl ) -#define INSTANTIATE1(Tk, isAscending) \ - INSTANTIATE(Tk, float , isAscending) \ - INSTANTIATE(Tk, double , isAscending) \ - INSTANTIATE(Tk, cfloat , isAscending) \ - INSTANTIATE(Tk, cdouble, isAscending) \ - INSTANTIATE(Tk, int , isAscending) \ - INSTANTIATE(Tk, uint , isAscending) \ - INSTANTIATE(Tk, char , isAscending) \ - INSTANTIATE(Tk, uchar , isAscending) \ - INSTANTIATE(Tk, short , isAscending) \ - INSTANTIATE(Tk, ushort , isAscending) \ - INSTANTIATE(Tk, intl , isAscending) \ - INSTANTIATE(Tk, uintl , isAscending) \ + +INSTANTIATE1(float ) +INSTANTIATE1(double) +INSTANTIATE1(int ) +INSTANTIATE1(uint ) +INSTANTIATE1(short ) +INSTANTIATE1(ushort) +INSTANTIATE1(char ) +INSTANTIATE1(uchar ) +INSTANTIATE1(intl ) +INSTANTIATE1(uintl ) } diff --git a/src/backend/opencl/traits.hpp b/src/backend/opencl/traits.hpp index 4e63095421..54ba158e8a 100644 --- a/src/backend/opencl/traits.hpp +++ b/src/backend/opencl/traits.hpp @@ -10,6 +10,7 @@ #pragma once #include +#include #include #include #include From cbefc1f2952ed780b4c4f71b98f6bb9bdf54d646 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 21 Apr 2016 21:22:48 -0400 Subject: [PATCH 25/28] Instantiate sort_by_key kernels in separately in cpu --- src/backend/cpu/CMakeLists.txt | 3 +- src/backend/cpu/kernel/sort_by_key.hpp | 120 +------------- src/backend/cpu/kernel/sort_by_key/b8.cpp | 19 +++ src/backend/cpu/kernel/sort_by_key/f32.cpp | 19 +++ src/backend/cpu/kernel/sort_by_key/f64.cpp | 19 +++ src/backend/cpu/kernel/sort_by_key/s16.cpp | 19 +++ src/backend/cpu/kernel/sort_by_key/s32.cpp | 19 +++ src/backend/cpu/kernel/sort_by_key/s64.cpp | 19 +++ src/backend/cpu/kernel/sort_by_key/u16.cpp | 19 +++ src/backend/cpu/kernel/sort_by_key/u32.cpp | 19 +++ src/backend/cpu/kernel/sort_by_key/u64.cpp | 19 +++ src/backend/cpu/kernel/sort_by_key/u8.cpp | 19 +++ src/backend/cpu/kernel/sort_by_key_impl.hpp | 166 ++++++++++++++++++++ 13 files changed, 361 insertions(+), 118 deletions(-) create mode 100644 src/backend/cpu/kernel/sort_by_key/b8.cpp create mode 100644 src/backend/cpu/kernel/sort_by_key/f32.cpp create mode 100644 src/backend/cpu/kernel/sort_by_key/f64.cpp create mode 100644 src/backend/cpu/kernel/sort_by_key/s16.cpp create mode 100644 src/backend/cpu/kernel/sort_by_key/s32.cpp create mode 100644 src/backend/cpu/kernel/sort_by_key/s64.cpp create mode 100644 src/backend/cpu/kernel/sort_by_key/u16.cpp create mode 100644 src/backend/cpu/kernel/sort_by_key/u32.cpp create mode 100644 src/backend/cpu/kernel/sort_by_key/u64.cpp create mode 100644 src/backend/cpu/kernel/sort_by_key/u8.cpp create mode 100644 src/backend/cpu/kernel/sort_by_key_impl.hpp diff --git a/src/backend/cpu/CMakeLists.txt b/src/backend/cpu/CMakeLists.txt index f7857ec6d6..0ae74e8384 100644 --- a/src/backend/cpu/CMakeLists.txt +++ b/src/backend/cpu/CMakeLists.txt @@ -93,7 +93,8 @@ FILE(GLOB cpu_headers "*.h") FILE(GLOB cpu_sources - "*.cpp") + "*.cpp" + "kernel/sort_by_key/*.cpp") LIST(SORT cpu_headers) LIST(SORT cpu_sources) diff --git a/src/backend/cpu/kernel/sort_by_key.hpp b/src/backend/cpu/kernel/sort_by_key.hpp index 1be4a94d3a..55d5a89337 100644 --- a/src/backend/cpu/kernel/sort_by_key.hpp +++ b/src/backend/cpu/kernel/sort_by_key.hpp @@ -8,15 +8,8 @@ ********************************************************/ #pragma once -#include #include -#include -#include -#include -#include #include -#include -#include namespace cpu { @@ -24,120 +17,13 @@ namespace kernel { template -void sort0ByKeyIterative(Array okey, Array oval) -{ - // Get pointers and initialize original index locations - Tk *okey_ptr = okey.get(); - Tv *oval_ptr = oval.get(); - - std::vector > X; - X.reserve(okey.dims()[0]); - - for(dim_t w = 0; w < okey.dims()[3]; w++) { - dim_t okeyW = w * okey.strides()[3]; - dim_t ovalW = w * oval.strides()[3]; - - for(dim_t z = 0; z < okey.dims()[2]; z++) { - dim_t okeyWZ = okeyW + z * okey.strides()[2]; - dim_t ovalWZ = ovalW + z * oval.strides()[2]; - - for(dim_t y = 0; y < okey.dims()[1]; y++) { - - dim_t okeyOffset = okeyWZ + y * okey.strides()[1]; - dim_t ovalOffset = ovalWZ + y * oval.strides()[1]; - - X.clear(); - std::transform(okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims()[0], - oval_ptr + ovalOffset, - std::back_inserter(X), - [](Tk v_, Tv i_) { return std::make_pair(v_, i_); } - ); - - std::stable_sort(X.begin(), X.end(), IPCompare()); - - for(unsigned it = 0; it < X.size(); it++) { - okey_ptr[okeyOffset + it] = X[it].first; - oval_ptr[ovalOffset + it] = X[it].second; - } - } - } - } - - return; -} +void sort0ByKeyIterative(Array okey, Array oval); template -void sortByKeyBatched(Array okey, Array oval) -{ - af::dim4 inDims = okey.dims(); - - af::dim4 tileDims(1); - af::dim4 seqDims = inDims; - tileDims[dim] = inDims[dim]; - seqDims[dim] = 1; - - uint* key = memAlloc(inDims.elements()); - // IOTA - { - af::dim4 dims = inDims; - uint* out = key; - af::dim4 strides(1); - for(int i = 1; i < 4; i++) - strides[i] = strides[i-1] * dims[i-1]; - - for(dim_t w = 0; w < dims[3]; w++) { - dim_t offW = w * strides[3]; - uint okeyW = (w % seqDims[3]) * seqDims[0] * seqDims[1] * seqDims[2]; - for(dim_t z = 0; z < dims[2]; z++) { - dim_t offWZ = offW + z * strides[2]; - uint okeyZ = okeyW + (z % seqDims[2]) * seqDims[0] * seqDims[1]; - for(dim_t y = 0; y < dims[1]; y++) { - dim_t offWZY = offWZ + y * strides[1]; - uint okeyY = okeyZ + (y % seqDims[1]) * seqDims[0]; - for(dim_t x = 0; x < dims[0]; x++) { - dim_t id = offWZY + x; - out[id] = okeyY + (x % seqDims[0]); - } - } - } - } - } - - // initialize original index locations - Tk *okey_ptr = okey.get(); - Tv *oval_ptr = oval.get(); - - std::vector > X; - X.reserve(okey.elements()); - - for(unsigned i = 0; i < okey.elements(); i++) { - X.push_back(std::make_pair(std::make_pair(okey_ptr[i], oval_ptr[i]), key[i])); - } - - memFree(key); // key is no longer required - - std::stable_sort(X.begin(), X.end(), KIPCompareV()); - - std::stable_sort(X.begin(), X.end(), KIPCompareK()); - - for(unsigned it = 0; it < okey.elements(); it++) { - okey_ptr[it] = X[it].first.first; - oval_ptr[it] = X[it].first.second; - } - - return; -} +void sortByKeyBatched(Array okey, Array oval); template -void sort0ByKey(Array okey, Array oval) -{ - int higherDims = okey.dims()[1] * okey.dims()[2] * okey.dims()[3]; - // TODO Make a better heurisitic - if(higherDims > 0) - kernel::sortByKeyBatched(okey, oval); - else - kernel::sort0ByKeyIterative(okey, oval); -} +void sort0ByKey(Array okey, Array oval); } } diff --git a/src/backend/cpu/kernel/sort_by_key/b8.cpp b/src/backend/cpu/kernel/sort_by_key/b8.cpp new file mode 100644 index 0000000000..855e7a93ca --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/b8.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(char,true) + INSTANTIATE1(char,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/f32.cpp b/src/backend/cpu/kernel/sort_by_key/f32.cpp new file mode 100644 index 0000000000..11d8139957 --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/f32.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(float,true) + INSTANTIATE1(float,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/f64.cpp b/src/backend/cpu/kernel/sort_by_key/f64.cpp new file mode 100644 index 0000000000..21746d773a --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/f64.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(double,true) + INSTANTIATE1(double,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/s16.cpp b/src/backend/cpu/kernel/sort_by_key/s16.cpp new file mode 100644 index 0000000000..50b718d04c --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/s16.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(short,true) + INSTANTIATE1(short,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/s32.cpp b/src/backend/cpu/kernel/sort_by_key/s32.cpp new file mode 100644 index 0000000000..c50efcdc26 --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/s32.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(int,true) + INSTANTIATE1(int,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/s64.cpp b/src/backend/cpu/kernel/sort_by_key/s64.cpp new file mode 100644 index 0000000000..82946f820e --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/s64.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(intl,true) + INSTANTIATE1(intl,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/u16.cpp b/src/backend/cpu/kernel/sort_by_key/u16.cpp new file mode 100644 index 0000000000..feedd1d56e --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/u16.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(ushort,true) + INSTANTIATE1(ushort,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/u32.cpp b/src/backend/cpu/kernel/sort_by_key/u32.cpp new file mode 100644 index 0000000000..cd514af19a --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/u32.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(uint,true) + INSTANTIATE1(uint,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/u64.cpp b/src/backend/cpu/kernel/sort_by_key/u64.cpp new file mode 100644 index 0000000000..ec955b3de7 --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/u64.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(uintl,true) + INSTANTIATE1(uintl,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/u8.cpp b/src/backend/cpu/kernel/sort_by_key/u8.cpp new file mode 100644 index 0000000000..fd58cbfaa1 --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/u8.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(uchar,true) + INSTANTIATE1(uchar,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key_impl.hpp b/src/backend/cpu/kernel/sort_by_key_impl.hpp new file mode 100644 index 0000000000..fcd415c29c --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key_impl.hpp @@ -0,0 +1,166 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void sort0ByKeyIterative(Array okey, Array oval) +{ + // Get pointers and initialize original index locations + Tk *okey_ptr = okey.get(); + Tv *oval_ptr = oval.get(); + + std::vector > X; + X.reserve(okey.dims()[0]); + + for(dim_t w = 0; w < okey.dims()[3]; w++) { + dim_t okeyW = w * okey.strides()[3]; + dim_t ovalW = w * oval.strides()[3]; + + for(dim_t z = 0; z < okey.dims()[2]; z++) { + dim_t okeyWZ = okeyW + z * okey.strides()[2]; + dim_t ovalWZ = ovalW + z * oval.strides()[2]; + + for(dim_t y = 0; y < okey.dims()[1]; y++) { + + dim_t okeyOffset = okeyWZ + y * okey.strides()[1]; + dim_t ovalOffset = ovalWZ + y * oval.strides()[1]; + + X.clear(); + std::transform(okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims()[0], + oval_ptr + ovalOffset, + std::back_inserter(X), + [](Tk v_, Tv i_) { return std::make_pair(v_, i_); } + ); + + std::stable_sort(X.begin(), X.end(), IPCompare()); + + for(unsigned it = 0; it < X.size(); it++) { + okey_ptr[okeyOffset + it] = X[it].first; + oval_ptr[ovalOffset + it] = X[it].second; + } + } + } + } + + return; +} + +template +void sortByKeyBatched(Array okey, Array oval) +{ + af::dim4 inDims = okey.dims(); + + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + uint* key = memAlloc(inDims.elements()); + // IOTA + { + af::dim4 dims = inDims; + uint* out = key; + af::dim4 strides(1); + for(int i = 1; i < 4; i++) + strides[i] = strides[i-1] * dims[i-1]; + + for(dim_t w = 0; w < dims[3]; w++) { + dim_t offW = w * strides[3]; + uint okeyW = (w % seqDims[3]) * seqDims[0] * seqDims[1] * seqDims[2]; + for(dim_t z = 0; z < dims[2]; z++) { + dim_t offWZ = offW + z * strides[2]; + uint okeyZ = okeyW + (z % seqDims[2]) * seqDims[0] * seqDims[1]; + for(dim_t y = 0; y < dims[1]; y++) { + dim_t offWZY = offWZ + y * strides[1]; + uint okeyY = okeyZ + (y % seqDims[1]) * seqDims[0]; + for(dim_t x = 0; x < dims[0]; x++) { + dim_t id = offWZY + x; + out[id] = okeyY + (x % seqDims[0]); + } + } + } + } + } + + // initialize original index locations + Tk *okey_ptr = okey.get(); + Tv *oval_ptr = oval.get(); + + std::vector > X; + X.reserve(okey.elements()); + + for(unsigned i = 0; i < okey.elements(); i++) { + X.push_back(std::make_pair(std::make_pair(okey_ptr[i], oval_ptr[i]), key[i])); + } + + memFree(key); // key is no longer required + + std::stable_sort(X.begin(), X.end(), KIPCompareV()); + + std::stable_sort(X.begin(), X.end(), KIPCompareK()); + + for(unsigned it = 0; it < okey.elements(); it++) { + okey_ptr[it] = X[it].first.first; + oval_ptr[it] = X[it].first.second; + } + + return; +} + +template +void sort0ByKey(Array okey, Array oval) +{ + int higherDims = okey.dims()[1] * okey.dims()[2] * okey.dims()[3]; + // TODO Make a better heurisitic + if(higherDims > 4) + kernel::sortByKeyBatched(okey, oval); + else + kernel::sort0ByKeyIterative(okey, oval); +} + +#define INSTANTIATE(Tk, Tv, dr) \ + template void sort0ByKey(Array okey, Array oval); \ + template void sort0ByKeyIterative(Array okey, Array oval); \ + template void sortByKeyBatched(Array okey, Array oval); \ + template void sortByKeyBatched(Array okey, Array oval); \ + template void sortByKeyBatched(Array okey, Array oval); \ + template void sortByKeyBatched(Array okey, Array oval); \ + +#define INSTANTIATE1(Tk , dr) \ + INSTANTIATE(Tk, float , dr) \ + INSTANTIATE(Tk, double , dr) \ + INSTANTIATE(Tk, cfloat , dr) \ + INSTANTIATE(Tk, cdouble, dr) \ + INSTANTIATE(Tk, int , dr) \ + INSTANTIATE(Tk, uint , dr) \ + INSTANTIATE(Tk, short , dr) \ + INSTANTIATE(Tk, ushort , dr) \ + INSTANTIATE(Tk, char , dr) \ + INSTANTIATE(Tk, uchar , dr) \ + INSTANTIATE(Tk, intl , dr) \ + INSTANTIATE(Tk, uintl , dr) +} +} + From 45574db72c7bdc3470ec9d3f4f0fd212dd76dac1 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Thu, 21 Apr 2016 21:24:08 -0400 Subject: [PATCH 26/28] Sort by key cuda - create pair memory using memalloc, reasonable heuristic --- src/backend/cuda/kernel/sort_by_key_impl.hpp | 30 ++++++++++--------- .../opencl/kernel/sort_by_key_impl.hpp | 2 +- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src/backend/cuda/kernel/sort_by_key_impl.hpp b/src/backend/cuda/kernel/sort_by_key_impl.hpp index 66a6087401..bcc2fefeb4 100644 --- a/src/backend/cuda/kernel/sort_by_key_impl.hpp +++ b/src/backend/cuda/kernel/sort_by_key_impl.hpp @@ -117,6 +117,8 @@ namespace cuda for(int i = 0; i < 4; i++) inDims[i] = pKey.dims[i]; + const dim_t elements = inDims.elements(); + // Sort dimension // tileDims * seqDims = inDims af::dim4 tileDims(1); @@ -126,34 +128,34 @@ namespace cuda // Create/call iota // Array key = iota(seqDims, tileDims); - af::dim4 keydims = inDims; - uint* key = memAlloc(keydims.elements()); + uint* key = memAlloc(elements); Param pSeq; pSeq.ptr = key; pSeq.strides[0] = 1; - pSeq.dims[0] = keydims[0]; + pSeq.dims[0] = inDims[0]; for(int i = 1; i < 4; i++) { - pSeq.dims[i] = keydims[i]; + pSeq.dims[i] = inDims[i]; pSeq.strides[i] = pSeq.strides[i - 1] * pSeq.dims[i - 1]; } cuda::kernel::iota(pSeq, seqDims, tileDims); // Make pkey, pVal into a pair - thrust::device_vector > X(inDims.elements()); - IndexPair *Xptr = thrust::raw_pointer_cast(X.data()); + IndexPair *Xptr = (IndexPair*)memAlloc(sizeof(IndexPair) * elements); const int threads = 256; - int blocks = divup(inDims.elements(), threads * copyPairIter); + int blocks = divup(elements, threads * copyPairIter); CUDA_LAUNCH((makeIndexPair), blocks, threads, - Xptr, pKey.ptr, pVal.ptr, inDims.elements()); + Xptr, pKey.ptr, pVal.ptr, elements); POST_LAUNCH_CHECK(); + thrust::device_ptr > X = thrust::device_pointer_cast(Xptr); + // Sort indices // Need to convert pSeq to thrust::device_ptr, otherwise thrust // throws weird errors for all *64 data types (double, intl, uintl etc) thrust::device_ptr dSeq = thrust::device_pointer_cast(pSeq.ptr); THRUST_SELECT(thrust::stable_sort_by_key, - X.begin(), X.end(), + X, X + elements, dSeq, IPCompare()); POST_LAUNCH_CHECK(); @@ -161,13 +163,12 @@ namespace cuda // Needs to be ascending (true) in order to maintain the indices properly //kernel::sort0_by_key(pKey, pVal); THRUST_SELECT(thrust::stable_sort_by_key, - dSeq, - dSeq + inDims.elements(), - X.begin()); + dSeq, dSeq + elements, + X); POST_LAUNCH_CHECK(); CUDA_LAUNCH((splitIndexPair), blocks, threads, - pKey.ptr, pVal.ptr, Xptr, inDims.elements()); + pKey.ptr, pVal.ptr, Xptr, elements); POST_LAUNCH_CHECK(); // No need of doing moddims here because the original Array @@ -175,6 +176,7 @@ namespace cuda //val.modDims(inDims); memFree(key); + memFree((char*)Xptr); } template @@ -182,7 +184,7 @@ namespace cuda { int higherDims = okey.dims[1] * okey.dims[2] * okey.dims[3]; // TODO Make a better heurisitic - if(higherDims > 5) + if(higherDims > 4) kernel::sortByKeyBatched(okey, oval); else kernel::sort0ByKeyIterative(okey, oval); diff --git a/src/backend/opencl/kernel/sort_by_key_impl.hpp b/src/backend/opencl/kernel/sort_by_key_impl.hpp index dc1aa2735f..243034541f 100644 --- a/src/backend/opencl/kernel/sort_by_key_impl.hpp +++ b/src/backend/opencl/kernel/sort_by_key_impl.hpp @@ -340,7 +340,7 @@ namespace opencl { int higherDims = pKey.info.dims[1] * pKey.info.dims[2] * pKey.info.dims[3]; // TODO Make a better heurisitic - if(higherDims > 0) + if(higherDims > 5) kernel::sortByKeyBatched(pKey, pVal); else kernel::sort0ByKeyIterative(pKey, pVal); From 840ea28d28fa0f63e860989008490f491e5e1fc3 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 22 Apr 2016 14:20:14 -0400 Subject: [PATCH 27/28] Remove sort 0 dim restriction note from documentation --- include/af/algorithm.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/include/af/algorithm.h b/include/af/algorithm.h index 792d6e2f44..a25120ffe3 100644 --- a/include/af/algorithm.h +++ b/include/af/algorithm.h @@ -357,8 +357,6 @@ namespace af \return the sorted output \ingroup sort_func_sort - - \note \p dim is currently restricted to 0. */ AFAPI array sort(const array &in, const unsigned dim = 0, const bool isAscending = true); @@ -372,8 +370,6 @@ namespace af \param[in] isAscending specifies the sorting order \ingroup sort_func_sort_index - - \note \p dim is currently restricted to 0. */ AFAPI void sort(array &out, array &indices, const array &in, const unsigned dim = 0, const bool isAscending = true); @@ -388,8 +384,6 @@ namespace af \param[in] isAscending specifies the sorting order \ingroup sort_func_sort_keys - - \note \p dim is currently restricted to 0. */ AFAPI void sort(array &out_keys, array &out_values, const array &keys, const array &values, const unsigned dim = 0, const bool isAscending = true); @@ -794,8 +788,6 @@ extern "C" { \return \ref AF_SUCCESS if the execution completes properly \ingroup sort_func_sort - - \note \p dim is currently restricted to 0. */ AFAPI af_err af_sort(af_array *out, const af_array in, const unsigned dim, const bool isAscending); @@ -810,8 +802,6 @@ extern "C" { \return \ref AF_SUCCESS if the execution completes properly \ingroup sort_func_sort_index - - \note \p dim is currently restricted to 0. */ AFAPI af_err af_sort_index(af_array *out, af_array *indices, const af_array in, const unsigned dim, const bool isAscending); @@ -827,8 +817,6 @@ extern "C" { \return \ref AF_SUCCESS if the execution completes properly \ingroup sort_func_sort_keys - - \note \p dim is currently restricted to 0. */ AFAPI af_err af_sort_by_key(af_array *out_keys, af_array *out_values, const af_array keys, const af_array values, From f6eae071675f8fe4f2c5672b104a3c1394de66b7 Mon Sep 17 00:00:00 2001 From: Shehzan Mohammed Date: Fri, 22 Apr 2016 15:58:11 -0400 Subject: [PATCH 28/28] Add multi dimension support to median, tests --- src/api/c/median.cpp | 13 ++-- test/median.cpp | 156 ++++++++++++++++++++++++++++--------------- 2 files changed, 108 insertions(+), 61 deletions(-) diff --git a/src/api/c/median.cpp b/src/api/c/median.cpp index 50bcad25ee..b5c033f461 100644 --- a/src/api/c/median.cpp +++ b/src/api/c/median.cpp @@ -68,8 +68,8 @@ static af_array median(const af_array& in, const dim_t dim) const Array input = getArray(in); Array sortedIn = sort(input, dim); - int nElems = input.dims()[0]; - double mid = (nElems + 1) / 2; + int dimLength = input.dims()[dim]; + double mid = (dimLength + 1) / 2; af_array left = 0; af_seq slices[4] = {af_span, af_span, af_span, af_span}; @@ -78,7 +78,7 @@ static af_array median(const af_array& in, const dim_t dim) af_array sortedIn_handle = getHandle(sortedIn); AF_CHECK(af_index(&left, sortedIn_handle, input.ndims(), slices)); - if (nElems % 2 == 1) { + if (dimLength % 2 == 1) { // mid-1 is our guy if (input.isFloating()) return left; @@ -90,7 +90,7 @@ static af_array median(const af_array& in, const dim_t dim) return out; } else { // ((mid-1)+mid)/2 is our guy - dim4 dims = input.dims(); + dim4 dims = input.dims(); af_array right = 0; slices[dim] = af_make_seq(mid, mid, 1.0); @@ -100,7 +100,8 @@ static af_array median(const af_array& in, const dim_t dim) af_array carr = 0; af_array result = 0; - dim4 cdims = dim4(1, dims[1], dims[2], dims[3]); + dim4 cdims = dims; + cdims[dim] = 1; AF_CHECK(af_constant(&carr, 0.5, cdims.ndims(), cdims.get(), input.isDouble() ? f64 : f32)); if (!input.isFloating()) { @@ -148,7 +149,7 @@ af_err af_median_all(double *realVal, double *imagVal, const af_array in) af_err af_median(af_array* out, const af_array in, const dim_t dim) { try { - ARG_ASSERT(2, (dim>=0 && dim<=0)); + ARG_ASSERT(2, (dim >= 0 && dim <= 4)); af_array output = 0; ArrayInfo info = getInfo(in); diff --git a/test/median.cpp b/test/median.cpp index e0b21ba281..5b26a44a97 100644 --- a/test/median.cpp +++ b/test/median.cpp @@ -37,96 +37,142 @@ af::array generateArray(int nx, int ny, int nz, int nw) return a; } -template -void median0(int nx, int ny=1, int nz=1, int nw=1) +template +void median_flat(int nx, int ny=1, int nz=1, int nw=1) { if (noDoubleTests()) return; array a = generateArray(nx, ny, nz, nw); - array sa = sort(a); - Ti *h_sa = sa.host(); + // Verification + array sa = sort(flat(a)); + dim_t mid = (sa.dims(0) + 1) / 2; - To *h_b = NULL; - To val = 0; + To verify; - if (flat) { - val = median(a); - h_b = &val; + To *h_sa = sa.as((af_dtype)af::dtype_traits::af_type).host(); + if(sa.dims(0) % 2 == 1) { + verify = h_sa[mid - 1]; } else { - array b = median(a); - h_b = b.host(); + verify = (h_sa[mid - 1] + h_sa[mid]) / (To)2; } - for (int w = 0; w < nw; w++) { - for (int z = 0; z < nz; z++) { - for (int y = 0; y < ny; y++) { + // Test Part + To val = median(a); - int off = (y + ny * (z + nz * w)); - int id = nx / 2; + ASSERT_EQ(verify, val); - if (nx & 2) { - ASSERT_EQ(h_sa[id + off * nx], h_b[off]); - } else { - To left = h_sa[id + off * nx - 1]; - To right = h_sa[id + off * nx]; + delete[] h_sa; +} + +template +void median_test(int nx, int ny=1, int nz=1, int nw=1) +{ + if (noDoubleTests()) return; + + array a = generateArray(nx, ny, nz, nw); + + // If selected dim is higher than input ndims, then return + if(dim >= a.dims().ndims()) + return; + + array verify; + + // Verification + array sa = sort(a, dim); + + double mid = (a.dims(dim) + 1) / 2; + af::seq mSeq[4] = {span, span, span, span}; + mSeq[dim] = af::seq(mid, mid, 1.0); - ASSERT_NEAR((left + right) / 2, h_b[off], 1e-5); - } - } - } + if(sa.dims(dim) % 2 == 1) { + mSeq[dim] = mSeq[dim] - 1.0; + verify = sa(mSeq[0], mSeq[1], mSeq[2], mSeq[3]); + } else { + dim_t sdim[4] = {0}; + sdim[dim] = 1; + sa = sa.as((af_dtype)af::dtype_traits::af_type); + array sas = shift(sa, sdim[0], sdim[1], sdim[2], sdim[3]); + verify = ((sa + sas) / 2)(mSeq[0], mSeq[1], mSeq[2], mSeq[3]); } - delete[] h_sa; - if (!flat) delete[] h_b; + // Test Part + array out = median(a, dim); + + ASSERT_EQ(out.dims() == verify.dims(), true); + ASSERT_NEAR(0, sum(af::abs(out - verify)), 1e-5); } -#define MEDIAN0(To, Ti) \ - TEST(median0, Ti##_1D_even) \ +#define MEDIAN_FLAT(To, Ti) \ + TEST(MedianFlat, Ti##_flat_even) \ + { \ + median_flat(1000); \ + } \ + TEST(MedianFlat, Ti##_flat_odd) \ { \ - median0(1000); \ + median_flat(783); \ } \ - TEST(median0, Ti##_2D_even) \ + TEST(MedianFlat, Ti##_flat_multi_even) \ { \ - median0(1000, 100); \ + median_flat(24, 11, 3); \ } \ - TEST(median0, Ti##_3D_even) \ + TEST(MedianFlat, Ti##_flat_multi_odd) \ { \ - median0(1000, 25, 4); \ + median_flat(15, 21, 7); \ } \ - TEST(median0, Ti##_4D_even) \ + +MEDIAN_FLAT(float, float) +MEDIAN_FLAT(float, int) +MEDIAN_FLAT(float, uint) +MEDIAN_FLAT(float, uchar) +MEDIAN_FLAT(float, short) +MEDIAN_FLAT(float, ushort) +MEDIAN_FLAT(double, double) + +#define MEDIAN_TEST(To, Ti, dim) \ + TEST(Median, Ti##_1D_##dim##_even) \ { \ - median0(1000, 25, 2, 2); \ + median_test(1000); \ } \ - TEST(median0, Ti##_flat_even) \ + TEST(Median, Ti##_2D_##dim##_even) \ { \ - median0(1000); \ + median_test(1000, 25); \ } \ - TEST(median0, Ti##_1D_odd) \ + TEST(Median, Ti##_3D_##dim##_even) \ { \ - median0(783); \ + median_test(100, 25, 4); \ } \ - TEST(median0, Ti##_2D_odd) \ + TEST(Median, Ti##_4D_##dim##_even) \ { \ - median0(783, 100); \ + median_test(100, 25, 2, 2);\ } \ - TEST(median0, Ti##_3D_odd) \ + TEST(Median, Ti##_1D_##dim##_odd) \ { \ - median0(783, 25, 4); \ + median_test(783); \ } \ - TEST(median0, Ti##_4D_odd) \ + TEST(Median, Ti##_2D_##dim##_odd) \ { \ - median0(783, 25, 2, 2); \ + median_test(783, 25); \ } \ - TEST(median0, Ti##_flat_odd) \ + TEST(Median, Ti##_3D_##dim##_odd) \ { \ - median0(783); \ + median_test(123, 25, 3); \ } \ + TEST(Median, Ti##_4D_##dim##_odd) \ + { \ + median_test(123, 25, 3, 3);\ + } \ + +#define MEDIAN(To, Ti) \ + MEDIAN_TEST(To, Ti, 0) \ + MEDIAN_TEST(To, Ti, 1) \ + MEDIAN_TEST(To, Ti, 2) \ + MEDIAN_TEST(To, Ti, 3) \ -MEDIAN0(float, float) -MEDIAN0(float, int) -MEDIAN0(float, uint) -MEDIAN0(float, uchar) -MEDIAN0(float, short) -MEDIAN0(float, ushort) -MEDIAN0(double, double) +MEDIAN(float, float) +MEDIAN(float, int) +MEDIAN(float, uint) +MEDIAN(float, uchar) +MEDIAN(float, short) +MEDIAN(float, ushort) +MEDIAN(double, double)