diff --git a/include/af/algorithm.h b/include/af/algorithm.h index 792d6e2f44..a25120ffe3 100644 --- a/include/af/algorithm.h +++ b/include/af/algorithm.h @@ -357,8 +357,6 @@ namespace af \return the sorted output \ingroup sort_func_sort - - \note \p dim is currently restricted to 0. */ AFAPI array sort(const array &in, const unsigned dim = 0, const bool isAscending = true); @@ -372,8 +370,6 @@ namespace af \param[in] isAscending specifies the sorting order \ingroup sort_func_sort_index - - \note \p dim is currently restricted to 0. */ AFAPI void sort(array &out, array &indices, const array &in, const unsigned dim = 0, const bool isAscending = true); @@ -388,8 +384,6 @@ namespace af \param[in] isAscending specifies the sorting order \ingroup sort_func_sort_keys - - \note \p dim is currently restricted to 0. */ AFAPI void sort(array &out_keys, array &out_values, const array &keys, const array &values, const unsigned dim = 0, const bool isAscending = true); @@ -794,8 +788,6 @@ extern "C" { \return \ref AF_SUCCESS if the execution completes properly \ingroup sort_func_sort - - \note \p dim is currently restricted to 0. */ AFAPI af_err af_sort(af_array *out, const af_array in, const unsigned dim, const bool isAscending); @@ -810,8 +802,6 @@ extern "C" { \return \ref AF_SUCCESS if the execution completes properly \ingroup sort_func_sort_index - - \note \p dim is currently restricted to 0. */ AFAPI af_err af_sort_index(af_array *out, af_array *indices, const af_array in, const unsigned dim, const bool isAscending); @@ -827,8 +817,6 @@ extern "C" { \return \ref AF_SUCCESS if the execution completes properly \ingroup sort_func_sort_keys - - \note \p dim is currently restricted to 0. */ AFAPI af_err af_sort_by_key(af_array *out_keys, af_array *out_values, const af_array keys, const af_array values, diff --git a/src/api/c/median.cpp b/src/api/c/median.cpp index 50bcad25ee..b5c033f461 100644 --- a/src/api/c/median.cpp +++ b/src/api/c/median.cpp @@ -68,8 +68,8 @@ static af_array median(const af_array& in, const dim_t dim) const Array input = getArray(in); Array sortedIn = sort(input, dim); - int nElems = input.dims()[0]; - double mid = (nElems + 1) / 2; + int dimLength = input.dims()[dim]; + double mid = (dimLength + 1) / 2; af_array left = 0; af_seq slices[4] = {af_span, af_span, af_span, af_span}; @@ -78,7 +78,7 @@ static af_array median(const af_array& in, const dim_t dim) af_array sortedIn_handle = getHandle(sortedIn); AF_CHECK(af_index(&left, sortedIn_handle, input.ndims(), slices)); - if (nElems % 2 == 1) { + if (dimLength % 2 == 1) { // mid-1 is our guy if (input.isFloating()) return left; @@ -90,7 +90,7 @@ static af_array median(const af_array& in, const dim_t dim) return out; } else { // ((mid-1)+mid)/2 is our guy - dim4 dims = input.dims(); + dim4 dims = input.dims(); af_array right = 0; slices[dim] = af_make_seq(mid, mid, 1.0); @@ -100,7 +100,8 @@ static af_array median(const af_array& in, const dim_t dim) af_array carr = 0; af_array result = 0; - dim4 cdims = dim4(1, dims[1], dims[2], dims[3]); + dim4 cdims = dims; + cdims[dim] = 1; AF_CHECK(af_constant(&carr, 0.5, cdims.ndims(), cdims.get(), input.isDouble() ? f64 : f32)); if (!input.isFloating()) { @@ -148,7 +149,7 @@ af_err af_median_all(double *realVal, double *imagVal, const af_array in) af_err af_median(af_array* out, const af_array in, const dim_t dim) { try { - ARG_ASSERT(2, (dim>=0 && dim<=0)); + ARG_ASSERT(2, (dim >= 0 && dim <= 4)); af_array output = 0; ArrayInfo info = getInfo(in); diff --git a/src/api/c/moddims.cpp b/src/api/c/moddims.cpp index b8f1fafa6c..1d326c0846 100644 --- a/src/api/c/moddims.cpp +++ b/src/api/c/moddims.cpp @@ -31,7 +31,6 @@ Array modDims(const Array& in, const af::dim4 &newDims) Out = copyArray(in); } - Out.modDims(newDims); Out.setDataDims(newDims); return Out; diff --git a/src/api/c/sort.cpp b/src/api/c/sort.cpp index 1de63c5052..dd58175936 100644 --- a/src/api/c/sort.cpp +++ b/src/api/c/sort.cpp @@ -42,8 +42,6 @@ af_err af_sort(af_array *out, const af_array in, const unsigned dim, const bool af_dtype type = info.getType(); DIM_ASSERT(1, info.elements() > 0); - // Only Dim 0 supported - ARG_ASSERT(2, dim == 0); af_array val; @@ -93,8 +91,6 @@ af_err af_sort_index(af_array *out, af_array *indices, const af_array in, const af_dtype type = info.getType(); DIM_ASSERT(2, info.elements() > 0); - // Only Dim 0 supported - ARG_ASSERT(3, dim == 0); af_array val; af_array idx; @@ -150,6 +146,8 @@ void sort_by_key_tmplt(af_array *okey, af_array *oval, const af_array ikey, cons switch(vtype) { case f32: sort_by_key(okey, oval, ikey, ival, dim, isAscending); break; case f64: sort_by_key(okey, oval, ikey, ival, dim, isAscending); break; + case c32: sort_by_key(okey, oval, ikey, ival, dim, isAscending); break; + case c64: sort_by_key(okey, oval, ikey, ival, dim, isAscending); break; case s32: sort_by_key(okey, oval, ikey, ival, dim, isAscending); break; case u32: sort_by_key(okey, oval, ikey, ival, dim, isAscending); break; case s16: sort_by_key(okey, oval, ikey, ival, dim, isAscending); break; @@ -169,20 +167,20 @@ af_err af_sort_by_key(af_array *out_keys, af_array *out_values, const unsigned dim, const bool isAscending) { try { - ArrayInfo info = getInfo(keys); - af_dtype type = info.getType(); + ArrayInfo kinfo = getInfo(keys); + af_dtype ktype = kinfo.getType(); ArrayInfo vinfo = getInfo(values); - DIM_ASSERT(3, info.elements() > 0); - DIM_ASSERT(4, info.dims() == vinfo.dims()); - // Only Dim 0 supported - ARG_ASSERT(5, dim == 0); + DIM_ASSERT(3, kinfo.elements() > 0); + DIM_ASSERT(4, kinfo.dims() == vinfo.dims()); + + TYPE_ASSERT(kinfo.isReal()); af_array oKey; af_array oVal; - switch(type) { + switch(ktype) { case f32: sort_by_key_tmplt(&oKey, &oVal, keys, values, dim, isAscending); break; case f64: sort_by_key_tmplt(&oKey, &oVal, keys, values, dim, isAscending); break; case s32: sort_by_key_tmplt(&oKey, &oVal, keys, values, dim, isAscending); break; @@ -193,7 +191,7 @@ af_err af_sort_by_key(af_array *out_keys, af_array *out_values, case u64: sort_by_key_tmplt(&oKey, &oVal, keys, values, dim, isAscending); break; case u8: sort_by_key_tmplt(&oKey, &oVal, keys, values, dim, isAscending); break; case b8: sort_by_key_tmplt(&oKey, &oVal, keys, values, dim, isAscending); break; - default: TYPE_ERROR(1, type); + default: TYPE_ERROR(1, ktype); } std::swap(*out_keys , oKey); std::swap(*out_values , oVal); diff --git a/src/backend/cpu/Array.hpp b/src/backend/cpu/Array.hpp index 2809a2b80d..cf970d18c7 100644 --- a/src/backend/cpu/Array.hpp +++ b/src/backend/cpu/Array.hpp @@ -181,6 +181,7 @@ namespace cpu void setDataDims(const dim4 &new_dims) { + modDims(new_dims); data_dims = new_dims; } diff --git a/src/backend/cpu/CMakeLists.txt b/src/backend/cpu/CMakeLists.txt index f7857ec6d6..0ae74e8384 100644 --- a/src/backend/cpu/CMakeLists.txt +++ b/src/backend/cpu/CMakeLists.txt @@ -93,7 +93,8 @@ FILE(GLOB cpu_headers "*.h") FILE(GLOB cpu_sources - "*.cpp") + "*.cpp" + "kernel/sort_by_key/*.cpp") LIST(SORT cpu_headers) LIST(SORT cpu_sources) diff --git a/src/backend/cpu/kernel/sort.hpp b/src/backend/cpu/kernel/sort.hpp index 292c6383dc..e0ae62c932 100644 --- a/src/backend/cpu/kernel/sort.hpp +++ b/src/backend/cpu/kernel/sort.hpp @@ -23,7 +23,7 @@ namespace kernel // Based off of http://stackoverflow.com/a/12399290 template -void sort0(Array val) +void sort0Iterative(Array val) { // initialize original index locations T *val_ptr = val.get(); diff --git a/src/backend/cpu/kernel/sort_by_key.hpp b/src/backend/cpu/kernel/sort_by_key.hpp index f9d391dc46..55d5a89337 100644 --- a/src/backend/cpu/kernel/sort_by_key.hpp +++ b/src/backend/cpu/kernel/sort_by_key.hpp @@ -8,14 +8,8 @@ ********************************************************/ #pragma once -#include #include -#include -#include -#include -#include #include -#include namespace cpu { @@ -23,64 +17,13 @@ namespace kernel { template -void sort0_by_key(Array okey, Array oval, Array oidx, - const Array ikey, const Array ival) -{ - function op = std::greater(); - if(isAscending) { op = std::less(); } - - // Get pointers and initialize original index locations - uint *oidx_ptr = oidx.get(); - Tk *okey_ptr = okey.get(); - Tv *oval_ptr = oval.get(); - const Tk *ikey_ptr = ikey.get(); - const Tv *ival_ptr = ival.get(); - - std::vector seq_vec(oidx.dims()[0]); - std::iota(seq_vec.begin(), seq_vec.end(), 0); - - const Tk *comp_ptr = nullptr; - auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);}; - - for(dim_t w = 0; w < ikey.dims()[3]; w++) { - dim_t okeyW = w * okey.strides()[3]; - dim_t ovalW = w * oval.strides()[3]; - dim_t oidxW = w * oidx.strides()[3]; - dim_t ikeyW = w * ikey.strides()[3]; - dim_t ivalW = w * ival.strides()[3]; - - for(dim_t z = 0; z < ikey.dims()[2]; z++) { - dim_t okeyWZ = okeyW + z * okey.strides()[2]; - dim_t ovalWZ = ovalW + z * oval.strides()[2]; - dim_t oidxWZ = oidxW + z * oidx.strides()[2]; - dim_t ikeyWZ = ikeyW + z * ikey.strides()[2]; - dim_t ivalWZ = ivalW + z * ival.strides()[2]; +void sort0ByKeyIterative(Array okey, Array oval); - for(dim_t y = 0; y < ikey.dims()[1]; y++) { +template +void sortByKeyBatched(Array okey, Array oval); - dim_t okeyOffset = okeyWZ + y * okey.strides()[1]; - dim_t ovalOffset = ovalWZ + y * oval.strides()[1]; - dim_t oidxOffset = oidxWZ + y * oidx.strides()[1]; - dim_t ikeyOffset = ikeyWZ + y * ikey.strides()[1]; - dim_t ivalOffset = ivalWZ + y * ival.strides()[1]; - - uint *ptr = oidx_ptr + oidxOffset; - std::copy(seq_vec.begin(), seq_vec.end(), ptr); - - comp_ptr = ikey_ptr + ikeyOffset; - std::stable_sort(ptr, ptr + ikey.dims()[0], comparator); - - for (dim_t i = 0; i < oval.dims()[0]; ++i){ - uint sortIdx = oidx_ptr[oidxOffset + i]; - okey_ptr[okeyOffset + i] = ikey_ptr[ikeyOffset + sortIdx]; - oval_ptr[ovalOffset + i] = ival_ptr[ivalOffset + sortIdx]; - } - } - } - } - - return; -} +template +void sort0ByKey(Array okey, Array oval); } } diff --git a/src/backend/cpu/kernel/sort_by_key/b8.cpp b/src/backend/cpu/kernel/sort_by_key/b8.cpp new file mode 100644 index 0000000000..855e7a93ca --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/b8.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(char,true) + INSTANTIATE1(char,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/f32.cpp b/src/backend/cpu/kernel/sort_by_key/f32.cpp new file mode 100644 index 0000000000..11d8139957 --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/f32.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(float,true) + INSTANTIATE1(float,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/f64.cpp b/src/backend/cpu/kernel/sort_by_key/f64.cpp new file mode 100644 index 0000000000..21746d773a --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/f64.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(double,true) + INSTANTIATE1(double,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/s16.cpp b/src/backend/cpu/kernel/sort_by_key/s16.cpp new file mode 100644 index 0000000000..50b718d04c --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/s16.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(short,true) + INSTANTIATE1(short,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/s32.cpp b/src/backend/cpu/kernel/sort_by_key/s32.cpp new file mode 100644 index 0000000000..c50efcdc26 --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/s32.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(int,true) + INSTANTIATE1(int,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/s64.cpp b/src/backend/cpu/kernel/sort_by_key/s64.cpp new file mode 100644 index 0000000000..82946f820e --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/s64.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(intl,true) + INSTANTIATE1(intl,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/u16.cpp b/src/backend/cpu/kernel/sort_by_key/u16.cpp new file mode 100644 index 0000000000..feedd1d56e --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/u16.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(ushort,true) + INSTANTIATE1(ushort,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/u32.cpp b/src/backend/cpu/kernel/sort_by_key/u32.cpp new file mode 100644 index 0000000000..cd514af19a --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/u32.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(uint,true) + INSTANTIATE1(uint,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/u64.cpp b/src/backend/cpu/kernel/sort_by_key/u64.cpp new file mode 100644 index 0000000000..ec955b3de7 --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/u64.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(uintl,true) + INSTANTIATE1(uintl,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key/u8.cpp b/src/backend/cpu/kernel/sort_by_key/u8.cpp new file mode 100644 index 0000000000..fd58cbfaa1 --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key/u8.cpp @@ -0,0 +1,19 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ +namespace kernel +{ + INSTANTIATE1(uchar,true) + INSTANTIATE1(uchar,false) +} +} diff --git a/src/backend/cpu/kernel/sort_by_key_impl.hpp b/src/backend/cpu/kernel/sort_by_key_impl.hpp new file mode 100644 index 0000000000..fcd415c29c --- /dev/null +++ b/src/backend/cpu/kernel/sort_by_key_impl.hpp @@ -0,0 +1,166 @@ +/******************************************************* + * Copyright (c) 2015, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cpu +{ +namespace kernel +{ + +template +void sort0ByKeyIterative(Array okey, Array oval) +{ + // Get pointers and initialize original index locations + Tk *okey_ptr = okey.get(); + Tv *oval_ptr = oval.get(); + + std::vector > X; + X.reserve(okey.dims()[0]); + + for(dim_t w = 0; w < okey.dims()[3]; w++) { + dim_t okeyW = w * okey.strides()[3]; + dim_t ovalW = w * oval.strides()[3]; + + for(dim_t z = 0; z < okey.dims()[2]; z++) { + dim_t okeyWZ = okeyW + z * okey.strides()[2]; + dim_t ovalWZ = ovalW + z * oval.strides()[2]; + + for(dim_t y = 0; y < okey.dims()[1]; y++) { + + dim_t okeyOffset = okeyWZ + y * okey.strides()[1]; + dim_t ovalOffset = ovalWZ + y * oval.strides()[1]; + + X.clear(); + std::transform(okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims()[0], + oval_ptr + ovalOffset, + std::back_inserter(X), + [](Tk v_, Tv i_) { return std::make_pair(v_, i_); } + ); + + std::stable_sort(X.begin(), X.end(), IPCompare()); + + for(unsigned it = 0; it < X.size(); it++) { + okey_ptr[okeyOffset + it] = X[it].first; + oval_ptr[ovalOffset + it] = X[it].second; + } + } + } + } + + return; +} + +template +void sortByKeyBatched(Array okey, Array oval) +{ + af::dim4 inDims = okey.dims(); + + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + uint* key = memAlloc(inDims.elements()); + // IOTA + { + af::dim4 dims = inDims; + uint* out = key; + af::dim4 strides(1); + for(int i = 1; i < 4; i++) + strides[i] = strides[i-1] * dims[i-1]; + + for(dim_t w = 0; w < dims[3]; w++) { + dim_t offW = w * strides[3]; + uint okeyW = (w % seqDims[3]) * seqDims[0] * seqDims[1] * seqDims[2]; + for(dim_t z = 0; z < dims[2]; z++) { + dim_t offWZ = offW + z * strides[2]; + uint okeyZ = okeyW + (z % seqDims[2]) * seqDims[0] * seqDims[1]; + for(dim_t y = 0; y < dims[1]; y++) { + dim_t offWZY = offWZ + y * strides[1]; + uint okeyY = okeyZ + (y % seqDims[1]) * seqDims[0]; + for(dim_t x = 0; x < dims[0]; x++) { + dim_t id = offWZY + x; + out[id] = okeyY + (x % seqDims[0]); + } + } + } + } + } + + // initialize original index locations + Tk *okey_ptr = okey.get(); + Tv *oval_ptr = oval.get(); + + std::vector > X; + X.reserve(okey.elements()); + + for(unsigned i = 0; i < okey.elements(); i++) { + X.push_back(std::make_pair(std::make_pair(okey_ptr[i], oval_ptr[i]), key[i])); + } + + memFree(key); // key is no longer required + + std::stable_sort(X.begin(), X.end(), KIPCompareV()); + + std::stable_sort(X.begin(), X.end(), KIPCompareK()); + + for(unsigned it = 0; it < okey.elements(); it++) { + okey_ptr[it] = X[it].first.first; + oval_ptr[it] = X[it].first.second; + } + + return; +} + +template +void sort0ByKey(Array okey, Array oval) +{ + int higherDims = okey.dims()[1] * okey.dims()[2] * okey.dims()[3]; + // TODO Make a better heurisitic + if(higherDims > 4) + kernel::sortByKeyBatched(okey, oval); + else + kernel::sort0ByKeyIterative(okey, oval); +} + +#define INSTANTIATE(Tk, Tv, dr) \ + template void sort0ByKey(Array okey, Array oval); \ + template void sort0ByKeyIterative(Array okey, Array oval); \ + template void sortByKeyBatched(Array okey, Array oval); \ + template void sortByKeyBatched(Array okey, Array oval); \ + template void sortByKeyBatched(Array okey, Array oval); \ + template void sortByKeyBatched(Array okey, Array oval); \ + +#define INSTANTIATE1(Tk , dr) \ + INSTANTIATE(Tk, float , dr) \ + INSTANTIATE(Tk, double , dr) \ + INSTANTIATE(Tk, cfloat , dr) \ + INSTANTIATE(Tk, cdouble, dr) \ + INSTANTIATE(Tk, int , dr) \ + INSTANTIATE(Tk, uint , dr) \ + INSTANTIATE(Tk, short , dr) \ + INSTANTIATE(Tk, ushort , dr) \ + INSTANTIATE(Tk, char , dr) \ + INSTANTIATE(Tk, uchar , dr) \ + INSTANTIATE(Tk, intl , dr) \ + INSTANTIATE(Tk, uintl , dr) +} +} + diff --git a/src/backend/cpu/kernel/sort_helper.hpp b/src/backend/cpu/kernel/sort_helper.hpp new file mode 100644 index 0000000000..ff7da3560b --- /dev/null +++ b/src/backend/cpu/kernel/sort_helper.hpp @@ -0,0 +1,54 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include + +namespace cpu +{ + namespace kernel + { + template + using IndexPair = std::pair; + + template + struct IPCompare + { + bool operator()(const IndexPair &lhs, const IndexPair &rhs) + { + // Check stable sort condition + if(isAscending) return (lhs.first < rhs.first); + else return (lhs.first > rhs.first); + } + }; + + template + using KeyIndexPair = std::pair, uint>; + + template + struct KIPCompareV + { + bool operator()(const KeyIndexPair &lhs, const KeyIndexPair &rhs) + { + // Check stable sort condition + if(isAscending) return (lhs.first.first < rhs.first.first); + else return (lhs.first.first > rhs.first.first); + } + }; + + template + struct KIPCompareK + { + bool operator()(const KeyIndexPair &lhs, const KeyIndexPair &rhs) + { + if(isAscending) return (lhs.second < rhs.second); + else return (lhs.second > rhs.second); + } + }; + } +} diff --git a/src/backend/cpu/kernel/sort_index.hpp b/src/backend/cpu/kernel/sort_index.hpp deleted file mode 100644 index b71cc47071..0000000000 --- a/src/backend/cpu/kernel/sort_index.hpp +++ /dev/null @@ -1,71 +0,0 @@ -/******************************************************* - * Copyright (c) 2015, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#pragma once -#include -#include -#include -#include -#include -#include -#include - -namespace cpu -{ -namespace kernel -{ - -template -void sort0_index(Array val, Array idx, const Array in) -{ - // initialize original index locations - uint *idx_ptr = idx.get(); - T *val_ptr = val.get(); - const T *in_ptr = in.get(); - function op = std::greater(); - if(isAscending) { op = std::less(); } - - std::vector seq_vec(idx.dims()[0]); - std::iota(seq_vec.begin(), seq_vec.end(), 0); - - const T *comp_ptr = nullptr; - auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);}; - - for(dim_t w = 0; w < in.dims()[3]; w++) { - dim_t valW = w * val.strides()[3]; - dim_t idxW = w * idx.strides()[3]; - dim_t inW = w * in.strides()[3]; - for(dim_t z = 0; z < in.dims()[2]; z++) { - dim_t valWZ = valW + z * val.strides()[2]; - dim_t idxWZ = idxW + z * idx.strides()[2]; - dim_t inWZ = inW + z * in.strides()[2]; - for(dim_t y = 0; y < in.dims()[1]; y++) { - - dim_t valOffset = valWZ + y * val.strides()[1]; - dim_t idxOffset = idxWZ + y * idx.strides()[1]; - dim_t inOffset = inWZ + y * in.strides()[1]; - - uint *ptr = idx_ptr + idxOffset; - std::copy(seq_vec.begin(), seq_vec.end(), ptr); - - comp_ptr = in_ptr + inOffset; - std::stable_sort(ptr, ptr + in.dims()[0], comparator); - - for (dim_t i = 0; i < val.dims()[0]; ++i){ - val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]]; - } - } - } - } - - return; -} - -} -} diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp index bc6396b258..4a649e0b23 100644 --- a/src/backend/cpu/sort.cpp +++ b/src/backend/cpu/sort.cpp @@ -15,11 +15,55 @@ #include #include #include +#include +#include +#include +#include +#include #include namespace cpu { +template +void sortBatched(Array& val) +{ + af::dim4 inDims = val.dims(); + + // Sort dimension + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + Array key = iota(seqDims, tileDims); + + Array resKey = createEmptyArray(dim4()); + Array resVal = createEmptyArray(dim4()); + + val.setDataDims(inDims.elements()); + key.setDataDims(inDims.elements()); + + sort_by_key(resVal, resKey, val, key, 0); + + // Needs to be ascending (true) in order to maintain the indices properly + sort_by_key(key, val, resKey, resVal, 0); + val.eval(); + + val.setDataDims(inDims); // This is correct only for dim0 +} + +template +void sort0(Array& val) +{ + int higherDims = val.elements() / val.dims()[0]; + // TODO Make a better heurisitic + if(higherDims > 10) + sortBatched(val); + else + getQueue().enqueue(kernel::sort0Iterative, val); +} + template Array sort(const Array &in, const unsigned dim) { @@ -27,9 +71,26 @@ Array sort(const Array &in, const unsigned dim) Array out = copyArray(in); switch(dim) { - case 0: getQueue().enqueue(kernel::sort0, out); break; + case 0: sort0(out); break; + case 1: sortBatched(out); break; + case 2: sortBatched(out); break; + case 3: sortBatched(out); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } + + if(dim != 0) { + af::dim4 preorderDims = out.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = out.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = out.dims()[i - 1]; + } + + out.setDataDims(preorderDims); + out = reorder(out, reorderDims); + } return out; } diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/sort_by_key.cpp index 5a99257033..46b06602b4 100644 --- a/src/backend/cpu/sort_by_key.cpp +++ b/src/backend/cpu/sort_by_key.cpp @@ -11,6 +11,9 @@ #include #include #include +#include +#include +#include #include namespace cpu @@ -23,16 +26,33 @@ void sort_by_key(Array &okey, Array &oval, ikey.eval(); ival.eval(); - okey = createEmptyArray(ikey.dims()); - oval = createEmptyArray(ival.dims()); - Array oidx = createValueArray(ikey.dims(), 0u); - oidx.eval(); + okey = copyArray(ikey); + oval = copyArray(ival); switch(dim) { - case 0: getQueue().enqueue(kernel::sort0_by_key, - okey, oval, oidx, ikey, ival); break; + case 0: getQueue().enqueue(kernel::sort0ByKey, okey, oval); break; + case 1: getQueue().enqueue(kernel::sortByKeyBatched, okey, oval); break; + case 2: getQueue().enqueue(kernel::sortByKeyBatched, okey, oval); break; + case 3: getQueue().enqueue(kernel::sortByKeyBatched, okey, oval); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } + + if(dim != 0) { + af::dim4 preorderDims = okey.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = okey.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = okey.dims()[i - 1]; + } + + okey.setDataDims(preorderDims); + oval.setDataDims(preorderDims); + + okey = reorder(okey, reorderDims); + oval = reorder(oval, reorderDims); + } } #define INSTANTIATE(Tk, Tv) \ @@ -46,6 +66,8 @@ void sort_by_key(Array &okey, Array &oval, #define INSTANTIATE1(Tk) \ INSTANTIATE(Tk, float) \ INSTANTIATE(Tk, double) \ + INSTANTIATE(Tk, cfloat) \ + INSTANTIATE(Tk, cdouble) \ INSTANTIATE(Tk, int) \ INSTANTIATE(Tk, uint) \ INSTANTIATE(Tk, char) \ diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp index 77860ede18..b865db9c1c 100644 --- a/src/backend/cpu/sort_index.cpp +++ b/src/backend/cpu/sort_index.cpp @@ -14,22 +14,48 @@ #include #include #include -#include +#include +#include +#include +#include namespace cpu { template -void sort_index(Array &val, Array &idx, const Array &in, const uint dim) +void sort_index(Array &okey, Array &oval, const Array &in, const uint dim) { in.eval(); - val = createEmptyArray(in.dims()); - idx = createEmptyArray(in.dims()); + // okey is values, oval is indices + okey = copyArray(in); + oval = range(in.dims(), dim); + oval.eval(); + switch(dim) { - case 0: getQueue().enqueue(kernel::sort0_index, val, idx, in); break; + case 0: getQueue().enqueue(kernel::sort0ByKey, okey, oval); break; + case 1: getQueue().enqueue(kernel::sortByKeyBatched, okey, oval); break; + case 2: getQueue().enqueue(kernel::sortByKeyBatched, okey, oval); break; + case 3: getQueue().enqueue(kernel::sortByKeyBatched, okey, oval); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } + + if(dim != 0) { + af::dim4 preorderDims = okey.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = okey.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = okey.dims()[i - 1]; + } + + okey.setDataDims(preorderDims); + oval.setDataDims(preorderDims); + + okey = reorder(okey, reorderDims); + oval = reorder(oval, reorderDims); + } } #define INSTANTIATE(T) \ diff --git a/src/backend/cuda/Array.hpp b/src/backend/cuda/Array.hpp index c2292087aa..1f9512fb8d 100644 --- a/src/backend/cuda/Array.hpp +++ b/src/backend/cuda/Array.hpp @@ -174,6 +174,7 @@ namespace cuda void setDataDims(const dim4 &new_dims) { + modDims(new_dims); data_dims = new_dims; } diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt index b887a98d67..4efb42764a 100644 --- a/src/backend/cuda/CMakeLists.txt +++ b/src/backend/cuda/CMakeLists.txt @@ -158,7 +158,7 @@ FILE(GLOB cuda_headers FILE(GLOB cuda_sources "*.cu" "*.cpp" - "sort_by_key/*.cu" + "kernel/sort_by_key/*.cu" "kernel/*.cu") FILE(GLOB jit_sources diff --git a/src/backend/cuda/kernel/harris.hpp b/src/backend/cuda/kernel/harris.hpp index 44f98d92c1..9361b72e23 100644 --- a/src/backend/cuda/kernel/harris.hpp +++ b/src/backend/cuda/kernel/harris.hpp @@ -18,7 +18,8 @@ #include #include "convolve.hpp" #include "gradient.hpp" -#include "sort_index.hpp" +#include "sort_by_key.hpp" +#include "range.hpp" namespace cuda { @@ -336,10 +337,12 @@ void harris(unsigned* corners_out, int sort_elem = harris_responses.strides[3] * harris_responses.dims[3]; harris_responses.ptr = d_resp_corners; + // Create indices using range harris_idx.ptr = memAlloc(sort_elem); + kernel::range(harris_idx, 0); // Sort Harris responses - sort0_index(harris_responses, harris_idx); + sort0ByKey(harris_responses, harris_idx); *x_out = memAlloc(*corners_out); *y_out = memAlloc(*corners_out); diff --git a/src/backend/cuda/kernel/iota.hpp b/src/backend/cuda/kernel/iota.hpp index 2632266c92..e2f7e591fb 100644 --- a/src/backend/cuda/kernel/iota.hpp +++ b/src/backend/cuda/kernel/iota.hpp @@ -7,6 +7,7 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ +#include #include #include #include @@ -18,8 +19,8 @@ namespace cuda namespace kernel { // Kernel Launch Config Values - static const unsigned TX = 32; - static const unsigned TY = 8; + static const unsigned IOTA_TX = 32; + static const unsigned IOTA_TY = 8; static const unsigned TILEX = 512; static const unsigned TILEY = 32; @@ -69,9 +70,9 @@ namespace cuda // Wrapper functions /////////////////////////////////////////////////////////////////////////// template - void iota(Param out, const dim4 &sdims, const dim4 &tdims) + void iota(Param out, const af::dim4 &sdims, const af::dim4 &tdims) { - dim3 threads(TX, TY, 1); + dim3 threads(IOTA_TX, IOTA_TY, 1); int blocksPerMatX = divup(out.dims[0], TILEX); int blocksPerMatY = divup(out.dims[1], TILEY); diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp index 89de56065d..8a2b535cee 100644 --- a/src/backend/cuda/kernel/orb.hpp +++ b/src/backend/cuda/kernel/orb.hpp @@ -16,7 +16,8 @@ #include #include "convolve.hpp" #include "orb_patch.hpp" -#include "sort_index.hpp" +#include "sort_by_key.hpp" +#include "range.hpp" #include @@ -394,10 +395,12 @@ void orb(unsigned* out_feat, int sort_elem = harris_sorted.strides[3] * harris_sorted.dims[3]; harris_sorted.ptr = d_score_harris; + // Create indices using range harris_idx.ptr = memAlloc(sort_elem); + kernel::range(harris_idx, 0); // Sort features according to Harris responses - sort0_index(harris_sorted, harris_idx); + kernel::sort0ByKey(harris_sorted, harris_idx); feat_pyr[i] = std::min(feat_pyr[i], lvl_best[i]); diff --git a/src/backend/cuda/kernel/range.hpp b/src/backend/cuda/kernel/range.hpp index 9670b07bd6..6880ed566a 100644 --- a/src/backend/cuda/kernel/range.hpp +++ b/src/backend/cuda/kernel/range.hpp @@ -18,10 +18,10 @@ namespace cuda namespace kernel { // Kernel Launch Config Values - static const unsigned TX = 32; - static const unsigned TY = 8; - static const unsigned TILEX = 512; - static const unsigned TILEY = 32; + static const unsigned RANGE_TX = 32; + static const unsigned RANGE_TY = 8; + static const unsigned RANGE_TILEX = 512; + static const unsigned RANGE_TILEY = 32; template __global__ @@ -74,10 +74,10 @@ namespace cuda template void range(Param out, const int dim) { - dim3 threads(TX, TY, 1); + dim3 threads(RANGE_TX, RANGE_TY, 1); - int blocksPerMatX = divup(out.dims[0], TILEX); - int blocksPerMatY = divup(out.dims[1], TILEY); + int blocksPerMatX = divup(out.dims[0], RANGE_TILEX); + int blocksPerMatY = divup(out.dims[1], RANGE_TILEY); dim3 blocks(blocksPerMatX * out.dims[2], blocksPerMatY * out.dims[3], 1); diff --git a/src/backend/cuda/kernel/sort.hpp b/src/backend/cuda/kernel/sort.hpp index b23e308633..f0095b144d 100644 --- a/src/backend/cuda/kernel/sort.hpp +++ b/src/backend/cuda/kernel/sort.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -19,15 +20,11 @@ namespace cuda { namespace kernel { - // Kernel Launch Config Values - static const unsigned TX = 32; - static const unsigned TY = 8; - /////////////////////////////////////////////////////////////////////////// // Wrapper functions /////////////////////////////////////////////////////////////////////////// template - void sort0(Param val) + void sort0Iterative(Param val) { thrust::device_ptr val_ptr = thrust::device_pointer_cast(val.ptr); @@ -49,5 +46,93 @@ namespace cuda } POST_LAUNCH_CHECK(); } + + template + void sortBatched(Param pVal) + { + af::dim4 inDims; + for(int i = 0; i < 4; i++) + inDims[i] = pVal.dims[i]; + + // Sort dimension + // tileDims * seqDims = inDims + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + // Create/call iota + // Array key = iota(seqDims, tileDims); + dim4 keydims = inDims; + uint* key = memAlloc(keydims.elements()); + Param pKey; + pKey.ptr = key; + pKey.strides[0] = 1; + pKey.dims[0] = keydims[0]; + for(int i = 1; i < 4; i++) { + pKey.dims[i] = keydims[i]; + pKey.strides[i] = pKey.strides[i - 1] * pKey.dims[i - 1]; + } + kernel::iota(pKey, seqDims, tileDims); + + // Flat + //val.modDims(inDims.elements()); + //key.modDims(inDims.elements()); + pKey.dims[0] = inDims.elements(); + pKey.strides[0] = 1; + pVal.dims[0] = inDims.elements(); + pVal.strides[0] = 1; + for(int i = 1; i < 4; i++) { + pKey.dims[i] = 1; + pKey.strides[i] = pKey.strides[i - 1] * pKey.dims[i - 1]; + pVal.dims[i] = 1; + pVal.strides[i] = pVal.strides[i - 1] * pVal.dims[i - 1]; + } + + // Sort indices + // sort_by_key(*resVal, *resKey, val, key, 0); + //kernel::sort0_by_key(pVal, pKey); + thrust::device_ptr pVal_ptr = thrust::device_pointer_cast(pVal.ptr); + thrust::device_ptr pKey_ptr = thrust::device_pointer_cast(pKey.ptr); + if(isAscending) { + THRUST_SELECT(thrust::stable_sort_by_key, + pVal_ptr, + pVal_ptr + pVal.dims[0], + pKey_ptr); + } else { + THRUST_SELECT(thrust::stable_sort_by_key, + pVal_ptr, + pVal_ptr + pVal.dims[0], + pKey_ptr, thrust::greater()); + } + POST_LAUNCH_CHECK(); + + // Needs to be ascending (true) in order to maintain the indices properly + //kernel::sort0_by_key(pKey, pVal); + THRUST_SELECT(thrust::stable_sort_by_key, + pKey_ptr, + pKey_ptr + pVal.dims[0], + pVal_ptr); + POST_LAUNCH_CHECK(); + + // No need of doing moddims here because the original Array + // dimensions have not been changed + //val.modDims(inDims); + + // Not really necessary + // CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId()))); + memFree(key); + } + + template + void sort0(Param val) + { + int higherDims = val.dims[1] * val.dims[2] * val.dims[3]; + // TODO Make a better heurisitic + if(higherDims > 10) + sortBatched(val); + else + kernel::sort0Iterative(val); + } } } diff --git a/src/backend/cuda/kernel/sort_by_key.hpp b/src/backend/cuda/kernel/sort_by_key.hpp index 42a3256a1c..35250a8ad1 100644 --- a/src/backend/cuda/kernel/sort_by_key.hpp +++ b/src/backend/cuda/kernel/sort_by_key.hpp @@ -12,46 +12,19 @@ #include #include #include -#include -#include namespace cuda { namespace kernel { - // Kernel Launch Config Values - static const unsigned TX = 32; - static const unsigned TY = 8; - - /////////////////////////////////////////////////////////////////////////// - // Wrapper functions - /////////////////////////////////////////////////////////////////////////// template - void sort0_by_key(Param okey, Param oval) - { - thrust::device_ptr okey_ptr = thrust::device_pointer_cast(okey.ptr); - thrust::device_ptr oval_ptr = thrust::device_pointer_cast(oval.ptr); + void sort0ByKeyIterative(Param okey, Param oval); - for(int w = 0; w < okey.dims[3]; w++) { - int okeyW = w * okey.strides[3]; - int ovalW = w * oval.strides[3]; - for(int z = 0; z < okey.dims[2]; z++) { - int okeyWZ = okeyW + z * okey.strides[2]; - int ovalWZ = ovalW + z * oval.strides[2]; - for(int y = 0; y < okey.dims[1]; y++) { + template + void sortByKeyBatched(Param pKey, Param pVal); - int okeyOffset = okeyWZ + y * okey.strides[1]; - int ovalOffset = ovalWZ + y * oval.strides[1]; + template + void sort0ByKey(Param okey, Param oval); - if(isAscending) { - THRUST_SELECT(thrust::sort_by_key, okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0], oval_ptr + ovalOffset); - } else { - THRUST_SELECT(thrust::sort_by_key, okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0], oval_ptr + ovalOffset, thrust::greater()); - } - } - } - } - POST_LAUNCH_CHECK(); - } } } diff --git a/src/backend/cuda/sort_by_key/ascd_f32.cu b/src/backend/cuda/kernel/sort_by_key/ascd_f32.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_f32.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_f32.cu index 44b770402c..284e8b4938 100644 --- a/src/backend/cuda/sort_by_key/ascd_f32.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_f32.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(float, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_f64.cu b/src/backend/cuda/kernel/sort_by_key/ascd_f64.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_f64.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_f64.cu index 17b54a3903..ba19ec447c 100644 --- a/src/backend/cuda/sort_by_key/ascd_f64.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_f64.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(double, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_s16.cu b/src/backend/cuda/kernel/sort_by_key/ascd_s16.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_s16.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_s16.cu index d51e9ae671..1be6e540ca 100644 --- a/src/backend/cuda/sort_by_key/ascd_s16.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_s16.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(short, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_s32.cu b/src/backend/cuda/kernel/sort_by_key/ascd_s32.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_s32.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_s32.cu index 75adbddc0b..8cee7c9b49 100644 --- a/src/backend/cuda/sort_by_key/ascd_s32.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_s32.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(int, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_s64.cu b/src/backend/cuda/kernel/sort_by_key/ascd_s64.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_s64.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_s64.cu index 25a1e589f8..0e5a7c81a2 100644 --- a/src/backend/cuda/sort_by_key/ascd_s64.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_s64.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(intl, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_s8.cu b/src/backend/cuda/kernel/sort_by_key/ascd_s8.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_s8.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_s8.cu index f47a397727..81ed32952f 100644 --- a/src/backend/cuda/sort_by_key/ascd_s8.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_s8.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(char, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_u16.cu b/src/backend/cuda/kernel/sort_by_key/ascd_u16.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_u16.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_u16.cu index e06036abc7..e232c08376 100644 --- a/src/backend/cuda/sort_by_key/ascd_u16.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_u16.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(ushort, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_u32.cu b/src/backend/cuda/kernel/sort_by_key/ascd_u32.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_u32.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_u32.cu index 6f7939aa12..34a4580936 100644 --- a/src/backend/cuda/sort_by_key/ascd_u32.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_u32.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(uint, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_u64.cu b/src/backend/cuda/kernel/sort_by_key/ascd_u64.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_u64.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_u64.cu index 63eec5fdd4..fc576e7f99 100644 --- a/src/backend/cuda/sort_by_key/ascd_u64.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_u64.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(uintl, true) } +} diff --git a/src/backend/cuda/sort_by_key/ascd_u8.cu b/src/backend/cuda/kernel/sort_by_key/ascd_u8.cu similarity index 86% rename from src/backend/cuda/sort_by_key/ascd_u8.cu rename to src/backend/cuda/kernel/sort_by_key/ascd_u8.cu index a2e1dec887..ed8454d53e 100644 --- a/src/backend/cuda/sort_by_key/ascd_u8.cu +++ b/src/backend/cuda/kernel/sort_by_key/ascd_u8.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(uchar, true) } +} diff --git a/src/backend/cuda/sort_by_key/desc_f32.cu b/src/backend/cuda/kernel/sort_by_key/desc_f32.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_f32.cu rename to src/backend/cuda/kernel/sort_by_key/desc_f32.cu index 1bbb10bbba..73459ac033 100644 --- a/src/backend/cuda/sort_by_key/desc_f32.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_f32.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(float, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_f64.cu b/src/backend/cuda/kernel/sort_by_key/desc_f64.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_f64.cu rename to src/backend/cuda/kernel/sort_by_key/desc_f64.cu index ecbed78878..be0536b1e3 100644 --- a/src/backend/cuda/sort_by_key/desc_f64.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_f64.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(double, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_s16.cu b/src/backend/cuda/kernel/sort_by_key/desc_s16.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_s16.cu rename to src/backend/cuda/kernel/sort_by_key/desc_s16.cu index 63967b6117..0fc3b50827 100644 --- a/src/backend/cuda/sort_by_key/desc_s16.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_s16.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(short, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_s32.cu b/src/backend/cuda/kernel/sort_by_key/desc_s32.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_s32.cu rename to src/backend/cuda/kernel/sort_by_key/desc_s32.cu index 49904437f4..cfda29c7de 100644 --- a/src/backend/cuda/sort_by_key/desc_s32.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_s32.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(int, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_s64.cu b/src/backend/cuda/kernel/sort_by_key/desc_s64.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_s64.cu rename to src/backend/cuda/kernel/sort_by_key/desc_s64.cu index a10ee11475..b334a91a99 100644 --- a/src/backend/cuda/sort_by_key/desc_s64.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_s64.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(intl, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_s8.cu b/src/backend/cuda/kernel/sort_by_key/desc_s8.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_s8.cu rename to src/backend/cuda/kernel/sort_by_key/desc_s8.cu index cad78dfc84..f02d5ce2fe 100644 --- a/src/backend/cuda/sort_by_key/desc_s8.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_s8.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(char, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_u16.cu b/src/backend/cuda/kernel/sort_by_key/desc_u16.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_u16.cu rename to src/backend/cuda/kernel/sort_by_key/desc_u16.cu index 69dc01634b..9b0a77cb25 100644 --- a/src/backend/cuda/sort_by_key/desc_u16.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_u16.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(ushort, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_u32.cu b/src/backend/cuda/kernel/sort_by_key/desc_u32.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_u32.cu rename to src/backend/cuda/kernel/sort_by_key/desc_u32.cu index ae2ad4bc84..1d02aec848 100644 --- a/src/backend/cuda/sort_by_key/desc_u32.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_u32.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(uint, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_u64.cu b/src/backend/cuda/kernel/sort_by_key/desc_u64.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_u64.cu rename to src/backend/cuda/kernel/sort_by_key/desc_u64.cu index 43f60c075b..597bd2c1b4 100644 --- a/src/backend/cuda/sort_by_key/desc_u64.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_u64.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(uintl, false) } +} diff --git a/src/backend/cuda/sort_by_key/desc_u8.cu b/src/backend/cuda/kernel/sort_by_key/desc_u8.cu similarity index 86% rename from src/backend/cuda/sort_by_key/desc_u8.cu rename to src/backend/cuda/kernel/sort_by_key/desc_u8.cu index 51d8096620..4f55479604 100644 --- a/src/backend/cuda/sort_by_key/desc_u8.cu +++ b/src/backend/cuda/kernel/sort_by_key/desc_u8.cu @@ -7,9 +7,12 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include +#include namespace cuda +{ +namespace kernel { INSTANTIATE1(uchar, false) } +} diff --git a/src/backend/cuda/kernel/sort_by_key_impl.hpp b/src/backend/cuda/kernel/sort_by_key_impl.hpp new file mode 100644 index 0000000000..bcc2fefeb4 --- /dev/null +++ b/src/backend/cuda/kernel/sort_by_key_impl.hpp @@ -0,0 +1,215 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +// This needs to be in global namespace as it is used by thrust +template +struct IndexPair +{ + Tk first; + Tv second; +}; + +template +struct IPCompare +{ + __host__ __device__ + bool operator()(const IndexPair &lhs, const IndexPair &rhs) const + { + // Check stable sort condition + if(isAscending) return (lhs.first < rhs.first); + else return (lhs.first > rhs.first); + } +}; + +namespace cuda +{ + namespace kernel + { + static const int copyPairIter = 4; + + template + __global__ + void makeIndexPair(IndexPair *out, const Tk *first, const Tv *second, const int N) + { + int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x; + + for(int i = tIdx; i < N; i += blockDim.x) + { + out[i].first = first[i]; + out[i].second = second[i]; + } + } + + template + __global__ + void splitIndexPair(Tk *first, Tv *second, const IndexPair *out, const int N) + { + int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x; + + for(int i = tIdx; i < N; i += blockDim.x) + { + first[i] = out[i].first; + second[i] = out[i].second; + } + } + + /////////////////////////////////////////////////////////////////////////// + // Wrapper functions + /////////////////////////////////////////////////////////////////////////// + template + void sort0ByKeyIterative(Param okey, Param oval) + { + thrust::device_ptr okey_ptr = thrust::device_pointer_cast(okey.ptr); + thrust::device_ptr oval_ptr = thrust::device_pointer_cast(oval.ptr); + + for(int w = 0; w < okey.dims[3]; w++) { + int okeyW = w * okey.strides[3]; + int ovalW = w * oval.strides[3]; + for(int z = 0; z < okey.dims[2]; z++) { + int okeyWZ = okeyW + z * okey.strides[2]; + int ovalWZ = ovalW + z * oval.strides[2]; + for(int y = 0; y < okey.dims[1]; y++) { + + int okeyOffset = okeyWZ + y * okey.strides[1]; + int ovalOffset = ovalWZ + y * oval.strides[1]; + + if(isAscending) { + THRUST_SELECT(thrust::stable_sort_by_key, + okey_ptr + okeyOffset, + okey_ptr + okeyOffset + okey.dims[0], + oval_ptr + ovalOffset); + } else { + THRUST_SELECT(thrust::stable_sort_by_key, + okey_ptr + okeyOffset, + okey_ptr + okeyOffset + okey.dims[0], + oval_ptr + ovalOffset, thrust::greater()); + } + } + } + } + POST_LAUNCH_CHECK(); + } + + template + void sortByKeyBatched(Param pKey, Param pVal) + { + af::dim4 inDims; + for(int i = 0; i < 4; i++) + inDims[i] = pKey.dims[i]; + + const dim_t elements = inDims.elements(); + + // Sort dimension + // tileDims * seqDims = inDims + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + // Create/call iota + // Array key = iota(seqDims, tileDims); + uint* key = memAlloc(elements); + Param pSeq; + pSeq.ptr = key; + pSeq.strides[0] = 1; + pSeq.dims[0] = inDims[0]; + for(int i = 1; i < 4; i++) { + pSeq.dims[i] = inDims[i]; + pSeq.strides[i] = pSeq.strides[i - 1] * pSeq.dims[i - 1]; + } + cuda::kernel::iota(pSeq, seqDims, tileDims); + + // Make pkey, pVal into a pair + IndexPair *Xptr = (IndexPair*)memAlloc(sizeof(IndexPair) * elements); + + const int threads = 256; + int blocks = divup(elements, threads * copyPairIter); + CUDA_LAUNCH((makeIndexPair), blocks, threads, + Xptr, pKey.ptr, pVal.ptr, elements); + POST_LAUNCH_CHECK(); + + thrust::device_ptr > X = thrust::device_pointer_cast(Xptr); + + // Sort indices + // Need to convert pSeq to thrust::device_ptr, otherwise thrust + // throws weird errors for all *64 data types (double, intl, uintl etc) + thrust::device_ptr dSeq = thrust::device_pointer_cast(pSeq.ptr); + THRUST_SELECT(thrust::stable_sort_by_key, + X, X + elements, + dSeq, + IPCompare()); + POST_LAUNCH_CHECK(); + + // Needs to be ascending (true) in order to maintain the indices properly + //kernel::sort0_by_key(pKey, pVal); + THRUST_SELECT(thrust::stable_sort_by_key, + dSeq, dSeq + elements, + X); + POST_LAUNCH_CHECK(); + + CUDA_LAUNCH((splitIndexPair), blocks, threads, + pKey.ptr, pVal.ptr, Xptr, elements); + POST_LAUNCH_CHECK(); + + // No need of doing moddims here because the original Array + // dimensions have not been changed + //val.modDims(inDims); + + memFree(key); + memFree((char*)Xptr); + } + + template + void sort0ByKey(Param okey, Param oval) + { + int higherDims = okey.dims[1] * okey.dims[2] * okey.dims[3]; + // TODO Make a better heurisitic + if(higherDims > 4) + kernel::sortByKeyBatched(okey, oval); + else + kernel::sort0ByKeyIterative(okey, oval); + } + +#define INSTANTIATE(Tk, Tv, dr) \ + template void sort0ByKey(Param okey, Param oval); \ + template void sort0ByKeyIterative(Param okey, Param oval); \ + template void sortByKeyBatched(Param okey, Param oval); \ + template void sortByKeyBatched(Param okey, Param oval); \ + template void sortByKeyBatched(Param okey, Param oval); \ + template void sortByKeyBatched(Param okey, Param oval); \ + +#define INSTANTIATE1(Tk , dr) \ + INSTANTIATE(Tk, float , dr) \ + INSTANTIATE(Tk, double , dr) \ + INSTANTIATE(Tk, cfloat , dr) \ + INSTANTIATE(Tk, cdouble, dr) \ + INSTANTIATE(Tk, int , dr) \ + INSTANTIATE(Tk, uint , dr) \ + INSTANTIATE(Tk, short , dr) \ + INSTANTIATE(Tk, ushort , dr) \ + INSTANTIATE(Tk, char , dr) \ + INSTANTIATE(Tk, uchar , dr) \ + INSTANTIATE(Tk, intl , dr) \ + INSTANTIATE(Tk, uintl , dr) + } +} diff --git a/src/backend/cuda/kernel/sort_index.hpp b/src/backend/cuda/kernel/sort_index.hpp deleted file mode 100644 index 9d29914f23..0000000000 --- a/src/backend/cuda/kernel/sort_index.hpp +++ /dev/null @@ -1,59 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace cuda -{ - namespace kernel - { - /////////////////////////////////////////////////////////////////////////// - // Wrapper functions - /////////////////////////////////////////////////////////////////////////// - template - void sort0_index(Param val, Param idx) - { - thrust::device_ptr val_ptr = thrust::device_pointer_cast(val.ptr); - thrust::device_ptr idx_ptr = thrust::device_pointer_cast(idx.ptr); - - for(int w = 0; w < val.dims[3]; w++) { - int valW = w * val.strides[3]; - int idxW = w * idx.strides[3]; - for(int z = 0; z < val.dims[2]; z++) { - int valWZ = valW + z * val.strides[2]; - int idxWZ = idxW + z * idx.strides[2]; - for(int y = 0; y < val.dims[1]; y++) { - - int valOffset = valWZ + y * val.strides[1]; - int idxOffset = idxWZ + y * idx.strides[1]; - - THRUST_SELECT(thrust::sequence, idx_ptr + idxOffset, idx_ptr + idxOffset + idx.dims[0]); - if(isAscending) { - THRUST_SELECT(thrust::sort_by_key, - val_ptr + valOffset, val_ptr + valOffset + val.dims[0], - idx_ptr + idxOffset); - } else { - THRUST_SELECT(thrust::sort_by_key, - val_ptr + valOffset, val_ptr + valOffset + val.dims[0], - idx_ptr + idxOffset, thrust::greater()); - } - } - } - } - POST_LAUNCH_CHECK(); - } - } -} diff --git a/src/backend/cuda/sort.cu b/src/backend/cuda/sort.cu index 6d14c0309f..9b0f4c53af 100644 --- a/src/backend/cuda/sort.cu +++ b/src/backend/cuda/sort.cu @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -22,10 +23,25 @@ namespace cuda { Array out = copyArray(in); switch(dim) { + case 0: kernel::sort0(out); break; + case 1: kernel::sortBatched(out); break; + case 2: kernel::sortBatched(out); break; + case 3: kernel::sortBatched(out); break; + default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); + } + + if(dim != 0) { + af::dim4 preorderDims = out.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = out.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = out.dims()[i - 1]; + } - case 0: kernel::sort0(out); - break; - default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); + out.setDataDims(preorderDims); + out = reorder(out, reorderDims); } return out; } diff --git a/src/backend/cuda/sort_by_key.cu b/src/backend/cuda/sort_by_key.cu new file mode 100644 index 0000000000..2d5d68eef0 --- /dev/null +++ b/src/backend/cuda/sort_by_key.cu @@ -0,0 +1,86 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuda +{ + template + void sort_by_key(Array &okey, Array &oval, + const Array &ikey, const Array &ival, const uint dim) + { + okey = copyArray(ikey); + oval = copyArray(ival); + + switch(dim) { + case 0: kernel::sort0ByKey(okey, oval); break; + case 1: kernel::sortByKeyBatched(okey, oval); break; + case 2: kernel::sortByKeyBatched(okey, oval); break; + case 3: kernel::sortByKeyBatched(okey, oval); break; + default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); + } + + if(dim != 0) { + af::dim4 preorderDims = okey.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = okey.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = okey.dims()[i - 1]; + } + + okey.setDataDims(preorderDims); + oval.setDataDims(preorderDims); + + okey = reorder(okey, reorderDims); + oval = reorder(oval, reorderDims); + } + } + +#define INSTANTIATE(Tk, Tv) \ + template void sort_by_key(Array &okey, Array &oval, \ + const Array &ikey, const Array &ival, const uint dim); \ + template void sort_by_key(Array &okey, Array &oval, \ + const Array &ikey, const Array &ival, const uint dim); \ + +#define INSTANTIATE1(Tk ) \ + INSTANTIATE(Tk, float ) \ + INSTANTIATE(Tk, double ) \ + INSTANTIATE(Tk, cfloat ) \ + INSTANTIATE(Tk, cdouble) \ + INSTANTIATE(Tk, int ) \ + INSTANTIATE(Tk, uint ) \ + INSTANTIATE(Tk, short ) \ + INSTANTIATE(Tk, ushort ) \ + INSTANTIATE(Tk, char ) \ + INSTANTIATE(Tk, uchar ) \ + INSTANTIATE(Tk, intl ) \ + INSTANTIATE(Tk, uintl ) + + +INSTANTIATE1(float ) +INSTANTIATE1(double) +INSTANTIATE1(int ) +INSTANTIATE1(uint ) +INSTANTIATE1(short ) +INSTANTIATE1(ushort) +INSTANTIATE1(char ) +INSTANTIATE1(uchar ) +INSTANTIATE1(intl ) +INSTANTIATE1(uintl ) + +} diff --git a/src/backend/cuda/sort_by_key_impl.hpp b/src/backend/cuda/sort_by_key_impl.hpp deleted file mode 100644 index d01ace404e..0000000000 --- a/src/backend/cuda/sort_by_key_impl.hpp +++ /dev/null @@ -1,49 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#include -#include -#include -#include -#include -#include -#include - -namespace cuda -{ - template - void sort_by_key(Array &okey, Array &oval, - const Array &ikey, const Array &ival, const uint dim) - { - okey = copyArray(ikey); - oval = copyArray(ival); - switch(dim) { - case 0: kernel::sort0_by_key(okey, oval); - break; - default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); - } - } - -#define INSTANTIATE(Tk, Tv, dr) \ - template void \ - sort_by_key(Array &okey, Array &oval, \ - const Array &ikey, const Array &ival, const uint dim); \ - -#define INSTANTIATE1(Tk, dr) \ - INSTANTIATE(Tk, float, dr) \ - INSTANTIATE(Tk, double, dr) \ - INSTANTIATE(Tk, int, dr) \ - INSTANTIATE(Tk, uint, dr) \ - INSTANTIATE(Tk, short, dr) \ - INSTANTIATE(Tk, ushort, dr) \ - INSTANTIATE(Tk, char, dr) \ - INSTANTIATE(Tk, uchar, dr) \ - INSTANTIATE(Tk, intl, dr) \ - INSTANTIATE(Tk, uintl, dr) -} diff --git a/src/backend/cuda/sort_index.cu b/src/backend/cuda/sort_index.cu index 606aab4eb1..03c69ad4f3 100644 --- a/src/backend/cuda/sort_index.cu +++ b/src/backend/cuda/sort_index.cu @@ -9,24 +9,47 @@ #include #include +#include #include -#include #include #include #include +#include +#include namespace cuda { template - void sort_index(Array &val, Array &idx, const Array &in, const uint dim) + void sort_index(Array &okey, Array &oval, const Array &in, const uint dim) { - val = copyArray(in); - idx = createEmptyArray(in.dims()); + okey = copyArray(in); + oval = range(in.dims(), dim); + oval.eval(); + switch(dim) { - case 0: kernel::sort0_index(val, idx); - break; + case 0: kernel::sort0ByKey(okey, oval); break; + case 1: kernel::sortByKeyBatched(okey, oval); break; + case 2: kernel::sortByKeyBatched(okey, oval); break; + case 3: kernel::sortByKeyBatched(okey, oval); break; default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } + + if(dim != 0) { + af::dim4 preorderDims = okey.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = okey.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = okey.dims()[i - 1]; + } + + okey.setDataDims(preorderDims); + oval.setDataDims(preorderDims); + + okey = reorder(okey, reorderDims); + oval = reorder(oval, reorderDims); + } } #define INSTANTIATE(T) \ diff --git a/src/backend/opencl/Array.hpp b/src/backend/opencl/Array.hpp index ada3b41dc3..f83d5c0120 100644 --- a/src/backend/opencl/Array.hpp +++ b/src/backend/opencl/Array.hpp @@ -211,6 +211,7 @@ namespace opencl void setDataDims(const dim4 &new_dims) { + modDims(new_dims); data_dims = new_dims; } diff --git a/src/backend/opencl/CMakeLists.txt b/src/backend/opencl/CMakeLists.txt index 2cb8ddfdf9..b7eb77ded5 100644 --- a/src/backend/opencl/CMakeLists.txt +++ b/src/backend/opencl/CMakeLists.txt @@ -111,12 +111,10 @@ ENDIF() FILE(GLOB opencl_headers "*.hpp" - "*.h" - "sort_by_key/*.hpp") + "*.h") FILE(GLOB opencl_sources - "*.cpp" - "sort_by_key/*.cpp") + "*.cpp") FILE(GLOB jit_sources "jit/*.hpp") @@ -128,7 +126,8 @@ FILE(GLOB opencl_kernels "kernel/*.cl") FILE(GLOB kernel_sources - "kernel/*.cpp") + "kernel/*.cpp" + "kernel/sort_by_key/*.cpp") FILE(GLOB conv_ker_headers "kernel/convolve/*.hpp") diff --git a/src/backend/opencl/kernel/harris.hpp b/src/backend/opencl/kernel/harris.hpp index 7fffdee423..442275d326 100644 --- a/src/backend/opencl/kernel/harris.hpp +++ b/src/backend/opencl/kernel/harris.hpp @@ -16,7 +16,8 @@ #include #include #include -#include +#include +#include #include #include #include @@ -284,10 +285,12 @@ void harris(unsigned* corners_out, int sort_elem = harris_resp.info.strides[3] * harris_resp.info.dims[3]; harris_resp.data = d_resp_corners; + // Create indices using range harris_idx.data = bufferAlloc(sort_elem * sizeof(unsigned)); + kernel::range(harris_idx, 0); // Sort Harris responses - sort0_index(harris_resp, harris_idx); + kernel::sort0ByKey(harris_resp, harris_idx); x_out.data = bufferAlloc(*corners_out * sizeof(float)); y_out.data = bufferAlloc(*corners_out * sizeof(float)); diff --git a/src/backend/opencl/kernel/iota.hpp b/src/backend/opencl/kernel/iota.hpp index bad486abd2..7cd8046d68 100644 --- a/src/backend/opencl/kernel/iota.hpp +++ b/src/backend/opencl/kernel/iota.hpp @@ -9,6 +9,7 @@ #pragma once #include +#include #include #include #include @@ -31,13 +32,13 @@ namespace opencl namespace kernel { // Kernel Launch Config Values - static const int TX = 32; - static const int TY = 8; + static const int IOTA_TX = 32; + static const int IOTA_TY = 8; static const int TILEX = 512; static const int TILEY = 32; template - void iota(Param out, const dim4 &sdims, const dim4 &tdims) + void iota(Param out, const af::dim4 &sdims, const af::dim4 &tdims) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; @@ -64,7 +65,7 @@ namespace opencl const int, const int, const int, const int, const int, const int> (*iotaKernels[device]); - NDRange local(TX, TY, 1); + NDRange local(IOTA_TX, IOTA_TY, 1); int blocksPerMatX = divup(out.info.dims[0], TILEX); int blocksPerMatY = divup(out.info.dims[1], TILEY); diff --git a/src/backend/opencl/kernel/orb.hpp b/src/backend/opencl/kernel/orb.hpp index 69c1176210..0c752d2c21 100644 --- a/src/backend/opencl/kernel/orb.hpp +++ b/src/backend/opencl/kernel/orb.hpp @@ -16,7 +16,8 @@ #include #include #include -#include +#include +#include #include #include #include @@ -303,9 +304,11 @@ void orb(unsigned* out_feat, d_harris_sorted.info.offset = 0; d_harris_idx.info.offset = 0; d_harris_sorted.data = d_score_harris; + // Create indices using range d_harris_idx.data = bufferAlloc((d_harris_idx.info.dims[0]) * sizeof(unsigned)); + kernel::range(d_harris_idx, 0); - sort0_index(d_harris_sorted, d_harris_idx); + kernel::sort0ByKey(d_harris_sorted, d_harris_idx); cl::Buffer* d_x_lvl = bufferAlloc(usable_feat * sizeof(float)); cl::Buffer* d_y_lvl = bufferAlloc(usable_feat * sizeof(float)); diff --git a/src/backend/opencl/kernel/range.hpp b/src/backend/opencl/kernel/range.hpp index 2f8be8cd4a..0299c030d4 100644 --- a/src/backend/opencl/kernel/range.hpp +++ b/src/backend/opencl/kernel/range.hpp @@ -31,10 +31,10 @@ namespace opencl namespace kernel { // Kernel Launch Config Values - static const int TX = 32; - static const int TY = 8; - static const int TILEX = 512; - static const int TILEY = 32; + static const int RANGE_TX = 32; + static const int RANGE_TY = 8; + static const int RANGE_TILEX = 512; + static const int RANGE_TILEY = 32; template void range(Param out, const int dim) @@ -62,10 +62,10 @@ namespace opencl auto rangeOp = make_kernel (*rangeKernels[device]); - NDRange local(TX, TY, 1); + NDRange local(RANGE_TX, RANGE_TY, 1); - int blocksPerMatX = divup(out.info.dims[0], TILEX); - int blocksPerMatY = divup(out.info.dims[1], TILEY); + int blocksPerMatX = divup(out.info.dims[0], RANGE_TILEX); + int blocksPerMatY = divup(out.info.dims[1], RANGE_TILEY); NDRange global(local[0] * blocksPerMatX * out.info.dims[2], local[1] * blocksPerMatY * out.info.dims[3], 1); diff --git a/src/backend/opencl/kernel/sift_nonfree.hpp b/src/backend/opencl/kernel/sift_nonfree.hpp index c28f432fce..0b78ef02f3 100644 --- a/src/backend/opencl/kernel/sift_nonfree.hpp +++ b/src/backend/opencl/kernel/sift_nonfree.hpp @@ -92,11 +92,12 @@ #include #include #include -#include #include #include #include +namespace compute = boost::compute; + using cl::Buffer; using cl::Program; using cl::Kernel; diff --git a/src/backend/opencl/kernel/sort.hpp b/src/backend/opencl/kernel/sort.hpp index 013d8c53a9..98ba75977a 100644 --- a/src/backend/opencl/kernel/sort.hpp +++ b/src/backend/opencl/kernel/sort.hpp @@ -15,12 +15,15 @@ #include #include #include +#include +#include #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" #include -#include +#include +#include #include #include @@ -38,18 +41,8 @@ namespace opencl { namespace kernel { - using std::conditional; - using std::is_same; - template - using ltype_t = typename conditional::value, cl_long, T>::type; - - template - using type_t = typename conditional::value, - cl_ulong, ltype_t - >::type; - template - void sort0(Param val) + void sort0Iterative(Param val) { try { compute::command_queue c_queue(getQueue()()); @@ -65,12 +58,12 @@ namespace opencl int valOffset = valWZ + y * val.info.strides[1]; if(isAscending) { - compute::stable_sort( + compute::sort( compute::make_buffer_iterator< type_t >(val_buf, valOffset), compute::make_buffer_iterator< type_t >(val_buf, valOffset + val.info.dims[0]), compute::less< type_t >(), c_queue); } else { - compute::stable_sort( + compute::sort( compute::make_buffer_iterator< type_t >(val_buf, valOffset), compute::make_buffer_iterator< type_t >(val_buf, valOffset + val.info.dims[0]), compute::greater< type_t >(), c_queue); @@ -85,6 +78,95 @@ namespace opencl throw; } } + + template + void sortBatched(Param pVal) + { + try{ + af::dim4 inDims; + for(int i = 0; i < 4; i++) + inDims[i] = pVal.info.dims[i]; + + // Sort dimension + // tileDims * seqDims = inDims + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + // Create/call iota + // Array key = iota(seqDims, tileDims); + dim4 keydims = inDims; + cl::Buffer* key = bufferAlloc(keydims.elements() * sizeof(uint)); + Param pKey; + pKey.data = key; + pKey.info.offset = 0; + pKey.info.dims[0] = keydims[0]; + pKey.info.strides[0] = 1; + for(int i = 1; i < 4; i++) { + pKey.info.dims[i] = keydims[i]; + pKey.info.strides[i] = pKey.info.strides[i - 1] * pKey.info.dims[i - 1]; + } + kernel::iota(pKey, seqDims, tileDims); + + // Flat + //val.modDims(inDims.elements()); + //key.modDims(inDims.elements()); + pKey.info.dims[0] = inDims.elements(); + pKey.info.strides[0] = 1; + pVal.info.dims[0] = inDims.elements(); + pVal.info.strides[0] = 1; + for(int i = 1; i < 4; i++) { + pKey.info.dims[i] = 1; + pKey.info.strides[i] = pKey.info.strides[i - 1] * pKey.info.dims[i - 1]; + pVal.info.dims[i] = 1; + pVal.info.strides[i] = pVal.info.strides[i - 1] * pVal.info.dims[i - 1]; + } + + // Sort indices + // sort_by_key(*resVal, *resKey, val, key, 0); + //kernel::sort0_by_key(pVal, pKey); + compute::command_queue c_queue(getQueue()()); + + compute::buffer pKey_buf((*pKey.data)()); + compute::buffer pVal_buf((*pVal.data)()); + + compute::buffer_iterator > val0 = compute::make_buffer_iterator >(pVal_buf, 0); + compute::buffer_iterator > valN = compute::make_buffer_iterator >(pVal_buf,+ pVal.info.dims[0]); + compute::buffer_iterator key0 = compute::make_buffer_iterator(pKey_buf, 0); + compute::buffer_iterator keyN = compute::make_buffer_iterator(pKey_buf, pKey.info.dims[0]); + if(isAscending) { + compute::sort_by_key(val0, valN, key0, c_queue); + } else { + compute::sort_by_key(val0, valN, key0, compute::greater< type_t >(), c_queue); + } + + // Needs to be ascending (true) in order to maintain the indices properly + //kernel::sort0_by_key(pKey, pVal); + compute::sort_by_key(key0, keyN, val0, c_queue); + + // No need of doing moddims here because the original Array + // dimensions have not been changed + //val.modDims(inDims); + + CL_DEBUG_FINISH(getQueue()); + bufferFree(key); + } catch (cl::Error err) { + CL_TO_AF_ERROR(err); + throw; + } + } + + template + void sort0(Param val) + { + int higherDims = val.info.dims[1] * val.info.dims[2] * val.info.dims[3]; + // TODO Make a better heurisitic + if(higherDims > 10) + sortBatched(val); + else + kernel::sort0Iterative(val); + } } } diff --git a/src/backend/opencl/kernel/sort_by_key.hpp b/src/backend/opencl/kernel/sort_by_key.hpp index 0cb9cb042d..224f6411ff 100644 --- a/src/backend/opencl/kernel/sort_by_key.hpp +++ b/src/backend/opencl/kernel/sort_by_key.hpp @@ -8,86 +8,22 @@ ********************************************************/ #pragma once -#include #include -#include -#include #include #include #include -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - -#include -#include -#include -#include - -namespace compute = boost::compute; - -using cl::Buffer; -using cl::Program; -using cl::Kernel; -using cl::make_kernel; -using cl::EnqueueArgs; -using cl::NDRange; -using std::string; - namespace opencl { namespace kernel { - using std::conditional; - using std::is_same; - template - using ltype_t = typename conditional::value, cl_long, T>::type; - - template - using type_t = typename conditional::value, - cl_ulong, ltype_t - >::type; - template - void sort0_by_key(Param okey, Param oval) - { - try { - compute::command_queue c_queue(getQueue()()); + void sort0ByKeyIterative(Param pKey, Param pVal); - compute::buffer okey_buf((*okey.data)()); - compute::buffer oval_buf((*oval.data)()); + template + void sortByKeyBatched(Param pKey, Param pVal); - for(int w = 0; w < okey.info.dims[3]; w++) { - int okeyW = w * okey.info.strides[3]; - int ovalW = w * oval.info.strides[3]; - for(int z = 0; z < okey.info.dims[2]; z++) { - int okeyWZ = okeyW + z * okey.info.strides[2]; - int ovalWZ = ovalW + z * oval.info.strides[2]; - for(int y = 0; y < okey.info.dims[1]; y++) { - - int okeyOffset = okeyWZ + y * okey.info.strides[1]; - int ovalOffset = ovalWZ + y * oval.info.strides[1]; - - compute::buffer_iterator< type_t > start= compute::make_buffer_iterator< type_t >(okey_buf, okeyOffset); - compute::buffer_iterator< type_t > end = compute::make_buffer_iterator< type_t >(okey_buf, okeyOffset + okey.info.dims[0]); - compute::buffer_iterator< type_t > vals = compute::make_buffer_iterator< type_t >(oval_buf, ovalOffset); - if(isAscending) { - compute::sort_by_key(start, end, vals, c_queue); - } else { - compute::sort_by_key(start, end, vals, - compute::greater< type_t >(), c_queue); - } - } - } - } - - CL_DEBUG_FINISH(getQueue()); - } catch (cl::Error err) { - CL_TO_AF_ERROR(err); - throw; - } - } + template + void sort0ByKey(Param pKey, Param pVal); } } - -#pragma GCC diagnostic pop diff --git a/src/backend/opencl/sort_by_key/b8.cpp b/src/backend/opencl/kernel/sort_by_key/b8.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/b8.cpp rename to src/backend/opencl/kernel/sort_by_key/b8.cpp index 118d20dc92..ad0d7f48ae 100644 --- a/src/backend/opencl/sort_by_key/b8.cpp +++ b/src/backend/opencl/kernel/sort_by_key/b8.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(char,true) INSTANTIATE1(char,false) } +} diff --git a/src/backend/opencl/sort_by_key/f32.cpp b/src/backend/opencl/kernel/sort_by_key/f32.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/f32.cpp rename to src/backend/opencl/kernel/sort_by_key/f32.cpp index a7baf486f1..a1e9ae5f1f 100644 --- a/src/backend/opencl/sort_by_key/f32.cpp +++ b/src/backend/opencl/kernel/sort_by_key/f32.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(float,true) INSTANTIATE1(float,false) } +} diff --git a/src/backend/opencl/sort_by_key/f64.cpp b/src/backend/opencl/kernel/sort_by_key/f64.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/f64.cpp rename to src/backend/opencl/kernel/sort_by_key/f64.cpp index 6971c90982..7fb7a79bd8 100644 --- a/src/backend/opencl/sort_by_key/f64.cpp +++ b/src/backend/opencl/kernel/sort_by_key/f64.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(double,true) INSTANTIATE1(double,false) } +} diff --git a/src/backend/opencl/sort_by_key/s16.cpp b/src/backend/opencl/kernel/sort_by_key/s16.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/s16.cpp rename to src/backend/opencl/kernel/sort_by_key/s16.cpp index 44e17b5030..491ea0e3a2 100644 --- a/src/backend/opencl/sort_by_key/s16.cpp +++ b/src/backend/opencl/kernel/sort_by_key/s16.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(short,true) INSTANTIATE1(short,false) } +} diff --git a/src/backend/opencl/sort_by_key/s32.cpp b/src/backend/opencl/kernel/sort_by_key/s32.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/s32.cpp rename to src/backend/opencl/kernel/sort_by_key/s32.cpp index 9fed1a53b3..67ba20e7dd 100644 --- a/src/backend/opencl/sort_by_key/s32.cpp +++ b/src/backend/opencl/kernel/sort_by_key/s32.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(int,true) INSTANTIATE1(int,false) } +} diff --git a/src/backend/opencl/sort_by_key/s64.cpp b/src/backend/opencl/kernel/sort_by_key/s64.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/s64.cpp rename to src/backend/opencl/kernel/sort_by_key/s64.cpp index e2ed8d687b..a48f36ee47 100644 --- a/src/backend/opencl/sort_by_key/s64.cpp +++ b/src/backend/opencl/kernel/sort_by_key/s64.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(intl,true) INSTANTIATE1(intl,false) } +} diff --git a/src/backend/opencl/sort_by_key/u16.cpp b/src/backend/opencl/kernel/sort_by_key/u16.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/u16.cpp rename to src/backend/opencl/kernel/sort_by_key/u16.cpp index c53b68fb53..36678d0a42 100644 --- a/src/backend/opencl/sort_by_key/u16.cpp +++ b/src/backend/opencl/kernel/sort_by_key/u16.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(ushort,true) INSTANTIATE1(ushort,false) } +} diff --git a/src/backend/opencl/sort_by_key/u32.cpp b/src/backend/opencl/kernel/sort_by_key/u32.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/u32.cpp rename to src/backend/opencl/kernel/sort_by_key/u32.cpp index c2e3e62163..f1e4b5322f 100644 --- a/src/backend/opencl/sort_by_key/u32.cpp +++ b/src/backend/opencl/kernel/sort_by_key/u32.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(uint,true) INSTANTIATE1(uint,false) } +} diff --git a/src/backend/opencl/sort_by_key/u64.cpp b/src/backend/opencl/kernel/sort_by_key/u64.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/u64.cpp rename to src/backend/opencl/kernel/sort_by_key/u64.cpp index 89649b1ba5..0a6f5b0c4f 100644 --- a/src/backend/opencl/sort_by_key/u64.cpp +++ b/src/backend/opencl/kernel/sort_by_key/u64.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(uintl,true) INSTANTIATE1(uintl,false) } +} diff --git a/src/backend/opencl/sort_by_key/u8.cpp b/src/backend/opencl/kernel/sort_by_key/u8.cpp similarity index 87% rename from src/backend/opencl/sort_by_key/u8.cpp rename to src/backend/opencl/kernel/sort_by_key/u8.cpp index 2dfb4c3a73..45af011a86 100644 --- a/src/backend/opencl/sort_by_key/u8.cpp +++ b/src/backend/opencl/kernel/sort_by_key/u8.cpp @@ -7,10 +7,13 @@ * http://arrayfire.com/licenses/BSD-3-Clause ********************************************************/ -#include "impl.hpp" +#include namespace opencl +{ +namespace kernel { INSTANTIATE1(uchar,true) INSTANTIATE1(uchar,false) } +} diff --git a/src/backend/opencl/kernel/sort_by_key_impl.hpp b/src/backend/opencl/kernel/sort_by_key_impl.hpp new file mode 100644 index 0000000000..243034541f --- /dev/null +++ b/src/backend/opencl/kernel/sort_by_key_impl.hpp @@ -0,0 +1,373 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace compute = boost::compute; + +using cl::Buffer; +using cl::Program; +using cl::Kernel; +using cl::make_kernel; +using cl::EnqueueArgs; +using cl::NDRange; +using std::string; + +template +inline +boost::compute::function, const std::pair)> +makeCompareFunction() +{ + // Cannot use isAscending in BOOST_COMPUTE_FUNCTION + if(isAscending) { + BOOST_COMPUTE_FUNCTION(bool, IPCompare, (std::pair lhs, std::pair rhs), + { + return lhs.first < rhs.first; + } + ); + return IPCompare; + } else { + BOOST_COMPUTE_FUNCTION(bool, IPCompare, (std::pair lhs, std::pair rhs), + { + return lhs.first > rhs.first; + } + ); + return IPCompare; + } +} + +template +inline boost::compute::function +flipFunction() +{ + BOOST_COMPUTE_FUNCTION(Tk, negateFn, (const Tk x), + { + return -x; + } + ); + + return negateFn; +} + +#define INSTANTIATE_FLIP(TY, XMAX) \ +template<> inline boost::compute::function \ +flipFunction() \ +{ \ + BOOST_COMPUTE_FUNCTION(TY, negateFn, (const TY x), \ + { \ + return XMAX - x; \ + } \ + ); \ + \ + return negateFn; \ +} + +INSTANTIATE_FLIP(unsigned, UINT_MAX) +INSTANTIATE_FLIP(unsigned short, USHRT_MAX) +INSTANTIATE_FLIP(unsigned char, UCHAR_MAX) +INSTANTIATE_FLIP(cl_ulong, ULONG_MAX) + +#undef INSTANTIATE_FLIP + +namespace opencl +{ + namespace kernel + { + static const int copyPairIter = 4; + + template + void makePair(cl::Buffer *out, const cl::Buffer *first, const cl::Buffer *second, const unsigned N) + { + try { + static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; + static std::map sortPairProgs; + static std::map sortPairKernels; + + int device = getActiveDeviceId(); + + std::call_once( compileFlags[device], [device] () { + std::ostringstream options; + options << " -D Tk=" << dtype_traits::getName() + << " -D Tv=" << dtype_traits::getName() + << " -D copyPairIter=" << copyPairIter; + if (std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value) { + options << " -D USE_DOUBLE"; + } + Program prog; + buildProgram(prog, sort_pair_cl, sort_pair_cl_len, options.str()); + sortPairProgs[device] = new Program(prog); + sortPairKernels[device] = new Kernel(*sortPairProgs[device], "make_pair_kernel"); + }); + + auto makePairOp = make_kernel + (*sortPairKernels[device]); + + NDRange local(256, 1, 1); + NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1); + + makePairOp(EnqueueArgs(getQueue(), global, local), *out, *first, *second, N); + + CL_DEBUG_FINISH(getQueue()); + } catch (cl::Error err) { + CL_TO_AF_ERROR(err); + throw; + } + } + + template + void splitPair(cl::Buffer *first, cl::Buffer *second, const cl::Buffer *in, const unsigned N) + { + try { + static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; + static std::map sortPairProgs; + static std::map sortPairKernels; + + int device = getActiveDeviceId(); + + std::call_once( compileFlags[device], [device] () { + std::ostringstream options; + options << " -D Tk=" << dtype_traits::getName() + << " -D Tv=" << dtype_traits::getName() + << " -D copyPairIter=" << copyPairIter; + if (std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value) { + options << " -D USE_DOUBLE"; + } + Program prog; + buildProgram(prog, sort_pair_cl, sort_pair_cl_len, options.str()); + sortPairProgs[device] = new Program(prog); + sortPairKernels[device] = new Kernel(*sortPairProgs[device], "split_pair_kernel"); + }); + + auto splitPairOp = make_kernel + (*sortPairKernels[device]); + + NDRange local(256, 1, 1); + NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1); + + splitPairOp(EnqueueArgs(getQueue(), global, local), *first, *second, *in, N); + + CL_DEBUG_FINISH(getQueue()); + } catch (cl::Error err) { + CL_TO_AF_ERROR(err); + throw; + } + } + + template + void sort0ByKeyIterative(Param pKey, Param pVal) + { + try { + compute::command_queue c_queue(getQueue()()); + + compute::buffer pKey_buf((*pKey.data)()); + compute::buffer pVal_buf((*pVal.data)()); + + for(int w = 0; w < pKey.info.dims[3]; w++) { + int pKeyW = w * pKey.info.strides[3]; + int pValW = w * pVal.info.strides[3]; + for(int z = 0; z < pKey.info.dims[2]; z++) { + int pKeyWZ = pKeyW + z * pKey.info.strides[2]; + int pValWZ = pValW + z * pVal.info.strides[2]; + for(int y = 0; y < pKey.info.dims[1]; y++) { + + int pKeyOffset = pKeyWZ + y * pKey.info.strides[1]; + int pValOffset = pValWZ + y * pVal.info.strides[1]; + + compute::buffer_iterator< type_t > start= compute::make_buffer_iterator< type_t >(pKey_buf, pKeyOffset); + compute::buffer_iterator< type_t > end = compute::make_buffer_iterator< type_t >(pKey_buf, pKeyOffset + pKey.info.dims[0]); + compute::buffer_iterator< type_t > vals = compute::make_buffer_iterator< type_t >(pVal_buf, pValOffset); + if(isAscending) { + compute::sort_by_key(start, end, vals, c_queue); + } else { + compute::sort_by_key(start, end, vals, + compute::greater< type_t >(), c_queue); + } + } + } + } + + CL_DEBUG_FINISH(getQueue()); + } catch (cl::Error err) { + CL_TO_AF_ERROR(err); + throw; + } + } + + template + void sortByKeyBatched(Param pKey, Param pVal) + { + typedef type_t Tk; + typedef type_t Tv; + + try { + af::dim4 inDims; + for(int i = 0; i < 4; i++) + inDims[i] = pKey.info.dims[i]; + + // Sort dimension + // tileDims * seqDims = inDims + af::dim4 tileDims(1); + af::dim4 seqDims = inDims; + tileDims[dim] = inDims[dim]; + seqDims[dim] = 1; + + // Create/call iota + // Array key = iota(seqDims, tileDims); + cl::Buffer* key = bufferAlloc(inDims.elements() * sizeof(unsigned)); + Param pSeq; + pSeq.data = key; + pSeq.info.offset = 0; + pSeq.info.dims[0] = inDims[0]; + pSeq.info.strides[0] = 1; + for(int i = 1; i < 4; i++) { + pSeq.info.dims[i] = inDims[i]; + pSeq.info.strides[i] = pSeq.info.strides[i - 1] * pSeq.info.dims[i - 1]; + } + kernel::iota(pSeq, seqDims, tileDims); + + int elements = inDims.elements(); + + // Flat - Not required since inplace and both are continuous + //val.modDims(inDims.elements()); + //key.modDims(inDims.elements()); + + // Sort indices + // sort_by_key(*resVal, *resKey, val, key, 0); + //kernel::sort0_by_key(pVal, pKey); + compute::command_queue c_queue(getQueue()()); + compute::context c_context(getContext()()); + + // Create buffer iterators for seq + compute::buffer pSeq_buf((*pSeq.data)()); + compute::buffer_iterator seq0 = compute::make_buffer_iterator(pSeq_buf, 0); + compute::buffer_iterator seqN = compute::make_buffer_iterator(pSeq_buf, elements); + // Create buffer iterators for key and val + compute::buffer pKey_buf((*pKey.data)()); + compute::buffer pVal_buf((*pVal.data)()); + compute::buffer_iterator key0 = compute::make_buffer_iterator(pKey_buf, 0); + compute::buffer_iterator keyN = compute::make_buffer_iterator(pKey_buf, elements); + compute::buffer_iterator val0 = compute::make_buffer_iterator(pVal_buf, 0); + compute::buffer_iterator valN = compute::make_buffer_iterator(pVal_buf, elements); + + // Sort By Key for descending is stable in the reverse + // (greater) order. Sorting in ascending with negated values + // will give the right result + if(!isAscending) compute::transform(key0, keyN, key0, flipFunction(), c_queue); + + // Create a copy of the pKey buffer + cl::Buffer* cKey = bufferAlloc(elements * sizeof(Tk)); + compute::buffer cKey_buf((*cKey)()); + compute::buffer_iterator cKey0 = compute::make_buffer_iterator(cKey_buf, 0); + compute::buffer_iterator cKeyN = compute::make_buffer_iterator(cKey_buf, elements); + compute::copy(key0, keyN, cKey0, c_queue); + + // FIRST SORT + compute::sort_by_key(key0, keyN, seq0, c_queue); + compute::sort_by_key(cKey0, cKeyN, val0, c_queue); + + // Create a copy of the seq buffer after first sort + cl::Buffer* cSeq = bufferAlloc(elements * sizeof(unsigned)); + compute::buffer cSeq_buf((*cSeq)()); + compute::buffer_iterator cSeq0 = compute::make_buffer_iterator(cSeq_buf, 0); + compute::buffer_iterator cSeqN = compute::make_buffer_iterator(cSeq_buf, elements); + compute::copy(seq0, seqN, cSeq0, c_queue); + + // SECOND SORT + // First call will sort key, second sort will sort val + // Needs to be ascending (true) in order to maintain the indices properly + //kernel::sort0_by_key(pKey, pVal); + compute::sort_by_key(seq0, seqN, key0, c_queue); + compute::sort_by_key(cSeq0, cSeqN, val0, c_queue); + + // If descending, flip it back + if(!isAscending) compute::transform(key0, keyN, key0, flipFunction(), c_queue); + + //// No need of doing moddims here because the original Array + //// dimensions have not been changed + ////val.modDims(inDims); + + CL_DEBUG_FINISH(getQueue()); + bufferFree(key); + bufferFree(cSeq); + bufferFree(cKey); + } catch (cl::Error err) { + CL_TO_AF_ERROR(err); + throw; + } + } + + template + void sort0ByKey(Param pKey, Param pVal) + { + int higherDims = pKey.info.dims[1] * pKey.info.dims[2] * pKey.info.dims[3]; + // TODO Make a better heurisitic + if(higherDims > 5) + kernel::sortByKeyBatched(pKey, pVal); + else + kernel::sort0ByKeyIterative(pKey, pVal); + } + +#define INSTANTIATE(Tk, Tv, dr) \ + template void sort0ByKey(Param okey, Param oval); \ + template void sort0ByKeyIterative(Param okey, Param oval); \ + template void sortByKeyBatched(Param okey, Param oval); \ + template void sortByKeyBatched(Param okey, Param oval); \ + template void sortByKeyBatched(Param okey, Param oval); \ + template void sortByKeyBatched(Param okey, Param oval); \ + +#define INSTANTIATE1(Tk , dr) \ + INSTANTIATE(Tk, float , dr) \ + INSTANTIATE(Tk, double , dr) \ + INSTANTIATE(Tk, cfloat , dr) \ + INSTANTIATE(Tk, cdouble, dr) \ + INSTANTIATE(Tk, int , dr) \ + INSTANTIATE(Tk, uint , dr) \ + INSTANTIATE(Tk, short , dr) \ + INSTANTIATE(Tk, ushort , dr) \ + INSTANTIATE(Tk, char , dr) \ + INSTANTIATE(Tk, uchar , dr) \ + INSTANTIATE(Tk, intl , dr) \ + INSTANTIATE(Tk, uintl , dr) + } +} + +#pragma GCC diagnostic pop diff --git a/src/backend/opencl/kernel/sort_helper.hpp b/src/backend/opencl/kernel/sort_helper.hpp new file mode 100644 index 0000000000..b8031c2314 --- /dev/null +++ b/src/backend/opencl/kernel/sort_helper.hpp @@ -0,0 +1,48 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#pragma once +#include +#include +#include +#include + +namespace opencl +{ + namespace kernel + { + using std::conditional; + using std::is_same; + + // If type is cdouble, return std::complex, else return T + template + using ztype_t = typename conditional::value, + std::complex, T + >::type; + + // If type is cfloat, return std::complex, else return ztype_t + template + using ctype_t = typename conditional::value, + std::complex, ztype_t + >::type; + + // If type is intl, return cl_long, else return ctype_t + template + using ltype_t = typename conditional::value, + cl_long, ctype_t + >::type; + + // If type is uintl, return cl_ulong, else return ltype_t + template + using type_t = typename conditional::value, + cl_ulong, ltype_t + >::type; + } +} + diff --git a/src/backend/opencl/kernel/sort_index.hpp b/src/backend/opencl/kernel/sort_index.hpp deleted file mode 100644 index 3a8ab1401e..0000000000 --- a/src/backend/opencl/kernel/sort_index.hpp +++ /dev/null @@ -1,99 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#pragma once -#include -#include -#include -#include -#include -#include -#include - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - -#include -#include -#include -#include -#include - -namespace compute = boost::compute; - -using cl::Buffer; -using cl::Program; -using cl::Kernel; -using cl::make_kernel; -using cl::EnqueueArgs; -using cl::NDRange; -using std::string; - -namespace opencl -{ - namespace kernel - { - using std::conditional; - using std::is_same; - template - using ltype_t = typename conditional::value, cl_long, T>::type; - - template - using type_t = typename conditional::value, - cl_ulong, ltype_t - >::type; - - template - void sort0_index(Param val, Param idx) - { - try { - compute::command_queue c_queue(getQueue()()); - - compute::buffer val_buf((*val.data)()); - compute::buffer idx_buf((*idx.data)()); - - for(int w = 0; w < (int)val.info.dims[3]; w++) { - int valW = w * (int)val.info.strides[3]; - int idxW = w * idx.info.strides[3]; - for(int z = 0; z < (int)val.info.dims[2]; z++) { - int valWZ = valW + z * (int)val.info.strides[2]; - int idxWZ = idxW + z * idx.info.strides[2]; - for(int y = 0; y < (int)val.info.dims[1]; y++) { - - int valOffset = valWZ + y * val.info.strides[1]; - int idxOffset = idxWZ + y * idx.info.strides[1]; - - compute::buffer_iterator idx_begin(idx_buf, idxOffset); - compute::iota(idx_begin, idx_begin + val.info.dims[0], 0, c_queue); - - if(isAscending) { - compute::sort_by_key( - compute::make_buffer_iterator< type_t >(val_buf, valOffset), - compute::make_buffer_iterator< type_t >(val_buf, valOffset + val.info.dims[0]), - idx_begin, compute::less< type_t >(), c_queue); - } else { - compute::sort_by_key( - compute::make_buffer_iterator< type_t >(val_buf, valOffset), - compute::make_buffer_iterator< type_t >(val_buf, valOffset + val.info.dims[0]), - idx_begin, compute::greater< type_t >(), c_queue); - } - } - } - } - - CL_DEBUG_FINISH(getQueue()); - } catch (cl::Error err) { - CL_TO_AF_ERROR(err); - throw; - } - } - } -} - -#pragma GCC diagnostic pop diff --git a/src/backend/opencl/kernel/sort_pair.cl b/src/backend/opencl/kernel/sort_pair.cl new file mode 100644 index 0000000000..f5e5413d73 --- /dev/null +++ b/src/backend/opencl/kernel/sort_pair.cl @@ -0,0 +1,43 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +struct IndexPair +{ + Tk first; + Tv second; +}; + +typedef struct IndexPair IndexPair_t; + +__kernel +void make_pair_kernel(__global IndexPair_t *out, + __global const Tk *first, __global const Tv *second, + const unsigned N) +{ + int tIdx = get_group_id(0) * get_local_size(0) * copyPairIter + get_local_id(0); + const int blockDimX = get_local_size(0); + + for(int i = tIdx; i < N; i += blockDimX) { + out[i].first = first[i]; + out[i].second = second[i]; + } +} + +__kernel +void split_pair_kernel( __global Tk *first, __global Tv *second, + __global const IndexPair_t *out, const unsigned N) +{ + int tIdx = get_group_id(0) * get_local_size(0) * copyPairIter + get_local_id(0); + const int blockDimX = get_local_size(0); + + for(int i = tIdx; i < N; i += blockDimX) { + first[i] = out[i].first; + second[i] = out[i].second; + } +} diff --git a/src/backend/opencl/sort.cpp b/src/backend/opencl/sort.cpp index 762d815095..1548f27472 100644 --- a/src/backend/opencl/sort.cpp +++ b/src/backend/opencl/sort.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -23,9 +24,25 @@ namespace opencl try { Array out = copyArray(in); switch(dim) { - case 0: kernel::sort0(out); - break; - default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); + case 0: kernel::sort0(out); break; + case 1: kernel::sortBatched(out); break; + case 2: kernel::sortBatched(out); break; + case 3: kernel::sortBatched(out); break; + default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); + } + + if(dim != 0) { + af::dim4 preorderDims = out.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = out.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = out.dims()[i - 1]; + } + + out.setDataDims(preorderDims); + out = reorder(out, reorderDims); } return out; } catch (std::exception &ex) { diff --git a/src/backend/opencl/sort_by_key.cpp b/src/backend/opencl/sort_by_key.cpp new file mode 100644 index 0000000000..27c2dc2462 --- /dev/null +++ b/src/backend/opencl/sort_by_key.cpp @@ -0,0 +1,90 @@ +/******************************************************* + * Copyright (c) 2014, ArrayFire + * All rights reserved. + * + * This file is distributed under 3-clause BSD license. + * The complete license agreement can be obtained at: + * http://arrayfire.com/licenses/BSD-3-Clause + ********************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace opencl +{ + template + void sort_by_key(Array &okey, Array &oval, + const Array &ikey, const Array &ival, const unsigned dim) + { + try { + okey = copyArray(ikey); + oval = copyArray(ival); + + switch(dim) { + case 0: kernel::sort0ByKey(okey, oval); break; + case 1: kernel::sortByKeyBatched(okey, oval); break; + case 2: kernel::sortByKeyBatched(okey, oval); break; + case 3: kernel::sortByKeyBatched(okey, oval); break; + default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); + } + + if(dim != 0) { + af::dim4 preorderDims = okey.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = okey.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = okey.dims()[i - 1]; + } + + okey.setDataDims(preorderDims); + oval.setDataDims(preorderDims); + + okey = reorder(okey, reorderDims); + oval = reorder(oval, reorderDims); + } + } catch(std::exception &ex) { + AF_ERROR(ex.what(), AF_ERR_INTERNAL); + } + } + +#define INSTANTIATE(Tk, Tv) \ + template void sort_by_key(Array &okey, Array &oval, \ + const Array &ikey, const Array &ival, const uint dim); \ + template void sort_by_key(Array &okey, Array &oval, \ + const Array &ikey, const Array &ival, const uint dim); \ + +#define INSTANTIATE1(Tk ) \ + INSTANTIATE(Tk, float ) \ + INSTANTIATE(Tk, double ) \ + INSTANTIATE(Tk, cfloat ) \ + INSTANTIATE(Tk, cdouble) \ + INSTANTIATE(Tk, int ) \ + INSTANTIATE(Tk, uint ) \ + INSTANTIATE(Tk, short ) \ + INSTANTIATE(Tk, ushort ) \ + INSTANTIATE(Tk, char ) \ + INSTANTIATE(Tk, uchar ) \ + INSTANTIATE(Tk, intl ) \ + INSTANTIATE(Tk, uintl ) + + +INSTANTIATE1(float ) +INSTANTIATE1(double) +INSTANTIATE1(int ) +INSTANTIATE1(uint ) +INSTANTIATE1(short ) +INSTANTIATE1(ushort) +INSTANTIATE1(char ) +INSTANTIATE1(uchar ) +INSTANTIATE1(intl ) +INSTANTIATE1(uintl ) + +} diff --git a/src/backend/opencl/sort_by_key/impl.hpp b/src/backend/opencl/sort_by_key/impl.hpp deleted file mode 100644 index 49d184113f..0000000000 --- a/src/backend/opencl/sort_by_key/impl.hpp +++ /dev/null @@ -1,57 +0,0 @@ -/******************************************************* - * Copyright (c) 2014, ArrayFire - * All rights reserved. - * - * This file is distributed under 3-clause BSD license. - * The complete license agreement can be obtained at: - * http://arrayfire.com/licenses/BSD-3-Clause - ********************************************************/ - -#include -#include -#include -#include -#include -#include -#include - -namespace opencl -{ - template - void sort_by_key(Array &okey, Array &oval, - const Array &ikey, const Array &ival, const unsigned dim) - { - try { - okey = copyArray(ikey); - oval = copyArray(ival); - switch(dim) { - case 0: kernel::sort0_by_key(okey, oval); - break; - default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); - } - }catch(std::exception &ex) { - AF_ERROR(ex.what(), AF_ERR_INTERNAL); - } - } - -#define INSTANTIATE(Tk, Tv, isAscending) \ - template void \ - sort_by_key(Array &okey, Array &oval, \ - const Array &ikey, \ - const Array &ival, \ - const unsigned dim); \ - - -#define INSTANTIATE1(Tk, isAscending) \ - INSTANTIATE(Tk, float , isAscending) \ - INSTANTIATE(Tk, double, isAscending) \ - INSTANTIATE(Tk, int , isAscending) \ - INSTANTIATE(Tk, uint , isAscending) \ - INSTANTIATE(Tk, char , isAscending) \ - INSTANTIATE(Tk, uchar , isAscending) \ - INSTANTIATE(Tk, short , isAscending) \ - INSTANTIATE(Tk, ushort, isAscending) \ - INSTANTIATE(Tk, intl , isAscending) \ - INSTANTIATE(Tk, uintl , isAscending) \ - -} diff --git a/src/backend/opencl/sort_index.cpp b/src/backend/opencl/sort_index.cpp index c7aaa70feb..bb5474909d 100644 --- a/src/backend/opencl/sort_index.cpp +++ b/src/backend/opencl/sort_index.cpp @@ -10,29 +10,53 @@ #include #include #include -#include +#include #include #include #include +#include +#include namespace opencl { template - void sort_index(Array &val, Array &idx, const Array &in, const uint dim) + void sort_index(Array &okey, Array &oval, const Array &in, const uint dim) { try { - val = copyArray(in); - idx = createEmptyArray(in.dims()); + // okey contains values, oval contains indices + okey = copyArray(in); + oval = range(in.dims(), dim); + oval.eval(); switch(dim) { - case 0: kernel::sort0_index(val, idx); - break; - default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); + case 0: kernel::sort0ByKey(okey, oval); break; + case 1: kernel::sortByKeyBatched(okey, oval); break; + case 2: kernel::sortByKeyBatched(okey, oval); break; + case 3: kernel::sortByKeyBatched(okey, oval); break; + default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED); } - } catch (std::exception &ex) { + + if(dim != 0) { + af::dim4 preorderDims = okey.dims(); + af::dim4 reorderDims(0, 1, 2, 3); + reorderDims[dim] = 0; + preorderDims[0] = okey.dims()[dim]; + for(int i = 1; i <= (int)dim; i++) { + reorderDims[i - 1] = i; + preorderDims[i] = okey.dims()[i - 1]; + } + + okey.setDataDims(preorderDims); + oval.setDataDims(preorderDims); + + okey = reorder(okey, reorderDims); + oval = reorder(oval, reorderDims); + } + } catch (std::exception &ex) { AF_ERROR(ex.what(), AF_ERR_INTERNAL); } } + #define INSTANTIATE(T) \ template void sort_index(Array &val, Array &idx, const Array &in, \ const uint dim); \ diff --git a/src/backend/opencl/traits.hpp b/src/backend/opencl/traits.hpp index 4e63095421..54ba158e8a 100644 --- a/src/backend/opencl/traits.hpp +++ b/src/backend/opencl/traits.hpp @@ -10,6 +10,7 @@ #pragma once #include +#include #include #include #include diff --git a/test/median.cpp b/test/median.cpp index e0b21ba281..5b26a44a97 100644 --- a/test/median.cpp +++ b/test/median.cpp @@ -37,96 +37,142 @@ af::array generateArray(int nx, int ny, int nz, int nw) return a; } -template -void median0(int nx, int ny=1, int nz=1, int nw=1) +template +void median_flat(int nx, int ny=1, int nz=1, int nw=1) { if (noDoubleTests()) return; array a = generateArray(nx, ny, nz, nw); - array sa = sort(a); - Ti *h_sa = sa.host(); + // Verification + array sa = sort(flat(a)); + dim_t mid = (sa.dims(0) + 1) / 2; - To *h_b = NULL; - To val = 0; + To verify; - if (flat) { - val = median(a); - h_b = &val; + To *h_sa = sa.as((af_dtype)af::dtype_traits::af_type).host(); + if(sa.dims(0) % 2 == 1) { + verify = h_sa[mid - 1]; } else { - array b = median(a); - h_b = b.host(); + verify = (h_sa[mid - 1] + h_sa[mid]) / (To)2; } - for (int w = 0; w < nw; w++) { - for (int z = 0; z < nz; z++) { - for (int y = 0; y < ny; y++) { + // Test Part + To val = median(a); - int off = (y + ny * (z + nz * w)); - int id = nx / 2; + ASSERT_EQ(verify, val); - if (nx & 2) { - ASSERT_EQ(h_sa[id + off * nx], h_b[off]); - } else { - To left = h_sa[id + off * nx - 1]; - To right = h_sa[id + off * nx]; + delete[] h_sa; +} + +template +void median_test(int nx, int ny=1, int nz=1, int nw=1) +{ + if (noDoubleTests()) return; + + array a = generateArray(nx, ny, nz, nw); + + // If selected dim is higher than input ndims, then return + if(dim >= a.dims().ndims()) + return; + + array verify; + + // Verification + array sa = sort(a, dim); + + double mid = (a.dims(dim) + 1) / 2; + af::seq mSeq[4] = {span, span, span, span}; + mSeq[dim] = af::seq(mid, mid, 1.0); - ASSERT_NEAR((left + right) / 2, h_b[off], 1e-5); - } - } - } + if(sa.dims(dim) % 2 == 1) { + mSeq[dim] = mSeq[dim] - 1.0; + verify = sa(mSeq[0], mSeq[1], mSeq[2], mSeq[3]); + } else { + dim_t sdim[4] = {0}; + sdim[dim] = 1; + sa = sa.as((af_dtype)af::dtype_traits::af_type); + array sas = shift(sa, sdim[0], sdim[1], sdim[2], sdim[3]); + verify = ((sa + sas) / 2)(mSeq[0], mSeq[1], mSeq[2], mSeq[3]); } - delete[] h_sa; - if (!flat) delete[] h_b; + // Test Part + array out = median(a, dim); + + ASSERT_EQ(out.dims() == verify.dims(), true); + ASSERT_NEAR(0, sum(af::abs(out - verify)), 1e-5); } -#define MEDIAN0(To, Ti) \ - TEST(median0, Ti##_1D_even) \ +#define MEDIAN_FLAT(To, Ti) \ + TEST(MedianFlat, Ti##_flat_even) \ + { \ + median_flat(1000); \ + } \ + TEST(MedianFlat, Ti##_flat_odd) \ { \ - median0(1000); \ + median_flat(783); \ } \ - TEST(median0, Ti##_2D_even) \ + TEST(MedianFlat, Ti##_flat_multi_even) \ { \ - median0(1000, 100); \ + median_flat(24, 11, 3); \ } \ - TEST(median0, Ti##_3D_even) \ + TEST(MedianFlat, Ti##_flat_multi_odd) \ { \ - median0(1000, 25, 4); \ + median_flat(15, 21, 7); \ } \ - TEST(median0, Ti##_4D_even) \ + +MEDIAN_FLAT(float, float) +MEDIAN_FLAT(float, int) +MEDIAN_FLAT(float, uint) +MEDIAN_FLAT(float, uchar) +MEDIAN_FLAT(float, short) +MEDIAN_FLAT(float, ushort) +MEDIAN_FLAT(double, double) + +#define MEDIAN_TEST(To, Ti, dim) \ + TEST(Median, Ti##_1D_##dim##_even) \ { \ - median0(1000, 25, 2, 2); \ + median_test(1000); \ } \ - TEST(median0, Ti##_flat_even) \ + TEST(Median, Ti##_2D_##dim##_even) \ { \ - median0(1000); \ + median_test(1000, 25); \ } \ - TEST(median0, Ti##_1D_odd) \ + TEST(Median, Ti##_3D_##dim##_even) \ { \ - median0(783); \ + median_test(100, 25, 4); \ } \ - TEST(median0, Ti##_2D_odd) \ + TEST(Median, Ti##_4D_##dim##_even) \ { \ - median0(783, 100); \ + median_test(100, 25, 2, 2);\ } \ - TEST(median0, Ti##_3D_odd) \ + TEST(Median, Ti##_1D_##dim##_odd) \ { \ - median0(783, 25, 4); \ + median_test(783); \ } \ - TEST(median0, Ti##_4D_odd) \ + TEST(Median, Ti##_2D_##dim##_odd) \ { \ - median0(783, 25, 2, 2); \ + median_test(783, 25); \ } \ - TEST(median0, Ti##_flat_odd) \ + TEST(Median, Ti##_3D_##dim##_odd) \ { \ - median0(783); \ + median_test(123, 25, 3); \ } \ + TEST(Median, Ti##_4D_##dim##_odd) \ + { \ + median_test(123, 25, 3, 3);\ + } \ + +#define MEDIAN(To, Ti) \ + MEDIAN_TEST(To, Ti, 0) \ + MEDIAN_TEST(To, Ti, 1) \ + MEDIAN_TEST(To, Ti, 2) \ + MEDIAN_TEST(To, Ti, 3) \ -MEDIAN0(float, float) -MEDIAN0(float, int) -MEDIAN0(float, uint) -MEDIAN0(float, uchar) -MEDIAN0(float, short) -MEDIAN0(float, ushort) -MEDIAN0(double, double) +MEDIAN(float, float) +MEDIAN(float, int) +MEDIAN(float, uint) +MEDIAN(float, uchar) +MEDIAN(float, short) +MEDIAN(float, ushort) +MEDIAN(double, double) diff --git a/test/sort.cpp b/test/sort.cpp index 7ec6f5565e..9a496f3236 100644 --- a/test/sort.cpp +++ b/test/sort.cpp @@ -106,16 +106,16 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, bool SORT_INIT(Sort1000False, sort_1000, false, 2); SORT_INIT(SortMedTrue, sort_med1, true, 0); SORT_INIT(SortMedFalse, sort_med1, false, 2); - // Takes too much time in current implementation. Enable when everything is parallel - //SORT_INIT(SortMed5True, sort_med, true, 0); - //SORT_INIT(SortMed5False, sort_med, false, 2); - //SORT_INIT(SortLargeTrue, sort_large, true, 0); - //SORT_INIT(SortLargeFalse, sort_large, false, 2); + + SORT_INIT(SortMed5True, sort_med, true, 0); + SORT_INIT(SortMed5False, sort_med, false, 2); + SORT_INIT(SortLargeTrue, sort_large, true, 0); + SORT_INIT(SortLargeFalse, sort_large, false, 2); ////////////////////////////////////// CPP //////////////////////////////// // -TEST(Sort, CPP) +TEST(Sort, CPPDim0) { if (noDoubleTests()) return; @@ -147,3 +147,74 @@ TEST(Sort, CPP) delete[] sxData; } +TEST(Sort, CPPDim1) +{ + if (noDoubleTests()) return; + + const bool dir = true; + const unsigned resultIdx0 = 0; + + vector numDims; + vector > in; + vector > tests; + readTests(string(TEST_DIR"/sort/sort_10x10.test"),numDims,in,tests); + + af::dim4 idims = numDims[0]; + af::array input(idims, &(in[0].front())); + + af::array input_ = reorder(input, 1, 0, 2, 3); + + af::array output = af::sort(input_, 1, dir); + + output = reorder(output, 1, 0, 2, 3); // Required for checking with test data + + size_t nElems = tests[resultIdx0].size(); + + // Get result + float* sxData = new float[tests[resultIdx0].size()]; + output.host((void*)sxData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx0][elIter], sxData[elIter]) << "at: " << elIter << std::endl; + } + + // Delete + delete[] sxData; +} + +TEST(Sort, CPPDim2) +{ + if (noDoubleTests()) return; + + const bool dir = false; + const unsigned resultIdx0 = 2; + + vector numDims; + vector > in; + vector > tests; + readTests(string(TEST_DIR"/sort/sort_med.test"),numDims,in,tests); + + af::dim4 idims = numDims[0]; + af::array input(idims, &(in[0].front())); + + af::array input_ = reorder(input, 1, 2, 0, 3); + + af::array output = af::sort(input_, 2, dir); + + output = reorder(output, 2, 0, 1, 3); // Required for checking with test data + + size_t nElems = tests[resultIdx0].size(); + + // Get result + float* sxData = new float[tests[resultIdx0].size()]; + output.host((void*)sxData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx0][elIter], sxData[elIter]) << "at: " << elIter << std::endl; + } + + // Delete + delete[] sxData; +} diff --git a/test/sort_by_key.cpp b/test/sort_by_key.cpp index ed827c9da5..dae46bef54 100644 --- a/test/sort_by_key.cpp +++ b/test/sort_by_key.cpp @@ -118,16 +118,16 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const SORT_INIT(SortMedTrue, sort_by_key_med, true, 0, 1); SORT_INIT(Sort1000False, sort_by_key_1000, false, 2, 3); SORT_INIT(SortMedFalse, sort_by_key_med, false, 2, 3); - // Takes too much time in current implementation. Enable when everything is parallel - //SORT_INIT(SortLargeTrue, sort_by_key_large, true, 0, 1); - //SORT_INIT(SortLargeFalse, sort_by_key_large, false, 2, 3); + + SORT_INIT(SortLargeTrue, sort_by_key_large, true, 0, 1); + SORT_INIT(SortLargeFalse, sort_by_key_large, false, 2, 3); ////////////////////////////////////// CPP /////////////////////////////// // -TEST(SortByKey, CPP) +TEST(SortByKey, CPPDim0) { if (noDoubleTests()) return; @@ -168,3 +168,101 @@ TEST(SortByKey, CPP) delete[] keyData; delete[] valData; } + +TEST(SortByKey, CPPDim1) +{ + if (noDoubleTests()) return; + + const bool dir = true; + const unsigned resultIdx0 = 0; + const unsigned resultIdx1 = 1; + + vector numDims; + vector > in; + vector > tests; + readTests(string(TEST_DIR"/sort/sort_by_key_large.test"),numDims,in,tests); + + af::dim4 idims = numDims[0]; + af::array keys(idims, &(in[0].front())); + af::array vals(idims, &(in[1].front())); + + af::array keys_ = reorder(keys, 1, 0, 2, 3); + af::array vals_ = reorder(vals, 1, 0, 2, 3); + + af::array out_keys, out_vals; + af::sort(out_keys, out_vals, keys_, vals_, 1, dir); + + out_keys = reorder(out_keys, 1, 0, 2, 3); + out_vals = reorder(out_vals, 1, 0, 2, 3); + + size_t nElems = tests[resultIdx0].size(); + // Get result + float* keyData = new float[tests[resultIdx0].size()]; + out_keys.host((void*)keyData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx0][elIter], keyData[elIter]) << "at: " << elIter << std::endl; + } + + float* valData = new float[tests[resultIdx1].size()]; + out_vals.host((void*)valData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx1][elIter], valData[elIter]) << "at: " << elIter << std::endl; + } + + // Delete + delete[] keyData; + delete[] valData; +} + +TEST(SortByKey, CPPDim2) +{ + if (noDoubleTests()) return; + + const bool dir = false; + const unsigned resultIdx0 = 2; + const unsigned resultIdx1 = 3; + + vector numDims; + vector > in; + vector > tests; + readTests(string(TEST_DIR"/sort/sort_by_key_large.test"),numDims,in,tests); + + af::dim4 idims = numDims[0]; + af::array keys(idims, &(in[0].front())); + af::array vals(idims, &(in[1].front())); + + af::array keys_ = reorder(keys, 1, 2, 0, 3); + af::array vals_ = reorder(vals, 1, 2, 0, 3); + + af::array out_keys, out_vals; + af::sort(out_keys, out_vals, keys_, vals_, 2, dir); + + out_keys = reorder(out_keys, 2, 0, 1, 3); + out_vals = reorder(out_vals, 2, 0, 1, 3); + + size_t nElems = tests[resultIdx0].size(); + // Get result + float* keyData = new float[tests[resultIdx0].size()]; + out_keys.host((void*)keyData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx0][elIter], keyData[elIter]) << "at: " << elIter << std::endl; + } + + float* valData = new float[tests[resultIdx1].size()]; + out_vals.host((void*)valData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx1][elIter], valData[elIter]) << "at: " << elIter << std::endl; + } + + // Delete + delete[] keyData; + delete[] valData; +} diff --git a/test/sort_index.cpp b/test/sort_index.cpp index 6aa240d5a5..0df4744c02 100644 --- a/test/sort_index.cpp +++ b/test/sort_index.cpp @@ -119,17 +119,16 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const SORT_INIT(SortMedTrue, sort_med1, true, 0, 1); SORT_INIT(Sort1000False, sort_1000, false, 2, 3); SORT_INIT(SortMedFalse, sort_med1, false, 2, 3); - // Takes too much time in current implementation. Enable when everything is parallel - //SORT_INIT(SortMed5True, sort_med, true, 0, 1); - //SORT_INIT(SortMed5False, sort_med, false, 2, 3); - //SORT_INIT(SortLargeTrue, sort_large, true, 0, 1); - //SORT_INIT(SortLargeFalse, sort_large, false, 2, 3); -; + + SORT_INIT(SortMed5True, sort_med, true, 0, 1); + SORT_INIT(SortMed5False, sort_med, false, 2, 3); + SORT_INIT(SortLargeTrue, sort_large, true, 0, 1); + SORT_INIT(SortLargeFalse, sort_large, false, 2, 3); //////////////////////////////////// CPP ///////////////////////////////// // -TEST(SortIndex, CPP) +TEST(SortIndex, CPPDim0) { if (noDoubleTests()) return; @@ -171,3 +170,98 @@ TEST(SortIndex, CPP) delete[] sxData; delete[] ixData; } + +TEST(SortIndex, CPPDim1) +{ + if (noDoubleTests()) return; + + const bool dir = true; + const unsigned resultIdx0 = 0; + const unsigned resultIdx1 = 1; + + vector numDims; + vector > in; + vector > tests; + readTests(string(TEST_DIR"/sort/sort_10x10.test"),numDims,in,tests); + + af::dim4 idims = numDims[0]; + af::array input_(idims, &(in[0].front())); + af::array input = reorder(input_, 1, 0, 2, 3); + + af::array outValues, outIndices; + af::sort(outValues, outIndices, input, 1, dir); + + outValues = reorder(outValues, 1, 0, 2, 3); + outIndices = reorder(outIndices, 1, 0, 2, 3); + + size_t nElems = tests[resultIdx0].size(); + + // Get result + float* sxData = new float[tests[resultIdx0].size()]; + outValues.host((void*)sxData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx0][elIter], sxData[elIter]) << "at: " << elIter << std::endl; + } + + // Get result + unsigned* ixData = new unsigned[tests[resultIdx1].size()]; + outIndices.host((void*)ixData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx1][elIter], ixData[elIter]) << "at: " << elIter << std::endl; + } + + // Delete + delete[] sxData; + delete[] ixData; +} + +TEST(SortIndex, CPPDim2) +{ + if (noDoubleTests()) return; + + const bool dir = false; + const unsigned resultIdx0 = 2; + const unsigned resultIdx1 = 3; + + vector numDims; + vector > in; + vector > tests; + readTests(string(TEST_DIR"/sort/sort_med.test"),numDims,in,tests); + + af::dim4 idims = numDims[0]; + af::array input_(idims, &(in[0].front())); + af::array input = reorder(input_, 1, 2, 0, 3); + + af::array outValues, outIndices; + af::sort(outValues, outIndices, input, 2, dir); + + outValues = reorder(outValues, 2, 0, 1, 3); + outIndices = reorder(outIndices, 2, 0, 1, 3); + size_t nElems = tests[resultIdx0].size(); + + // Get result + float* sxData = new float[tests[resultIdx0].size()]; + outValues.host((void*)sxData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx0][elIter], sxData[elIter]) << "at: " << elIter << std::endl; + } + + // Get result + unsigned* ixData = new unsigned[tests[resultIdx1].size()]; + outIndices.host((void*)ixData); + + // Compare result + for (size_t elIter = 0; elIter < nElems; ++elIter) { + ASSERT_EQ(tests[resultIdx1][elIter], ixData[elIter]) << "at: " << elIter << std::endl; + } + + // Delete + delete[] sxData; + delete[] ixData; +}