From 98e023d7a10e301f793c04246c80bcc1c1bcfd1b Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Wed, 30 Mar 2016 15:22:00 -0400
Subject: [PATCH 01/28] Updated boost.compute to latest develop commit

---
 CMakeModules/build_boost_compute.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/CMakeModules/build_boost_compute.cmake b/CMakeModules/build_boost_compute.cmake
index 03c20435a8..eca4e32771 100644
--- a/CMakeModules/build_boost_compute.cmake
+++ b/CMakeModules/build_boost_compute.cmake
@@ -1,9 +1,9 @@
 # If using a commit, remove the v prefix to VER in URL.
 # If using a tag, don't use v in VER
 # This is because of how github handles it's release tar balls
-SET(VER 0.5)
-SET(URL https://github.com/boostorg/compute/archive/v${VER}.tar.gz)
-SET(MD5 69a52598ac539d3b7f6005a3dd2b6f58)
+SET(VER 523d8e974559977fab006190e9d40eb2e4f87bd0)
+SET(URL https://github.com/boostorg/compute/archive/${VER}.tar.gz)
+SET(MD5 bbce9e2730e449db5c8f88eae160ea12)
 
 SET(thirdPartyDir "${CMAKE_BINARY_DIR}/third_party")
 SET(srcDir "${thirdPartyDir}/compute-${VER}")

From 8026cdb82984d610037667a25f4460f191115ef7 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Wed, 30 Mar 2016 15:22:39 -0400
Subject: [PATCH 02/28] Use stable sort for thrust and boost compute

---
 src/backend/cuda/kernel/sort_by_key.hpp   | 4 ++--
 src/backend/cuda/kernel/sort_index.hpp    | 4 ++--
 src/backend/opencl/kernel/sort_by_key.hpp | 8 ++++----
 src/backend/opencl/kernel/sort_index.hpp  | 6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/backend/cuda/kernel/sort_by_key.hpp b/src/backend/cuda/kernel/sort_by_key.hpp
index 42a3256a1c..e06d9202b6 100644
--- a/src/backend/cuda/kernel/sort_by_key.hpp
+++ b/src/backend/cuda/kernel/sort_by_key.hpp
@@ -44,9 +44,9 @@ namespace cuda
                         int ovalOffset = ovalWZ + y * oval.strides[1];
 
                         if(isAscending) {
-                            THRUST_SELECT(thrust::sort_by_key, okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0], oval_ptr + ovalOffset);
+                            THRUST_SELECT(thrust::stable_sort_by_key, okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0], oval_ptr + ovalOffset);
                         } else {
-                            THRUST_SELECT(thrust::sort_by_key, okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0], oval_ptr + ovalOffset, thrust::greater<Tk>());
+                            THRUST_SELECT(thrust::stable_sort_by_key, okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0], oval_ptr + ovalOffset, thrust::greater<Tk>());
                         }
                     }
                 }
diff --git a/src/backend/cuda/kernel/sort_index.hpp b/src/backend/cuda/kernel/sort_index.hpp
index 9d29914f23..8762f28a4a 100644
--- a/src/backend/cuda/kernel/sort_index.hpp
+++ b/src/backend/cuda/kernel/sort_index.hpp
@@ -42,11 +42,11 @@ namespace cuda
 
                         THRUST_SELECT(thrust::sequence, idx_ptr + idxOffset, idx_ptr + idxOffset + idx.dims[0]);
                         if(isAscending) {
-                            THRUST_SELECT(thrust::sort_by_key,
+                            THRUST_SELECT(thrust::stable_sort_by_key,
                                     val_ptr + valOffset, val_ptr + valOffset + val.dims[0],
                                     idx_ptr + idxOffset);
                         } else {
-                            THRUST_SELECT(thrust::sort_by_key,
+                            THRUST_SELECT(thrust::stable_sort_by_key,
                                         val_ptr + valOffset, val_ptr + valOffset + val.dims[0],
                                         idx_ptr + idxOffset, thrust::greater<T>());
                         }
diff --git a/src/backend/opencl/kernel/sort_by_key.hpp b/src/backend/opencl/kernel/sort_by_key.hpp
index 0cb9cb042d..555e39cdf1 100644
--- a/src/backend/opencl/kernel/sort_by_key.hpp
+++ b/src/backend/opencl/kernel/sort_by_key.hpp
@@ -20,7 +20,7 @@
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 
 #include <boost/compute/core.hpp>
-#include <boost/compute/algorithm/sort_by_key.hpp>
+#include <boost/compute/algorithm/stable_sort_by_key.hpp>
 #include <boost/compute/functional/operator.hpp>
 #include <boost/compute/iterator/buffer_iterator.hpp>
 
@@ -72,10 +72,10 @@ namespace opencl
                             compute::buffer_iterator< type_t<Tk> > end = compute::make_buffer_iterator< type_t<Tk> >(okey_buf, okeyOffset + okey.info.dims[0]);
                             compute::buffer_iterator< type_t<Tv> > vals = compute::make_buffer_iterator< type_t<Tv> >(oval_buf, ovalOffset);
                             if(isAscending) {
-                                compute::sort_by_key(start, end, vals, c_queue);
+                                compute::stable_sort_by_key(start, end, vals, c_queue);
                             } else {
-                                compute::sort_by_key(start, end, vals,
-                                                     compute::greater< type_t<Tk> >(), c_queue);
+                                compute::stable_sort_by_key(start, end, vals,
+                                                            compute::greater< type_t<Tk> >(), c_queue);
                             }
                         }
                     }
diff --git a/src/backend/opencl/kernel/sort_index.hpp b/src/backend/opencl/kernel/sort_index.hpp
index 3a8ab1401e..4926dc34c3 100644
--- a/src/backend/opencl/kernel/sort_index.hpp
+++ b/src/backend/opencl/kernel/sort_index.hpp
@@ -21,7 +21,7 @@
 
 #include <boost/compute/core.hpp>
 #include <boost/compute/algorithm/iota.hpp>
-#include <boost/compute/algorithm/sort_by_key.hpp>
+#include <boost/compute/algorithm/stable_sort_by_key.hpp>
 #include <boost/compute/functional/operator.hpp>
 #include <boost/compute/iterator/buffer_iterator.hpp>
 
@@ -73,12 +73,12 @@ namespace opencl
                             compute::iota(idx_begin, idx_begin + val.info.dims[0], 0, c_queue);
 
                             if(isAscending) {
-                                compute::sort_by_key(
+                                compute::stable_sort_by_key(
                                         compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset),
                                         compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset + val.info.dims[0]),
                                         idx_begin, compute::less< type_t<T> >(), c_queue);
                             } else {
-                                compute::sort_by_key(
+                                compute::stable_sort_by_key(
                                         compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset),
                                         compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset + val.info.dims[0]),
                                         idx_begin, compute::greater< type_t<T> >(), c_queue);

From 09129b00a072c5ada07cb3321b16cf582f0f0d32 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Wed, 30 Mar 2016 16:49:21 -0400
Subject: [PATCH 03/28] Allow complex types as value type in sort_by_key

This allows complex sorting based on a key, which can be the abs, real, imag
etc of the value array. So the user can choose which metric they want to use.
---
 src/api/c/sort.cpp                        | 16 +++++++++------
 src/backend/cpu/sort_by_key.cpp           |  2 ++
 src/backend/cuda/sort_by_key_impl.hpp     | 24 ++++++++++++-----------
 src/backend/opencl/kernel/sort_by_key.hpp | 19 +++++++++++++++++-
 src/backend/opencl/sort_by_key/impl.hpp   | 22 +++++++++++----------
 5 files changed, 55 insertions(+), 28 deletions(-)

diff --git a/src/api/c/sort.cpp b/src/api/c/sort.cpp
index 1de63c5052..66ffce9eb1 100644
--- a/src/api/c/sort.cpp
+++ b/src/api/c/sort.cpp
@@ -150,6 +150,8 @@ void sort_by_key_tmplt(af_array *okey, af_array *oval, const af_array ikey, cons
     switch(vtype) {
     case f32: sort_by_key<Tk, float  >(okey, oval, ikey, ival, dim, isAscending);  break;
     case f64: sort_by_key<Tk, double >(okey, oval, ikey, ival, dim, isAscending);  break;
+    case c32: sort_by_key<Tk, cfloat >(okey, oval, ikey, ival, dim, isAscending);  break;
+    case c64: sort_by_key<Tk, cdouble>(okey, oval, ikey, ival, dim, isAscending);  break;
     case s32: sort_by_key<Tk, int    >(okey, oval, ikey, ival, dim, isAscending);  break;
     case u32: sort_by_key<Tk, uint   >(okey, oval, ikey, ival, dim, isAscending);  break;
     case s16: sort_by_key<Tk, short  >(okey, oval, ikey, ival, dim, isAscending);  break;
@@ -169,20 +171,22 @@ af_err af_sort_by_key(af_array *out_keys, af_array *out_values,
                       const unsigned dim, const bool isAscending)
 {
     try {
-        ArrayInfo info = getInfo(keys);
-        af_dtype type = info.getType();
+        ArrayInfo kinfo = getInfo(keys);
+        af_dtype ktype = kinfo.getType();
 
         ArrayInfo vinfo = getInfo(values);
 
-        DIM_ASSERT(3, info.elements() > 0);
-        DIM_ASSERT(4, info.dims() == vinfo.dims());
+        DIM_ASSERT(3, kinfo.elements() > 0);
+        DIM_ASSERT(4, kinfo.dims() == vinfo.dims());
         // Only Dim 0 supported
         ARG_ASSERT(5, dim == 0);
 
+        TYPE_ASSERT(kinfo.isReal());
+
         af_array oKey;
         af_array oVal;
 
-        switch(type) {
+        switch(ktype) {
             case f32: sort_by_key_tmplt<float  >(&oKey, &oVal, keys, values, dim, isAscending);  break;
             case f64: sort_by_key_tmplt<double >(&oKey, &oVal, keys, values, dim, isAscending);  break;
             case s32: sort_by_key_tmplt<int    >(&oKey, &oVal, keys, values, dim, isAscending);  break;
@@ -193,7 +197,7 @@ af_err af_sort_by_key(af_array *out_keys, af_array *out_values,
             case u64: sort_by_key_tmplt<uintl  >(&oKey, &oVal, keys, values, dim, isAscending);  break;
             case u8:  sort_by_key_tmplt<uchar  >(&oKey, &oVal, keys, values, dim, isAscending);  break;
             case b8:  sort_by_key_tmplt<char   >(&oKey, &oVal, keys, values, dim, isAscending);  break;
-            default:  TYPE_ERROR(1, type);
+            default:  TYPE_ERROR(1, ktype);
         }
         std::swap(*out_keys , oKey);
         std::swap(*out_values , oVal);
diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/sort_by_key.cpp
index 5a99257033..46ced4b9ef 100644
--- a/src/backend/cpu/sort_by_key.cpp
+++ b/src/backend/cpu/sort_by_key.cpp
@@ -46,6 +46,8 @@ void sort_by_key(Array<Tk> &okey, Array<Tv> &oval,
 #define INSTANTIATE1(Tk)       \
     INSTANTIATE(Tk, float)     \
     INSTANTIATE(Tk, double)    \
+    INSTANTIATE(Tk, cfloat)    \
+    INSTANTIATE(Tk, cdouble)   \
     INSTANTIATE(Tk, int)       \
     INSTANTIATE(Tk, uint)      \
     INSTANTIATE(Tk, char)      \
diff --git a/src/backend/cuda/sort_by_key_impl.hpp b/src/backend/cuda/sort_by_key_impl.hpp
index d01ace404e..217b17dc8a 100644
--- a/src/backend/cuda/sort_by_key_impl.hpp
+++ b/src/backend/cuda/sort_by_key_impl.hpp
@@ -35,15 +35,17 @@ namespace cuda
     sort_by_key<Tk, Tv, dr>(Array<Tk> &okey, Array<Tv> &oval,           \
                             const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim); \
 
-#define INSTANTIATE1(Tk,    dr) \
-    INSTANTIATE(Tk, float,  dr) \
-    INSTANTIATE(Tk, double, dr) \
-    INSTANTIATE(Tk, int,    dr) \
-    INSTANTIATE(Tk, uint,   dr) \
-    INSTANTIATE(Tk, short,  dr) \
-    INSTANTIATE(Tk, ushort, dr) \
-    INSTANTIATE(Tk, char,   dr) \
-    INSTANTIATE(Tk, uchar,  dr) \
-    INSTANTIATE(Tk, intl,   dr) \
-    INSTANTIATE(Tk, uintl,  dr)
+#define INSTANTIATE1(Tk    , dr) \
+    INSTANTIATE(Tk, float  , dr) \
+    INSTANTIATE(Tk, double , dr) \
+    INSTANTIATE(Tk, cfloat , dr) \
+    INSTANTIATE(Tk, cdouble, dr) \
+    INSTANTIATE(Tk, int    , dr) \
+    INSTANTIATE(Tk, uint   , dr) \
+    INSTANTIATE(Tk, short  , dr) \
+    INSTANTIATE(Tk, ushort , dr) \
+    INSTANTIATE(Tk, char   , dr) \
+    INSTANTIATE(Tk, uchar  , dr) \
+    INSTANTIATE(Tk, intl   , dr) \
+    INSTANTIATE(Tk, uintl  , dr)
 }
diff --git a/src/backend/opencl/kernel/sort_by_key.hpp b/src/backend/opencl/kernel/sort_by_key.hpp
index 555e39cdf1..e3306e6e68 100644
--- a/src/backend/opencl/kernel/sort_by_key.hpp
+++ b/src/backend/opencl/kernel/sort_by_key.hpp
@@ -40,9 +40,26 @@ namespace opencl
     {
         using std::conditional;
         using std::is_same;
+
+        // If type is cdouble, return std::complex<double>, else return T
+        template<typename T>
+        using ztype_t = typename conditional<is_same<T, cdouble>::value,
+                                             std::complex<double>, T
+                                            >::type;
+
+        // If type is cfloat, return std::complex<float>, else return ztype_t
+        template<typename T>
+        using ctype_t = typename conditional<is_same<T, cfloat>::value,
+                                             std::complex<float>, ztype_t<T>
+                                            >::type;
+
+        // If type is intl, return cl_long, else return ctype_t
         template<typename T>
-        using ltype_t = typename conditional<is_same<T, intl>::value, cl_long, T>::type;
+        using ltype_t = typename conditional<is_same<T, intl>::value,
+                                             cl_long, ctype_t<T>
+                                            >::type;
 
+        // If type is uintl, return cl_ulong, else return ltype_t
         template<typename T>
         using type_t = typename conditional<is_same<T, uintl>::value,
                                             cl_ulong, ltype_t<T>
diff --git a/src/backend/opencl/sort_by_key/impl.hpp b/src/backend/opencl/sort_by_key/impl.hpp
index 49d184113f..68c5ce70ae 100644
--- a/src/backend/opencl/sort_by_key/impl.hpp
+++ b/src/backend/opencl/sort_by_key/impl.hpp
@@ -43,15 +43,17 @@ namespace opencl
 
 
 #define INSTANTIATE1(Tk, isAscending)           \
-    INSTANTIATE(Tk, float , isAscending)        \
-    INSTANTIATE(Tk, double, isAscending)        \
-    INSTANTIATE(Tk, int   , isAscending)        \
-    INSTANTIATE(Tk, uint  , isAscending)        \
-    INSTANTIATE(Tk, char  , isAscending)        \
-    INSTANTIATE(Tk, uchar , isAscending)        \
-    INSTANTIATE(Tk, short , isAscending)        \
-    INSTANTIATE(Tk, ushort, isAscending)        \
-    INSTANTIATE(Tk, intl  , isAscending)        \
-    INSTANTIATE(Tk, uintl , isAscending)        \
+    INSTANTIATE(Tk, float  , isAscending)       \
+    INSTANTIATE(Tk, double , isAscending)       \
+    INSTANTIATE(Tk, cfloat , isAscending)       \
+    INSTANTIATE(Tk, cdouble, isAscending)       \
+    INSTANTIATE(Tk, int    , isAscending)       \
+    INSTANTIATE(Tk, uint   , isAscending)       \
+    INSTANTIATE(Tk, char   , isAscending)       \
+    INSTANTIATE(Tk, uchar  , isAscending)       \
+    INSTANTIATE(Tk, short  , isAscending)       \
+    INSTANTIATE(Tk, ushort , isAscending)       \
+    INSTANTIATE(Tk, intl   , isAscending)       \
+    INSTANTIATE(Tk, uintl  , isAscending)       \
 
 }

From cda059fcb22a22446210e0c9d120353821102382 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Thu, 31 Mar 2016 17:28:02 -0400
Subject: [PATCH 04/28] Improvements to sort

* Sort now allows all dimensions
* Sort if much faster by using batched mode. This takes up more memory though.
* Enabled large tests for sort
* Added tests for sorting on dim1 and dim2
---
 src/api/c/sort.cpp                        |  2 -
 src/backend/cpu/kernel/sort.hpp           |  2 +-
 src/backend/cpu/sort.cpp                  | 48 +++++++++++-
 src/backend/cuda/kernel/sort.hpp          | 77 ++++++++++++++++++--
 src/backend/cuda/kernel/sort_by_key.hpp   | 14 ++--
 src/backend/cuda/sort.cu                  |  9 ++-
 src/backend/opencl/kernel/sort.hpp        | 89 ++++++++++++++++++++---
 src/backend/opencl/kernel/sort_by_key.hpp | 28 +------
 src/backend/opencl/kernel/sort_helper.hpp | 46 ++++++++++++
 src/backend/opencl/kernel/sort_index.hpp  | 11 +--
 src/backend/opencl/sort.cpp               |  8 +-
 test/sort.cpp                             | 77 ++++++++++++++++++--
 12 files changed, 336 insertions(+), 75 deletions(-)
 create mode 100644 src/backend/opencl/kernel/sort_helper.hpp

diff --git a/src/api/c/sort.cpp b/src/api/c/sort.cpp
index 66ffce9eb1..e3f3ae35da 100644
--- a/src/api/c/sort.cpp
+++ b/src/api/c/sort.cpp
@@ -42,8 +42,6 @@ af_err af_sort(af_array *out, const af_array in, const unsigned dim, const bool
         af_dtype type = info.getType();
 
         DIM_ASSERT(1, info.elements() > 0);
-        // Only Dim 0 supported
-        ARG_ASSERT(2, dim == 0);
 
         af_array val;
 
diff --git a/src/backend/cpu/kernel/sort.hpp b/src/backend/cpu/kernel/sort.hpp
index 292c6383dc..e0ae62c932 100644
--- a/src/backend/cpu/kernel/sort.hpp
+++ b/src/backend/cpu/kernel/sort.hpp
@@ -23,7 +23,7 @@ namespace kernel
 
 // Based off of http://stackoverflow.com/a/12399290
 template<typename T, bool isAscending>
-void sort0(Array<T> val)
+void sort0Iterative(Array<T> val)
 {
     // initialize original index locations
     T *val_ptr = val.get();
diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp
index bc6396b258..c3c5286fbf 100644
--- a/src/backend/cpu/sort.cpp
+++ b/src/backend/cpu/sort.cpp
@@ -15,11 +15,54 @@
 #include <functional>
 #include <platform.hpp>
 #include <queue.hpp>
+#include <range.hpp>
+#include <iota.hpp>
+#include <handle.hpp>
+#include <sort_by_key.hpp>
 #include <kernel/sort.hpp>
 
 namespace cpu
 {
 
+template<typename T, bool isAscending, int dim>
+void sortBatched(Array<T>& val)
+{
+    af::dim4 inDims = val.dims();
+
+    // Sort dimension
+    af::dim4 tileDims(1);
+    af::dim4 seqDims = inDims;
+    tileDims[dim] = inDims[dim];
+    seqDims[dim] = 1;
+
+    Array<uint> key = iota<uint>(seqDims, tileDims);
+
+    Array<uint> *resKey = initArray<uint>();
+    Array<T   > *resVal = initArray<T>();
+
+    val.modDims(inDims.elements());
+    key.modDims(inDims.elements());
+
+    sort_by_key<T, uint, isAscending>(*resVal, *resKey, val, key, 0);
+
+    // Needs to be ascending (true) in order to maintain the indices properly
+    sort_by_key<uint, T, true>(key, val, *resKey, *resVal, 0);
+    val.eval();
+
+    val.modDims(inDims);
+}
+
+template<typename T, bool isAscending>
+void sort0(Array<T>& val)
+{
+    int higherDims = val.elements() / val.dims()[0];
+    // TODO Make a better heurisitic
+    if(higherDims > 10)
+        sortBatched<T, isAscending, 0>(val);
+    else
+        getQueue().enqueue(kernel::sort0Iterative<T, isAscending>, val);
+}
+
 template<typename T, bool isAscending>
 Array<T> sort(const Array<T> &in, const unsigned dim)
 {
@@ -27,7 +70,10 @@ Array<T> sort(const Array<T> &in, const unsigned dim)
 
     Array<T> out = copyArray<T>(in);
     switch(dim) {
-        case 0: getQueue().enqueue(kernel::sort0<T, isAscending>, out); break;
+        case 0: sort0<T, isAscending>(out); break;
+        case 1: sortBatched<T, isAscending, 1>(out); break;
+        case 2: sortBatched<T, isAscending, 2>(out); break;
+        case 3: sortBatched<T, isAscending, 3>(out); break;
         default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
     }
     return out;
diff --git a/src/backend/cuda/kernel/sort.hpp b/src/backend/cuda/kernel/sort.hpp
index b23e308633..21d9122d64 100644
--- a/src/backend/cuda/kernel/sort.hpp
+++ b/src/backend/cuda/kernel/sort.hpp
@@ -10,6 +10,8 @@
 #include <math.hpp>
 #include <dispatch.hpp>
 #include <Param.hpp>
+#include <kernel/iota.hpp>
+#include <kernel/sort_by_key.hpp>
 #include <err_cuda.hpp>
 #include <debug_cuda.hpp>
 #include <thrust/device_ptr.h>
@@ -19,15 +21,11 @@ namespace cuda
 {
     namespace kernel
     {
-        // Kernel Launch Config Values
-        static const unsigned TX = 32;
-        static const unsigned TY = 8;
-
         ///////////////////////////////////////////////////////////////////////////
         // Wrapper functions
         ///////////////////////////////////////////////////////////////////////////
         template<typename T, bool isAscending>
-        void sort0(Param<T> val)
+        void sort0Iterative(Param<T> val)
         {
             thrust::device_ptr<T> val_ptr = thrust::device_pointer_cast(val.ptr);
 
@@ -49,5 +47,74 @@ namespace cuda
             }
             POST_LAUNCH_CHECK();
         }
+
+        template<typename T, bool isAscending, int dim>
+        void sortBatched(Param<T> pVal)
+        {
+            af::dim4 inDims;
+            for(int i = 0; i < 4; i++)
+                inDims[i] = pVal.dims[i];
+
+            // Sort dimension
+            // tileDims * seqDims = inDims
+            af::dim4 tileDims(1);
+            af::dim4 seqDims = inDims;
+            tileDims[dim] = inDims[dim];
+            seqDims[dim] = 1;
+
+            // Create/call iota
+            // Array<uint> key = iota<uint>(seqDims, tileDims);
+            dim4 keydims = inDims;
+            uint* key = memAlloc<uint>(keydims.elements());
+            Param<uint> pKey;
+            pKey.ptr = key;
+            pKey.strides[0] = 1;
+            pKey.dims[0] = keydims[0];
+            for(int i = 1; i < 4; i++) {
+                pKey.dims[i] = keydims[i];
+                pKey.strides[i] = pKey.strides[i - 1] * pKey.dims[i - 1];
+            }
+            kernel::iota<uint>(pKey, seqDims, tileDims);
+
+            // Flat
+            //val.modDims(inDims.elements());
+            //key.modDims(inDims.elements());
+            pKey.dims[0] = inDims.elements();
+            pKey.strides[0] = 1;
+            pVal.dims[0] = inDims.elements();
+            pVal.strides[0] = 1;
+            for(int i = 1; i < 4; i++) {
+                pKey.dims[i] = 1;
+                pKey.strides[i] = pKey.strides[i - 1] * pKey.dims[i - 1];
+                pVal.dims[i] = 1;
+                pVal.strides[i] = pVal.strides[i - 1] * pVal.dims[i - 1];
+            }
+
+            // Sort indices
+            // sort_by_key<T, uint, isAscending>(*resVal, *resKey, val, key, 0);
+            kernel::sort0_by_key<T, uint, isAscending>(pVal, pKey);
+
+            // Needs to be ascending (true) in order to maintain the indices properly
+            kernel::sort0_by_key<uint, T, true>(pKey, pVal);
+
+            // No need of doing moddims here because the original Array<T>
+            // dimensions have not been changed
+            //val.modDims(inDims);
+
+            // Not really necessary
+            // CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));
+            memFree(key);
+        }
+
+        template<typename T, bool isAscending>
+        void sort0(Param<T> val)
+        {
+            int higherDims =  val.dims[1] * val.dims[2] * val.dims[3];
+            // TODO Make a better heurisitic
+            if(higherDims > 10)
+                sortBatched<T, isAscending, 0>(val);
+            else
+                kernel::sort0Iterative<T, isAscending>(val);
+        }
     }
 }
diff --git a/src/backend/cuda/kernel/sort_by_key.hpp b/src/backend/cuda/kernel/sort_by_key.hpp
index e06d9202b6..bfaa79a311 100644
--- a/src/backend/cuda/kernel/sort_by_key.hpp
+++ b/src/backend/cuda/kernel/sort_by_key.hpp
@@ -19,10 +19,6 @@ namespace cuda
 {
     namespace kernel
     {
-        // Kernel Launch Config Values
-        static const unsigned TX = 32;
-        static const unsigned TY = 8;
-
         ///////////////////////////////////////////////////////////////////////////
         // Wrapper functions
         ///////////////////////////////////////////////////////////////////////////
@@ -44,9 +40,15 @@ namespace cuda
                         int ovalOffset = ovalWZ + y * oval.strides[1];
 
                         if(isAscending) {
-                            THRUST_SELECT(thrust::stable_sort_by_key, okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0], oval_ptr + ovalOffset);
+                            THRUST_SELECT(thrust::stable_sort_by_key,
+                                          okey_ptr + okeyOffset,
+                                          okey_ptr + okeyOffset + okey.dims[0],
+                                          oval_ptr + ovalOffset);
                         } else {
-                            THRUST_SELECT(thrust::stable_sort_by_key, okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0], oval_ptr + ovalOffset, thrust::greater<Tk>());
+                            THRUST_SELECT(thrust::stable_sort_by_key,
+                                          okey_ptr + okeyOffset,
+                                          okey_ptr + okeyOffset + okey.dims[0],
+                                          oval_ptr + ovalOffset, thrust::greater<Tk>());
                         }
                     }
                 }
diff --git a/src/backend/cuda/sort.cu b/src/backend/cuda/sort.cu
index 6d14c0309f..4ae3b759fb 100644
--- a/src/backend/cuda/sort.cu
+++ b/src/backend/cuda/sort.cu
@@ -22,10 +22,11 @@ namespace cuda
     {
         Array<T> out = copyArray<T>(in);
         switch(dim) {
-
-        case 0: kernel::sort0<T, isAscending>(out);
-            break;
-        default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
+            case 0: kernel::sort0<T, isAscending>(out); break;
+            case 1: kernel::sortBatched<T, isAscending, 1>(out); break;
+            case 2: kernel::sortBatched<T, isAscending, 2>(out); break;
+            case 3: kernel::sortBatched<T, isAscending, 3>(out); break;
+            default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
         }
         return out;
     }
diff --git a/src/backend/opencl/kernel/sort.hpp b/src/backend/opencl/kernel/sort.hpp
index 013d8c53a9..7b7799ca89 100644
--- a/src/backend/opencl/kernel/sort.hpp
+++ b/src/backend/opencl/kernel/sort.hpp
@@ -15,6 +15,9 @@
 #include <dispatch.hpp>
 #include <Param.hpp>
 #include <debug_opencl.hpp>
+#include <kernel/sort_helper.hpp>
+#include <kernel/iota.hpp>
+#include <kernel/sort_by_key.hpp>
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
@@ -38,18 +41,8 @@ namespace opencl
 {
     namespace kernel
     {
-        using std::conditional;
-        using std::is_same;
-        template<typename T>
-        using ltype_t = typename conditional<is_same<T, intl>::value, cl_long, T>::type;
-
-        template<typename T>
-        using type_t = typename conditional<is_same<T, uintl>::value,
-                                            cl_ulong, ltype_t<T>
-                                           >::type;
-
         template<typename T, bool isAscending>
-        void sort0(Param val)
+        void sort0Iterative(Param val)
         {
             try {
                 compute::command_queue c_queue(getQueue()());
@@ -85,6 +78,80 @@ namespace opencl
                 throw;
             }
         }
+
+        template<typename T, bool isAscending, int dim>
+        void sortBatched(Param pVal)
+        {
+            try{
+                af::dim4 inDims;
+                for(int i = 0; i < 4; i++)
+                    inDims[i] = pVal.info.dims[i];
+
+                // Sort dimension
+                // tileDims * seqDims = inDims
+                af::dim4 tileDims(1);
+                af::dim4 seqDims = inDims;
+                tileDims[dim] = inDims[dim];
+                seqDims[dim] = 1;
+
+                // Create/call iota
+                // Array<uint> key = iota<uint>(seqDims, tileDims);
+                dim4 keydims = inDims;
+                cl::Buffer* key = bufferAlloc(keydims.elements() * sizeof(uint));
+                Param pKey;
+                pKey.data = key;
+                pKey.info.offset = 0;
+                pKey.info.dims[0] = keydims[0];
+                pKey.info.strides[0] = 1;
+                for(int i = 1; i < 4; i++) {
+                    pKey.info.dims[i] = keydims[i];
+                    pKey.info.strides[i] = pKey.info.strides[i - 1] * pKey.info.dims[i - 1];
+                }
+                kernel::iota<uint>(pKey, seqDims, tileDims);
+
+                // Flat
+                //val.modDims(inDims.elements());
+                //key.modDims(inDims.elements());
+                pKey.info.dims[0] = inDims.elements();
+                pKey.info.strides[0] = 1;
+                pVal.info.dims[0] = inDims.elements();
+                pVal.info.strides[0] = 1;
+                for(int i = 1; i < 4; i++) {
+                    pKey.info.dims[i] = 1;
+                    pKey.info.strides[i] = pKey.info.strides[i - 1] * pKey.info.dims[i - 1];
+                    pVal.info.dims[i] = 1;
+                    pVal.info.strides[i] = pVal.info.strides[i - 1] * pVal.info.dims[i - 1];
+                }
+
+                // Sort indices
+                // sort_by_key<T, uint, isAscending>(*resVal, *resKey, val, key, 0);
+                kernel::sort0_by_key<T, uint, isAscending>(pVal, pKey);
+
+                // Needs to be ascending (true) in order to maintain the indices properly
+                kernel::sort0_by_key<uint, T, true>(pKey, pVal);
+
+                // No need of doing moddims here because the original Array<T>
+                // dimensions have not been changed
+                //val.modDims(inDims);
+
+                CL_DEBUG_FINISH(getQueue());
+                bufferFree(key);
+            } catch (cl::Error err) {
+                CL_TO_AF_ERROR(err);
+                throw;
+            }
+        }
+
+        template<typename T, bool isAscending>
+        void sort0(Param val)
+        {
+            int higherDims =  val.info.dims[1] * val.info.dims[2] * val.info.dims[3];
+            // TODO Make a better heurisitic
+            if(higherDims > 10)
+                sortBatched<T, isAscending, 0>(val);
+            else
+                kernel::sort0Iterative<T, isAscending>(val);
+        }
     }
 }
 
diff --git a/src/backend/opencl/kernel/sort_by_key.hpp b/src/backend/opencl/kernel/sort_by_key.hpp
index e3306e6e68..c3807f7a31 100644
--- a/src/backend/opencl/kernel/sort_by_key.hpp
+++ b/src/backend/opencl/kernel/sort_by_key.hpp
@@ -15,6 +15,7 @@
 #include <dispatch.hpp>
 #include <Param.hpp>
 #include <debug_opencl.hpp>
+#include <kernel/sort_helper.hpp>
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
@@ -38,33 +39,6 @@ namespace opencl
 {
     namespace kernel
     {
-        using std::conditional;
-        using std::is_same;
-
-        // If type is cdouble, return std::complex<double>, else return T
-        template<typename T>
-        using ztype_t = typename conditional<is_same<T, cdouble>::value,
-                                             std::complex<double>, T
-                                            >::type;
-
-        // If type is cfloat, return std::complex<float>, else return ztype_t
-        template<typename T>
-        using ctype_t = typename conditional<is_same<T, cfloat>::value,
-                                             std::complex<float>, ztype_t<T>
-                                            >::type;
-
-        // If type is intl, return cl_long, else return ctype_t
-        template<typename T>
-        using ltype_t = typename conditional<is_same<T, intl>::value,
-                                             cl_long, ctype_t<T>
-                                            >::type;
-
-        // If type is uintl, return cl_ulong, else return ltype_t
-        template<typename T>
-        using type_t = typename conditional<is_same<T, uintl>::value,
-                                            cl_ulong, ltype_t<T>
-                                           >::type;
-
         template<typename Tk, typename Tv, bool isAscending>
         void sort0_by_key(Param okey, Param oval)
         {
diff --git a/src/backend/opencl/kernel/sort_helper.hpp b/src/backend/opencl/kernel/sort_helper.hpp
new file mode 100644
index 0000000000..07ab0eeb69
--- /dev/null
+++ b/src/backend/opencl/kernel/sort_helper.hpp
@@ -0,0 +1,46 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <program.hpp>
+#include <traits.hpp>
+
+namespace opencl
+{
+    namespace kernel
+    {
+        using std::conditional;
+        using std::is_same;
+
+        // If type is cdouble, return std::complex<double>, else return T
+        template<typename T>
+        using ztype_t = typename conditional<is_same<T, cdouble>::value,
+                                             std::complex<double>, T
+                                            >::type;
+
+        // If type is cfloat, return std::complex<float>, else return ztype_t
+        template<typename T>
+        using ctype_t = typename conditional<is_same<T, cfloat>::value,
+                                             std::complex<float>, ztype_t<T>
+                                            >::type;
+
+        // If type is intl, return cl_long, else return ctype_t
+        template<typename T>
+        using ltype_t = typename conditional<is_same<T, intl>::value,
+                                             cl_long, ctype_t<T>
+                                            >::type;
+
+        // If type is uintl, return cl_ulong, else return ltype_t
+        template<typename T>
+        using type_t = typename conditional<is_same<T, uintl>::value,
+                                            cl_ulong, ltype_t<T>
+                                           >::type;
+
+    }
+}
diff --git a/src/backend/opencl/kernel/sort_index.hpp b/src/backend/opencl/kernel/sort_index.hpp
index 4926dc34c3..0fa4847fc1 100644
--- a/src/backend/opencl/kernel/sort_index.hpp
+++ b/src/backend/opencl/kernel/sort_index.hpp
@@ -15,6 +15,7 @@
 #include <dispatch.hpp>
 #include <Param.hpp>
 #include <debug_opencl.hpp>
+#include <kernel/sort_helper.hpp>
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
@@ -39,16 +40,6 @@ namespace opencl
 {
     namespace kernel
     {
-        using std::conditional;
-        using std::is_same;
-        template<typename T>
-        using ltype_t = typename conditional<is_same<T, intl>::value, cl_long, T>::type;
-
-        template<typename T>
-        using type_t = typename conditional<is_same<T, uintl>::value,
-                                            cl_ulong, ltype_t<T>
-                                           >::type;
-
         template<typename T, bool isAscending>
         void sort0_index(Param val, Param idx)
         {
diff --git a/src/backend/opencl/sort.cpp b/src/backend/opencl/sort.cpp
index 762d815095..0bf2dc04cd 100644
--- a/src/backend/opencl/sort.cpp
+++ b/src/backend/opencl/sort.cpp
@@ -23,9 +23,11 @@ namespace opencl
         try {
             Array<T> out = copyArray<T>(in);
             switch(dim) {
-            case 0: kernel::sort0<T, isAscending>(out);
-                break;
-            default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
+                case 0: kernel::sort0<T, isAscending>(out); break;
+                case 1: kernel::sortBatched<T, isAscending, 1>(out); break;
+                case 2: kernel::sortBatched<T, isAscending, 2>(out); break;
+                case 3: kernel::sortBatched<T, isAscending, 3>(out); break;
+                default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
             }
             return out;
         } catch (std::exception &ex) {
diff --git a/test/sort.cpp b/test/sort.cpp
index 7ec6f5565e..116b136abe 100644
--- a/test/sort.cpp
+++ b/test/sort.cpp
@@ -107,15 +107,15 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, bool
     SORT_INIT(SortMedTrue,    sort_med1,  true,  0);
     SORT_INIT(SortMedFalse,   sort_med1,  false, 2);
     // Takes too much time in current implementation. Enable when everything is parallel
-    //SORT_INIT(SortMed5True,   sort_med,   true,  0);
-    //SORT_INIT(SortMed5False,  sort_med,   false, 2);
-    //SORT_INIT(SortLargeTrue,  sort_large, true,  0);
-    //SORT_INIT(SortLargeFalse, sort_large, false, 2);
+    SORT_INIT(SortMed5True,   sort_med,   true,  0);
+    SORT_INIT(SortMed5False,  sort_med,   false, 2);
+    SORT_INIT(SortLargeTrue,  sort_large, true,  0);
+    SORT_INIT(SortLargeFalse, sort_large, false, 2);
 
 
 ////////////////////////////////////// CPP ////////////////////////////////
 //
-TEST(Sort, CPP)
+TEST(Sort, CPPDim0)
 {
     if (noDoubleTests<float>()) return;
 
@@ -147,3 +147,70 @@ TEST(Sort, CPP)
     delete[] sxData;
 }
 
+TEST(Sort, CPPDim1)
+{
+    if (noDoubleTests<float>()) return;
+
+    const bool dir = true;
+    const unsigned resultIdx0 = 0;
+
+    vector<af::dim4> numDims;
+    vector<vector<float> > in;
+    vector<vector<float> > tests;
+    readTests<float, float, int>(string(TEST_DIR"/sort/sort_10x10.test"),numDims,in,tests);
+
+    af::dim4 idims = numDims[0];
+    af::array input(idims, &(in[0].front()));
+
+    af::array input_ = reorder(input, 1, 0, 2, 3);
+
+    af::array output = af::sort(input_, 1, dir);
+
+    size_t nElems = tests[resultIdx0].size();
+
+    // Get result
+    float* sxData = new float[tests[resultIdx0].size()];
+    output.host((void*)sxData);
+
+    // Compare result
+    for (size_t elIter = 0; elIter < nElems; ++elIter) {
+        ASSERT_EQ(tests[resultIdx0][elIter], sxData[elIter]) << "at: " << elIter << std::endl;
+    }
+
+    // Delete
+    delete[] sxData;
+}
+
+TEST(Sort, CPPDim2)
+{
+    if (noDoubleTests<float>()) return;
+
+    const bool dir = false;
+    const unsigned resultIdx0 = 2;
+
+    vector<af::dim4> numDims;
+    vector<vector<float> > in;
+    vector<vector<float> > tests;
+    readTests<float, float, int>(string(TEST_DIR"/sort/sort_med.test"),numDims,in,tests);
+
+    af::dim4 idims = numDims[0];
+    af::array input(idims, &(in[0].front()));
+
+    af::array input_ = reorder(input, 1, 2, 0, 3);
+
+    af::array output = af::sort(input_, 2, dir);
+
+    size_t nElems = tests[resultIdx0].size();
+
+    // Get result
+    float* sxData = new float[tests[resultIdx0].size()];
+    output.host((void*)sxData);
+
+    // Compare result
+    for (size_t elIter = 0; elIter < nElems; ++elIter) {
+        ASSERT_EQ(tests[resultIdx0][elIter], sxData[elIter]) << "at: " << elIter << std::endl;
+    }
+
+    // Delete
+    delete[] sxData;
+}

From e6c9e934a33f5e97a1ff97061cde490d8d5b898e Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Thu, 31 Mar 2016 17:29:44 -0400
Subject: [PATCH 05/28] Revert "Updated boost.compute to latest develop commit"

This reverts commit 98e023d7a10e301f793c04246c80bcc1c1bcfd1b.
---
 CMakeModules/build_boost_compute.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeModules/build_boost_compute.cmake b/CMakeModules/build_boost_compute.cmake
index eca4e32771..03c20435a8 100644
--- a/CMakeModules/build_boost_compute.cmake
+++ b/CMakeModules/build_boost_compute.cmake
@@ -1,9 +1,9 @@
 # If using a commit, remove the v prefix to VER in URL.
 # If using a tag, don't use v in VER
 # This is because of how github handles it's release tar balls
-SET(VER 523d8e974559977fab006190e9d40eb2e4f87bd0)
-SET(URL https://github.com/boostorg/compute/archive/${VER}.tar.gz)
-SET(MD5 bbce9e2730e449db5c8f88eae160ea12)
+SET(VER 0.5)
+SET(URL https://github.com/boostorg/compute/archive/v${VER}.tar.gz)
+SET(MD5 69a52598ac539d3b7f6005a3dd2b6f58)
 
 SET(thirdPartyDir "${CMAKE_BINARY_DIR}/third_party")
 SET(srcDir "${thirdPartyDir}/compute-${VER}")

From aaa0a056744d46c4d9027ff9ef25ceed3fe8fe3a Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Thu, 31 Mar 2016 17:40:13 -0400
Subject: [PATCH 06/28] Boost.Compute sort/sort_by_key are stable in v0.5. So
 revert to that

---
 src/backend/opencl/kernel/sort.hpp        | 6 +++---
 src/backend/opencl/kernel/sort_by_key.hpp | 8 ++++----
 src/backend/opencl/kernel/sort_index.hpp  | 6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/backend/opencl/kernel/sort.hpp b/src/backend/opencl/kernel/sort.hpp
index 7b7799ca89..63f1658208 100644
--- a/src/backend/opencl/kernel/sort.hpp
+++ b/src/backend/opencl/kernel/sort.hpp
@@ -23,7 +23,7 @@
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 
 #include <boost/compute/core.hpp>
-#include <boost/compute/algorithm/stable_sort.hpp>
+#include <boost/compute/algorithm/sort.hpp>
 #include <boost/compute/functional/operator.hpp>
 #include <boost/compute/iterator/buffer_iterator.hpp>
 
@@ -58,12 +58,12 @@ namespace opencl
                             int valOffset = valWZ + y * val.info.strides[1];
 
                             if(isAscending) {
-                                compute::stable_sort(
+                                compute::sort(
                                         compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset),
                                         compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset + val.info.dims[0]),
                                         compute::less< type_t<T> >(), c_queue);
                             } else {
-                                compute::stable_sort(
+                                compute::sort(
                                         compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset),
                                         compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset + val.info.dims[0]),
                                         compute::greater< type_t<T> >(), c_queue);
diff --git a/src/backend/opencl/kernel/sort_by_key.hpp b/src/backend/opencl/kernel/sort_by_key.hpp
index c3807f7a31..513ddbfb6d 100644
--- a/src/backend/opencl/kernel/sort_by_key.hpp
+++ b/src/backend/opencl/kernel/sort_by_key.hpp
@@ -21,7 +21,7 @@
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 
 #include <boost/compute/core.hpp>
-#include <boost/compute/algorithm/stable_sort_by_key.hpp>
+#include <boost/compute/algorithm/sort_by_key.hpp>
 #include <boost/compute/functional/operator.hpp>
 #include <boost/compute/iterator/buffer_iterator.hpp>
 
@@ -60,12 +60,12 @@ namespace opencl
                             int ovalOffset = ovalWZ + y * oval.info.strides[1];
 
                             compute::buffer_iterator< type_t<Tk> > start= compute::make_buffer_iterator< type_t<Tk> >(okey_buf, okeyOffset);
-                            compute::buffer_iterator< type_t<Tk> > end = compute::make_buffer_iterator< type_t<Tk> >(okey_buf, okeyOffset + okey.info.dims[0]);
+                            compute::buffer_iterator< type_t<Tk> > end  = compute::make_buffer_iterator< type_t<Tk> >(okey_buf, okeyOffset + okey.info.dims[0]);
                             compute::buffer_iterator< type_t<Tv> > vals = compute::make_buffer_iterator< type_t<Tv> >(oval_buf, ovalOffset);
                             if(isAscending) {
-                                compute::stable_sort_by_key(start, end, vals, c_queue);
+                                compute::sort_by_key(start, end, vals, c_queue);
                             } else {
-                                compute::stable_sort_by_key(start, end, vals,
+                                compute::sort_by_key(start, end, vals,
                                                             compute::greater< type_t<Tk> >(), c_queue);
                             }
                         }
diff --git a/src/backend/opencl/kernel/sort_index.hpp b/src/backend/opencl/kernel/sort_index.hpp
index 0fa4847fc1..aae0a94ea6 100644
--- a/src/backend/opencl/kernel/sort_index.hpp
+++ b/src/backend/opencl/kernel/sort_index.hpp
@@ -22,7 +22,7 @@
 
 #include <boost/compute/core.hpp>
 #include <boost/compute/algorithm/iota.hpp>
-#include <boost/compute/algorithm/stable_sort_by_key.hpp>
+#include <boost/compute/algorithm/sort_by_key.hpp>
 #include <boost/compute/functional/operator.hpp>
 #include <boost/compute/iterator/buffer_iterator.hpp>
 
@@ -64,12 +64,12 @@ namespace opencl
                             compute::iota(idx_begin, idx_begin + val.info.dims[0], 0, c_queue);
 
                             if(isAscending) {
-                                compute::stable_sort_by_key(
+                                compute::sort_by_key(
                                         compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset),
                                         compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset + val.info.dims[0]),
                                         idx_begin, compute::less< type_t<T> >(), c_queue);
                             } else {
-                                compute::stable_sort_by_key(
+                                compute::sort_by_key(
                                         compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset),
                                         compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset + val.info.dims[0]),
                                         idx_begin, compute::greater< type_t<T> >(), c_queue);

From 3422d012df209db8a1c563d5937b12b8b8b90235 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Fri, 1 Apr 2016 11:45:09 -0400
Subject: [PATCH 07/28] Call modDims when setDataDims is called

---
 src/api/c/moddims.cpp        | 1 -
 src/backend/cpu/Array.hpp    | 1 +
 src/backend/cuda/Array.hpp   | 1 +
 src/backend/opencl/Array.hpp | 1 +
 4 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/api/c/moddims.cpp b/src/api/c/moddims.cpp
index b8f1fafa6c..1d326c0846 100644
--- a/src/api/c/moddims.cpp
+++ b/src/api/c/moddims.cpp
@@ -31,7 +31,6 @@ Array<T> modDims(const Array<T>& in, const af::dim4 &newDims)
         Out = copyArray<T>(in);
     }
 
-    Out.modDims(newDims);
     Out.setDataDims(newDims);
 
     return Out;
diff --git a/src/backend/cpu/Array.hpp b/src/backend/cpu/Array.hpp
index 2809a2b80d..cf970d18c7 100644
--- a/src/backend/cpu/Array.hpp
+++ b/src/backend/cpu/Array.hpp
@@ -181,6 +181,7 @@ namespace cpu
 
         void setDataDims(const dim4 &new_dims)
         {
+            modDims(new_dims);
             data_dims = new_dims;
         }
 
diff --git a/src/backend/cuda/Array.hpp b/src/backend/cuda/Array.hpp
index c2292087aa..1f9512fb8d 100644
--- a/src/backend/cuda/Array.hpp
+++ b/src/backend/cuda/Array.hpp
@@ -174,6 +174,7 @@ namespace cuda
 
         void setDataDims(const dim4 &new_dims)
         {
+            modDims(new_dims);
             data_dims = new_dims;
         }
 
diff --git a/src/backend/opencl/Array.hpp b/src/backend/opencl/Array.hpp
index ada3b41dc3..f83d5c0120 100644
--- a/src/backend/opencl/Array.hpp
+++ b/src/backend/opencl/Array.hpp
@@ -211,6 +211,7 @@ namespace opencl
 
         void setDataDims(const dim4 &new_dims)
         {
+            modDims(new_dims);
             data_dims = new_dims;
         }
 

From 3fea8a2649a0a077dadb2de5fba7174c67c8c70b Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Fri, 1 Apr 2016 11:46:49 -0400
Subject: [PATCH 08/28] Fix temp Array T and moddims in CPU batched sort

---
 src/backend/cpu/sort.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp
index c3c5286fbf..fbf613f962 100644
--- a/src/backend/cpu/sort.cpp
+++ b/src/backend/cpu/sort.cpp
@@ -37,19 +37,19 @@ void sortBatched(Array<T>& val)
 
     Array<uint> key = iota<uint>(seqDims, tileDims);
 
-    Array<uint> *resKey = initArray<uint>();
-    Array<T   > *resVal = initArray<T>();
+    Array<uint> resKey = createEmptyArray<uint>(dim4());
+    Array<T   > resVal = createEmptyArray<T>(dim4());
 
-    val.modDims(inDims.elements());
-    key.modDims(inDims.elements());
+    val.setDataDims(inDims.elements());
+    key.setDataDims(inDims.elements());
 
-    sort_by_key<T, uint, isAscending>(*resVal, *resKey, val, key, 0);
+    sort_by_key<T, uint, isAscending>(resVal, resKey, val, key, 0);
 
     // Needs to be ascending (true) in order to maintain the indices properly
-    sort_by_key<uint, T, true>(key, val, *resKey, *resVal, 0);
+    sort_by_key<uint, T, true>(key, val, resKey, resVal, 0);
     val.eval();
 
-    val.modDims(inDims);
+    val.setDataDims(inDims);
 }
 
 template<typename T, bool isAscending>

From 03ce51f87c7fd3ff69808ff41cb75e33b64edb13 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Mon, 4 Apr 2016 15:40:25 -0400
Subject: [PATCH 09/28] Call thrust/compute::sort_by_key instead of
 kernel::sort_by_key wrapper

---
 src/backend/cuda/kernel/sort.hpp   | 24 +++++++++++++++++++++---
 src/backend/opencl/kernel/sort.hpp | 21 ++++++++++++++++++---
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/src/backend/cuda/kernel/sort.hpp b/src/backend/cuda/kernel/sort.hpp
index 21d9122d64..f0095b144d 100644
--- a/src/backend/cuda/kernel/sort.hpp
+++ b/src/backend/cuda/kernel/sort.hpp
@@ -11,7 +11,6 @@
 #include <dispatch.hpp>
 #include <Param.hpp>
 #include <kernel/iota.hpp>
-#include <kernel/sort_by_key.hpp>
 #include <err_cuda.hpp>
 #include <debug_cuda.hpp>
 #include <thrust/device_ptr.h>
@@ -92,10 +91,29 @@ namespace cuda
 
             // Sort indices
             // sort_by_key<T, uint, isAscending>(*resVal, *resKey, val, key, 0);
-            kernel::sort0_by_key<T, uint, isAscending>(pVal, pKey);
+            //kernel::sort0_by_key<T, uint, isAscending>(pVal, pKey);
+            thrust::device_ptr<T>    pVal_ptr = thrust::device_pointer_cast(pVal.ptr);
+            thrust::device_ptr<uint> pKey_ptr = thrust::device_pointer_cast(pKey.ptr);
+            if(isAscending) {
+                THRUST_SELECT(thrust::stable_sort_by_key,
+                              pVal_ptr,
+                              pVal_ptr + pVal.dims[0],
+                              pKey_ptr);
+            } else {
+                THRUST_SELECT(thrust::stable_sort_by_key,
+                              pVal_ptr,
+                              pVal_ptr + pVal.dims[0],
+                              pKey_ptr, thrust::greater<T>());
+            }
+            POST_LAUNCH_CHECK();
 
             // Needs to be ascending (true) in order to maintain the indices properly
-            kernel::sort0_by_key<uint, T, true>(pKey, pVal);
+            //kernel::sort0_by_key<uint, T, true>(pKey, pVal);
+            THRUST_SELECT(thrust::stable_sort_by_key,
+                          pKey_ptr,
+                          pKey_ptr + pVal.dims[0],
+                          pVal_ptr);
+            POST_LAUNCH_CHECK();
 
             // No need of doing moddims here because the original Array<T>
             // dimensions have not been changed
diff --git a/src/backend/opencl/kernel/sort.hpp b/src/backend/opencl/kernel/sort.hpp
index 63f1658208..b9a7d39b28 100644
--- a/src/backend/opencl/kernel/sort.hpp
+++ b/src/backend/opencl/kernel/sort.hpp
@@ -17,13 +17,13 @@
 #include <debug_opencl.hpp>
 #include <kernel/sort_helper.hpp>
 #include <kernel/iota.hpp>
-#include <kernel/sort_by_key.hpp>
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 
 #include <boost/compute/core.hpp>
 #include <boost/compute/algorithm/sort.hpp>
+#include <boost/compute/algorithm/sort_by_key.hpp>
 #include <boost/compute/functional/operator.hpp>
 #include <boost/compute/iterator/buffer_iterator.hpp>
 
@@ -125,10 +125,25 @@ namespace opencl
 
                 // Sort indices
                 // sort_by_key<T, uint, isAscending>(*resVal, *resKey, val, key, 0);
-                kernel::sort0_by_key<T, uint, isAscending>(pVal, pKey);
+                //kernel::sort0_by_key<T, uint, isAscending>(pVal, pKey);
+                compute::command_queue c_queue(getQueue()());
+
+                compute::buffer pKey_buf((*pKey.data)());
+                compute::buffer pVal_buf((*pVal.data)());
+
+                compute::buffer_iterator< type_t<T>    > val0 = compute::make_buffer_iterator< type_t<T>    >(pVal_buf, 0);
+                compute::buffer_iterator< type_t<T>    > valN = compute::make_buffer_iterator< type_t<T>    >(pVal_buf,+ pVal.info.dims[0]);
+                compute::buffer_iterator< type_t<uint> > key0 = compute::make_buffer_iterator< type_t<uint> >(pKey_buf, 0);
+                compute::buffer_iterator< type_t<uint> > keyN = compute::make_buffer_iterator< type_t<uint> >(pKey_buf, pKey.info.dims[0]);
+                if(isAscending) {
+                    compute::sort_by_key(val0, valN, key0, c_queue);
+                } else {
+                    compute::sort_by_key(val0, valN, key0, compute::greater< type_t<T> >(), c_queue);
+                }
 
                 // Needs to be ascending (true) in order to maintain the indices properly
-                kernel::sort0_by_key<uint, T, true>(pKey, pVal);
+                //kernel::sort0_by_key<uint, T, true>(pKey, pVal);
+                compute::sort_by_key(key0, keyN, val0, c_queue);
 
                 // No need of doing moddims here because the original Array<T>
                 // dimensions have not been changed

From 6c47b15657019ad815db966512a096b212ca8d8e Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Mon, 4 Apr 2016 19:28:22 -0400
Subject: [PATCH 10/28] PERF Add batched sort to sort_index for CPU and CUDA

---
 src/api/c/sort.cpp                      |   2 -
 src/backend/cpu/kernel/sort_helper.hpp  |  56 ++++++++++++
 src/backend/cpu/kernel/sort_index.hpp   | 117 +++++++++++++++++++-----
 src/backend/cpu/sort_index.cpp          |  13 ++-
 src/backend/cuda/kernel/harris.hpp      |   2 +-
 src/backend/cuda/kernel/iota.hpp        |   6 +-
 src/backend/cuda/kernel/orb.hpp         |   2 +-
 src/backend/cuda/kernel/sort_helper.hpp |  66 +++++++++++++
 src/backend/cuda/kernel/sort_index.hpp  |  87 +++++++++++++++++-
 src/backend/cuda/sort_index.cu          |  14 ++-
 test/sort_index.cpp                     |   3 +-
 11 files changed, 325 insertions(+), 43 deletions(-)
 create mode 100644 src/backend/cpu/kernel/sort_helper.hpp
 create mode 100644 src/backend/cuda/kernel/sort_helper.hpp

diff --git a/src/api/c/sort.cpp b/src/api/c/sort.cpp
index e3f3ae35da..7f81fbc540 100644
--- a/src/api/c/sort.cpp
+++ b/src/api/c/sort.cpp
@@ -91,8 +91,6 @@ af_err af_sort_index(af_array *out, af_array *indices, const af_array in, const
         af_dtype type = info.getType();
 
         DIM_ASSERT(2, info.elements() > 0);
-        // Only Dim 0 supported
-        ARG_ASSERT(3, dim == 0);
 
         af_array val;
         af_array idx;
diff --git a/src/backend/cpu/kernel/sort_helper.hpp b/src/backend/cpu/kernel/sort_helper.hpp
new file mode 100644
index 0000000000..4b8f2f95b0
--- /dev/null
+++ b/src/backend/cpu/kernel/sort_helper.hpp
@@ -0,0 +1,56 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <err_cpu.hpp>
+
+namespace cpu
+{
+    namespace kernel
+    {
+        static const int copyPairIter = 4;
+
+        template <typename T>
+        using IndexPair = std::pair<T, uint>;
+
+        template <typename T, bool isAscending>
+        struct IPCompare
+        {
+            bool operator()(const IndexPair<T> &lhs, const IndexPair<T> &rhs)
+            {
+                // Check stable sort condition
+                if(isAscending) return (lhs.first < rhs.first);
+                else return (lhs.first > rhs.first);
+            }
+        };
+
+        template <typename T>
+        using KeyIndexPair = std::pair<IndexPair<T>, uint>;
+
+        template <typename T, bool isAscending>
+        struct KIPCompareV
+        {
+            bool operator()(const KeyIndexPair<T> &lhs, const KeyIndexPair<T> &rhs)
+            {
+                // Check stable sort condition
+                if(isAscending) return (lhs.first.first < rhs.first.first);
+                else return (lhs.first.first > rhs.first.first);
+            }
+        };
+
+        template <typename T, bool isAscending>
+        struct KIPCompareK
+        {
+            bool operator()(const KeyIndexPair<T> &lhs, const KeyIndexPair<T> &rhs)
+            {
+                if(isAscending) return (lhs.second < rhs.second);
+                else return (lhs.second > rhs.second);
+            }
+        };
+    }
+}
diff --git a/src/backend/cpu/kernel/sort_index.hpp b/src/backend/cpu/kernel/sort_index.hpp
index b71cc47071..1b86507aae 100644
--- a/src/backend/cpu/kernel/sort_index.hpp
+++ b/src/backend/cpu/kernel/sort_index.hpp
@@ -15,6 +15,7 @@
 #include <numeric>
 #include <err_cpu.hpp>
 #include <functional>
+#include <kernel/sort_helper.hpp>
 
 namespace cpu
 {
@@ -22,50 +23,118 @@ namespace kernel
 {
 
 template<typename T, bool isAscending>
-void sort0_index(Array<T> val, Array<uint> idx, const Array<T> in)
+void sort0IndexIterative(Array<T> val, Array<uint> idx)
 {
     // initialize original index locations
-       uint *idx_ptr = idx.get();
-          T *val_ptr = val.get();
-    const T *in_ptr  = in.get();
-    function<bool(T, T)> op = std::greater<T>();
-    if(isAscending) { op = std::less<T>(); }
+    uint *idx_ptr = idx.get();
+       T *val_ptr = val.get();
 
-    std::vector<uint> seq_vec(idx.dims()[0]);
-    std::iota(seq_vec.begin(), seq_vec.end(), 0);
+    std::vector<IndexPair<T> > X;
+    X.reserve(val.dims()[0]);
 
-    const T *comp_ptr = nullptr;
-    auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);};
-
-    for(dim_t w = 0; w < in.dims()[3]; w++) {
+    for(dim_t w = 0; w < val.dims()[3]; w++) {
         dim_t valW = w * val.strides()[3];
         dim_t idxW = w * idx.strides()[3];
-        dim_t  inW = w *  in.strides()[3];
-        for(dim_t z = 0; z < in.dims()[2]; z++) {
+        for(dim_t z = 0; z < val.dims()[2]; z++) {
             dim_t valWZ = valW + z * val.strides()[2];
             dim_t idxWZ = idxW + z * idx.strides()[2];
-            dim_t  inWZ =  inW + z *  in.strides()[2];
-            for(dim_t y = 0; y < in.dims()[1]; y++) {
-
+            for(dim_t y = 0; y < val.dims()[1]; y++) {
                 dim_t valOffset = valWZ + y * val.strides()[1];
                 dim_t idxOffset = idxWZ + y * idx.strides()[1];
-                dim_t inOffset  =  inWZ + y *  in.strides()[1];
 
-                uint *ptr = idx_ptr + idxOffset;
-                std::copy(seq_vec.begin(), seq_vec.end(), ptr);
+                X.clear();
+                std::transform(val_ptr + valOffset, val_ptr + valOffset + val.dims()[0],
+                               idx_ptr + idxOffset,
+                               std::back_inserter(X),
+                               [](T v_, uint i_) { return std::make_pair(v_, i_); }
+                               );
+
+                //comp_ptr = &X.front();
+                std::stable_sort(X.begin(), X.end(), IPCompare<T, isAscending>());
+
+                for(unsigned it = 0; it < X.size(); it++) {
+                    val_ptr[valOffset + it] = X[it].first;
+                    idx_ptr[idxOffset + it] = X[it].second;
+                }
+            }
+        }
+    }
+
+    return;
+}
+
+template<typename T, bool isAscending, int dim>
+void sortIndexBatched(Array<T> val, Array<uint> idx)
+{
+    af::dim4 inDims = val.dims();
+
+    af::dim4 tileDims(1);
+    af::dim4 seqDims = inDims;
+    tileDims[dim] = inDims[dim];
+    seqDims[dim] = 1;
 
-                comp_ptr = in_ptr + inOffset;
-                std::stable_sort(ptr, ptr + in.dims()[0], comparator);
+    uint* key = memAlloc<uint>(inDims.elements());
+    // IOTA
+    {
+        af::dim4 dims    = inDims;
+        uint* out        = key;
+        af::dim4 strides(1);
+        for(int i = 1; i < 4; i++)
+            strides[i] = strides[i-1] * dims[i-1];
 
-                for (dim_t i = 0; i < val.dims()[0]; ++i){
-                    val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]];
+        for(dim_t w = 0; w < dims[3]; w++) {
+            dim_t offW = w * strides[3];
+            T valW = (w % seqDims[3]) * seqDims[0] * seqDims[1] * seqDims[2];
+            for(dim_t z = 0; z < dims[2]; z++) {
+                dim_t offWZ = offW + z * strides[2];
+                T valZ = valW + (z % seqDims[2]) * seqDims[0] * seqDims[1];
+                for(dim_t y = 0; y < dims[1]; y++) {
+                    dim_t offWZY = offWZ + y * strides[1];
+                    T valY = valZ + (y % seqDims[1]) * seqDims[0];
+                    for(dim_t x = 0; x < dims[0]; x++) {
+                        dim_t id = offWZY + x;
+                        out[id] = valY + (x % seqDims[0]);
+                    }
                 }
             }
         }
     }
 
+    // initialize original index locations
+    uint *idx_ptr = idx.get();
+       T *val_ptr = val.get();
+
+    std::vector<KeyIndexPair<T> > X;
+    X.reserve(val.elements());
+
+    for(unsigned i = 0; i < val.elements(); i++) {
+        X.push_back(std::make_pair(std::make_pair(val_ptr[i], idx_ptr[i]), key[i]));
+    }
+
+    memFree(key); // key is no longer required
+
+    std::stable_sort(X.begin(), X.end(), KIPCompareV<T, isAscending>());
+
+    std::stable_sort(X.begin(), X.end(), KIPCompareK<T, true>());
+
+    for(unsigned it = 0; it < val.elements(); it++) {
+        val_ptr[it] = X[it].first.first;
+        idx_ptr[it] = X[it].first.second;
+    }
+
     return;
 }
 
+template<typename T, bool isAscending>
+void sort0Index(Array<T> val, Array<unsigned> idx)
+{
+    int higherDims =  val.dims()[1] * val.dims()[2] * val.dims()[3];
+    // TODO Make a better heurisitic
+    if(higherDims > 0)
+        kernel::sortIndexBatched<T, isAscending, 0>(val, idx);
+    else
+        kernel::sort0IndexIterative<T, isAscending>(val, idx);
+}
+
 }
 }
diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp
index 77860ede18..883cb24bb3 100644
--- a/src/backend/cpu/sort_index.cpp
+++ b/src/backend/cpu/sort_index.cpp
@@ -14,6 +14,8 @@
 #include <numeric>
 #include <platform.hpp>
 #include <queue.hpp>
+#include <range.hpp>
+#include <copy.hpp>
 #include <kernel/sort_index.hpp>
 
 namespace cpu
@@ -24,10 +26,15 @@ void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint
 {
     in.eval();
 
-    val = createEmptyArray<T>(in.dims());
-    idx = createEmptyArray<uint>(in.dims());
+    val = copyArray<T>(in);
+    idx = range<uint>(in.dims(), dim);
+    idx.eval();
+
     switch(dim) {
-        case 0: getQueue().enqueue(kernel::sort0_index<T, isAscending>, val, idx, in); break;
+        case 0: getQueue().enqueue(kernel::sort0Index<T, isAscending>, val, idx); break;
+        case 1: getQueue().enqueue(kernel::sortIndexBatched<T, isAscending, 1>, val, idx); break;
+        case 2: getQueue().enqueue(kernel::sortIndexBatched<T, isAscending, 2>, val, idx); break;
+        case 3: getQueue().enqueue(kernel::sortIndexBatched<T, isAscending, 3>, val, idx); break;
         default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
     }
 }
diff --git a/src/backend/cuda/kernel/harris.hpp b/src/backend/cuda/kernel/harris.hpp
index 44f98d92c1..3cb28b2b2f 100644
--- a/src/backend/cuda/kernel/harris.hpp
+++ b/src/backend/cuda/kernel/harris.hpp
@@ -339,7 +339,7 @@ void harris(unsigned* corners_out,
         harris_idx.ptr = memAlloc<unsigned>(sort_elem);
 
         // Sort Harris responses
-        sort0_index<float, false>(harris_responses, harris_idx);
+        sort0Index<float, false>(harris_responses, harris_idx);
 
         *x_out = memAlloc<float>(*corners_out);
         *y_out = memAlloc<float>(*corners_out);
diff --git a/src/backend/cuda/kernel/iota.hpp b/src/backend/cuda/kernel/iota.hpp
index 2632266c92..fc28c82882 100644
--- a/src/backend/cuda/kernel/iota.hpp
+++ b/src/backend/cuda/kernel/iota.hpp
@@ -18,8 +18,8 @@ namespace cuda
     namespace kernel
     {
         // Kernel Launch Config Values
-        static const unsigned TX = 32;
-        static const unsigned TY = 8;
+        static const unsigned IOTA_TX = 32;
+        static const unsigned IOTA_TY = 8;
         static const unsigned TILEX = 512;
         static const unsigned TILEY = 32;
 
@@ -71,7 +71,7 @@ namespace cuda
         template<typename T>
         void iota(Param<T> out, const dim4 &sdims, const dim4 &tdims)
         {
-            dim3 threads(TX, TY, 1);
+            dim3 threads(IOTA_TX, IOTA_TY, 1);
 
             int blocksPerMatX = divup(out.dims[0], TILEX);
             int blocksPerMatY = divup(out.dims[1], TILEY);
diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp
index 89de56065d..8448418f8b 100644
--- a/src/backend/cuda/kernel/orb.hpp
+++ b/src/backend/cuda/kernel/orb.hpp
@@ -397,7 +397,7 @@ void orb(unsigned* out_feat,
         harris_idx.ptr = memAlloc<unsigned>(sort_elem);
 
         // Sort features according to Harris responses
-        sort0_index<float, false>(harris_sorted, harris_idx);
+        sort0Index<float, false>(harris_sorted, harris_idx);
 
         feat_pyr[i] = std::min(feat_pyr[i], lvl_best[i]);
 
diff --git a/src/backend/cuda/kernel/sort_helper.hpp b/src/backend/cuda/kernel/sort_helper.hpp
new file mode 100644
index 0000000000..445e9a28f5
--- /dev/null
+++ b/src/backend/cuda/kernel/sort_helper.hpp
@@ -0,0 +1,66 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <dispatch.hpp>
+#include <err_cuda.hpp>
+#include <debug_cuda.hpp>
+
+// This needs to be in global namespace as it is used by thrust
+template<typename T>
+struct IndexPair
+{
+    T val;
+    uint idx;
+};
+
+template <typename T, bool isAscending>
+struct IPCompare
+{
+    __host__ __device__
+    bool operator()(const IndexPair<T> &lhs, const IndexPair<T> &rhs) const
+    {
+        // Check stable sort condition
+        if(isAscending) return (lhs.val < rhs.val);
+        else return (lhs.val > rhs.val);
+    }
+};
+
+namespace cuda
+{
+    namespace kernel
+    {
+        static const int copyPairIter = 4;
+
+        template <typename Tk, typename Tv>
+        __global__
+        void makeIndexPair(IndexPair<Tv> *out, const Tk *key, const Tv *val, const int N)
+        {
+            int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x;
+
+            for(int i = tIdx; i < N; i += blockDim.x)
+            {
+                out[i].val = val[i];
+                out[i].idx = key[i];
+            }
+        }
+
+        template <typename Tk, typename Tv>
+        __global__
+        void splitIndexPair(Tk *key, Tv *val, const IndexPair<Tv> *out, const int N)
+        {
+            int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x;
+
+            for(int i = tIdx; i < N; i += blockDim.x)
+            {
+                val[i] = out[i].val;
+                key[i] = out[i].idx;
+            }
+        }
+    }
+}
diff --git a/src/backend/cuda/kernel/sort_index.hpp b/src/backend/cuda/kernel/sort_index.hpp
index 8762f28a4a..d23c503005 100644
--- a/src/backend/cuda/kernel/sort_index.hpp
+++ b/src/backend/cuda/kernel/sort_index.hpp
@@ -12,8 +12,11 @@
 #include <Param.hpp>
 #include <err_cuda.hpp>
 #include <debug_cuda.hpp>
+#include <kernel/sort_helper.hpp>
+#include <kernel/iota.hpp>
+
 #include <thrust/device_ptr.h>
-#include <thrust/sequence.h>
+#include <thrust/device_vector.h>
 #include <thrust/sort.h>
 
 namespace cuda
@@ -24,7 +27,7 @@ namespace cuda
         // Wrapper functions
         ///////////////////////////////////////////////////////////////////////////
         template<typename T, bool isAscending>
-        void sort0_index(Param<T> val, Param<unsigned> idx)
+        void sort0IndexIterative(Param<T> val, Param<unsigned> idx)
         {
             thrust::device_ptr<T>        val_ptr = thrust::device_pointer_cast(val.ptr);
             thrust::device_ptr<unsigned> idx_ptr = thrust::device_pointer_cast(idx.ptr);
@@ -40,7 +43,6 @@ namespace cuda
                         int valOffset = valWZ + y * val.strides[1];
                         int idxOffset = idxWZ + y * idx.strides[1];
 
-                        THRUST_SELECT(thrust::sequence, idx_ptr + idxOffset, idx_ptr + idxOffset + idx.dims[0]);
                         if(isAscending) {
                             THRUST_SELECT(thrust::stable_sort_by_key,
                                     val_ptr + valOffset, val_ptr + valOffset + val.dims[0],
@@ -55,5 +57,84 @@ namespace cuda
             }
             POST_LAUNCH_CHECK();
         }
+
+        template<typename T, bool isAscending, int dim>
+        void sortIndexBatched(Param<T> pVal, Param<unsigned> pIdx)
+        {
+            af::dim4 inDims;
+            for(int i = 0; i < 4; i++)
+                inDims[i] = pVal.dims[i];
+
+            // Sort dimension
+            // tileDims * seqDims = inDims
+            af::dim4 tileDims(1);
+            af::dim4 seqDims = inDims;
+            tileDims[dim] = inDims[dim];
+            seqDims[dim] = 1;
+
+            // Create/call iota
+            // Array<uint> key = iota<uint>(seqDims, tileDims);
+            dim4 keydims = inDims;
+            uint* key = memAlloc<uint>(keydims.elements());
+            Param<uint> pKey;
+            pKey.ptr = key;
+            pKey.strides[0] = 1;
+            pKey.dims[0] = keydims[0];
+            for(int i = 1; i < 4; i++) {
+                pKey.dims[i] = keydims[i];
+                pKey.strides[i] = pKey.strides[i - 1] * pKey.dims[i - 1];
+            }
+            cuda::kernel::iota<uint>(pKey, seqDims, tileDims);
+
+            // Flat - Not required since inplace and both are continuous
+            //val.modDims(inDims.elements());
+            //key.modDims(inDims.elements());
+
+            // Make val, idx into a pair
+            thrust::device_vector<IndexPair<T> > X(inDims.elements());
+            IndexPair<T> *Xptr = thrust::raw_pointer_cast(X.data());
+
+            const int threads = 256;
+            int blocks = divup(inDims.elements(), threads * copyPairIter);
+            CUDA_LAUNCH((makeIndexPair<unsigned, T>), blocks, threads,
+                        Xptr, pIdx.ptr, pVal.ptr, inDims.elements());
+
+            // Sort indices
+            // sort_by_key<T, uint, isAscending>(*resVal, *resKey, val, key, 0);
+            THRUST_SELECT(thrust::stable_sort_by_key,
+                          X.begin(), X.end(),
+                          pKey.ptr,
+                          IPCompare<T, isAscending>());
+            POST_LAUNCH_CHECK();
+
+            // Needs to be ascending (true) in order to maintain the indices properly
+            //kernel::sort0_by_key<uint, T, true>(pKey, pVal);
+            THRUST_SELECT(thrust::stable_sort_by_key,
+                          pKey.ptr,
+                          pKey.ptr + inDims.elements(),
+                          X.begin());
+            POST_LAUNCH_CHECK();
+
+            CUDA_LAUNCH((splitIndexPair<unsigned, T>), blocks, threads,
+                        pIdx.ptr, pVal.ptr, Xptr, inDims.elements());
+            POST_LAUNCH_CHECK();
+
+            // No need of doing moddims here because the original Array<T>
+            // dimensions have not been changed
+            //val.modDims(inDims);
+
+            memFree(key);
+        }
+
+        template<typename T, bool isAscending>
+        void sort0Index(Param<T> val, Param<unsigned> idx)
+        {
+            int higherDims =  val.dims[1] * val.dims[2] * val.dims[3];
+            // TODO Make a better heurisitic
+            if(higherDims > 5)
+                sortIndexBatched<T, isAscending, 0>(val, idx);
+            else
+                kernel::sort0IndexIterative<T, isAscending>(val, idx);
+        }
     }
 }
diff --git a/src/backend/cuda/sort_index.cu b/src/backend/cuda/sort_index.cu
index 606aab4eb1..270df30128 100644
--- a/src/backend/cuda/sort_index.cu
+++ b/src/backend/cuda/sort_index.cu
@@ -9,11 +9,13 @@
 
 #include <Array.hpp>
 #include <sort_index.hpp>
-#include <copy.hpp>
 #include <kernel/sort_index.hpp>
+#include <copy.hpp>
 #include <math.hpp>
 #include <stdexcept>
 #include <err_cuda.hpp>
+#include <range.hpp>
+#include <reorder.hpp>
 
 namespace cuda
 {
@@ -21,10 +23,14 @@ namespace cuda
     void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint dim)
     {
         val = copyArray<T>(in);
-        idx = createEmptyArray<uint>(in.dims());
+        idx = range<uint>(in.dims(), dim);
+        idx.eval();
+
         switch(dim) {
-            case 0: kernel::sort0_index<T, isAscending>(val, idx);
-                    break;
+            case 0: kernel::sort0Index<T, isAscending>(val, idx); break;
+            case 1: kernel::sortIndexBatched<T, isAscending, 1>(val, idx); break;
+            case 2: kernel::sortIndexBatched<T, isAscending, 2>(val, idx); break;
+            case 3: kernel::sortIndexBatched<T, isAscending, 3>(val, idx); break;
             default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
         }
     }
diff --git a/test/sort_index.cpp b/test/sort_index.cpp
index 6aa240d5a5..2326ed706f 100644
--- a/test/sort_index.cpp
+++ b/test/sort_index.cpp
@@ -124,12 +124,11 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const
     //SORT_INIT(SortMed5False,  sort_med,   false, 2, 3);
     //SORT_INIT(SortLargeTrue,  sort_large, true,  0, 1);
     //SORT_INIT(SortLargeFalse, sort_large, false, 2, 3);
-;
 
 
 //////////////////////////////////// CPP /////////////////////////////////
 //
-TEST(SortIndex, CPP)
+TEST(SortIndex, CPPDim0)
 {
     if (noDoubleTests<float>()) return;
 

From 48dcfb0cb5c7d6c94d028f13eafcba7852a67741 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Mon, 18 Apr 2016 18:08:44 -0400
Subject: [PATCH 11/28] Fix ordering of data from sort

---
 src/backend/cpu/sort.cpp    | 16 +++++++++++++++-
 src/backend/cuda/sort.cu    | 14 ++++++++++++++
 src/backend/opencl/sort.cpp | 14 ++++++++++++++
 test/sort.cpp               |  4 ++++
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp
index fbf613f962..7fa8769f11 100644
--- a/src/backend/cpu/sort.cpp
+++ b/src/backend/cpu/sort.cpp
@@ -19,6 +19,7 @@
 #include <iota.hpp>
 #include <handle.hpp>
 #include <sort_by_key.hpp>
+#include <reorder.hpp>
 #include <kernel/sort.hpp>
 
 namespace cpu
@@ -49,7 +50,7 @@ void sortBatched(Array<T>& val)
     sort_by_key<uint, T, true>(key, val, resKey, resVal, 0);
     val.eval();
 
-    val.setDataDims(inDims);
+    val.setDataDims(inDims); // This is correct only for dim0
 }
 
 template<typename T, bool isAscending>
@@ -76,6 +77,19 @@ Array<T> sort(const Array<T> &in, const unsigned dim)
         case 3: sortBatched<T, isAscending, 3>(out); break;
         default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
     }
+
+    if(dim != 0) {
+        af::dim4 preorderDims = out.dims();
+        af::dim4 reorderDims(0, 1, 2, 3);
+        reorderDims[dim] = 0;
+        preorderDims[0] = out.dims()[dim];
+        for(int i = 1; i <= (int)dim; i++) {
+            reorderDims[i - 1] = i;
+            preorderDims[i] = out.dims()[i - 1];
+        }
+
+        out = reorder<T>(out, reorderDims);
+    }
     return out;
 }
 
diff --git a/src/backend/cuda/sort.cu b/src/backend/cuda/sort.cu
index 4ae3b759fb..99b42d4196 100644
--- a/src/backend/cuda/sort.cu
+++ b/src/backend/cuda/sort.cu
@@ -12,6 +12,7 @@
 #include <sort.hpp>
 #include <kernel/sort.hpp>
 #include <math.hpp>
+#include <reorder.hpp>
 #include <stdexcept>
 #include <err_cuda.hpp>
 
@@ -28,6 +29,19 @@ namespace cuda
             case 3: kernel::sortBatched<T, isAscending, 3>(out); break;
             default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
         }
+
+        if(dim != 0) {
+            af::dim4 preorderDims = out.dims();
+            af::dim4 reorderDims(0, 1, 2, 3);
+            reorderDims[dim] = 0;
+            preorderDims[0] = out.dims()[dim];
+            for(int i = 1; i <= (int)dim; i++) {
+                reorderDims[i - 1] = i;
+                preorderDims[i] = out.dims()[i - 1];
+            }
+
+            out = reorder<T>(out, reorderDims);
+        }
         return out;
     }
 
diff --git a/src/backend/opencl/sort.cpp b/src/backend/opencl/sort.cpp
index 0bf2dc04cd..c7bd774ecd 100644
--- a/src/backend/opencl/sort.cpp
+++ b/src/backend/opencl/sort.cpp
@@ -12,6 +12,7 @@
 #include <copy.hpp>
 #include <kernel/sort.hpp>
 #include <math.hpp>
+#include <reorder.hpp>
 #include <stdexcept>
 #include <err_opencl.hpp>
 
@@ -29,6 +30,19 @@ namespace opencl
                 case 3: kernel::sortBatched<T, isAscending, 3>(out); break;
                 default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
             }
+
+            if(dim != 0) {
+                af::dim4 preorderDims = out.dims();
+                af::dim4 reorderDims(0, 1, 2, 3);
+                reorderDims[dim] = 0;
+                preorderDims[0] = out.dims()[dim];
+                for(int i = 1; i <= (int)dim; i++) {
+                    reorderDims[i - 1] = i;
+                    preorderDims[i] = out.dims()[i - 1];
+                }
+
+                out = reorder<T>(out, reorderDims);
+            }
             return out;
         } catch (std::exception &ex) {
             AF_ERROR(ex.what(), AF_ERR_INTERNAL);
diff --git a/test/sort.cpp b/test/sort.cpp
index 116b136abe..977b54b5c7 100644
--- a/test/sort.cpp
+++ b/test/sort.cpp
@@ -166,6 +166,8 @@ TEST(Sort, CPPDim1)
 
     af::array output = af::sort(input_, 1, dir);
 
+    output = reorder(output, 1, 0, 2, 3); // Required for checking with test data
+
     size_t nElems = tests[resultIdx0].size();
 
     // Get result
@@ -200,6 +202,8 @@ TEST(Sort, CPPDim2)
 
     af::array output = af::sort(input_, 2, dir);
 
+    output = reorder(output, 2, 0, 1, 3); // Required for checking with test data
+
     size_t nElems = tests[resultIdx0].size();
 
     // Get result

From c79b26fa58dfd3d8493a1cea5460fa573cced894 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Tue, 19 Apr 2016 10:46:23 -0400
Subject: [PATCH 12/28] Fix reordering of data for sort_index in cpu and cuda

---
 src/backend/cpu/sort.cpp       |  1 +
 src/backend/cpu/sort_index.cpp | 18 ++++++++++++++++++
 src/backend/cuda/sort.cu       |  1 +
 src/backend/cuda/sort_index.cu | 17 +++++++++++++++++
 src/backend/opencl/sort.cpp    |  1 +
 5 files changed, 38 insertions(+)

diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp
index 7fa8769f11..4a649e0b23 100644
--- a/src/backend/cpu/sort.cpp
+++ b/src/backend/cpu/sort.cpp
@@ -88,6 +88,7 @@ Array<T> sort(const Array<T> &in, const unsigned dim)
             preorderDims[i] = out.dims()[i - 1];
         }
 
+        out.setDataDims(preorderDims);
         out = reorder<T>(out, reorderDims);
     }
     return out;
diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp
index 883cb24bb3..36ca57b3e8 100644
--- a/src/backend/cpu/sort_index.cpp
+++ b/src/backend/cpu/sort_index.cpp
@@ -16,6 +16,7 @@
 #include <queue.hpp>
 #include <range.hpp>
 #include <copy.hpp>
+#include <reorder.hpp>
 #include <kernel/sort_index.hpp>
 
 namespace cpu
@@ -37,6 +38,23 @@ void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint
         case 3: getQueue().enqueue(kernel::sortIndexBatched<T, isAscending, 3>, val, idx); break;
         default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
     }
+
+    if(dim != 0) {
+        af::dim4 preorderDims = val.dims();
+        af::dim4 reorderDims(0, 1, 2, 3);
+        reorderDims[dim] = 0;
+        preorderDims[0] = val.dims()[dim];
+        for(int i = 1; i <= (int)dim; i++) {
+            reorderDims[i - 1] = i;
+            preorderDims[i] = val.dims()[i - 1];
+        }
+
+        val.setDataDims(preorderDims);
+        idx.setDataDims(preorderDims);
+
+        val = reorder<T>(val, reorderDims);
+        idx = reorder<uint>(idx, reorderDims);
+    }
 }
 
 #define INSTANTIATE(T)                                                  \
diff --git a/src/backend/cuda/sort.cu b/src/backend/cuda/sort.cu
index 99b42d4196..9b0f4c53af 100644
--- a/src/backend/cuda/sort.cu
+++ b/src/backend/cuda/sort.cu
@@ -40,6 +40,7 @@ namespace cuda
                 preorderDims[i] = out.dims()[i - 1];
             }
 
+            out.setDataDims(preorderDims);
             out = reorder<T>(out, reorderDims);
         }
         return out;
diff --git a/src/backend/cuda/sort_index.cu b/src/backend/cuda/sort_index.cu
index 270df30128..ab54c24a9c 100644
--- a/src/backend/cuda/sort_index.cu
+++ b/src/backend/cuda/sort_index.cu
@@ -33,6 +33,23 @@ namespace cuda
             case 3: kernel::sortIndexBatched<T, isAscending, 3>(val, idx); break;
             default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
         }
+
+        if(dim != 0) {
+            af::dim4 preorderDims = val.dims();
+            af::dim4 reorderDims(0, 1, 2, 3);
+            reorderDims[dim] = 0;
+            preorderDims[0] = val.dims()[dim];
+            for(int i = 1; i <= (int)dim; i++) {
+                reorderDims[i - 1] = i;
+                preorderDims[i] = val.dims()[i - 1];
+            }
+
+            val.setDataDims(preorderDims);
+            idx.setDataDims(preorderDims);
+
+            val = reorder<T>(val, reorderDims);
+            idx = reorder<uint>(idx, reorderDims);
+        }
     }
 
 #define INSTANTIATE(T)                                                  \
diff --git a/src/backend/opencl/sort.cpp b/src/backend/opencl/sort.cpp
index c7bd774ecd..1548f27472 100644
--- a/src/backend/opencl/sort.cpp
+++ b/src/backend/opencl/sort.cpp
@@ -41,6 +41,7 @@ namespace opencl
                     preorderDims[i] = out.dims()[i - 1];
                 }
 
+                out.setDataDims(preorderDims);
                 out = reorder<T>(out, reorderDims);
             }
             return out;

From 08613a817564f89d18397f53de83b2633ed9d97b Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Wed, 20 Apr 2016 13:22:00 -0400
Subject: [PATCH 13/28] Added sort_by_key batching to CPU and CUDA

---
 src/api/c/sort.cpp                      |   2 -
 src/backend/cpu/kernel/sort_by_key.hpp  | 127 +++++++++++++++++-------
 src/backend/cpu/kernel/sort_helper.hpp  |  22 ++--
 src/backend/cpu/kernel/sort_index.hpp   |  16 +--
 src/backend/cpu/sort_by_key.cpp         |  32 ++++--
 src/backend/cuda/kernel/sort_by_key.hpp |  83 +++++++++++++++-
 src/backend/cuda/kernel/sort_helper.hpp |  26 ++---
 src/backend/cuda/kernel/sort_index.hpp  |  37 +++----
 src/backend/cuda/sort_by_key_impl.hpp   |  27 ++++-
 9 files changed, 274 insertions(+), 98 deletions(-)

diff --git a/src/api/c/sort.cpp b/src/api/c/sort.cpp
index 7f81fbc540..dd58175936 100644
--- a/src/api/c/sort.cpp
+++ b/src/api/c/sort.cpp
@@ -174,8 +174,6 @@ af_err af_sort_by_key(af_array *out_keys, af_array *out_values,
 
         DIM_ASSERT(3, kinfo.elements() > 0);
         DIM_ASSERT(4, kinfo.dims() == vinfo.dims());
-        // Only Dim 0 supported
-        ARG_ASSERT(5, dim == 0);
 
         TYPE_ASSERT(kinfo.isReal());
 
diff --git a/src/backend/cpu/kernel/sort_by_key.hpp b/src/backend/cpu/kernel/sort_by_key.hpp
index f9d391dc46..1be4a94d3a 100644
--- a/src/backend/cpu/kernel/sort_by_key.hpp
+++ b/src/backend/cpu/kernel/sort_by_key.hpp
@@ -16,6 +16,7 @@
 #include <queue>
 #include <err_cpu.hpp>
 #include <functional>
+#include <kernel/sort_helper.hpp>
 
 namespace cpu
 {
@@ -23,57 +24,40 @@ namespace kernel
 {
 
 template<typename Tk, typename Tv, bool isAscending>
-void sort0_by_key(Array<Tk> okey, Array<Tv> oval, Array<uint> oidx,
-                  const Array<Tk> ikey, const Array<Tv> ival)
+void sort0ByKeyIterative(Array<Tk> okey, Array<Tv> oval)
 {
-    function<bool(Tk, Tk)> op = std::greater<Tk>();
-    if(isAscending) { op = std::less<Tk>(); }
-
     // Get pointers and initialize original index locations
-        uint *oidx_ptr = oidx.get();
-          Tk *okey_ptr = okey.get();
-          Tv *oval_ptr = oval.get();
-    const Tk *ikey_ptr = ikey.get();
-    const Tv *ival_ptr = ival.get();
-
-    std::vector<uint> seq_vec(oidx.dims()[0]);
-    std::iota(seq_vec.begin(), seq_vec.end(), 0);
+    Tk *okey_ptr = okey.get();
+    Tv *oval_ptr = oval.get();
 
-    const Tk *comp_ptr = nullptr;
-    auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);};
+    std::vector<IndexPair<Tk, Tv> > X;
+    X.reserve(okey.dims()[0]);
 
-    for(dim_t w = 0; w < ikey.dims()[3]; w++) {
+    for(dim_t w = 0; w < okey.dims()[3]; w++) {
         dim_t okeyW = w * okey.strides()[3];
         dim_t ovalW = w * oval.strides()[3];
-        dim_t oidxW = w * oidx.strides()[3];
-        dim_t ikeyW = w * ikey.strides()[3];
-        dim_t ivalW = w * ival.strides()[3];
 
-        for(dim_t z = 0; z < ikey.dims()[2]; z++) {
+        for(dim_t z = 0; z < okey.dims()[2]; z++) {
             dim_t okeyWZ = okeyW + z * okey.strides()[2];
             dim_t ovalWZ = ovalW + z * oval.strides()[2];
-            dim_t oidxWZ = oidxW + z * oidx.strides()[2];
-            dim_t ikeyWZ = ikeyW + z * ikey.strides()[2];
-            dim_t ivalWZ = ivalW + z * ival.strides()[2];
 
-            for(dim_t y = 0; y < ikey.dims()[1]; y++) {
+            for(dim_t y = 0; y < okey.dims()[1]; y++) {
 
                 dim_t okeyOffset = okeyWZ + y * okey.strides()[1];
                 dim_t ovalOffset = ovalWZ + y * oval.strides()[1];
-                dim_t oidxOffset = oidxWZ + y * oidx.strides()[1];
-                dim_t ikeyOffset = ikeyWZ + y * ikey.strides()[1];
-                dim_t ivalOffset = ivalWZ + y * ival.strides()[1];
 
-                uint *ptr = oidx_ptr + oidxOffset;
-                std::copy(seq_vec.begin(), seq_vec.end(), ptr);
+                X.clear();
+                std::transform(okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims()[0],
+                               oval_ptr + ovalOffset,
+                               std::back_inserter(X),
+                               [](Tk v_, Tv i_) { return std::make_pair(v_, i_); }
+                               );
 
-                comp_ptr = ikey_ptr + ikeyOffset;
-                std::stable_sort(ptr, ptr + ikey.dims()[0], comparator);
+                std::stable_sort(X.begin(), X.end(), IPCompare<Tk, Tv, isAscending>());
 
-                for (dim_t i = 0; i < oval.dims()[0]; ++i){
-                    uint sortIdx = oidx_ptr[oidxOffset + i];
-                    okey_ptr[okeyOffset + i] = ikey_ptr[ikeyOffset + sortIdx];
-                    oval_ptr[ovalOffset + i] = ival_ptr[ivalOffset + sortIdx];
+                for(unsigned it = 0; it < X.size(); it++) {
+                    okey_ptr[okeyOffset + it] = X[it].first;
+                    oval_ptr[ovalOffset + it] = X[it].second;
                 }
             }
         }
@@ -82,5 +66,78 @@ void sort0_by_key(Array<Tk> okey, Array<Tv> oval, Array<uint> oidx,
     return;
 }
 
+template<typename Tk, typename Tv, bool isAscending, int dim>
+void sortByKeyBatched(Array<Tk> okey, Array<Tv> oval)
+{
+    af::dim4 inDims = okey.dims();
+
+    af::dim4 tileDims(1);
+    af::dim4 seqDims = inDims;
+    tileDims[dim] = inDims[dim];
+    seqDims[dim] = 1;
+
+    uint* key = memAlloc<uint>(inDims.elements());
+    // IOTA
+    {
+        af::dim4 dims    = inDims;
+        uint* out        = key;
+        af::dim4 strides(1);
+        for(int i = 1; i < 4; i++)
+            strides[i] = strides[i-1] * dims[i-1];
+
+        for(dim_t w = 0; w < dims[3]; w++) {
+            dim_t offW = w * strides[3];
+            uint okeyW = (w % seqDims[3]) * seqDims[0] * seqDims[1] * seqDims[2];
+            for(dim_t z = 0; z < dims[2]; z++) {
+                dim_t offWZ = offW + z * strides[2];
+                uint okeyZ = okeyW + (z % seqDims[2]) * seqDims[0] * seqDims[1];
+                for(dim_t y = 0; y < dims[1]; y++) {
+                    dim_t offWZY = offWZ + y * strides[1];
+                    uint okeyY = okeyZ + (y % seqDims[1]) * seqDims[0];
+                    for(dim_t x = 0; x < dims[0]; x++) {
+                        dim_t id = offWZY + x;
+                        out[id] = okeyY + (x % seqDims[0]);
+                    }
+                }
+            }
+        }
+    }
+
+    // initialize original index locations
+    Tk *okey_ptr = okey.get();
+    Tv *oval_ptr = oval.get();
+
+    std::vector<KeyIndexPair<Tk, Tv> > X;
+    X.reserve(okey.elements());
+
+    for(unsigned i = 0; i < okey.elements(); i++) {
+        X.push_back(std::make_pair(std::make_pair(okey_ptr[i], oval_ptr[i]), key[i]));
+    }
+
+    memFree(key); // key is no longer required
+
+    std::stable_sort(X.begin(), X.end(), KIPCompareV<Tk, Tv, isAscending>());
+
+    std::stable_sort(X.begin(), X.end(), KIPCompareK<Tk, Tv, true>());
+
+    for(unsigned it = 0; it < okey.elements(); it++) {
+        okey_ptr[it] = X[it].first.first;
+        oval_ptr[it] = X[it].first.second;
+    }
+
+    return;
+}
+
+template<typename Tk, typename Tv, bool isAscending>
+void sort0ByKey(Array<Tk> okey, Array<Tv> oval)
+{
+    int higherDims =  okey.dims()[1] * okey.dims()[2] * okey.dims()[3];
+    // TODO Make a better heurisitic
+    if(higherDims > 0)
+        kernel::sortByKeyBatched<Tk, Tv, isAscending, 0>(okey, oval);
+    else
+        kernel::sort0ByKeyIterative<Tk, Tv, isAscending>(okey, oval);
+}
+
 }
 }
diff --git a/src/backend/cpu/kernel/sort_helper.hpp b/src/backend/cpu/kernel/sort_helper.hpp
index 4b8f2f95b0..ff7da3560b 100644
--- a/src/backend/cpu/kernel/sort_helper.hpp
+++ b/src/backend/cpu/kernel/sort_helper.hpp
@@ -13,15 +13,13 @@ namespace cpu
 {
     namespace kernel
     {
-        static const int copyPairIter = 4;
+        template <typename Tk, typename Tv>
+        using IndexPair = std::pair<Tk, Tv>;
 
-        template <typename T>
-        using IndexPair = std::pair<T, uint>;
-
-        template <typename T, bool isAscending>
+        template <typename Tk, typename Tv, bool isAscending>
         struct IPCompare
         {
-            bool operator()(const IndexPair<T> &lhs, const IndexPair<T> &rhs)
+            bool operator()(const IndexPair<Tk, Tv> &lhs, const IndexPair<Tk, Tv> &rhs)
             {
                 // Check stable sort condition
                 if(isAscending) return (lhs.first < rhs.first);
@@ -29,13 +27,13 @@ namespace cpu
             }
         };
 
-        template <typename T>
-        using KeyIndexPair = std::pair<IndexPair<T>, uint>;
+        template <typename Tk, typename Tv>
+        using KeyIndexPair = std::pair<IndexPair<Tk, Tv>, uint>;
 
-        template <typename T, bool isAscending>
+        template <typename Tk, typename Tv, bool isAscending>
         struct KIPCompareV
         {
-            bool operator()(const KeyIndexPair<T> &lhs, const KeyIndexPair<T> &rhs)
+            bool operator()(const KeyIndexPair<Tk, Tv> &lhs, const KeyIndexPair<Tk, Tv> &rhs)
             {
                 // Check stable sort condition
                 if(isAscending) return (lhs.first.first < rhs.first.first);
@@ -43,10 +41,10 @@ namespace cpu
             }
         };
 
-        template <typename T, bool isAscending>
+        template <typename Tk, typename Tv, bool isAscending>
         struct KIPCompareK
         {
-            bool operator()(const KeyIndexPair<T> &lhs, const KeyIndexPair<T> &rhs)
+            bool operator()(const KeyIndexPair<Tk, Tv> &lhs, const KeyIndexPair<Tk, Tv> &rhs)
             {
                 if(isAscending) return (lhs.second < rhs.second);
                 else return (lhs.second > rhs.second);
diff --git a/src/backend/cpu/kernel/sort_index.hpp b/src/backend/cpu/kernel/sort_index.hpp
index 1b86507aae..7a23a7df49 100644
--- a/src/backend/cpu/kernel/sort_index.hpp
+++ b/src/backend/cpu/kernel/sort_index.hpp
@@ -29,7 +29,7 @@ void sort0IndexIterative(Array<T> val, Array<uint> idx)
     uint *idx_ptr = idx.get();
        T *val_ptr = val.get();
 
-    std::vector<IndexPair<T> > X;
+    std::vector<IndexPair<T, uint> > X;
     X.reserve(val.dims()[0]);
 
     for(dim_t w = 0; w < val.dims()[3]; w++) {
@@ -50,7 +50,7 @@ void sort0IndexIterative(Array<T> val, Array<uint> idx)
                                );
 
                 //comp_ptr = &X.front();
-                std::stable_sort(X.begin(), X.end(), IPCompare<T, isAscending>());
+                std::stable_sort(X.begin(), X.end(), IPCompare<T, uint, isAscending>());
 
                 for(unsigned it = 0; it < X.size(); it++) {
                     val_ptr[valOffset + it] = X[it].first;
@@ -84,13 +84,13 @@ void sortIndexBatched(Array<T> val, Array<uint> idx)
 
         for(dim_t w = 0; w < dims[3]; w++) {
             dim_t offW = w * strides[3];
-            T valW = (w % seqDims[3]) * seqDims[0] * seqDims[1] * seqDims[2];
+            uint valW = (w % seqDims[3]) * seqDims[0] * seqDims[1] * seqDims[2];
             for(dim_t z = 0; z < dims[2]; z++) {
                 dim_t offWZ = offW + z * strides[2];
-                T valZ = valW + (z % seqDims[2]) * seqDims[0] * seqDims[1];
+                uint valZ = valW + (z % seqDims[2]) * seqDims[0] * seqDims[1];
                 for(dim_t y = 0; y < dims[1]; y++) {
                     dim_t offWZY = offWZ + y * strides[1];
-                    T valY = valZ + (y % seqDims[1]) * seqDims[0];
+                    uint valY = valZ + (y % seqDims[1]) * seqDims[0];
                     for(dim_t x = 0; x < dims[0]; x++) {
                         dim_t id = offWZY + x;
                         out[id] = valY + (x % seqDims[0]);
@@ -104,7 +104,7 @@ void sortIndexBatched(Array<T> val, Array<uint> idx)
     uint *idx_ptr = idx.get();
        T *val_ptr = val.get();
 
-    std::vector<KeyIndexPair<T> > X;
+    std::vector<KeyIndexPair<T, uint> > X;
     X.reserve(val.elements());
 
     for(unsigned i = 0; i < val.elements(); i++) {
@@ -113,9 +113,9 @@ void sortIndexBatched(Array<T> val, Array<uint> idx)
 
     memFree(key); // key is no longer required
 
-    std::stable_sort(X.begin(), X.end(), KIPCompareV<T, isAscending>());
+    std::stable_sort(X.begin(), X.end(), KIPCompareV<T, uint, isAscending>());
 
-    std::stable_sort(X.begin(), X.end(), KIPCompareK<T, true>());
+    std::stable_sort(X.begin(), X.end(), KIPCompareK<T, uint, true>());
 
     for(unsigned it = 0; it < val.elements(); it++) {
         val_ptr[it] = X[it].first.first;
diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/sort_by_key.cpp
index 46ced4b9ef..46b06602b4 100644
--- a/src/backend/cpu/sort_by_key.cpp
+++ b/src/backend/cpu/sort_by_key.cpp
@@ -11,6 +11,9 @@
 #include <sort_by_key.hpp>
 #include <platform.hpp>
 #include <queue.hpp>
+#include <copy.hpp>
+#include <range.hpp>
+#include <reorder.hpp>
 #include <kernel/sort_by_key.hpp>
 
 namespace cpu
@@ -23,16 +26,33 @@ void sort_by_key(Array<Tk> &okey, Array<Tv> &oval,
     ikey.eval();
     ival.eval();
 
-    okey = createEmptyArray<Tk>(ikey.dims());
-    oval = createEmptyArray<Tv>(ival.dims());
-    Array<uint> oidx = createValueArray(ikey.dims(), 0u);
-    oidx.eval();
+    okey = copyArray<Tk>(ikey);
+    oval = copyArray<Tv>(ival);
 
     switch(dim) {
-        case 0: getQueue().enqueue(kernel::sort0_by_key<Tk, Tv, isAscending>,
-                                   okey, oval, oidx, ikey, ival); break;
+        case 0: getQueue().enqueue(kernel::sort0ByKey<Tk, Tv, isAscending>, okey, oval); break;
+        case 1: getQueue().enqueue(kernel::sortByKeyBatched<Tk, Tv, isAscending, 1>, okey, oval); break;
+        case 2: getQueue().enqueue(kernel::sortByKeyBatched<Tk, Tv, isAscending, 2>, okey, oval); break;
+        case 3: getQueue().enqueue(kernel::sortByKeyBatched<Tk, Tv, isAscending, 3>, okey, oval); break;
         default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
     }
+
+    if(dim != 0) {
+        af::dim4 preorderDims = okey.dims();
+        af::dim4 reorderDims(0, 1, 2, 3);
+        reorderDims[dim] = 0;
+        preorderDims[0] = okey.dims()[dim];
+        for(int i = 1; i <= (int)dim; i++) {
+            reorderDims[i - 1] = i;
+            preorderDims[i] = okey.dims()[i - 1];
+        }
+
+        okey.setDataDims(preorderDims);
+        oval.setDataDims(preorderDims);
+
+        okey = reorder<Tk>(okey, reorderDims);
+        oval = reorder<Tv>(oval, reorderDims);
+    }
 }
 
 #define INSTANTIATE(Tk, Tv)                                             \
diff --git a/src/backend/cuda/kernel/sort_by_key.hpp b/src/backend/cuda/kernel/sort_by_key.hpp
index bfaa79a311..beffa5476e 100644
--- a/src/backend/cuda/kernel/sort_by_key.hpp
+++ b/src/backend/cuda/kernel/sort_by_key.hpp
@@ -12,7 +12,11 @@
 #include <Param.hpp>
 #include <err_cuda.hpp>
 #include <debug_cuda.hpp>
+#include <kernel/sort_helper.hpp>
+#include <kernel/iota.hpp>
+
 #include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
 #include <thrust/sort.h>
 
 namespace cuda
@@ -23,7 +27,7 @@ namespace cuda
         // Wrapper functions
         ///////////////////////////////////////////////////////////////////////////
         template<typename Tk, typename Tv, bool isAscending>
-        void sort0_by_key(Param<Tk> okey, Param<Tv> oval)
+        void sort0ByKeyIterative(Param<Tk> okey, Param<Tv> oval)
         {
             thrust::device_ptr<Tk>       okey_ptr = thrust::device_pointer_cast(okey.ptr);
             thrust::device_ptr<Tv>       oval_ptr = thrust::device_pointer_cast(oval.ptr);
@@ -55,5 +59,82 @@ namespace cuda
             }
             POST_LAUNCH_CHECK();
         }
+
+        template<typename Tk, typename Tv, bool isAscending, int dim>
+        void sortByKeyBatched(Param<Tk> pKey, Param<Tv> pVal)
+        {
+            af::dim4 inDims;
+            for(int i = 0; i < 4; i++)
+                inDims[i] = pKey.dims[i];
+
+            // Sort dimension
+            // tileDims * seqDims = inDims
+            af::dim4 tileDims(1);
+            af::dim4 seqDims = inDims;
+            tileDims[dim] = inDims[dim];
+            seqDims[dim] = 1;
+
+            // Create/call iota
+            // Array<uint> key = iota<uint>(seqDims, tileDims);
+            dim4 keydims = inDims;
+            uint* key = memAlloc<uint>(keydims.elements());
+            Param<uint> pSeq;
+            pSeq.ptr = key;
+            pSeq.strides[0] = 1;
+            pSeq.dims[0] = keydims[0];
+            for(int i = 1; i < 4; i++) {
+                pSeq.dims[i] = keydims[i];
+                pSeq.strides[i] = pSeq.strides[i - 1] * pSeq.dims[i - 1];
+            }
+            cuda::kernel::iota<uint>(pSeq, seqDims, tileDims);
+
+            // Make pkey, pVal into a pair
+            thrust::device_vector<IndexPair<Tk, Tv> > X(inDims.elements());
+            IndexPair<Tk, Tv> *Xptr = thrust::raw_pointer_cast(X.data());
+
+            const int threads = 256;
+            int blocks = divup(inDims.elements(), threads * copyPairIter);
+            CUDA_LAUNCH((makeIndexPair<Tk, Tv>), blocks, threads,
+                        Xptr, pKey.ptr, pVal.ptr, inDims.elements());
+            POST_LAUNCH_CHECK();
+
+            // Sort indices
+            // Need to convert pSeq to thrust::device_ptr, otherwise thrust
+            // throws weird errors for all *64 data types (double, intl, uintl etc)
+            thrust::device_ptr<uint> dSeq = thrust::device_pointer_cast(pSeq.ptr);
+            THRUST_SELECT(thrust::stable_sort_by_key,
+                          X.begin(), X.end(),
+                          dSeq,
+                          IPCompare<Tk, Tv, isAscending>());
+            POST_LAUNCH_CHECK();
+
+            // Needs to be ascending (true) in order to maintain the indices properly
+            THRUST_SELECT(thrust::stable_sort_by_key,
+                          dSeq,
+                          dSeq + inDims.elements(),
+                          X.begin());
+            POST_LAUNCH_CHECK();
+
+            CUDA_LAUNCH((splitIndexPair<Tk, Tv>), blocks, threads,
+                        pKey.ptr, pVal.ptr, Xptr, inDims.elements());
+            POST_LAUNCH_CHECK();
+
+            // No need of doing moddims here because the original Array<T>
+            // dimensions have not been changed
+            //val.modDims(inDims);
+
+            memFree(key);
+        }
+
+        template<typename Tk, typename Tv, bool isAscending>
+        void sort0ByKey(Param<Tk> okey, Param<Tv> oval)
+        {
+            int higherDims =  okey.dims[1] * okey.dims[2] * okey.dims[3];
+            // TODO Make a better heurisitic
+            if(higherDims > 5)
+                sortByKeyBatched<Tk, Tv, isAscending, 0>(okey, oval);
+            else
+                kernel::sort0ByKeyIterative<Tk, Tv, isAscending>(okey, oval);
+        }
     }
 }
diff --git a/src/backend/cuda/kernel/sort_helper.hpp b/src/backend/cuda/kernel/sort_helper.hpp
index 445e9a28f5..93fb33ac8f 100644
--- a/src/backend/cuda/kernel/sort_helper.hpp
+++ b/src/backend/cuda/kernel/sort_helper.hpp
@@ -12,22 +12,22 @@
 #include <debug_cuda.hpp>
 
 // This needs to be in global namespace as it is used by thrust
-template<typename T>
+template<typename Tk, typename Tv>
 struct IndexPair
 {
-    T val;
-    uint idx;
+    Tk first;
+    Tv second;
 };
 
-template <typename T, bool isAscending>
+template <typename Tk, typename Tv, bool isAscending>
 struct IPCompare
 {
     __host__ __device__
-    bool operator()(const IndexPair<T> &lhs, const IndexPair<T> &rhs) const
+    bool operator()(const IndexPair<Tk, Tv> &lhs, const IndexPair<Tk, Tv> &rhs) const
     {
         // Check stable sort condition
-        if(isAscending) return (lhs.val < rhs.val);
-        else return (lhs.val > rhs.val);
+        if(isAscending) return (lhs.first < rhs.first);
+        else return (lhs.first > rhs.first);
     }
 };
 
@@ -39,27 +39,27 @@ namespace cuda
 
         template <typename Tk, typename Tv>
         __global__
-        void makeIndexPair(IndexPair<Tv> *out, const Tk *key, const Tv *val, const int N)
+        void makeIndexPair(IndexPair<Tk, Tv> *out, const Tk *first, const Tv *second, const int N)
         {
             int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x;
 
             for(int i = tIdx; i < N; i += blockDim.x)
             {
-                out[i].val = val[i];
-                out[i].idx = key[i];
+                out[i].first = first[i];
+                out[i].second = second[i];
             }
         }
 
         template <typename Tk, typename Tv>
         __global__
-        void splitIndexPair(Tk *key, Tv *val, const IndexPair<Tv> *out, const int N)
+        void splitIndexPair(Tk *first, Tv *second, const IndexPair<Tk, Tv> *out, const int N)
         {
             int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x;
 
             for(int i = tIdx; i < N; i += blockDim.x)
             {
-                val[i] = out[i].val;
-                key[i] = out[i].idx;
+                first[i]  = out[i].first;
+                second[i] = out[i].second;
             }
         }
     }
diff --git a/src/backend/cuda/kernel/sort_index.hpp b/src/backend/cuda/kernel/sort_index.hpp
index d23c503005..40a5d59311 100644
--- a/src/backend/cuda/kernel/sort_index.hpp
+++ b/src/backend/cuda/kernel/sort_index.hpp
@@ -73,50 +73,51 @@ namespace cuda
             seqDims[dim] = 1;
 
             // Create/call iota
-            // Array<uint> key = iota<uint>(seqDims, tileDims);
+            // Array<uint> seq = iota<uint>(seqDims, tileDims);
             dim4 keydims = inDims;
             uint* key = memAlloc<uint>(keydims.elements());
-            Param<uint> pKey;
-            pKey.ptr = key;
-            pKey.strides[0] = 1;
-            pKey.dims[0] = keydims[0];
+            Param<uint> pSeq;
+            pSeq.ptr = key;
+            pSeq.strides[0] = 1;
+            pSeq.dims[0] = keydims[0];
             for(int i = 1; i < 4; i++) {
-                pKey.dims[i] = keydims[i];
-                pKey.strides[i] = pKey.strides[i - 1] * pKey.dims[i - 1];
+                pSeq.dims[i] = keydims[i];
+                pSeq.strides[i] = pSeq.strides[i - 1] * pSeq.dims[i - 1];
             }
-            cuda::kernel::iota<uint>(pKey, seqDims, tileDims);
+            cuda::kernel::iota<uint>(pSeq, seqDims, tileDims);
 
             // Flat - Not required since inplace and both are continuous
             //val.modDims(inDims.elements());
             //key.modDims(inDims.elements());
 
             // Make val, idx into a pair
-            thrust::device_vector<IndexPair<T> > X(inDims.elements());
-            IndexPair<T> *Xptr = thrust::raw_pointer_cast(X.data());
+            thrust::device_vector<IndexPair<T, uint> > X(inDims.elements());
+            IndexPair<T, uint> *Xptr = thrust::raw_pointer_cast(X.data());
 
             const int threads = 256;
             int blocks = divup(inDims.elements(), threads * copyPairIter);
-            CUDA_LAUNCH((makeIndexPair<unsigned, T>), blocks, threads,
-                        Xptr, pIdx.ptr, pVal.ptr, inDims.elements());
+            CUDA_LAUNCH((makeIndexPair<T, unsigned>), blocks, threads,
+                        Xptr, pVal.ptr, pIdx.ptr, inDims.elements());
 
             // Sort indices
             // sort_by_key<T, uint, isAscending>(*resVal, *resKey, val, key, 0);
+            thrust::device_ptr<uint> dSeq = thrust::device_pointer_cast(pSeq.ptr);
             THRUST_SELECT(thrust::stable_sort_by_key,
                           X.begin(), X.end(),
-                          pKey.ptr,
-                          IPCompare<T, isAscending>());
+                          dSeq,
+                          IPCompare<T, uint, isAscending>());
             POST_LAUNCH_CHECK();
 
             // Needs to be ascending (true) in order to maintain the indices properly
             //kernel::sort0_by_key<uint, T, true>(pKey, pVal);
             THRUST_SELECT(thrust::stable_sort_by_key,
-                          pKey.ptr,
-                          pKey.ptr + inDims.elements(),
+                          dSeq,
+                          dSeq + inDims.elements(),
                           X.begin());
             POST_LAUNCH_CHECK();
 
-            CUDA_LAUNCH((splitIndexPair<unsigned, T>), blocks, threads,
-                        pIdx.ptr, pVal.ptr, Xptr, inDims.elements());
+            CUDA_LAUNCH((splitIndexPair<T, unsigned>), blocks, threads,
+                        pVal.ptr, pIdx.ptr, Xptr, inDims.elements());
             POST_LAUNCH_CHECK();
 
             // No need of doing moddims here because the original Array<T>
diff --git a/src/backend/cuda/sort_by_key_impl.hpp b/src/backend/cuda/sort_by_key_impl.hpp
index 217b17dc8a..8cc86b55db 100644
--- a/src/backend/cuda/sort_by_key_impl.hpp
+++ b/src/backend/cuda/sort_by_key_impl.hpp
@@ -12,6 +12,7 @@
 #include <sort_by_key.hpp>
 #include <kernel/sort_by_key.hpp>
 #include <math.hpp>
+#include <reorder.hpp>
 #include <stdexcept>
 #include <err_cuda.hpp>
 
@@ -23,10 +24,30 @@ namespace cuda
     {
         okey = copyArray<Tk>(ikey);
         oval = copyArray<Tv>(ival);
+
         switch(dim) {
-        case 0: kernel::sort0_by_key<Tk, Tv, isAscending>(okey, oval);
-            break;
-        default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
+            case 0: kernel::sort0ByKey<Tk, Tv, isAscending>(okey, oval); break;
+            case 1: kernel::sortByKeyBatched<Tk, Tv, isAscending, 1>(okey, oval); break;
+            case 2: kernel::sortByKeyBatched<Tk, Tv, isAscending, 2>(okey, oval); break;
+            case 3: kernel::sortByKeyBatched<Tk, Tv, isAscending, 3>(okey, oval); break;
+            default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
+        }
+
+        if(dim != 0) {
+            af::dim4 preorderDims = okey.dims();
+            af::dim4 reorderDims(0, 1, 2, 3);
+            reorderDims[dim] = 0;
+            preorderDims[0] = okey.dims()[dim];
+            for(int i = 1; i <= (int)dim; i++) {
+                reorderDims[i - 1] = i;
+                preorderDims[i] = okey.dims()[i - 1];
+            }
+
+            okey.setDataDims(preorderDims);
+            oval.setDataDims(preorderDims);
+
+            okey = reorder<Tk>(okey, reorderDims);
+            oval = reorder<Tv>(oval, reorderDims);
         }
     }
 

From aaa13f6cb514395dd723a1b6e09388f3825ee258 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Wed, 20 Apr 2016 13:50:59 -0400
Subject: [PATCH 14/28] Added tests for sort_index and sort_by_key for higher
 dimensions

---
 test/sort_by_key.cpp | 104 +++++++++++++++++++++++++++++++++++++++++--
 test/sort_index.cpp  |  95 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 196 insertions(+), 3 deletions(-)

diff --git a/test/sort_by_key.cpp b/test/sort_by_key.cpp
index ed827c9da5..cbb13b8785 100644
--- a/test/sort_by_key.cpp
+++ b/test/sort_by_key.cpp
@@ -119,15 +119,15 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const
     SORT_INIT(Sort1000False,  sort_by_key_1000,  false, 2, 3);
     SORT_INIT(SortMedFalse,   sort_by_key_med,   false, 2, 3);
     // Takes too much time in current implementation. Enable when everything is parallel
-    //SORT_INIT(SortLargeTrue,  sort_by_key_large, true,  0, 1);
-    //SORT_INIT(SortLargeFalse, sort_by_key_large, false, 2, 3);
+    SORT_INIT(SortLargeTrue,  sort_by_key_large, true,  0, 1);
+    SORT_INIT(SortLargeFalse, sort_by_key_large, false, 2, 3);
 
 
 
 
 ////////////////////////////////////// CPP ///////////////////////////////
 //
-TEST(SortByKey, CPP)
+TEST(SortByKey, CPPDim0)
 {
     if (noDoubleTests<float>()) return;
 
@@ -168,3 +168,101 @@ TEST(SortByKey, CPP)
     delete[] keyData;
     delete[] valData;
 }
+
+TEST(SortByKey, CPPDim1)
+{
+    if (noDoubleTests<float>()) return;
+
+    const bool dir = true;
+    const unsigned resultIdx0 = 0;
+    const unsigned resultIdx1 = 1;
+
+    vector<af::dim4> numDims;
+    vector<vector<float> > in;
+    vector<vector<float> > tests;
+    readTests<float, float, int>(string(TEST_DIR"/sort/sort_by_key_large.test"),numDims,in,tests);
+
+    af::dim4 idims = numDims[0];
+    af::array keys(idims, &(in[0].front()));
+    af::array vals(idims, &(in[1].front()));
+
+    af::array keys_ = reorder(keys, 1, 0, 2, 3);
+    af::array vals_ = reorder(vals, 1, 0, 2, 3);
+
+    af::array out_keys, out_vals;
+    af::sort(out_keys, out_vals, keys_, vals_, 1, dir);
+
+    out_keys = reorder(out_keys, 1, 0, 2, 3);
+    out_vals = reorder(out_vals, 1, 0, 2, 3);
+
+    size_t nElems = tests[resultIdx0].size();
+    // Get result
+    float* keyData = new float[tests[resultIdx0].size()];
+    out_keys.host((void*)keyData);
+
+    // Compare result
+    for (size_t elIter = 0; elIter < nElems; ++elIter) {
+        ASSERT_EQ(tests[resultIdx0][elIter], keyData[elIter]) << "at: " << elIter << std::endl;
+    }
+
+    float* valData = new float[tests[resultIdx1].size()];
+    out_vals.host((void*)valData);
+
+    // Compare result
+    for (size_t elIter = 0; elIter < nElems; ++elIter) {
+        ASSERT_EQ(tests[resultIdx1][elIter], valData[elIter]) << "at: " << elIter << std::endl;
+    }
+
+    // Delete
+    delete[] keyData;
+    delete[] valData;
+}
+
+TEST(SortByKey, CPPDim2)
+{
+    if (noDoubleTests<float>()) return;
+
+    const bool dir = false;
+    const unsigned resultIdx0 = 2;
+    const unsigned resultIdx1 = 3;
+
+    vector<af::dim4> numDims;
+    vector<vector<float> > in;
+    vector<vector<float> > tests;
+    readTests<float, float, int>(string(TEST_DIR"/sort/sort_by_key_large.test"),numDims,in,tests);
+
+    af::dim4 idims = numDims[0];
+    af::array keys(idims, &(in[0].front()));
+    af::array vals(idims, &(in[1].front()));
+
+    af::array keys_ = reorder(keys, 1, 2, 0, 3);
+    af::array vals_ = reorder(vals, 1, 2, 0, 3);
+
+    af::array out_keys, out_vals;
+    af::sort(out_keys, out_vals, keys_, vals_, 2, dir);
+
+    out_keys = reorder(out_keys, 2, 0, 1, 3);
+    out_vals = reorder(out_vals, 2, 0, 1, 3);
+
+    size_t nElems = tests[resultIdx0].size();
+    // Get result
+    float* keyData = new float[tests[resultIdx0].size()];
+    out_keys.host((void*)keyData);
+
+    // Compare result
+    for (size_t elIter = 0; elIter < nElems; ++elIter) {
+        ASSERT_EQ(tests[resultIdx0][elIter], keyData[elIter]) << "at: " << elIter << std::endl;
+    }
+
+    float* valData = new float[tests[resultIdx1].size()];
+    out_vals.host((void*)valData);
+
+    // Compare result
+    for (size_t elIter = 0; elIter < nElems; ++elIter) {
+        ASSERT_EQ(tests[resultIdx1][elIter], valData[elIter]) << "at: " << elIter << std::endl;
+    }
+
+    // Delete
+    delete[] keyData;
+    delete[] valData;
+}
diff --git a/test/sort_index.cpp b/test/sort_index.cpp
index 2326ed706f..eed85047bf 100644
--- a/test/sort_index.cpp
+++ b/test/sort_index.cpp
@@ -170,3 +170,98 @@ TEST(SortIndex, CPPDim0)
     delete[] sxData;
     delete[] ixData;
 }
+
+TEST(SortIndex, CPPDim1)
+{
+    if (noDoubleTests<float>()) return;
+
+    const bool dir = true;
+    const unsigned resultIdx0 = 0;
+    const unsigned resultIdx1 = 1;
+
+    vector<af::dim4> numDims;
+    vector<vector<float> > in;
+    vector<vector<float> > tests;
+    readTests<float, float, int>(string(TEST_DIR"/sort/sort_10x10.test"),numDims,in,tests);
+
+    af::dim4 idims = numDims[0];
+    af::array input_(idims, &(in[0].front()));
+    af::array input = reorder(input_, 1, 0, 2, 3);
+
+    af::array outValues, outIndices;
+    af::sort(outValues, outIndices, input, 1, dir);
+
+    outValues  = reorder(outValues,  1, 0, 2, 3);
+    outIndices = reorder(outIndices, 1, 0, 2, 3);
+
+    size_t nElems = tests[resultIdx0].size();
+
+    // Get result
+    float* sxData = new float[tests[resultIdx0].size()];
+    outValues.host((void*)sxData);
+
+    // Compare result
+    for (size_t elIter = 0; elIter < nElems; ++elIter) {
+        ASSERT_EQ(tests[resultIdx0][elIter], sxData[elIter]) << "at: " << elIter << std::endl;
+    }
+
+    // Get result
+    unsigned* ixData = new unsigned[tests[resultIdx1].size()];
+    outIndices.host((void*)ixData);
+
+    // Compare result
+    for (size_t elIter = 0; elIter < nElems; ++elIter) {
+        ASSERT_EQ(tests[resultIdx1][elIter], ixData[elIter]) << "at: " << elIter << std::endl;
+    }
+
+    // Delete
+    delete[] sxData;
+    delete[] ixData;
+}
+
+TEST(SortIndex, CPPDim2)
+{
+    if (noDoubleTests<float>()) return;
+
+    const bool dir = false;
+    const unsigned resultIdx0 = 2;
+    const unsigned resultIdx1 = 3;
+
+    vector<af::dim4> numDims;
+    vector<vector<float> > in;
+    vector<vector<float> > tests;
+    readTests<float, float, int>(string(TEST_DIR"/sort/sort_med.test"),numDims,in,tests);
+
+    af::dim4 idims = numDims[0];
+    af::array input_(idims, &(in[0].front()));
+    af::array input = reorder(input_, 1, 2, 0, 3);
+
+    af::array outValues, outIndices;
+    af::sort(outValues, outIndices, input, 2, dir);
+
+    outValues  = reorder(outValues,  2, 0, 1, 3);
+    outIndices = reorder(outIndices, 2, 0, 1, 3);
+    size_t nElems = tests[resultIdx0].size();
+
+    // Get result
+    float* sxData = new float[tests[resultIdx0].size()];
+    outValues.host((void*)sxData);
+
+    // Compare result
+    for (size_t elIter = 0; elIter < nElems; ++elIter) {
+        ASSERT_EQ(tests[resultIdx0][elIter], sxData[elIter]) << "at: " << elIter << std::endl;
+    }
+
+    // Get result
+    unsigned* ixData = new unsigned[tests[resultIdx1].size()];
+    outIndices.host((void*)ixData);
+
+    // Compare result
+    for (size_t elIter = 0; elIter < nElems; ++elIter) {
+        EXPECT_EQ(tests[resultIdx1][elIter], ixData[elIter]) << "at: " << elIter << std::endl;
+    }
+
+    // Delete
+    delete[] sxData;
+    delete[] ixData;
+}

From 8dad427391087265bc9df8b3ec6f528b3fd5cc11 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Thu, 21 Apr 2016 12:00:24 -0400
Subject: [PATCH 15/28] Add sort index, sort by key batching to OpenCL

---
 src/backend/opencl/kernel/harris.hpp        |   2 +-
 src/backend/opencl/kernel/iota.hpp          |   6 +-
 src/backend/opencl/kernel/orb.hpp           |   2 +-
 src/backend/opencl/kernel/sort.hpp          |   8 +-
 src/backend/opencl/kernel/sort_by_key.hpp   | 143 +++++++++++++++++---
 src/backend/opencl/kernel/sort_helper.hpp   | 118 ++++++++++++++++
 src/backend/opencl/kernel/sort_index.hpp    | 123 ++++++++++++++++-
 src/backend/opencl/kernel/sort_make_pair.cl |  43 ++++++
 src/backend/opencl/sort_by_key/impl.hpp     |  29 +++-
 src/backend/opencl/sort_index.cpp           |  33 ++++-
 10 files changed, 466 insertions(+), 41 deletions(-)
 create mode 100644 src/backend/opencl/kernel/sort_make_pair.cl

diff --git a/src/backend/opencl/kernel/harris.hpp b/src/backend/opencl/kernel/harris.hpp
index 7fffdee423..3c0e531d6d 100644
--- a/src/backend/opencl/kernel/harris.hpp
+++ b/src/backend/opencl/kernel/harris.hpp
@@ -287,7 +287,7 @@ void harris(unsigned* corners_out,
             harris_idx.data = bufferAlloc(sort_elem * sizeof(unsigned));
 
             // Sort Harris responses
-            sort0_index<float, false>(harris_resp, harris_idx);
+            sort0Index<float, false>(harris_resp, harris_idx);
 
             x_out.data = bufferAlloc(*corners_out * sizeof(float));
             y_out.data = bufferAlloc(*corners_out * sizeof(float));
diff --git a/src/backend/opencl/kernel/iota.hpp b/src/backend/opencl/kernel/iota.hpp
index bad486abd2..210b6b202e 100644
--- a/src/backend/opencl/kernel/iota.hpp
+++ b/src/backend/opencl/kernel/iota.hpp
@@ -31,8 +31,8 @@ namespace opencl
     namespace kernel
     {
         // Kernel Launch Config Values
-        static const int TX = 32;
-        static const int TY = 8;
+        static const int IOTA_TX = 32;
+        static const int IOTA_TY = 8;
         static const int TILEX = 512;
         static const int TILEY = 32;
 
@@ -64,7 +64,7 @@ namespace opencl
                                           const int, const int, const int, const int,
                                           const int, const int> (*iotaKernels[device]);
 
-                NDRange local(TX, TY, 1);
+                NDRange local(IOTA_TX, IOTA_TY, 1);
 
                 int blocksPerMatX = divup(out.info.dims[0], TILEX);
                 int blocksPerMatY = divup(out.info.dims[1], TILEY);
diff --git a/src/backend/opencl/kernel/orb.hpp b/src/backend/opencl/kernel/orb.hpp
index 69c1176210..317bb4e3d8 100644
--- a/src/backend/opencl/kernel/orb.hpp
+++ b/src/backend/opencl/kernel/orb.hpp
@@ -305,7 +305,7 @@ void orb(unsigned* out_feat,
             d_harris_sorted.data = d_score_harris;
             d_harris_idx.data = bufferAlloc((d_harris_idx.info.dims[0]) * sizeof(unsigned));
 
-            sort0_index<float, false>(d_harris_sorted, d_harris_idx);
+            sort0Index<float, false>(d_harris_sorted, d_harris_idx);
 
             cl::Buffer* d_x_lvl = bufferAlloc(usable_feat * sizeof(float));
             cl::Buffer* d_y_lvl = bufferAlloc(usable_feat * sizeof(float));
diff --git a/src/backend/opencl/kernel/sort.hpp b/src/backend/opencl/kernel/sort.hpp
index b9a7d39b28..98ba75977a 100644
--- a/src/backend/opencl/kernel/sort.hpp
+++ b/src/backend/opencl/kernel/sort.hpp
@@ -131,10 +131,10 @@ namespace opencl
                 compute::buffer pKey_buf((*pKey.data)());
                 compute::buffer pVal_buf((*pVal.data)());
 
-                compute::buffer_iterator< type_t<T>    > val0 = compute::make_buffer_iterator< type_t<T>    >(pVal_buf, 0);
-                compute::buffer_iterator< type_t<T>    > valN = compute::make_buffer_iterator< type_t<T>    >(pVal_buf,+ pVal.info.dims[0]);
-                compute::buffer_iterator< type_t<uint> > key0 = compute::make_buffer_iterator< type_t<uint> >(pKey_buf, 0);
-                compute::buffer_iterator< type_t<uint> > keyN = compute::make_buffer_iterator< type_t<uint> >(pKey_buf, pKey.info.dims[0]);
+                compute::buffer_iterator<type_t<T> > val0 = compute::make_buffer_iterator<type_t<T> >(pVal_buf, 0);
+                compute::buffer_iterator<type_t<T> > valN = compute::make_buffer_iterator<type_t<T> >(pVal_buf,+ pVal.info.dims[0]);
+                compute::buffer_iterator<uint      > key0 = compute::make_buffer_iterator<uint>(pKey_buf, 0);
+                compute::buffer_iterator<uint      > keyN = compute::make_buffer_iterator<uint>(pKey_buf, pKey.info.dims[0]);
                 if(isAscending) {
                     compute::sort_by_key(val0, valN, key0, c_queue);
                 } else {
diff --git a/src/backend/opencl/kernel/sort_by_key.hpp b/src/backend/opencl/kernel/sort_by_key.hpp
index 513ddbfb6d..33a020712e 100644
--- a/src/backend/opencl/kernel/sort_by_key.hpp
+++ b/src/backend/opencl/kernel/sort_by_key.hpp
@@ -16,6 +16,7 @@
 #include <Param.hpp>
 #include <debug_opencl.hpp>
 #include <kernel/sort_helper.hpp>
+#include <kernel/iota.hpp>
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
@@ -24,6 +25,12 @@
 #include <boost/compute/algorithm/sort_by_key.hpp>
 #include <boost/compute/functional/operator.hpp>
 #include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/transform.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/functional/get.hpp>
+#include <boost/compute/functional/field.hpp>
+#include <boost/compute/types/pair.hpp>
 
 namespace compute = boost::compute;
 
@@ -40,33 +47,33 @@ namespace opencl
     namespace kernel
     {
         template<typename Tk, typename Tv, bool isAscending>
-        void sort0_by_key(Param okey, Param oval)
+        void sort0ByKeyIterative(Param pKey, Param pVal)
         {
             try {
                 compute::command_queue c_queue(getQueue()());
 
-                compute::buffer okey_buf((*okey.data)());
-                compute::buffer oval_buf((*oval.data)());
+                compute::buffer pKey_buf((*pKey.data)());
+                compute::buffer pVal_buf((*pVal.data)());
 
-                for(int w = 0; w < okey.info.dims[3]; w++) {
-                    int okeyW = w * okey.info.strides[3];
-                    int ovalW = w * oval.info.strides[3];
-                    for(int z = 0; z < okey.info.dims[2]; z++) {
-                        int okeyWZ = okeyW + z * okey.info.strides[2];
-                        int ovalWZ = ovalW + z * oval.info.strides[2];
-                        for(int y = 0; y < okey.info.dims[1]; y++) {
+                for(int w = 0; w < pKey.info.dims[3]; w++) {
+                    int pKeyW = w * pKey.info.strides[3];
+                    int pValW = w * pVal.info.strides[3];
+                    for(int z = 0; z < pKey.info.dims[2]; z++) {
+                        int pKeyWZ = pKeyW + z * pKey.info.strides[2];
+                        int pValWZ = pValW + z * pVal.info.strides[2];
+                        for(int y = 0; y < pKey.info.dims[1]; y++) {
 
-                            int okeyOffset = okeyWZ + y * okey.info.strides[1];
-                            int ovalOffset = ovalWZ + y * oval.info.strides[1];
+                            int pKeyOffset = pKeyWZ + y * pKey.info.strides[1];
+                            int pValOffset = pValWZ + y * pVal.info.strides[1];
 
-                            compute::buffer_iterator< type_t<Tk> > start= compute::make_buffer_iterator< type_t<Tk> >(okey_buf, okeyOffset);
-                            compute::buffer_iterator< type_t<Tk> > end  = compute::make_buffer_iterator< type_t<Tk> >(okey_buf, okeyOffset + okey.info.dims[0]);
-                            compute::buffer_iterator< type_t<Tv> > vals = compute::make_buffer_iterator< type_t<Tv> >(oval_buf, ovalOffset);
+                            compute::buffer_iterator< type_t<Tk> > start= compute::make_buffer_iterator< type_t<Tk> >(pKey_buf, pKeyOffset);
+                            compute::buffer_iterator< type_t<Tk> > end  = compute::make_buffer_iterator< type_t<Tk> >(pKey_buf, pKeyOffset + pKey.info.dims[0]);
+                            compute::buffer_iterator< type_t<Tv> > vals = compute::make_buffer_iterator< type_t<Tv> >(pVal_buf, pValOffset);
                             if(isAscending) {
                                 compute::sort_by_key(start, end, vals, c_queue);
                             } else {
                                 compute::sort_by_key(start, end, vals,
-                                                            compute::greater< type_t<Tk> >(), c_queue);
+                                                     compute::greater< type_t<Tk> >(), c_queue);
                             }
                         }
                     }
@@ -78,6 +85,110 @@ namespace opencl
                 throw;
             }
         }
+
+        template<typename Tk_, typename Tv_, bool isAscending, int dim>
+        void sortByKeyBatched(Param pKey, Param pVal)
+        {
+            typedef type_t<Tk_> Tk;
+            typedef type_t<Tv_> Tv;
+            typedef std::pair<Tk, Tv> IndexPair;
+
+            try {
+                af::dim4 inDims;
+                for(int i = 0; i < 4; i++)
+                    inDims[i] = pKey.info.dims[i];
+
+                // Sort dimension
+                // tileDims * seqDims = inDims
+                af::dim4 tileDims(1);
+                af::dim4 seqDims = inDims;
+                tileDims[dim] = inDims[dim];
+                seqDims[dim] = 1;
+
+                // Create/call iota
+                // Array<Tv> key = iota<Tv>(seqDims, tileDims);
+                dim4 keydims = inDims;
+                cl::Buffer* key = bufferAlloc(keydims.elements() * sizeof(unsigned));
+                Param pSeq;
+                pSeq.data = key;
+                pSeq.info.offset = 0;
+                pSeq.info.dims[0] = keydims[0];
+                pSeq.info.strides[0] = 1;
+                for(int i = 1; i < 4; i++) {
+                    pSeq.info.dims[i] = keydims[i];
+                    pSeq.info.strides[i] = pSeq.info.strides[i - 1] * pSeq.info.dims[i - 1];
+                }
+                kernel::iota<unsigned>(pSeq, seqDims, tileDims);
+
+                int elements = inDims.elements();
+
+                // Flat - Not required since inplace and both are continuous
+                //val.modDims(inDims.elements());
+                //key.modDims(inDims.elements());
+
+                // Sort indices
+                // sort_by_key<T, Tv, isAscending>(*resVal, *resKey, val, key, 0);
+                //kernel::sort0_by_key<T, Tv, isAscending>(pVal, pKey);
+                compute::command_queue c_queue(getQueue()());
+                compute::context c_context(getContext()());
+
+                // Create buffer iterators for seq
+                compute::buffer pSeq_buf((*pSeq.data)());
+                compute::buffer_iterator<unsigned> seq0 = compute::make_buffer_iterator<unsigned>(pSeq_buf, 0);
+                compute::buffer_iterator<unsigned> seqN = compute::make_buffer_iterator<unsigned>(pSeq_buf, elements);
+
+                // Copy key, val into X pair
+                cl::Buffer* X = bufferAlloc(elements * sizeof(IndexPair));
+                // Use Tk_ and Tv_ here, not Tk and Tv
+                kernel::makePair<Tk_, Tv_>(X, pKey.data, pVal.data, elements);
+                compute::buffer X_buf((*X)());
+                compute::buffer_iterator<IndexPair> X0 = compute::make_buffer_iterator<IndexPair>(X_buf, 0);
+                compute::buffer_iterator<IndexPair> XN = compute::make_buffer_iterator<IndexPair>(X_buf, elements);
+
+                // FIRST SORT CALL
+                compute::function<bool(IndexPair, IndexPair)> IPCompare =
+                    makeCompareFunction<Tk, Tv, isAscending>();
+
+                compute::sort_by_key(X0, XN, seq0, IPCompare, c_queue);
+                getQueue().finish();
+
+                // Needs to be ascending (true) in order to maintain the indices properly
+                //kernel::sort0_by_key<Tv, T, true>(pKey, pVal);
+                //
+                // Because we use a pair as values, we need to use a custom comparator
+                BOOST_COMPUTE_FUNCTION(bool, Compare_Seq, (const unsigned lhs, const unsigned rhs),
+                    {
+                        return lhs < rhs;
+                    }
+                );
+                compute::sort_by_key(seq0, seqN, X0, Compare_Seq, c_queue);
+                getQueue().finish();
+
+                kernel::splitPair<Tk_, Tv_>(pKey.data, pVal.data, X, elements);
+
+                //// No need of doing moddims here because the original Array<T>
+                //// dimensions have not been changed
+                ////val.modDims(inDims);
+
+                CL_DEBUG_FINISH(getQueue());
+                bufferFree(key);
+                bufferFree(X);
+            } catch (cl::Error err) {
+                CL_TO_AF_ERROR(err);
+                throw;
+            }
+        }
+
+        template<typename Tk, typename Tv, bool isAscending>
+        void sort0ByKey(Param pKey, Param pVal)
+        {
+            int higherDims =  pKey.info.dims[1] * pKey.info.dims[2] * pKey.info.dims[3];
+            // TODO Make a better heurisitic
+            if(higherDims > 5)
+                kernel::sortByKeyBatched<Tk, Tv, isAscending, 0>(pKey, pVal);
+            else
+                kernel::sort0ByKeyIterative<Tk, Tv, isAscending>(pKey, pVal);
+        }
     }
 }
 
diff --git a/src/backend/opencl/kernel/sort_helper.hpp b/src/backend/opencl/kernel/sort_helper.hpp
index 07ab0eeb69..6ba9eff0ae 100644
--- a/src/backend/opencl/kernel/sort_helper.hpp
+++ b/src/backend/opencl/kernel/sort_helper.hpp
@@ -8,8 +8,41 @@
  ********************************************************/
 
 #pragma once
+#include <kernel_headers/sort_make_pair.hpp>
 #include <program.hpp>
 #include <traits.hpp>
+#include <string>
+#include <mutex>
+#include <map>
+#include <utility>
+#include <dispatch.hpp>
+#include <Param.hpp>
+#include <debug_opencl.hpp>
+
+#include <boost/compute/function.hpp>
+
+template<typename Tk, typename Tv, bool isAscending>
+inline
+boost::compute::function<bool(const std::pair<Tk, Tv>, const std::pair<Tk, Tv>)>
+makeCompareFunction()
+{
+    // Cannot use isAscending in BOOST_COMPUTE_FUNCTION
+    if(isAscending) {
+        BOOST_COMPUTE_FUNCTION(bool, IPCompare, (std::pair<Tk, Tv> lhs, std::pair<Tk, Tv> rhs),
+            {
+                return lhs.first < rhs.first;
+            }
+        );
+        return IPCompare;
+    } else {
+        BOOST_COMPUTE_FUNCTION(bool, IPCompare, (std::pair<Tk, Tv> lhs, std::pair<Tk, Tv> rhs),
+            {
+                return lhs.first > rhs.first;
+            }
+        );
+        return IPCompare;
+    }
+}
 
 namespace opencl
 {
@@ -42,5 +75,90 @@ namespace opencl
                                             cl_ulong, ltype_t<T>
                                            >::type;
 
+        static const int copyPairIter = 4;
+
+        template<typename Tk, typename Tv>
+        void makePair(cl::Buffer *out, const cl::Buffer *first, const cl::Buffer *second, const unsigned N)
+        {
+            try {
+                static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
+                static std::map<int, Program*>   makePairProgs;
+                static std::map<int, Kernel*>  makePairKernels;
+
+                int device = getActiveDeviceId();
+
+                std::call_once( compileFlags[device], [device] () {
+                    std::ostringstream options;
+                    options << " -D Tk="      << dtype_traits<Tk>::getName()
+                            << " -D Tv="      << dtype_traits<Tv>::getName()
+                            << " -D copyPairIter=" << copyPairIter;
+                    if (std::is_same<Tk, double >::value ||
+                        std::is_same<Tk, cdouble>::value ||
+                        std::is_same<Tv, double >::value ||
+                        std::is_same<Tv, cdouble>::value) {
+                        options << " -D USE_DOUBLE";
+                    }
+                    Program prog;
+                    buildProgram(prog, sort_make_pair_cl, sort_make_pair_cl_len, options.str());
+                    makePairProgs[device]   = new Program(prog);
+                    makePairKernels[device] = new Kernel(*makePairProgs[device], "make_pair_kernel");
+                });
+
+                auto makePairOp = make_kernel<Buffer, const Buffer, const Buffer, const unsigned>
+                                          (*makePairKernels[device]);
+
+                NDRange local(256, 1, 1);
+                NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1);
+
+                makePairOp(EnqueueArgs(getQueue(), global, local), *out, *first, *second, N);
+
+                CL_DEBUG_FINISH(getQueue());
+            } catch (cl::Error err) {
+                CL_TO_AF_ERROR(err);
+                throw;
+            }
+        }
+
+        template<typename Tk, typename Tv>
+        void splitPair(cl::Buffer *first, cl::Buffer *second, const cl::Buffer *in, const unsigned N)
+        {
+            try {
+                static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
+                static std::map<int, Program*>   splitPairProgs;
+                static std::map<int, Kernel*>  splitPairKernels;
+
+                int device = getActiveDeviceId();
+
+                std::call_once( compileFlags[device], [device] () {
+                    std::ostringstream options;
+                    options << " -D Tk="      << dtype_traits<Tk>::getName()
+                            << " -D Tv="      << dtype_traits<Tv>::getName()
+                            << " -D copyPairIter=" << copyPairIter;
+                    if (std::is_same<Tk, double >::value ||
+                        std::is_same<Tk, cdouble>::value ||
+                        std::is_same<Tv, double >::value ||
+                        std::is_same<Tv, cdouble>::value) {
+                        options << " -D USE_DOUBLE";
+                    }
+                    Program prog;
+                    buildProgram(prog, sort_make_pair_cl, sort_make_pair_cl_len, options.str());
+                    splitPairProgs[device]   = new Program(prog);
+                    splitPairKernels[device] = new Kernel(*splitPairProgs[device], "split_pair_kernel");
+                });
+
+                auto splitPairOp = make_kernel<Buffer, Buffer, const Buffer, const unsigned>
+                                          (*splitPairKernels[device]);
+
+                NDRange local(256, 1, 1);
+                NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1);
+
+                splitPairOp(EnqueueArgs(getQueue(), global, local), *first, *second, *in, N);
+
+                CL_DEBUG_FINISH(getQueue());
+            } catch (cl::Error err) {
+                CL_TO_AF_ERROR(err);
+                throw;
+            }
+        }
     }
 }
diff --git a/src/backend/opencl/kernel/sort_index.hpp b/src/backend/opencl/kernel/sort_index.hpp
index aae0a94ea6..ef5faa612f 100644
--- a/src/backend/opencl/kernel/sort_index.hpp
+++ b/src/backend/opencl/kernel/sort_index.hpp
@@ -16,15 +16,21 @@
 #include <Param.hpp>
 #include <debug_opencl.hpp>
 #include <kernel/sort_helper.hpp>
+#include <kernel/iota.hpp>
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 
 #include <boost/compute/core.hpp>
-#include <boost/compute/algorithm/iota.hpp>
 #include <boost/compute/algorithm/sort_by_key.hpp>
 #include <boost/compute/functional/operator.hpp>
 #include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/transform.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/functional/get.hpp>
+#include <boost/compute/functional/field.hpp>
+#include <boost/compute/types/pair.hpp>
 
 namespace compute = boost::compute;
 
@@ -41,7 +47,7 @@ namespace opencl
     namespace kernel
     {
         template<typename T, bool isAscending>
-        void sort0_index(Param val, Param idx)
+        void sort0IndexIterative(Param val, Param idx)
         {
             try {
                 compute::command_queue c_queue(getQueue()());
@@ -60,19 +66,18 @@ namespace opencl
                             int valOffset = valWZ + y * val.info.strides[1];
                             int idxOffset = idxWZ + y * idx.info.strides[1];
 
-                            compute::buffer_iterator<unsigned> idx_begin(idx_buf, idxOffset);
-                            compute::iota(idx_begin, idx_begin + val.info.dims[0], 0, c_queue);
-
                             if(isAscending) {
                                 compute::sort_by_key(
                                         compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset),
                                         compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset + val.info.dims[0]),
-                                        idx_begin, compute::less< type_t<T> >(), c_queue);
+                                        compute::make_buffer_iterator< type_t<uint> >(idx_buf, idxOffset),
+                                        compute::less< type_t<T> >(), c_queue);
                             } else {
                                 compute::sort_by_key(
                                         compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset),
                                         compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset + val.info.dims[0]),
-                                        idx_begin, compute::greater< type_t<T> >(), c_queue);
+                                        compute::make_buffer_iterator< type_t<uint> >(idx_buf, idxOffset),
+                                        compute::greater< type_t<T> >(), c_queue);
                             }
                         }
                     }
@@ -84,6 +89,110 @@ namespace opencl
                 throw;
             }
         }
+
+        template<typename T, bool isAscending, int dim>
+        void sortIndexBatched(Param pVal, Param pIdx)
+        {
+            typedef type_t<T> Tk;
+            typedef uint Tv;
+            typedef std::pair<Tk, Tv> IndexPair;
+
+            try {
+                af::dim4 inDims;
+                for(int i = 0; i < 4; i++)
+                    inDims[i] = pVal.info.dims[i];
+
+                // Sort dimension
+                // tileDims * seqDims = inDims
+                af::dim4 tileDims(1);
+                af::dim4 seqDims = inDims;
+                tileDims[dim] = inDims[dim];
+                seqDims[dim] = 1;
+
+                // Create/call iota
+                // Array<Tv> key = iota<Tv>(seqDims, tileDims);
+                dim4 keydims = inDims;
+                cl::Buffer* key = bufferAlloc(keydims.elements() * sizeof(Tv));
+                Param pSeq;
+                pSeq.data = key;
+                pSeq.info.offset = 0;
+                pSeq.info.dims[0] = keydims[0];
+                pSeq.info.strides[0] = 1;
+                for(int i = 1; i < 4; i++) {
+                    pSeq.info.dims[i] = keydims[i];
+                    pSeq.info.strides[i] = pSeq.info.strides[i - 1] * pSeq.info.dims[i - 1];
+                }
+                kernel::iota<Tv>(pSeq, seqDims, tileDims);
+
+                int elements = inDims.elements();
+
+                // Flat - Not required since inplace and both are continuous
+                //val.modDims(inDims.elements());
+                //key.modDims(inDims.elements());
+
+                // Sort indices
+                // sort_by_key<T, Tv, isAscending>(*resVal, *resKey, val, key, 0);
+                //kernel::sort0_by_key<T, Tv, isAscending>(pVal, pKey);
+                compute::command_queue c_queue(getQueue()());
+                compute::context c_context(getContext()());
+
+                // Create buffer iterators for seq
+                compute::buffer pSeq_buf((*pSeq.data)());
+                compute::buffer_iterator<Tv> seq0 = compute::make_buffer_iterator<Tv>(pSeq_buf, 0);
+                compute::buffer_iterator<Tv> seqN = compute::make_buffer_iterator<Tv>(pSeq_buf, elements);
+
+                // Copy val, idx into X pair
+                cl::Buffer* X = bufferAlloc(elements * sizeof(IndexPair));
+                // Use T here, not Tk
+                kernel::makePair<T, Tv>(X, pVal.data, pIdx.data, elements);
+                compute::buffer X_buf((*X)());
+                compute::buffer_iterator<IndexPair> X0 = compute::make_buffer_iterator<IndexPair>(X_buf, 0);
+                compute::buffer_iterator<IndexPair> XN = compute::make_buffer_iterator<IndexPair>(X_buf, elements);
+
+                // FIRST SORT CALL
+                compute::function<bool(IndexPair, IndexPair)> IPCompare =
+                    makeCompareFunction<Tk, Tv, isAscending>();
+
+                compute::sort_by_key(X0, XN, seq0, IPCompare, c_queue);
+                getQueue().finish();
+
+                // Needs to be ascending (true) in order to maintain the indices properly
+                //kernel::sort0_by_key<Tv, T, true>(pKey, pVal);
+                //
+                // Because we use a pair as values, we need to use a custom comparator
+                BOOST_COMPUTE_FUNCTION(bool, Compare_Tv, (const Tv lhs, const Tv rhs),
+                    {
+                        return lhs < rhs;
+                    }
+                );
+                compute::sort_by_key(seq0, seqN, X0, Compare_Tv, c_queue);
+                getQueue().finish();
+
+                kernel::splitPair<T, Tv>(pVal.data, pIdx.data, X, elements);
+
+                //// No need of doing moddims here because the original Array<T>
+                //// dimensions have not been changed
+                ////val.modDims(inDims);
+
+                CL_DEBUG_FINISH(getQueue());
+                bufferFree(key);
+                bufferFree(X);
+            } catch (cl::Error err) {
+                CL_TO_AF_ERROR(err);
+                throw;
+            }
+        }
+
+        template<typename T, bool isAscending>
+        void sort0Index(Param val, Param idx)
+        {
+            int higherDims =  val.info.dims[1] * val.info.dims[2] * val.info.dims[3];
+            // TODO Make a better heurisitic
+            if(higherDims > 5)
+              sortIndexBatched<T, isAscending, 0>(val, idx);
+            else
+                kernel::sort0IndexIterative<T, isAscending>(val, idx);
+        }
     }
 }
 
diff --git a/src/backend/opencl/kernel/sort_make_pair.cl b/src/backend/opencl/kernel/sort_make_pair.cl
new file mode 100644
index 0000000000..f5e5413d73
--- /dev/null
+++ b/src/backend/opencl/kernel/sort_make_pair.cl
@@ -0,0 +1,43 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+struct IndexPair
+{
+    Tk first;
+    Tv second;
+};
+
+typedef struct IndexPair IndexPair_t;
+
+__kernel
+void make_pair_kernel(__global IndexPair_t *out,
+                      __global const Tk *first, __global const Tv *second,
+                      const unsigned N)
+{
+    int tIdx = get_group_id(0) * get_local_size(0) * copyPairIter + get_local_id(0);
+    const int blockDimX = get_local_size(0);
+
+    for(int i = tIdx; i < N; i += blockDimX) {
+        out[i].first  = first[i];
+        out[i].second = second[i];
+    }
+}
+
+__kernel
+void split_pair_kernel( __global Tk *first, __global Tv *second,
+                       __global const IndexPair_t *out, const unsigned N)
+{
+    int tIdx = get_group_id(0) * get_local_size(0) * copyPairIter + get_local_id(0);
+    const int blockDimX = get_local_size(0);
+
+    for(int i = tIdx; i < N; i += blockDimX) {
+        first[i]  = out[i].first;
+        second[i] = out[i].second;
+    }
+}
diff --git a/src/backend/opencl/sort_by_key/impl.hpp b/src/backend/opencl/sort_by_key/impl.hpp
index 68c5ce70ae..f68fe91b3c 100644
--- a/src/backend/opencl/sort_by_key/impl.hpp
+++ b/src/backend/opencl/sort_by_key/impl.hpp
@@ -11,6 +11,7 @@
 #include <copy.hpp>
 #include <sort_by_key.hpp>
 #include <kernel/sort_by_key.hpp>
+#include <reorder.hpp>
 #include <math.hpp>
 #include <stdexcept>
 #include <err_opencl.hpp>
@@ -24,12 +25,32 @@ namespace opencl
         try {
             okey = copyArray<Tk>(ikey);
             oval = copyArray<Tv>(ival);
+
             switch(dim) {
-            case 0: kernel::sort0_by_key<Tk, Tv, isAscending>(okey, oval);
-                break;
-            default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
+                case 0: kernel::sort0ByKey<Tk, Tv, isAscending>(okey, oval); break;
+                case 1: kernel::sortByKeyBatched<Tk, Tv, isAscending, 1>(okey, oval); break;
+                case 2: kernel::sortByKeyBatched<Tk, Tv, isAscending, 2>(okey, oval); break;
+                case 3: kernel::sortByKeyBatched<Tk, Tv, isAscending, 3>(okey, oval); break;
+                default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
+            }
+
+            if(dim != 0) {
+                af::dim4 preorderDims = okey.dims();
+                af::dim4 reorderDims(0, 1, 2, 3);
+                reorderDims[dim] = 0;
+                preorderDims[0] = okey.dims()[dim];
+                for(int i = 1; i <= (int)dim; i++) {
+                    reorderDims[i - 1] = i;
+                    preorderDims[i] = okey.dims()[i - 1];
+                }
+
+                okey.setDataDims(preorderDims);
+                oval.setDataDims(preorderDims);
+
+                okey = reorder<Tk>(okey, reorderDims);
+                oval = reorder<Tv>(oval, reorderDims);
             }
-        }catch(std::exception &ex) {
+        } catch(std::exception &ex) {
             AF_ERROR(ex.what(), AF_ERR_INTERNAL);
         }
     }
diff --git a/src/backend/opencl/sort_index.cpp b/src/backend/opencl/sort_index.cpp
index c7aaa70feb..49795c2eb5 100644
--- a/src/backend/opencl/sort_index.cpp
+++ b/src/backend/opencl/sort_index.cpp
@@ -14,6 +14,8 @@
 #include <math.hpp>
 #include <stdexcept>
 #include <err_opencl.hpp>
+#include <reorder.hpp>
+#include <range.hpp>
 
 namespace opencl
 {
@@ -22,17 +24,38 @@ namespace opencl
     {
         try {
             val = copyArray<T>(in);
-            idx = createEmptyArray<uint>(in.dims());
+            idx = range<uint>(in.dims(), dim);
+            idx.eval();
 
             switch(dim) {
-            case 0: kernel::sort0_index<T, isAscending>(val, idx);
-                break;
-            default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
+                case 0: kernel::sort0Index<T, isAscending>(val, idx); break;
+                case 1: kernel::sortIndexBatched<T, isAscending, 1>(val, idx); break;
+                case 2: kernel::sortIndexBatched<T, isAscending, 2>(val, idx); break;
+                case 3: kernel::sortIndexBatched<T, isAscending, 3>(val, idx); break;
+                default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
             }
-        }         catch (std::exception &ex) {
+
+            if(dim != 0) {
+                af::dim4 preorderDims = val.dims();
+                af::dim4 reorderDims(0, 1, 2, 3);
+                reorderDims[dim] = 0;
+                preorderDims[0] = val.dims()[dim];
+                for(int i = 1; i <= (int)dim; i++) {
+                    reorderDims[i - 1] = i;
+                    preorderDims[i] = val.dims()[i - 1];
+                }
+
+                val.setDataDims(preorderDims);
+                idx.setDataDims(preorderDims);
+
+                val = reorder<T>(val, reorderDims);
+                idx = reorder<uint>(idx, reorderDims);
+            }
+        } catch (std::exception &ex) {
             AF_ERROR(ex.what(), AF_ERR_INTERNAL);
         }
     }
+
 #define INSTANTIATE(T)                                                  \
     template void sort_index<T, true>(Array<T> &val, Array<uint> &idx, const Array<T> &in, \
                                       const uint dim);                  \

From efdc54264dde88b3635e0c6c0a1017293cd25353 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Thu, 21 Apr 2016 13:18:18 -0400
Subject: [PATCH 16/28] Combine sort_index and sort_by_key kernels in CPU

---
 src/backend/cpu/kernel/sort_index.hpp | 140 --------------------------
 src/backend/cpu/sort_index.cpp        |  33 +++---
 2 files changed, 17 insertions(+), 156 deletions(-)
 delete mode 100644 src/backend/cpu/kernel/sort_index.hpp

diff --git a/src/backend/cpu/kernel/sort_index.hpp b/src/backend/cpu/kernel/sort_index.hpp
deleted file mode 100644
index 7a23a7df49..0000000000
--- a/src/backend/cpu/kernel/sort_index.hpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*******************************************************
- * Copyright (c) 2015, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#pragma once
-#include <af/defines.h>
-#include <Array.hpp>
-#include <math.hpp>
-#include <algorithm>
-#include <numeric>
-#include <err_cpu.hpp>
-#include <functional>
-#include <kernel/sort_helper.hpp>
-
-namespace cpu
-{
-namespace kernel
-{
-
-template<typename T, bool isAscending>
-void sort0IndexIterative(Array<T> val, Array<uint> idx)
-{
-    // initialize original index locations
-    uint *idx_ptr = idx.get();
-       T *val_ptr = val.get();
-
-    std::vector<IndexPair<T, uint> > X;
-    X.reserve(val.dims()[0]);
-
-    for(dim_t w = 0; w < val.dims()[3]; w++) {
-        dim_t valW = w * val.strides()[3];
-        dim_t idxW = w * idx.strides()[3];
-        for(dim_t z = 0; z < val.dims()[2]; z++) {
-            dim_t valWZ = valW + z * val.strides()[2];
-            dim_t idxWZ = idxW + z * idx.strides()[2];
-            for(dim_t y = 0; y < val.dims()[1]; y++) {
-                dim_t valOffset = valWZ + y * val.strides()[1];
-                dim_t idxOffset = idxWZ + y * idx.strides()[1];
-
-                X.clear();
-                std::transform(val_ptr + valOffset, val_ptr + valOffset + val.dims()[0],
-                               idx_ptr + idxOffset,
-                               std::back_inserter(X),
-                               [](T v_, uint i_) { return std::make_pair(v_, i_); }
-                               );
-
-                //comp_ptr = &X.front();
-                std::stable_sort(X.begin(), X.end(), IPCompare<T, uint, isAscending>());
-
-                for(unsigned it = 0; it < X.size(); it++) {
-                    val_ptr[valOffset + it] = X[it].first;
-                    idx_ptr[idxOffset + it] = X[it].second;
-                }
-            }
-        }
-    }
-
-    return;
-}
-
-template<typename T, bool isAscending, int dim>
-void sortIndexBatched(Array<T> val, Array<uint> idx)
-{
-    af::dim4 inDims = val.dims();
-
-    af::dim4 tileDims(1);
-    af::dim4 seqDims = inDims;
-    tileDims[dim] = inDims[dim];
-    seqDims[dim] = 1;
-
-    uint* key = memAlloc<uint>(inDims.elements());
-    // IOTA
-    {
-        af::dim4 dims    = inDims;
-        uint* out        = key;
-        af::dim4 strides(1);
-        for(int i = 1; i < 4; i++)
-            strides[i] = strides[i-1] * dims[i-1];
-
-        for(dim_t w = 0; w < dims[3]; w++) {
-            dim_t offW = w * strides[3];
-            uint valW = (w % seqDims[3]) * seqDims[0] * seqDims[1] * seqDims[2];
-            for(dim_t z = 0; z < dims[2]; z++) {
-                dim_t offWZ = offW + z * strides[2];
-                uint valZ = valW + (z % seqDims[2]) * seqDims[0] * seqDims[1];
-                for(dim_t y = 0; y < dims[1]; y++) {
-                    dim_t offWZY = offWZ + y * strides[1];
-                    uint valY = valZ + (y % seqDims[1]) * seqDims[0];
-                    for(dim_t x = 0; x < dims[0]; x++) {
-                        dim_t id = offWZY + x;
-                        out[id] = valY + (x % seqDims[0]);
-                    }
-                }
-            }
-        }
-    }
-
-    // initialize original index locations
-    uint *idx_ptr = idx.get();
-       T *val_ptr = val.get();
-
-    std::vector<KeyIndexPair<T, uint> > X;
-    X.reserve(val.elements());
-
-    for(unsigned i = 0; i < val.elements(); i++) {
-        X.push_back(std::make_pair(std::make_pair(val_ptr[i], idx_ptr[i]), key[i]));
-    }
-
-    memFree(key); // key is no longer required
-
-    std::stable_sort(X.begin(), X.end(), KIPCompareV<T, uint, isAscending>());
-
-    std::stable_sort(X.begin(), X.end(), KIPCompareK<T, uint, true>());
-
-    for(unsigned it = 0; it < val.elements(); it++) {
-        val_ptr[it] = X[it].first.first;
-        idx_ptr[it] = X[it].first.second;
-    }
-
-    return;
-}
-
-template<typename T, bool isAscending>
-void sort0Index(Array<T> val, Array<unsigned> idx)
-{
-    int higherDims =  val.dims()[1] * val.dims()[2] * val.dims()[3];
-    // TODO Make a better heurisitic
-    if(higherDims > 0)
-        kernel::sortIndexBatched<T, isAscending, 0>(val, idx);
-    else
-        kernel::sort0IndexIterative<T, isAscending>(val, idx);
-}
-
-}
-}
diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp
index 36ca57b3e8..b865db9c1c 100644
--- a/src/backend/cpu/sort_index.cpp
+++ b/src/backend/cpu/sort_index.cpp
@@ -17,43 +17,44 @@
 #include <range.hpp>
 #include <copy.hpp>
 #include <reorder.hpp>
-#include <kernel/sort_index.hpp>
+#include <kernel/sort_by_key.hpp>
 
 namespace cpu
 {
 
 template<typename T, bool isAscending>
-void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint dim)
+void sort_index(Array<T> &okey, Array<uint> &oval, const Array<T> &in, const uint dim)
 {
     in.eval();
 
-    val = copyArray<T>(in);
-    idx = range<uint>(in.dims(), dim);
-    idx.eval();
+    // okey is values, oval is indices
+    okey = copyArray<T>(in);
+    oval = range<uint>(in.dims(), dim);
+    oval.eval();
 
     switch(dim) {
-        case 0: getQueue().enqueue(kernel::sort0Index<T, isAscending>, val, idx); break;
-        case 1: getQueue().enqueue(kernel::sortIndexBatched<T, isAscending, 1>, val, idx); break;
-        case 2: getQueue().enqueue(kernel::sortIndexBatched<T, isAscending, 2>, val, idx); break;
-        case 3: getQueue().enqueue(kernel::sortIndexBatched<T, isAscending, 3>, val, idx); break;
+        case 0: getQueue().enqueue(kernel::sort0ByKey<T, uint, isAscending>, okey, oval); break;
+        case 1: getQueue().enqueue(kernel::sortByKeyBatched<T, uint, isAscending, 1>, okey, oval); break;
+        case 2: getQueue().enqueue(kernel::sortByKeyBatched<T, uint, isAscending, 2>, okey, oval); break;
+        case 3: getQueue().enqueue(kernel::sortByKeyBatched<T, uint, isAscending, 3>, okey, oval); break;
         default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
     }
 
     if(dim != 0) {
-        af::dim4 preorderDims = val.dims();
+        af::dim4 preorderDims = okey.dims();
         af::dim4 reorderDims(0, 1, 2, 3);
         reorderDims[dim] = 0;
-        preorderDims[0] = val.dims()[dim];
+        preorderDims[0] = okey.dims()[dim];
         for(int i = 1; i <= (int)dim; i++) {
             reorderDims[i - 1] = i;
-            preorderDims[i] = val.dims()[i - 1];
+            preorderDims[i] = okey.dims()[i - 1];
         }
 
-        val.setDataDims(preorderDims);
-        idx.setDataDims(preorderDims);
+        okey.setDataDims(preorderDims);
+        oval.setDataDims(preorderDims);
 
-        val = reorder<T>(val, reorderDims);
-        idx = reorder<uint>(idx, reorderDims);
+        okey = reorder<T>(okey, reorderDims);
+        oval = reorder<uint>(oval, reorderDims);
     }
 }
 

From 460bf6c8baaf1b7fce4a758dbe3f9cd48c291499 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Thu, 21 Apr 2016 13:29:17 -0400
Subject: [PATCH 17/28] Combine sort_index and sort_by_key kernels in CUDA

---
 src/backend/cuda/kernel/harris.hpp      |   4 +-
 src/backend/cuda/kernel/orb.hpp         |   4 +-
 src/backend/cuda/kernel/sort_by_key.hpp |   3 +-
 src/backend/cuda/kernel/sort_index.hpp  | 141 ------------------------
 src/backend/cuda/sort_index.cu          |  32 +++---
 5 files changed, 22 insertions(+), 162 deletions(-)
 delete mode 100644 src/backend/cuda/kernel/sort_index.hpp

diff --git a/src/backend/cuda/kernel/harris.hpp b/src/backend/cuda/kernel/harris.hpp
index 3cb28b2b2f..c773ae45c5 100644
--- a/src/backend/cuda/kernel/harris.hpp
+++ b/src/backend/cuda/kernel/harris.hpp
@@ -18,7 +18,7 @@
 #include <convolve_common.hpp>
 #include "convolve.hpp"
 #include "gradient.hpp"
-#include "sort_index.hpp"
+#include "sort_by_key.hpp"
 
 namespace cuda
 {
@@ -339,7 +339,7 @@ void harris(unsigned* corners_out,
         harris_idx.ptr = memAlloc<unsigned>(sort_elem);
 
         // Sort Harris responses
-        sort0Index<float, false>(harris_responses, harris_idx);
+        sort0ByKey<float, uint, false>(harris_responses, harris_idx);
 
         *x_out = memAlloc<float>(*corners_out);
         *y_out = memAlloc<float>(*corners_out);
diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp
index 8448418f8b..b5ed10340e 100644
--- a/src/backend/cuda/kernel/orb.hpp
+++ b/src/backend/cuda/kernel/orb.hpp
@@ -16,7 +16,7 @@
 #include <convolve_common.hpp>
 #include "convolve.hpp"
 #include "orb_patch.hpp"
-#include "sort_index.hpp"
+#include "sort_by_key.hpp"
 
 #include <boost/scoped_ptr.hpp>
 
@@ -397,7 +397,7 @@ void orb(unsigned* out_feat,
         harris_idx.ptr = memAlloc<unsigned>(sort_elem);
 
         // Sort features according to Harris responses
-        sort0Index<float, false>(harris_sorted, harris_idx);
+        sort0ByKey<float, uint, false>(harris_sorted, harris_idx);
 
         feat_pyr[i] = std::min(feat_pyr[i], lvl_best[i]);
 
diff --git a/src/backend/cuda/kernel/sort_by_key.hpp b/src/backend/cuda/kernel/sort_by_key.hpp
index beffa5476e..1536d1bf53 100644
--- a/src/backend/cuda/kernel/sort_by_key.hpp
+++ b/src/backend/cuda/kernel/sort_by_key.hpp
@@ -109,6 +109,7 @@ namespace cuda
             POST_LAUNCH_CHECK();
 
             // Needs to be ascending (true) in order to maintain the indices properly
+            //kernel::sort0_by_key<uint, T, true>(pKey, pVal);
             THRUST_SELECT(thrust::stable_sort_by_key,
                           dSeq,
                           dSeq + inDims.elements(),
@@ -132,7 +133,7 @@ namespace cuda
             int higherDims =  okey.dims[1] * okey.dims[2] * okey.dims[3];
             // TODO Make a better heurisitic
             if(higherDims > 5)
-                sortByKeyBatched<Tk, Tv, isAscending, 0>(okey, oval);
+                kernel::sortByKeyBatched<Tk, Tv, isAscending, 0>(okey, oval);
             else
                 kernel::sort0ByKeyIterative<Tk, Tv, isAscending>(okey, oval);
         }
diff --git a/src/backend/cuda/kernel/sort_index.hpp b/src/backend/cuda/kernel/sort_index.hpp
deleted file mode 100644
index 40a5d59311..0000000000
--- a/src/backend/cuda/kernel/sort_index.hpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include <math.hpp>
-#include <dispatch.hpp>
-#include <Param.hpp>
-#include <err_cuda.hpp>
-#include <debug_cuda.hpp>
-#include <kernel/sort_helper.hpp>
-#include <kernel/iota.hpp>
-
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
-#include <thrust/sort.h>
-
-namespace cuda
-{
-    namespace kernel
-    {
-        ///////////////////////////////////////////////////////////////////////////
-        // Wrapper functions
-        ///////////////////////////////////////////////////////////////////////////
-        template<typename T, bool isAscending>
-        void sort0IndexIterative(Param<T> val, Param<unsigned> idx)
-        {
-            thrust::device_ptr<T>        val_ptr = thrust::device_pointer_cast(val.ptr);
-            thrust::device_ptr<unsigned> idx_ptr = thrust::device_pointer_cast(idx.ptr);
-
-            for(int w = 0; w < val.dims[3]; w++) {
-                int valW = w * val.strides[3];
-                int idxW = w * idx.strides[3];
-                for(int z = 0; z < val.dims[2]; z++) {
-                    int valWZ = valW + z * val.strides[2];
-                    int idxWZ = idxW + z * idx.strides[2];
-                    for(int y = 0; y < val.dims[1]; y++) {
-
-                        int valOffset = valWZ + y * val.strides[1];
-                        int idxOffset = idxWZ + y * idx.strides[1];
-
-                        if(isAscending) {
-                            THRUST_SELECT(thrust::stable_sort_by_key,
-                                    val_ptr + valOffset, val_ptr + valOffset + val.dims[0],
-                                    idx_ptr + idxOffset);
-                        } else {
-                            THRUST_SELECT(thrust::stable_sort_by_key,
-                                        val_ptr + valOffset, val_ptr + valOffset + val.dims[0],
-                                        idx_ptr + idxOffset, thrust::greater<T>());
-                        }
-                    }
-                }
-            }
-            POST_LAUNCH_CHECK();
-        }
-
-        template<typename T, bool isAscending, int dim>
-        void sortIndexBatched(Param<T> pVal, Param<unsigned> pIdx)
-        {
-            af::dim4 inDims;
-            for(int i = 0; i < 4; i++)
-                inDims[i] = pVal.dims[i];
-
-            // Sort dimension
-            // tileDims * seqDims = inDims
-            af::dim4 tileDims(1);
-            af::dim4 seqDims = inDims;
-            tileDims[dim] = inDims[dim];
-            seqDims[dim] = 1;
-
-            // Create/call iota
-            // Array<uint> seq = iota<uint>(seqDims, tileDims);
-            dim4 keydims = inDims;
-            uint* key = memAlloc<uint>(keydims.elements());
-            Param<uint> pSeq;
-            pSeq.ptr = key;
-            pSeq.strides[0] = 1;
-            pSeq.dims[0] = keydims[0];
-            for(int i = 1; i < 4; i++) {
-                pSeq.dims[i] = keydims[i];
-                pSeq.strides[i] = pSeq.strides[i - 1] * pSeq.dims[i - 1];
-            }
-            cuda::kernel::iota<uint>(pSeq, seqDims, tileDims);
-
-            // Flat - Not required since inplace and both are continuous
-            //val.modDims(inDims.elements());
-            //key.modDims(inDims.elements());
-
-            // Make val, idx into a pair
-            thrust::device_vector<IndexPair<T, uint> > X(inDims.elements());
-            IndexPair<T, uint> *Xptr = thrust::raw_pointer_cast(X.data());
-
-            const int threads = 256;
-            int blocks = divup(inDims.elements(), threads * copyPairIter);
-            CUDA_LAUNCH((makeIndexPair<T, unsigned>), blocks, threads,
-                        Xptr, pVal.ptr, pIdx.ptr, inDims.elements());
-
-            // Sort indices
-            // sort_by_key<T, uint, isAscending>(*resVal, *resKey, val, key, 0);
-            thrust::device_ptr<uint> dSeq = thrust::device_pointer_cast(pSeq.ptr);
-            THRUST_SELECT(thrust::stable_sort_by_key,
-                          X.begin(), X.end(),
-                          dSeq,
-                          IPCompare<T, uint, isAscending>());
-            POST_LAUNCH_CHECK();
-
-            // Needs to be ascending (true) in order to maintain the indices properly
-            //kernel::sort0_by_key<uint, T, true>(pKey, pVal);
-            THRUST_SELECT(thrust::stable_sort_by_key,
-                          dSeq,
-                          dSeq + inDims.elements(),
-                          X.begin());
-            POST_LAUNCH_CHECK();
-
-            CUDA_LAUNCH((splitIndexPair<T, unsigned>), blocks, threads,
-                        pVal.ptr, pIdx.ptr, Xptr, inDims.elements());
-            POST_LAUNCH_CHECK();
-
-            // No need of doing moddims here because the original Array<T>
-            // dimensions have not been changed
-            //val.modDims(inDims);
-
-            memFree(key);
-        }
-
-        template<typename T, bool isAscending>
-        void sort0Index(Param<T> val, Param<unsigned> idx)
-        {
-            int higherDims =  val.dims[1] * val.dims[2] * val.dims[3];
-            // TODO Make a better heurisitic
-            if(higherDims > 5)
-                sortIndexBatched<T, isAscending, 0>(val, idx);
-            else
-                kernel::sort0IndexIterative<T, isAscending>(val, idx);
-        }
-    }
-}
diff --git a/src/backend/cuda/sort_index.cu b/src/backend/cuda/sort_index.cu
index ab54c24a9c..03c69ad4f3 100644
--- a/src/backend/cuda/sort_index.cu
+++ b/src/backend/cuda/sort_index.cu
@@ -9,7 +9,7 @@
 
 #include <Array.hpp>
 #include <sort_index.hpp>
-#include <kernel/sort_index.hpp>
+#include <kernel/sort_by_key.hpp>
 #include <copy.hpp>
 #include <math.hpp>
 #include <stdexcept>
@@ -20,35 +20,35 @@
 namespace cuda
 {
     template<typename T, bool isAscending>
-    void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint dim)
+    void sort_index(Array<T> &okey, Array<uint> &oval, const Array<T> &in, const uint dim)
     {
-        val = copyArray<T>(in);
-        idx = range<uint>(in.dims(), dim);
-        idx.eval();
+        okey = copyArray<T>(in);
+        oval = range<uint>(in.dims(), dim);
+        oval.eval();
 
         switch(dim) {
-            case 0: kernel::sort0Index<T, isAscending>(val, idx); break;
-            case 1: kernel::sortIndexBatched<T, isAscending, 1>(val, idx); break;
-            case 2: kernel::sortIndexBatched<T, isAscending, 2>(val, idx); break;
-            case 3: kernel::sortIndexBatched<T, isAscending, 3>(val, idx); break;
+            case 0: kernel::sort0ByKey<T, uint, isAscending>(okey, oval); break;
+            case 1: kernel::sortByKeyBatched<T, uint, isAscending, 1>(okey, oval); break;
+            case 2: kernel::sortByKeyBatched<T, uint, isAscending, 2>(okey, oval); break;
+            case 3: kernel::sortByKeyBatched<T, uint, isAscending, 3>(okey, oval); break;
             default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
         }
 
         if(dim != 0) {
-            af::dim4 preorderDims = val.dims();
+            af::dim4 preorderDims = okey.dims();
             af::dim4 reorderDims(0, 1, 2, 3);
             reorderDims[dim] = 0;
-            preorderDims[0] = val.dims()[dim];
+            preorderDims[0] = okey.dims()[dim];
             for(int i = 1; i <= (int)dim; i++) {
                 reorderDims[i - 1] = i;
-                preorderDims[i] = val.dims()[i - 1];
+                preorderDims[i] = okey.dims()[i - 1];
             }
 
-            val.setDataDims(preorderDims);
-            idx.setDataDims(preorderDims);
+            okey.setDataDims(preorderDims);
+            oval.setDataDims(preorderDims);
 
-            val = reorder<T>(val, reorderDims);
-            idx = reorder<uint>(idx, reorderDims);
+            okey = reorder<T>(okey, reorderDims);
+            oval = reorder<uint>(oval, reorderDims);
         }
     }
 

From 363e86e5031f64689db50efb08b3323dea443dba Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Thu, 21 Apr 2016 13:29:46 -0400
Subject: [PATCH 18/28] Combine sort_index and sort_by_key kernels in OpenCL

---
 src/backend/opencl/kernel/harris.hpp          |   4 +-
 src/backend/opencl/kernel/orb.hpp             |   4 +-
 src/backend/opencl/kernel/sift_nonfree.hpp    |   3 +-
 src/backend/opencl/kernel/sort_helper.hpp     |  26 +--
 src/backend/opencl/kernel/sort_index.hpp      | 199 ------------------
 .../{sort_make_pair.cl => sort_pair.cl}       |   0
 src/backend/opencl/sort_index.cpp             |  33 +--
 7 files changed, 36 insertions(+), 233 deletions(-)
 delete mode 100644 src/backend/opencl/kernel/sort_index.hpp
 rename src/backend/opencl/kernel/{sort_make_pair.cl => sort_pair.cl} (100%)

diff --git a/src/backend/opencl/kernel/harris.hpp b/src/backend/opencl/kernel/harris.hpp
index 3c0e531d6d..4c203c9377 100644
--- a/src/backend/opencl/kernel/harris.hpp
+++ b/src/backend/opencl/kernel/harris.hpp
@@ -16,7 +16,7 @@
 #include <convolve_common.hpp>
 #include <kernel/convolve_separable.hpp>
 #include <kernel/gradient.hpp>
-#include <kernel/sort_index.hpp>
+#include <kernel/sort_by_key.hpp>
 #include <kernel_headers/harris.hpp>
 #include <memory.hpp>
 #include <map>
@@ -287,7 +287,7 @@ void harris(unsigned* corners_out,
             harris_idx.data = bufferAlloc(sort_elem * sizeof(unsigned));
 
             // Sort Harris responses
-            sort0Index<float, false>(harris_resp, harris_idx);
+            sort0ByKey<float, uint, false>(harris_resp, harris_idx);
 
             x_out.data = bufferAlloc(*corners_out * sizeof(float));
             y_out.data = bufferAlloc(*corners_out * sizeof(float));
diff --git a/src/backend/opencl/kernel/orb.hpp b/src/backend/opencl/kernel/orb.hpp
index 317bb4e3d8..612edacfe0 100644
--- a/src/backend/opencl/kernel/orb.hpp
+++ b/src/backend/opencl/kernel/orb.hpp
@@ -16,7 +16,7 @@
 #include <kernel/convolve_separable.hpp>
 #include <kernel/fast.hpp>
 #include <kernel/resize.hpp>
-#include <kernel/sort_index.hpp>
+#include <kernel/sort_by_key.hpp>
 #include <kernel_headers/orb.hpp>
 #include <memory.hpp>
 #include <vector>
@@ -305,7 +305,7 @@ void orb(unsigned* out_feat,
             d_harris_sorted.data = d_score_harris;
             d_harris_idx.data = bufferAlloc((d_harris_idx.info.dims[0]) * sizeof(unsigned));
 
-            sort0Index<float, false>(d_harris_sorted, d_harris_idx);
+            sort0ByKey<float, uint, false>(d_harris_sorted, d_harris_idx);
 
             cl::Buffer* d_x_lvl = bufferAlloc(usable_feat * sizeof(float));
             cl::Buffer* d_y_lvl = bufferAlloc(usable_feat * sizeof(float));
diff --git a/src/backend/opencl/kernel/sift_nonfree.hpp b/src/backend/opencl/kernel/sift_nonfree.hpp
index c28f432fce..0b78ef02f3 100644
--- a/src/backend/opencl/kernel/sift_nonfree.hpp
+++ b/src/backend/opencl/kernel/sift_nonfree.hpp
@@ -92,11 +92,12 @@
 #include <kernel/convolve_separable.hpp>
 #include <kernel/fast.hpp>
 #include <kernel/resize.hpp>
-#include <kernel/sort_index.hpp>
 #include <kernel_headers/sift_nonfree.hpp>
 #include <memory.hpp>
 #include <vector>
 
+namespace compute = boost::compute;
+
 using cl::Buffer;
 using cl::Program;
 using cl::Kernel;
diff --git a/src/backend/opencl/kernel/sort_helper.hpp b/src/backend/opencl/kernel/sort_helper.hpp
index 6ba9eff0ae..7f500c6bb0 100644
--- a/src/backend/opencl/kernel/sort_helper.hpp
+++ b/src/backend/opencl/kernel/sort_helper.hpp
@@ -8,7 +8,7 @@
  ********************************************************/
 
 #pragma once
-#include <kernel_headers/sort_make_pair.hpp>
+#include <kernel_headers/sort_pair.hpp>
 #include <program.hpp>
 #include <traits.hpp>
 #include <string>
@@ -82,8 +82,8 @@ namespace opencl
         {
             try {
                 static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
-                static std::map<int, Program*>   makePairProgs;
-                static std::map<int, Kernel*>  makePairKernels;
+                static std::map<int, Program*>   sortPairProgs;
+                static std::map<int, Kernel*>  sortPairKernels;
 
                 int device = getActiveDeviceId();
 
@@ -99,13 +99,13 @@ namespace opencl
                         options << " -D USE_DOUBLE";
                     }
                     Program prog;
-                    buildProgram(prog, sort_make_pair_cl, sort_make_pair_cl_len, options.str());
-                    makePairProgs[device]   = new Program(prog);
-                    makePairKernels[device] = new Kernel(*makePairProgs[device], "make_pair_kernel");
+                    buildProgram(prog, sort_pair_cl, sort_pair_cl_len, options.str());
+                    sortPairProgs[device]   = new Program(prog);
+                    sortPairKernels[device] = new Kernel(*sortPairProgs[device], "make_pair_kernel");
                 });
 
                 auto makePairOp = make_kernel<Buffer, const Buffer, const Buffer, const unsigned>
-                                          (*makePairKernels[device]);
+                                          (*sortPairKernels[device]);
 
                 NDRange local(256, 1, 1);
                 NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1);
@@ -124,8 +124,8 @@ namespace opencl
         {
             try {
                 static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
-                static std::map<int, Program*>   splitPairProgs;
-                static std::map<int, Kernel*>  splitPairKernels;
+                static std::map<int, Program*>   sortPairProgs;
+                static std::map<int, Kernel*>  sortPairKernels;
 
                 int device = getActiveDeviceId();
 
@@ -141,13 +141,13 @@ namespace opencl
                         options << " -D USE_DOUBLE";
                     }
                     Program prog;
-                    buildProgram(prog, sort_make_pair_cl, sort_make_pair_cl_len, options.str());
-                    splitPairProgs[device]   = new Program(prog);
-                    splitPairKernels[device] = new Kernel(*splitPairProgs[device], "split_pair_kernel");
+                    buildProgram(prog, sort_pair_cl, sort_pair_cl_len, options.str());
+                    sortPairProgs[device]   = new Program(prog);
+                    sortPairKernels[device] = new Kernel(*sortPairProgs[device], "split_pair_kernel");
                 });
 
                 auto splitPairOp = make_kernel<Buffer, Buffer, const Buffer, const unsigned>
-                                          (*splitPairKernels[device]);
+                                          (*sortPairKernels[device]);
 
                 NDRange local(256, 1, 1);
                 NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1);
diff --git a/src/backend/opencl/kernel/sort_index.hpp b/src/backend/opencl/kernel/sort_index.hpp
deleted file mode 100644
index ef5faa612f..0000000000
--- a/src/backend/opencl/kernel/sort_index.hpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#pragma once
-#include <program.hpp>
-#include <traits.hpp>
-#include <string>
-#include <mutex>
-#include <dispatch.hpp>
-#include <Param.hpp>
-#include <debug_opencl.hpp>
-#include <kernel/sort_helper.hpp>
-#include <kernel/iota.hpp>
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-
-#include <boost/compute/core.hpp>
-#include <boost/compute/algorithm/sort_by_key.hpp>
-#include <boost/compute/functional/operator.hpp>
-#include <boost/compute/iterator/buffer_iterator.hpp>
-#include <boost/compute/algorithm/copy.hpp>
-#include <boost/compute/algorithm/transform.hpp>
-#include <boost/compute/container/vector.hpp>
-#include <boost/compute/functional/get.hpp>
-#include <boost/compute/functional/field.hpp>
-#include <boost/compute/types/pair.hpp>
-
-namespace compute = boost::compute;
-
-using cl::Buffer;
-using cl::Program;
-using cl::Kernel;
-using cl::make_kernel;
-using cl::EnqueueArgs;
-using cl::NDRange;
-using std::string;
-
-namespace opencl
-{
-    namespace kernel
-    {
-        template<typename T, bool isAscending>
-        void sort0IndexIterative(Param val, Param idx)
-        {
-            try {
-                compute::command_queue c_queue(getQueue()());
-
-                compute::buffer val_buf((*val.data)());
-                compute::buffer idx_buf((*idx.data)());
-
-                for(int w = 0; w < (int)val.info.dims[3]; w++) {
-                    int valW = w * (int)val.info.strides[3];
-                    int idxW = w * idx.info.strides[3];
-                    for(int z = 0; z < (int)val.info.dims[2]; z++) {
-                        int valWZ = valW + z * (int)val.info.strides[2];
-                        int idxWZ = idxW + z * idx.info.strides[2];
-                        for(int y = 0; y < (int)val.info.dims[1]; y++) {
-
-                            int valOffset = valWZ + y * val.info.strides[1];
-                            int idxOffset = idxWZ + y * idx.info.strides[1];
-
-                            if(isAscending) {
-                                compute::sort_by_key(
-                                        compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset),
-                                        compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset + val.info.dims[0]),
-                                        compute::make_buffer_iterator< type_t<uint> >(idx_buf, idxOffset),
-                                        compute::less< type_t<T> >(), c_queue);
-                            } else {
-                                compute::sort_by_key(
-                                        compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset),
-                                        compute::make_buffer_iterator< type_t<T> >(val_buf, valOffset + val.info.dims[0]),
-                                        compute::make_buffer_iterator< type_t<uint> >(idx_buf, idxOffset),
-                                        compute::greater< type_t<T> >(), c_queue);
-                            }
-                        }
-                    }
-                }
-
-                CL_DEBUG_FINISH(getQueue());
-            } catch (cl::Error err) {
-                CL_TO_AF_ERROR(err);
-                throw;
-            }
-        }
-
-        template<typename T, bool isAscending, int dim>
-        void sortIndexBatched(Param pVal, Param pIdx)
-        {
-            typedef type_t<T> Tk;
-            typedef uint Tv;
-            typedef std::pair<Tk, Tv> IndexPair;
-
-            try {
-                af::dim4 inDims;
-                for(int i = 0; i < 4; i++)
-                    inDims[i] = pVal.info.dims[i];
-
-                // Sort dimension
-                // tileDims * seqDims = inDims
-                af::dim4 tileDims(1);
-                af::dim4 seqDims = inDims;
-                tileDims[dim] = inDims[dim];
-                seqDims[dim] = 1;
-
-                // Create/call iota
-                // Array<Tv> key = iota<Tv>(seqDims, tileDims);
-                dim4 keydims = inDims;
-                cl::Buffer* key = bufferAlloc(keydims.elements() * sizeof(Tv));
-                Param pSeq;
-                pSeq.data = key;
-                pSeq.info.offset = 0;
-                pSeq.info.dims[0] = keydims[0];
-                pSeq.info.strides[0] = 1;
-                for(int i = 1; i < 4; i++) {
-                    pSeq.info.dims[i] = keydims[i];
-                    pSeq.info.strides[i] = pSeq.info.strides[i - 1] * pSeq.info.dims[i - 1];
-                }
-                kernel::iota<Tv>(pSeq, seqDims, tileDims);
-
-                int elements = inDims.elements();
-
-                // Flat - Not required since inplace and both are continuous
-                //val.modDims(inDims.elements());
-                //key.modDims(inDims.elements());
-
-                // Sort indices
-                // sort_by_key<T, Tv, isAscending>(*resVal, *resKey, val, key, 0);
-                //kernel::sort0_by_key<T, Tv, isAscending>(pVal, pKey);
-                compute::command_queue c_queue(getQueue()());
-                compute::context c_context(getContext()());
-
-                // Create buffer iterators for seq
-                compute::buffer pSeq_buf((*pSeq.data)());
-                compute::buffer_iterator<Tv> seq0 = compute::make_buffer_iterator<Tv>(pSeq_buf, 0);
-                compute::buffer_iterator<Tv> seqN = compute::make_buffer_iterator<Tv>(pSeq_buf, elements);
-
-                // Copy val, idx into X pair
-                cl::Buffer* X = bufferAlloc(elements * sizeof(IndexPair));
-                // Use T here, not Tk
-                kernel::makePair<T, Tv>(X, pVal.data, pIdx.data, elements);
-                compute::buffer X_buf((*X)());
-                compute::buffer_iterator<IndexPair> X0 = compute::make_buffer_iterator<IndexPair>(X_buf, 0);
-                compute::buffer_iterator<IndexPair> XN = compute::make_buffer_iterator<IndexPair>(X_buf, elements);
-
-                // FIRST SORT CALL
-                compute::function<bool(IndexPair, IndexPair)> IPCompare =
-                    makeCompareFunction<Tk, Tv, isAscending>();
-
-                compute::sort_by_key(X0, XN, seq0, IPCompare, c_queue);
-                getQueue().finish();
-
-                // Needs to be ascending (true) in order to maintain the indices properly
-                //kernel::sort0_by_key<Tv, T, true>(pKey, pVal);
-                //
-                // Because we use a pair as values, we need to use a custom comparator
-                BOOST_COMPUTE_FUNCTION(bool, Compare_Tv, (const Tv lhs, const Tv rhs),
-                    {
-                        return lhs < rhs;
-                    }
-                );
-                compute::sort_by_key(seq0, seqN, X0, Compare_Tv, c_queue);
-                getQueue().finish();
-
-                kernel::splitPair<T, Tv>(pVal.data, pIdx.data, X, elements);
-
-                //// No need of doing moddims here because the original Array<T>
-                //// dimensions have not been changed
-                ////val.modDims(inDims);
-
-                CL_DEBUG_FINISH(getQueue());
-                bufferFree(key);
-                bufferFree(X);
-            } catch (cl::Error err) {
-                CL_TO_AF_ERROR(err);
-                throw;
-            }
-        }
-
-        template<typename T, bool isAscending>
-        void sort0Index(Param val, Param idx)
-        {
-            int higherDims =  val.info.dims[1] * val.info.dims[2] * val.info.dims[3];
-            // TODO Make a better heurisitic
-            if(higherDims > 5)
-              sortIndexBatched<T, isAscending, 0>(val, idx);
-            else
-                kernel::sort0IndexIterative<T, isAscending>(val, idx);
-        }
-    }
-}
-
-#pragma GCC diagnostic pop
diff --git a/src/backend/opencl/kernel/sort_make_pair.cl b/src/backend/opencl/kernel/sort_pair.cl
similarity index 100%
rename from src/backend/opencl/kernel/sort_make_pair.cl
rename to src/backend/opencl/kernel/sort_pair.cl
diff --git a/src/backend/opencl/sort_index.cpp b/src/backend/opencl/sort_index.cpp
index 49795c2eb5..bb5474909d 100644
--- a/src/backend/opencl/sort_index.cpp
+++ b/src/backend/opencl/sort_index.cpp
@@ -10,7 +10,7 @@
 #include <Array.hpp>
 #include <sort_index.hpp>
 #include <copy.hpp>
-#include <kernel/sort_index.hpp>
+#include <kernel/sort_by_key.hpp>
 #include <math.hpp>
 #include <stdexcept>
 #include <err_opencl.hpp>
@@ -20,36 +20,37 @@
 namespace opencl
 {
     template<typename T, bool isAscending>
-    void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint dim)
+    void sort_index(Array<T> &okey, Array<uint> &oval, const Array<T> &in, const uint dim)
     {
         try {
-            val = copyArray<T>(in);
-            idx = range<uint>(in.dims(), dim);
-            idx.eval();
+            // okey contains values, oval contains indices
+            okey = copyArray<T>(in);
+            oval = range<uint>(in.dims(), dim);
+            oval.eval();
 
             switch(dim) {
-                case 0: kernel::sort0Index<T, isAscending>(val, idx); break;
-                case 1: kernel::sortIndexBatched<T, isAscending, 1>(val, idx); break;
-                case 2: kernel::sortIndexBatched<T, isAscending, 2>(val, idx); break;
-                case 3: kernel::sortIndexBatched<T, isAscending, 3>(val, idx); break;
+                case 0: kernel::sort0ByKey<T, uint, isAscending>(okey, oval); break;
+                case 1: kernel::sortByKeyBatched<T, uint, isAscending, 1>(okey, oval); break;
+                case 2: kernel::sortByKeyBatched<T, uint, isAscending, 2>(okey, oval); break;
+                case 3: kernel::sortByKeyBatched<T, uint, isAscending, 3>(okey, oval); break;
                 default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
             }
 
             if(dim != 0) {
-                af::dim4 preorderDims = val.dims();
+                af::dim4 preorderDims = okey.dims();
                 af::dim4 reorderDims(0, 1, 2, 3);
                 reorderDims[dim] = 0;
-                preorderDims[0] = val.dims()[dim];
+                preorderDims[0] = okey.dims()[dim];
                 for(int i = 1; i <= (int)dim; i++) {
                     reorderDims[i - 1] = i;
-                    preorderDims[i] = val.dims()[i - 1];
+                    preorderDims[i] = okey.dims()[i - 1];
                 }
 
-                val.setDataDims(preorderDims);
-                idx.setDataDims(preorderDims);
+                okey.setDataDims(preorderDims);
+                oval.setDataDims(preorderDims);
 
-                val = reorder<T>(val, reorderDims);
-                idx = reorder<uint>(idx, reorderDims);
+                okey = reorder<T>(okey, reorderDims);
+                oval = reorder<uint>(oval, reorderDims);
             }
         } catch (std::exception &ex) {
             AF_ERROR(ex.what(), AF_ERR_INTERNAL);

From 87513e07d16958cceed4b896ad135c55e4316028 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Thu, 21 Apr 2016 13:42:54 -0400
Subject: [PATCH 19/28] Fix sort calls from harris and orb in CUDA

---
 src/backend/cuda/kernel/harris.hpp |  3 +++
 src/backend/cuda/kernel/orb.hpp    |  5 ++++-
 src/backend/cuda/kernel/range.hpp  | 14 +++++++-------
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/backend/cuda/kernel/harris.hpp b/src/backend/cuda/kernel/harris.hpp
index c773ae45c5..9361b72e23 100644
--- a/src/backend/cuda/kernel/harris.hpp
+++ b/src/backend/cuda/kernel/harris.hpp
@@ -19,6 +19,7 @@
 #include "convolve.hpp"
 #include "gradient.hpp"
 #include "sort_by_key.hpp"
+#include "range.hpp"
 
 namespace cuda
 {
@@ -336,7 +337,9 @@ void harris(unsigned* corners_out,
 
         int sort_elem = harris_responses.strides[3] * harris_responses.dims[3];
         harris_responses.ptr = d_resp_corners;
+        // Create indices using range
         harris_idx.ptr = memAlloc<unsigned>(sort_elem);
+        kernel::range<uint>(harris_idx, 0);
 
         // Sort Harris responses
         sort0ByKey<float, uint, false>(harris_responses, harris_idx);
diff --git a/src/backend/cuda/kernel/orb.hpp b/src/backend/cuda/kernel/orb.hpp
index b5ed10340e..8a2b535cee 100644
--- a/src/backend/cuda/kernel/orb.hpp
+++ b/src/backend/cuda/kernel/orb.hpp
@@ -17,6 +17,7 @@
 #include "convolve.hpp"
 #include "orb_patch.hpp"
 #include "sort_by_key.hpp"
+#include "range.hpp"
 
 #include <boost/scoped_ptr.hpp>
 
@@ -394,10 +395,12 @@ void orb(unsigned* out_feat,
 
         int sort_elem = harris_sorted.strides[3] * harris_sorted.dims[3];
         harris_sorted.ptr = d_score_harris;
+        // Create indices using range
         harris_idx.ptr = memAlloc<unsigned>(sort_elem);
+        kernel::range<uint>(harris_idx, 0);
 
         // Sort features according to Harris responses
-        sort0ByKey<float, uint, false>(harris_sorted, harris_idx);
+        kernel::sort0ByKey<float, uint, false>(harris_sorted, harris_idx);
 
         feat_pyr[i] = std::min(feat_pyr[i], lvl_best[i]);
 
diff --git a/src/backend/cuda/kernel/range.hpp b/src/backend/cuda/kernel/range.hpp
index 9670b07bd6..6880ed566a 100644
--- a/src/backend/cuda/kernel/range.hpp
+++ b/src/backend/cuda/kernel/range.hpp
@@ -18,10 +18,10 @@ namespace cuda
     namespace kernel
     {
         // Kernel Launch Config Values
-        static const unsigned TX = 32;
-        static const unsigned TY = 8;
-        static const unsigned TILEX = 512;
-        static const unsigned TILEY = 32;
+        static const unsigned RANGE_TX = 32;
+        static const unsigned RANGE_TY = 8;
+        static const unsigned RANGE_TILEX = 512;
+        static const unsigned RANGE_TILEY = 32;
 
         template<typename T>
         __global__
@@ -74,10 +74,10 @@ namespace cuda
         template<typename T>
         void range(Param<T> out, const int dim)
         {
-            dim3 threads(TX, TY, 1);
+            dim3 threads(RANGE_TX, RANGE_TY, 1);
 
-            int blocksPerMatX = divup(out.dims[0], TILEX);
-            int blocksPerMatY = divup(out.dims[1], TILEY);
+            int blocksPerMatX = divup(out.dims[0], RANGE_TILEX);
+            int blocksPerMatY = divup(out.dims[1], RANGE_TILEY);
             dim3 blocks(blocksPerMatX * out.dims[2],
                         blocksPerMatY * out.dims[3],
                         1);

From c6e08d5f14ca6a3d8f51f19c98c2223e2a158251 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Thu, 21 Apr 2016 13:46:51 -0400
Subject: [PATCH 20/28] Fix sort calls from harris and orb in OpenCL

---
 src/backend/opencl/kernel/harris.hpp |  5 ++++-
 src/backend/opencl/kernel/orb.hpp    |  5 ++++-
 src/backend/opencl/kernel/range.hpp  | 14 +++++++-------
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/backend/opencl/kernel/harris.hpp b/src/backend/opencl/kernel/harris.hpp
index 4c203c9377..442275d326 100644
--- a/src/backend/opencl/kernel/harris.hpp
+++ b/src/backend/opencl/kernel/harris.hpp
@@ -17,6 +17,7 @@
 #include <kernel/convolve_separable.hpp>
 #include <kernel/gradient.hpp>
 #include <kernel/sort_by_key.hpp>
+#include <kernel/range.hpp>
 #include <kernel_headers/harris.hpp>
 #include <memory.hpp>
 #include <map>
@@ -284,10 +285,12 @@ void harris(unsigned* corners_out,
 
             int sort_elem = harris_resp.info.strides[3] * harris_resp.info.dims[3];
             harris_resp.data = d_resp_corners;
+            // Create indices using range
             harris_idx.data = bufferAlloc(sort_elem * sizeof(unsigned));
+            kernel::range<uint>(harris_idx, 0);
 
             // Sort Harris responses
-            sort0ByKey<float, uint, false>(harris_resp, harris_idx);
+            kernel::sort0ByKey<float, uint, false>(harris_resp, harris_idx);
 
             x_out.data = bufferAlloc(*corners_out * sizeof(float));
             y_out.data = bufferAlloc(*corners_out * sizeof(float));
diff --git a/src/backend/opencl/kernel/orb.hpp b/src/backend/opencl/kernel/orb.hpp
index 612edacfe0..0c752d2c21 100644
--- a/src/backend/opencl/kernel/orb.hpp
+++ b/src/backend/opencl/kernel/orb.hpp
@@ -17,6 +17,7 @@
 #include <kernel/fast.hpp>
 #include <kernel/resize.hpp>
 #include <kernel/sort_by_key.hpp>
+#include <kernel/range.hpp>
 #include <kernel_headers/orb.hpp>
 #include <memory.hpp>
 #include <vector>
@@ -303,9 +304,11 @@ void orb(unsigned* out_feat,
             d_harris_sorted.info.offset = 0;
             d_harris_idx.info.offset = 0;
             d_harris_sorted.data = d_score_harris;
+            // Create indices using range
             d_harris_idx.data = bufferAlloc((d_harris_idx.info.dims[0]) * sizeof(unsigned));
+            kernel::range<uint>(d_harris_idx, 0);
 
-            sort0ByKey<float, uint, false>(d_harris_sorted, d_harris_idx);
+            kernel::sort0ByKey<float, uint, false>(d_harris_sorted, d_harris_idx);
 
             cl::Buffer* d_x_lvl = bufferAlloc(usable_feat * sizeof(float));
             cl::Buffer* d_y_lvl = bufferAlloc(usable_feat * sizeof(float));
diff --git a/src/backend/opencl/kernel/range.hpp b/src/backend/opencl/kernel/range.hpp
index 2f8be8cd4a..0299c030d4 100644
--- a/src/backend/opencl/kernel/range.hpp
+++ b/src/backend/opencl/kernel/range.hpp
@@ -31,10 +31,10 @@ namespace opencl
     namespace kernel
     {
         // Kernel Launch Config Values
-        static const int TX = 32;
-        static const int TY = 8;
-        static const int TILEX = 512;
-        static const int TILEY = 32;
+        static const int RANGE_TX = 32;
+        static const int RANGE_TY = 8;
+        static const int RANGE_TILEX = 512;
+        static const int RANGE_TILEY = 32;
 
         template<typename T>
         void range(Param out, const int dim)
@@ -62,10 +62,10 @@ namespace opencl
                 auto rangeOp = make_kernel<Buffer, const KParam, const int,
                                            const int, const int> (*rangeKernels[device]);
 
-                NDRange local(TX, TY, 1);
+                NDRange local(RANGE_TX, RANGE_TY, 1);
 
-                int blocksPerMatX = divup(out.info.dims[0], TILEX);
-                int blocksPerMatY = divup(out.info.dims[1], TILEY);
+                int blocksPerMatX = divup(out.info.dims[0], RANGE_TILEX);
+                int blocksPerMatY = divup(out.info.dims[1], RANGE_TILEY);
                 NDRange global(local[0] * blocksPerMatX * out.info.dims[2],
                                local[1] * blocksPerMatY * out.info.dims[3],
                                1);

From 8f48cdcd6a3cdc48ed050f0372a75c9a255b521d Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Thu, 21 Apr 2016 15:02:26 -0400
Subject: [PATCH 21/28] Clean up sort tests

---
 test/sort.cpp        |  2 +-
 test/sort_by_key.cpp |  2 +-
 test/sort_index.cpp  | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/sort.cpp b/test/sort.cpp
index 977b54b5c7..9a496f3236 100644
--- a/test/sort.cpp
+++ b/test/sort.cpp
@@ -106,7 +106,7 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, bool
     SORT_INIT(Sort1000False,  sort_1000,  false, 2);
     SORT_INIT(SortMedTrue,    sort_med1,  true,  0);
     SORT_INIT(SortMedFalse,   sort_med1,  false, 2);
-    // Takes too much time in current implementation. Enable when everything is parallel
+
     SORT_INIT(SortMed5True,   sort_med,   true,  0);
     SORT_INIT(SortMed5False,  sort_med,   false, 2);
     SORT_INIT(SortLargeTrue,  sort_large, true,  0);
diff --git a/test/sort_by_key.cpp b/test/sort_by_key.cpp
index cbb13b8785..dae46bef54 100644
--- a/test/sort_by_key.cpp
+++ b/test/sort_by_key.cpp
@@ -118,7 +118,7 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const
     SORT_INIT(SortMedTrue,    sort_by_key_med,   true,  0, 1);
     SORT_INIT(Sort1000False,  sort_by_key_1000,  false, 2, 3);
     SORT_INIT(SortMedFalse,   sort_by_key_med,   false, 2, 3);
-    // Takes too much time in current implementation. Enable when everything is parallel
+
     SORT_INIT(SortLargeTrue,  sort_by_key_large, true,  0, 1);
     SORT_INIT(SortLargeFalse, sort_by_key_large, false, 2, 3);
 
diff --git a/test/sort_index.cpp b/test/sort_index.cpp
index eed85047bf..fe11462310 100644
--- a/test/sort_index.cpp
+++ b/test/sort_index.cpp
@@ -119,11 +119,11 @@ void sortTest(string pTestFile, const bool dir, const unsigned resultIdx0, const
     SORT_INIT(SortMedTrue,    sort_med1,  true,  0, 1);
     SORT_INIT(Sort1000False,  sort_1000,  false, 2, 3);
     SORT_INIT(SortMedFalse,   sort_med1,  false, 2, 3);
-    // Takes too much time in current implementation. Enable when everything is parallel
-    //SORT_INIT(SortMed5True,   sort_med,   true,  0, 1);
-    //SORT_INIT(SortMed5False,  sort_med,   false, 2, 3);
-    //SORT_INIT(SortLargeTrue,  sort_large, true,  0, 1);
-    //SORT_INIT(SortLargeFalse, sort_large, false, 2, 3);
+
+    SORT_INIT(SortMed5True,   sort_med,   true,  0, 1);
+    SORT_INIT(SortMed5False,  sort_med,   false, 2, 3);
+    SORT_INIT(SortLargeTrue,  sort_large, true,  0, 1);
+    SORT_INIT(SortLargeFalse, sort_large, false, 2, 3);
 
 
 //////////////////////////////////// CPP /////////////////////////////////

From b6a6a8732bb43e257c0744a6c183f7d602f8d56d Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Thu, 21 Apr 2016 18:56:16 -0400
Subject: [PATCH 22/28] Fixed sort_by_key kernel for OpenCL

---
 src/backend/opencl/kernel/sort_by_key.hpp | 70 +++++++++++++----------
 src/backend/opencl/kernel/sort_helper.hpp | 33 +++++++++++
 test/sort_index.cpp                       |  2 +-
 3 files changed, 75 insertions(+), 30 deletions(-)

diff --git a/src/backend/opencl/kernel/sort_by_key.hpp b/src/backend/opencl/kernel/sort_by_key.hpp
index 33a020712e..6984603882 100644
--- a/src/backend/opencl/kernel/sort_by_key.hpp
+++ b/src/backend/opencl/kernel/sort_by_key.hpp
@@ -15,6 +15,7 @@
 #include <dispatch.hpp>
 #include <Param.hpp>
 #include <debug_opencl.hpp>
+#include <math.hpp>
 #include <kernel/sort_helper.hpp>
 #include <kernel/iota.hpp>
 
@@ -91,7 +92,6 @@ namespace opencl
         {
             typedef type_t<Tk_> Tk;
             typedef type_t<Tv_> Tv;
-            typedef std::pair<Tk, Tv> IndexPair;
 
             try {
                 af::dim4 inDims;
@@ -136,35 +136,46 @@ namespace opencl
                 compute::buffer pSeq_buf((*pSeq.data)());
                 compute::buffer_iterator<unsigned> seq0 = compute::make_buffer_iterator<unsigned>(pSeq_buf, 0);
                 compute::buffer_iterator<unsigned> seqN = compute::make_buffer_iterator<unsigned>(pSeq_buf, elements);
-
-                // Copy key, val into X pair
-                cl::Buffer* X = bufferAlloc(elements * sizeof(IndexPair));
-                // Use Tk_ and Tv_ here, not Tk and Tv
-                kernel::makePair<Tk_, Tv_>(X, pKey.data, pVal.data, elements);
-                compute::buffer X_buf((*X)());
-                compute::buffer_iterator<IndexPair> X0 = compute::make_buffer_iterator<IndexPair>(X_buf, 0);
-                compute::buffer_iterator<IndexPair> XN = compute::make_buffer_iterator<IndexPair>(X_buf, elements);
-
-                // FIRST SORT CALL
-                compute::function<bool(IndexPair, IndexPair)> IPCompare =
-                    makeCompareFunction<Tk, Tv, isAscending>();
-
-                compute::sort_by_key(X0, XN, seq0, IPCompare, c_queue);
-                getQueue().finish();
-
+                // Create buffer iterators for key and val
+                compute::buffer pKey_buf((*pKey.data)());
+                compute::buffer pVal_buf((*pVal.data)());
+                compute::buffer_iterator<Tk> key0 = compute::make_buffer_iterator<Tk>(pKey_buf, 0);
+                compute::buffer_iterator<Tk> keyN = compute::make_buffer_iterator<Tk>(pKey_buf, elements);
+                compute::buffer_iterator<Tv> val0 = compute::make_buffer_iterator<Tv>(pVal_buf, 0);
+                compute::buffer_iterator<Tv> valN = compute::make_buffer_iterator<Tv>(pVal_buf, elements);
+
+                // Sort By Key for descending is stable in the reverse
+                // (greater) order. Sorting in ascending with negated values
+                // will give the right result
+                if(!isAscending) compute::transform(key0, keyN, key0, flipFunction<Tk>(), c_queue);
+
+                // Create a copy of the pKey buffer
+                cl::Buffer* cKey = bufferAlloc(elements * sizeof(Tk));
+                compute::buffer cKey_buf((*cKey)());
+                compute::buffer_iterator<Tk> cKey0 = compute::make_buffer_iterator<Tk>(cKey_buf, 0);
+                compute::buffer_iterator<Tk> cKeyN = compute::make_buffer_iterator<Tk>(cKey_buf, elements);
+                compute::copy(key0, keyN, cKey0, c_queue);
+
+                // FIRST SORT
+                compute::sort_by_key(key0, keyN, seq0, c_queue);
+                compute::sort_by_key(cKey0, cKeyN, val0, c_queue);
+
+                // Create a copy of the seq buffer after first sort
+                cl::Buffer* cSeq = bufferAlloc(elements * sizeof(unsigned));
+                compute::buffer cSeq_buf((*cSeq)());
+                compute::buffer_iterator<unsigned> cSeq0 = compute::make_buffer_iterator<unsigned>(cSeq_buf, 0);
+                compute::buffer_iterator<unsigned> cSeqN = compute::make_buffer_iterator<unsigned>(cSeq_buf, elements);
+                compute::copy(seq0, seqN, cSeq0, c_queue);
+
+                // SECOND SORT
+                // First call will sort key, second sort will sort val
                 // Needs to be ascending (true) in order to maintain the indices properly
                 //kernel::sort0_by_key<Tv, T, true>(pKey, pVal);
-                //
-                // Because we use a pair as values, we need to use a custom comparator
-                BOOST_COMPUTE_FUNCTION(bool, Compare_Seq, (const unsigned lhs, const unsigned rhs),
-                    {
-                        return lhs < rhs;
-                    }
-                );
-                compute::sort_by_key(seq0, seqN, X0, Compare_Seq, c_queue);
-                getQueue().finish();
+                compute::sort_by_key(seq0, seqN, key0, c_queue);
+                compute::sort_by_key(cSeq0, cSeqN, val0, c_queue);
 
-                kernel::splitPair<Tk_, Tv_>(pKey.data, pVal.data, X, elements);
+                // If descending, flip it back
+                if(!isAscending) compute::transform(key0, keyN, key0, flipFunction<Tk>(), c_queue);
 
                 //// No need of doing moddims here because the original Array<T>
                 //// dimensions have not been changed
@@ -172,7 +183,8 @@ namespace opencl
 
                 CL_DEBUG_FINISH(getQueue());
                 bufferFree(key);
-                bufferFree(X);
+                bufferFree(cSeq);
+                bufferFree(cKey);
             } catch (cl::Error err) {
                 CL_TO_AF_ERROR(err);
                 throw;
@@ -184,7 +196,7 @@ namespace opencl
         {
             int higherDims =  pKey.info.dims[1] * pKey.info.dims[2] * pKey.info.dims[3];
             // TODO Make a better heurisitic
-            if(higherDims > 5)
+            if(higherDims > 0)
                 kernel::sortByKeyBatched<Tk, Tv, isAscending, 0>(pKey, pVal);
             else
                 kernel::sort0ByKeyIterative<Tk, Tv, isAscending>(pKey, pVal);
diff --git a/src/backend/opencl/kernel/sort_helper.hpp b/src/backend/opencl/kernel/sort_helper.hpp
index 7f500c6bb0..899e0873fa 100644
--- a/src/backend/opencl/kernel/sort_helper.hpp
+++ b/src/backend/opencl/kernel/sort_helper.hpp
@@ -44,6 +44,39 @@ makeCompareFunction()
     }
 }
 
+template<typename Tk>
+inline boost::compute::function<Tk(Tk)>
+flipFunction()
+{
+    BOOST_COMPUTE_FUNCTION(Tk, negateFn, (const Tk x),
+        {
+            return -x;
+        }
+    );
+
+    return negateFn;
+}
+
+#define INSTANTIATE_FLIP(TY, XMAX)                              \
+template<> inline boost::compute::function<TY(TY)>              \
+flipFunction<TY>()                                              \
+{                                                               \
+    BOOST_COMPUTE_FUNCTION(TY, negateFn, (const TY x),          \
+       {                                                        \
+            return XMAX - x;                                    \
+        }                                                       \
+    );                                                          \
+                                                                \
+    return negateFn;                                            \
+}
+
+INSTANTIATE_FLIP(unsigned, UINT_MAX)
+INSTANTIATE_FLIP(unsigned short, USHRT_MAX)
+INSTANTIATE_FLIP(unsigned char, UCHAR_MAX)
+INSTANTIATE_FLIP(cl_ulong, ULONG_MAX)
+
+#undef INSTANTIATE_FLIP
+
 namespace opencl
 {
     namespace kernel
diff --git a/test/sort_index.cpp b/test/sort_index.cpp
index fe11462310..0df4744c02 100644
--- a/test/sort_index.cpp
+++ b/test/sort_index.cpp
@@ -258,7 +258,7 @@ TEST(SortIndex, CPPDim2)
 
     // Compare result
     for (size_t elIter = 0; elIter < nElems; ++elIter) {
-        EXPECT_EQ(tests[resultIdx1][elIter], ixData[elIter]) << "at: " << elIter << std::endl;
+        ASSERT_EQ(tests[resultIdx1][elIter], ixData[elIter]) << "at: " << elIter << std::endl;
     }
 
     // Delete

From 75b1a6bc0f428fd43f3189239aa355cce9f50e1a Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Thu, 21 Apr 2016 20:10:20 -0400
Subject: [PATCH 23/28] Instantiate sort_by_key kernels in separately

---
 src/backend/cuda/CMakeLists.txt               |   2 +-
 src/backend/cuda/kernel/iota.hpp              |   3 +-
 src/backend/cuda/kernel/sort_by_key.hpp       | 119 +---------
 .../cuda/{ => kernel}/sort_by_key/ascd_f32.cu |   5 +-
 .../cuda/{ => kernel}/sort_by_key/ascd_f64.cu |   5 +-
 .../cuda/{ => kernel}/sort_by_key/ascd_s16.cu |   5 +-
 .../cuda/{ => kernel}/sort_by_key/ascd_s32.cu |   5 +-
 .../cuda/{ => kernel}/sort_by_key/ascd_s64.cu |   5 +-
 .../cuda/{ => kernel}/sort_by_key/ascd_s8.cu  |   5 +-
 .../cuda/{ => kernel}/sort_by_key/ascd_u16.cu |   5 +-
 .../cuda/{ => kernel}/sort_by_key/ascd_u32.cu |   5 +-
 .../cuda/{ => kernel}/sort_by_key/ascd_u64.cu |   5 +-
 .../cuda/{ => kernel}/sort_by_key/ascd_u8.cu  |   5 +-
 .../cuda/{ => kernel}/sort_by_key/desc_f32.cu |   5 +-
 .../cuda/{ => kernel}/sort_by_key/desc_f64.cu |   5 +-
 .../cuda/{ => kernel}/sort_by_key/desc_s16.cu |   5 +-
 .../cuda/{ => kernel}/sort_by_key/desc_s32.cu |   5 +-
 .../cuda/{ => kernel}/sort_by_key/desc_s64.cu |   5 +-
 .../cuda/{ => kernel}/sort_by_key/desc_s8.cu  |   5 +-
 .../cuda/{ => kernel}/sort_by_key/desc_u16.cu |   5 +-
 .../cuda/{ => kernel}/sort_by_key/desc_u32.cu |   5 +-
 .../cuda/{ => kernel}/sort_by_key/desc_u64.cu |   5 +-
 .../cuda/{ => kernel}/sort_by_key/desc_u8.cu  |   5 +-
 src/backend/cuda/kernel/sort_by_key_impl.hpp  | 213 ++++++++++++++++++
 src/backend/cuda/kernel/sort_helper.hpp       |  66 ------
 .../{sort_by_key_impl.hpp => sort_by_key.cu}  |  48 ++--
 26 files changed, 331 insertions(+), 220 deletions(-)
 rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_f32.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_f64.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_s16.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_s32.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_s64.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_s8.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_u16.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_u32.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_u64.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/ascd_u8.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/desc_f32.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/desc_f64.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/desc_s16.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/desc_s32.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/desc_s64.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/desc_s8.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/desc_u16.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/desc_u32.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/desc_u64.cu (86%)
 rename src/backend/cuda/{ => kernel}/sort_by_key/desc_u8.cu (86%)
 create mode 100644 src/backend/cuda/kernel/sort_by_key_impl.hpp
 delete mode 100644 src/backend/cuda/kernel/sort_helper.hpp
 rename src/backend/cuda/{sort_by_key_impl.hpp => sort_by_key.cu} (63%)

diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt
index b887a98d67..4efb42764a 100644
--- a/src/backend/cuda/CMakeLists.txt
+++ b/src/backend/cuda/CMakeLists.txt
@@ -158,7 +158,7 @@ FILE(GLOB cuda_headers
 FILE(GLOB cuda_sources
     "*.cu"
     "*.cpp"
-    "sort_by_key/*.cu"
+    "kernel/sort_by_key/*.cu"
     "kernel/*.cu")
 
 FILE(GLOB jit_sources
diff --git a/src/backend/cuda/kernel/iota.hpp b/src/backend/cuda/kernel/iota.hpp
index fc28c82882..e2f7e591fb 100644
--- a/src/backend/cuda/kernel/iota.hpp
+++ b/src/backend/cuda/kernel/iota.hpp
@@ -7,6 +7,7 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#include <af/dim4.hpp>
 #include <math.hpp>
 #include <dispatch.hpp>
 #include <Param.hpp>
@@ -69,7 +70,7 @@ namespace cuda
         // Wrapper functions
         ///////////////////////////////////////////////////////////////////////////
         template<typename T>
-        void iota(Param<T> out, const dim4 &sdims, const dim4 &tdims)
+        void iota(Param<T> out, const af::dim4 &sdims, const af::dim4 &tdims)
         {
             dim3 threads(IOTA_TX, IOTA_TY, 1);
 
diff --git a/src/backend/cuda/kernel/sort_by_key.hpp b/src/backend/cuda/kernel/sort_by_key.hpp
index 1536d1bf53..35250a8ad1 100644
--- a/src/backend/cuda/kernel/sort_by_key.hpp
+++ b/src/backend/cuda/kernel/sort_by_key.hpp
@@ -12,130 +12,19 @@
 #include <Param.hpp>
 #include <err_cuda.hpp>
 #include <debug_cuda.hpp>
-#include <kernel/sort_helper.hpp>
-#include <kernel/iota.hpp>
-
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
-#include <thrust/sort.h>
 
 namespace cuda
 {
     namespace kernel
     {
-        ///////////////////////////////////////////////////////////////////////////
-        // Wrapper functions
-        ///////////////////////////////////////////////////////////////////////////
         template<typename Tk, typename Tv, bool isAscending>
-        void sort0ByKeyIterative(Param<Tk> okey, Param<Tv> oval)
-        {
-            thrust::device_ptr<Tk>       okey_ptr = thrust::device_pointer_cast(okey.ptr);
-            thrust::device_ptr<Tv>       oval_ptr = thrust::device_pointer_cast(oval.ptr);
-
-            for(int w = 0; w < okey.dims[3]; w++) {
-                int okeyW = w * okey.strides[3];
-                int ovalW = w * oval.strides[3];
-                for(int z = 0; z < okey.dims[2]; z++) {
-                    int okeyWZ = okeyW + z * okey.strides[2];
-                    int ovalWZ = ovalW + z * oval.strides[2];
-                    for(int y = 0; y < okey.dims[1]; y++) {
-
-                        int okeyOffset = okeyWZ + y * okey.strides[1];
-                        int ovalOffset = ovalWZ + y * oval.strides[1];
-
-                        if(isAscending) {
-                            THRUST_SELECT(thrust::stable_sort_by_key,
-                                          okey_ptr + okeyOffset,
-                                          okey_ptr + okeyOffset + okey.dims[0],
-                                          oval_ptr + ovalOffset);
-                        } else {
-                            THRUST_SELECT(thrust::stable_sort_by_key,
-                                          okey_ptr + okeyOffset,
-                                          okey_ptr + okeyOffset + okey.dims[0],
-                                          oval_ptr + ovalOffset, thrust::greater<Tk>());
-                        }
-                    }
-                }
-            }
-            POST_LAUNCH_CHECK();
-        }
+        void sort0ByKeyIterative(Param<Tk> okey, Param<Tv> oval);
 
         template<typename Tk, typename Tv, bool isAscending, int dim>
-        void sortByKeyBatched(Param<Tk> pKey, Param<Tv> pVal)
-        {
-            af::dim4 inDims;
-            for(int i = 0; i < 4; i++)
-                inDims[i] = pKey.dims[i];
-
-            // Sort dimension
-            // tileDims * seqDims = inDims
-            af::dim4 tileDims(1);
-            af::dim4 seqDims = inDims;
-            tileDims[dim] = inDims[dim];
-            seqDims[dim] = 1;
-
-            // Create/call iota
-            // Array<uint> key = iota<uint>(seqDims, tileDims);
-            dim4 keydims = inDims;
-            uint* key = memAlloc<uint>(keydims.elements());
-            Param<uint> pSeq;
-            pSeq.ptr = key;
-            pSeq.strides[0] = 1;
-            pSeq.dims[0] = keydims[0];
-            for(int i = 1; i < 4; i++) {
-                pSeq.dims[i] = keydims[i];
-                pSeq.strides[i] = pSeq.strides[i - 1] * pSeq.dims[i - 1];
-            }
-            cuda::kernel::iota<uint>(pSeq, seqDims, tileDims);
-
-            // Make pkey, pVal into a pair
-            thrust::device_vector<IndexPair<Tk, Tv> > X(inDims.elements());
-            IndexPair<Tk, Tv> *Xptr = thrust::raw_pointer_cast(X.data());
-
-            const int threads = 256;
-            int blocks = divup(inDims.elements(), threads * copyPairIter);
-            CUDA_LAUNCH((makeIndexPair<Tk, Tv>), blocks, threads,
-                        Xptr, pKey.ptr, pVal.ptr, inDims.elements());
-            POST_LAUNCH_CHECK();
-
-            // Sort indices
-            // Need to convert pSeq to thrust::device_ptr, otherwise thrust
-            // throws weird errors for all *64 data types (double, intl, uintl etc)
-            thrust::device_ptr<uint> dSeq = thrust::device_pointer_cast(pSeq.ptr);
-            THRUST_SELECT(thrust::stable_sort_by_key,
-                          X.begin(), X.end(),
-                          dSeq,
-                          IPCompare<Tk, Tv, isAscending>());
-            POST_LAUNCH_CHECK();
-
-            // Needs to be ascending (true) in order to maintain the indices properly
-            //kernel::sort0_by_key<uint, T, true>(pKey, pVal);
-            THRUST_SELECT(thrust::stable_sort_by_key,
-                          dSeq,
-                          dSeq + inDims.elements(),
-                          X.begin());
-            POST_LAUNCH_CHECK();
-
-            CUDA_LAUNCH((splitIndexPair<Tk, Tv>), blocks, threads,
-                        pKey.ptr, pVal.ptr, Xptr, inDims.elements());
-            POST_LAUNCH_CHECK();
-
-            // No need of doing moddims here because the original Array<T>
-            // dimensions have not been changed
-            //val.modDims(inDims);
-
-            memFree(key);
-        }
+        void sortByKeyBatched(Param<Tk> pKey, Param<Tv> pVal);
 
         template<typename Tk, typename Tv, bool isAscending>
-        void sort0ByKey(Param<Tk> okey, Param<Tv> oval)
-        {
-            int higherDims =  okey.dims[1] * okey.dims[2] * okey.dims[3];
-            // TODO Make a better heurisitic
-            if(higherDims > 5)
-                kernel::sortByKeyBatched<Tk, Tv, isAscending, 0>(okey, oval);
-            else
-                kernel::sort0ByKeyIterative<Tk, Tv, isAscending>(okey, oval);
-        }
+        void sort0ByKey(Param<Tk> okey, Param<Tv> oval);
+
     }
 }
diff --git a/src/backend/cuda/sort_by_key/ascd_f32.cu b/src/backend/cuda/kernel/sort_by_key/ascd_f32.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/ascd_f32.cu
rename to src/backend/cuda/kernel/sort_by_key/ascd_f32.cu
index 44b770402c..284e8b4938 100644
--- a/src/backend/cuda/sort_by_key/ascd_f32.cu
+++ b/src/backend/cuda/kernel/sort_by_key/ascd_f32.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(float, true)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/ascd_f64.cu b/src/backend/cuda/kernel/sort_by_key/ascd_f64.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/ascd_f64.cu
rename to src/backend/cuda/kernel/sort_by_key/ascd_f64.cu
index 17b54a3903..ba19ec447c 100644
--- a/src/backend/cuda/sort_by_key/ascd_f64.cu
+++ b/src/backend/cuda/kernel/sort_by_key/ascd_f64.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(double, true)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/ascd_s16.cu b/src/backend/cuda/kernel/sort_by_key/ascd_s16.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/ascd_s16.cu
rename to src/backend/cuda/kernel/sort_by_key/ascd_s16.cu
index d51e9ae671..1be6e540ca 100644
--- a/src/backend/cuda/sort_by_key/ascd_s16.cu
+++ b/src/backend/cuda/kernel/sort_by_key/ascd_s16.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(short, true)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/ascd_s32.cu b/src/backend/cuda/kernel/sort_by_key/ascd_s32.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/ascd_s32.cu
rename to src/backend/cuda/kernel/sort_by_key/ascd_s32.cu
index 75adbddc0b..8cee7c9b49 100644
--- a/src/backend/cuda/sort_by_key/ascd_s32.cu
+++ b/src/backend/cuda/kernel/sort_by_key/ascd_s32.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(int, true)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/ascd_s64.cu b/src/backend/cuda/kernel/sort_by_key/ascd_s64.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/ascd_s64.cu
rename to src/backend/cuda/kernel/sort_by_key/ascd_s64.cu
index 25a1e589f8..0e5a7c81a2 100644
--- a/src/backend/cuda/sort_by_key/ascd_s64.cu
+++ b/src/backend/cuda/kernel/sort_by_key/ascd_s64.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(intl, true)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/ascd_s8.cu b/src/backend/cuda/kernel/sort_by_key/ascd_s8.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/ascd_s8.cu
rename to src/backend/cuda/kernel/sort_by_key/ascd_s8.cu
index f47a397727..81ed32952f 100644
--- a/src/backend/cuda/sort_by_key/ascd_s8.cu
+++ b/src/backend/cuda/kernel/sort_by_key/ascd_s8.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(char, true)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/ascd_u16.cu b/src/backend/cuda/kernel/sort_by_key/ascd_u16.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/ascd_u16.cu
rename to src/backend/cuda/kernel/sort_by_key/ascd_u16.cu
index e06036abc7..e232c08376 100644
--- a/src/backend/cuda/sort_by_key/ascd_u16.cu
+++ b/src/backend/cuda/kernel/sort_by_key/ascd_u16.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(ushort, true)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/ascd_u32.cu b/src/backend/cuda/kernel/sort_by_key/ascd_u32.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/ascd_u32.cu
rename to src/backend/cuda/kernel/sort_by_key/ascd_u32.cu
index 6f7939aa12..34a4580936 100644
--- a/src/backend/cuda/sort_by_key/ascd_u32.cu
+++ b/src/backend/cuda/kernel/sort_by_key/ascd_u32.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(uint, true)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/ascd_u64.cu b/src/backend/cuda/kernel/sort_by_key/ascd_u64.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/ascd_u64.cu
rename to src/backend/cuda/kernel/sort_by_key/ascd_u64.cu
index 63eec5fdd4..fc576e7f99 100644
--- a/src/backend/cuda/sort_by_key/ascd_u64.cu
+++ b/src/backend/cuda/kernel/sort_by_key/ascd_u64.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(uintl, true)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/ascd_u8.cu b/src/backend/cuda/kernel/sort_by_key/ascd_u8.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/ascd_u8.cu
rename to src/backend/cuda/kernel/sort_by_key/ascd_u8.cu
index a2e1dec887..ed8454d53e 100644
--- a/src/backend/cuda/sort_by_key/ascd_u8.cu
+++ b/src/backend/cuda/kernel/sort_by_key/ascd_u8.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(uchar, true)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/desc_f32.cu b/src/backend/cuda/kernel/sort_by_key/desc_f32.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/desc_f32.cu
rename to src/backend/cuda/kernel/sort_by_key/desc_f32.cu
index 1bbb10bbba..73459ac033 100644
--- a/src/backend/cuda/sort_by_key/desc_f32.cu
+++ b/src/backend/cuda/kernel/sort_by_key/desc_f32.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(float, false)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/desc_f64.cu b/src/backend/cuda/kernel/sort_by_key/desc_f64.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/desc_f64.cu
rename to src/backend/cuda/kernel/sort_by_key/desc_f64.cu
index ecbed78878..be0536b1e3 100644
--- a/src/backend/cuda/sort_by_key/desc_f64.cu
+++ b/src/backend/cuda/kernel/sort_by_key/desc_f64.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(double, false)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/desc_s16.cu b/src/backend/cuda/kernel/sort_by_key/desc_s16.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/desc_s16.cu
rename to src/backend/cuda/kernel/sort_by_key/desc_s16.cu
index 63967b6117..0fc3b50827 100644
--- a/src/backend/cuda/sort_by_key/desc_s16.cu
+++ b/src/backend/cuda/kernel/sort_by_key/desc_s16.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(short, false)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/desc_s32.cu b/src/backend/cuda/kernel/sort_by_key/desc_s32.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/desc_s32.cu
rename to src/backend/cuda/kernel/sort_by_key/desc_s32.cu
index 49904437f4..cfda29c7de 100644
--- a/src/backend/cuda/sort_by_key/desc_s32.cu
+++ b/src/backend/cuda/kernel/sort_by_key/desc_s32.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(int, false)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/desc_s64.cu b/src/backend/cuda/kernel/sort_by_key/desc_s64.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/desc_s64.cu
rename to src/backend/cuda/kernel/sort_by_key/desc_s64.cu
index a10ee11475..b334a91a99 100644
--- a/src/backend/cuda/sort_by_key/desc_s64.cu
+++ b/src/backend/cuda/kernel/sort_by_key/desc_s64.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(intl, false)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/desc_s8.cu b/src/backend/cuda/kernel/sort_by_key/desc_s8.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/desc_s8.cu
rename to src/backend/cuda/kernel/sort_by_key/desc_s8.cu
index cad78dfc84..f02d5ce2fe 100644
--- a/src/backend/cuda/sort_by_key/desc_s8.cu
+++ b/src/backend/cuda/kernel/sort_by_key/desc_s8.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(char, false)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/desc_u16.cu b/src/backend/cuda/kernel/sort_by_key/desc_u16.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/desc_u16.cu
rename to src/backend/cuda/kernel/sort_by_key/desc_u16.cu
index 69dc01634b..9b0a77cb25 100644
--- a/src/backend/cuda/sort_by_key/desc_u16.cu
+++ b/src/backend/cuda/kernel/sort_by_key/desc_u16.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(ushort, false)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/desc_u32.cu b/src/backend/cuda/kernel/sort_by_key/desc_u32.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/desc_u32.cu
rename to src/backend/cuda/kernel/sort_by_key/desc_u32.cu
index ae2ad4bc84..1d02aec848 100644
--- a/src/backend/cuda/sort_by_key/desc_u32.cu
+++ b/src/backend/cuda/kernel/sort_by_key/desc_u32.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(uint, false)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/desc_u64.cu b/src/backend/cuda/kernel/sort_by_key/desc_u64.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/desc_u64.cu
rename to src/backend/cuda/kernel/sort_by_key/desc_u64.cu
index 43f60c075b..597bd2c1b4 100644
--- a/src/backend/cuda/sort_by_key/desc_u64.cu
+++ b/src/backend/cuda/kernel/sort_by_key/desc_u64.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(uintl, false)
 }
+}
diff --git a/src/backend/cuda/sort_by_key/desc_u8.cu b/src/backend/cuda/kernel/sort_by_key/desc_u8.cu
similarity index 86%
rename from src/backend/cuda/sort_by_key/desc_u8.cu
rename to src/backend/cuda/kernel/sort_by_key/desc_u8.cu
index 51d8096620..4f55479604 100644
--- a/src/backend/cuda/sort_by_key/desc_u8.cu
+++ b/src/backend/cuda/kernel/sort_by_key/desc_u8.cu
@@ -7,9 +7,12 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <sort_by_key_impl.hpp>
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace cuda
+{
+namespace kernel
 {
     INSTANTIATE1(uchar, false)
 }
+}
diff --git a/src/backend/cuda/kernel/sort_by_key_impl.hpp b/src/backend/cuda/kernel/sort_by_key_impl.hpp
new file mode 100644
index 0000000000..66a6087401
--- /dev/null
+++ b/src/backend/cuda/kernel/sort_by_key_impl.hpp
@@ -0,0 +1,213 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <kernel/sort_by_key.hpp>
+#include <af/dim4.hpp>
+#include <memory.hpp>
+#include <math.hpp>
+#include <dispatch.hpp>
+#include <Param.hpp>
+#include <err_cuda.hpp>
+#include <debug_cuda.hpp>
+#include <kernel/iota.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/sort.h>
+
+// This needs to be in global namespace as it is used by thrust
+template<typename Tk, typename Tv>
+struct IndexPair
+{
+    Tk first;
+    Tv second;
+};
+
+template <typename Tk, typename Tv, bool isAscending>
+struct IPCompare
+{
+    __host__ __device__
+    bool operator()(const IndexPair<Tk, Tv> &lhs, const IndexPair<Tk, Tv> &rhs) const
+    {
+        // Check stable sort condition
+        if(isAscending) return (lhs.first < rhs.first);
+        else return (lhs.first > rhs.first);
+    }
+};
+
+namespace cuda
+{
+    namespace kernel
+    {
+        static const int copyPairIter = 4;
+
+        template <typename Tk, typename Tv>
+        __global__
+        void makeIndexPair(IndexPair<Tk, Tv> *out, const Tk *first, const Tv *second, const int N)
+        {
+            int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x;
+
+            for(int i = tIdx; i < N; i += blockDim.x)
+            {
+                out[i].first = first[i];
+                out[i].second = second[i];
+            }
+        }
+
+        template <typename Tk, typename Tv>
+        __global__
+        void splitIndexPair(Tk *first, Tv *second, const IndexPair<Tk, Tv> *out, const int N)
+        {
+            int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x;
+
+            for(int i = tIdx; i < N; i += blockDim.x)
+            {
+                first[i]  = out[i].first;
+                second[i] = out[i].second;
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////
+        // Wrapper functions
+        ///////////////////////////////////////////////////////////////////////////
+        template<typename Tk, typename Tv, bool isAscending>
+        void sort0ByKeyIterative(Param<Tk> okey, Param<Tv> oval)
+        {
+            thrust::device_ptr<Tk>       okey_ptr = thrust::device_pointer_cast(okey.ptr);
+            thrust::device_ptr<Tv>       oval_ptr = thrust::device_pointer_cast(oval.ptr);
+
+            for(int w = 0; w < okey.dims[3]; w++) {
+                int okeyW = w * okey.strides[3];
+                int ovalW = w * oval.strides[3];
+                for(int z = 0; z < okey.dims[2]; z++) {
+                    int okeyWZ = okeyW + z * okey.strides[2];
+                    int ovalWZ = ovalW + z * oval.strides[2];
+                    for(int y = 0; y < okey.dims[1]; y++) {
+
+                        int okeyOffset = okeyWZ + y * okey.strides[1];
+                        int ovalOffset = ovalWZ + y * oval.strides[1];
+
+                        if(isAscending) {
+                            THRUST_SELECT(thrust::stable_sort_by_key,
+                                          okey_ptr + okeyOffset,
+                                          okey_ptr + okeyOffset + okey.dims[0],
+                                          oval_ptr + ovalOffset);
+                        } else {
+                            THRUST_SELECT(thrust::stable_sort_by_key,
+                                          okey_ptr + okeyOffset,
+                                          okey_ptr + okeyOffset + okey.dims[0],
+                                          oval_ptr + ovalOffset, thrust::greater<Tk>());
+                        }
+                    }
+                }
+            }
+            POST_LAUNCH_CHECK();
+        }
+
+        template<typename Tk, typename Tv, bool isAscending, int dim>
+        void sortByKeyBatched(Param<Tk> pKey, Param<Tv> pVal)
+        {
+            af::dim4 inDims;
+            for(int i = 0; i < 4; i++)
+                inDims[i] = pKey.dims[i];
+
+            // Sort dimension
+            // tileDims * seqDims = inDims
+            af::dim4 tileDims(1);
+            af::dim4 seqDims = inDims;
+            tileDims[dim] = inDims[dim];
+            seqDims[dim] = 1;
+
+            // Create/call iota
+            // Array<uint> key = iota<uint>(seqDims, tileDims);
+            af::dim4 keydims = inDims;
+            uint* key = memAlloc<uint>(keydims.elements());
+            Param<uint> pSeq;
+            pSeq.ptr = key;
+            pSeq.strides[0] = 1;
+            pSeq.dims[0] = keydims[0];
+            for(int i = 1; i < 4; i++) {
+                pSeq.dims[i] = keydims[i];
+                pSeq.strides[i] = pSeq.strides[i - 1] * pSeq.dims[i - 1];
+            }
+            cuda::kernel::iota<uint>(pSeq, seqDims, tileDims);
+
+            // Make pkey, pVal into a pair
+            thrust::device_vector<IndexPair<Tk, Tv> > X(inDims.elements());
+            IndexPair<Tk, Tv> *Xptr = thrust::raw_pointer_cast(X.data());
+
+            const int threads = 256;
+            int blocks = divup(inDims.elements(), threads * copyPairIter);
+            CUDA_LAUNCH((makeIndexPair<Tk, Tv>), blocks, threads,
+                        Xptr, pKey.ptr, pVal.ptr, inDims.elements());
+            POST_LAUNCH_CHECK();
+
+            // Sort indices
+            // Need to convert pSeq to thrust::device_ptr, otherwise thrust
+            // throws weird errors for all *64 data types (double, intl, uintl etc)
+            thrust::device_ptr<uint> dSeq = thrust::device_pointer_cast(pSeq.ptr);
+            THRUST_SELECT(thrust::stable_sort_by_key,
+                          X.begin(), X.end(),
+                          dSeq,
+                          IPCompare<Tk, Tv, isAscending>());
+            POST_LAUNCH_CHECK();
+
+            // Needs to be ascending (true) in order to maintain the indices properly
+            //kernel::sort0_by_key<uint, T, true>(pKey, pVal);
+            THRUST_SELECT(thrust::stable_sort_by_key,
+                          dSeq,
+                          dSeq + inDims.elements(),
+                          X.begin());
+            POST_LAUNCH_CHECK();
+
+            CUDA_LAUNCH((splitIndexPair<Tk, Tv>), blocks, threads,
+                        pKey.ptr, pVal.ptr, Xptr, inDims.elements());
+            POST_LAUNCH_CHECK();
+
+            // No need of doing moddims here because the original Array<T>
+            // dimensions have not been changed
+            //val.modDims(inDims);
+
+            memFree(key);
+        }
+
+        template<typename Tk, typename Tv, bool isAscending>
+        void sort0ByKey(Param<Tk> okey, Param<Tv> oval)
+        {
+            int higherDims =  okey.dims[1] * okey.dims[2] * okey.dims[3];
+            // TODO Make a better heurisitic
+            if(higherDims > 5)
+                kernel::sortByKeyBatched<Tk, Tv, isAscending, 0>(okey, oval);
+            else
+                kernel::sort0ByKeyIterative<Tk, Tv, isAscending>(okey, oval);
+        }
+
+#define INSTANTIATE(Tk, Tv, dr)                                                         \
+    template void sort0ByKey<Tk, Tv, dr>(Param<Tk> okey, Param<Tv> oval);               \
+    template void sort0ByKeyIterative<Tk, Tv, dr>(Param<Tk> okey, Param<Tv> oval);      \
+    template void sortByKeyBatched<Tk, Tv, dr, 0>(Param<Tk> okey, Param<Tv> oval);      \
+    template void sortByKeyBatched<Tk, Tv, dr, 1>(Param<Tk> okey, Param<Tv> oval);      \
+    template void sortByKeyBatched<Tk, Tv, dr, 2>(Param<Tk> okey, Param<Tv> oval);      \
+    template void sortByKeyBatched<Tk, Tv, dr, 3>(Param<Tk> okey, Param<Tv> oval);      \
+
+#define INSTANTIATE1(Tk    , dr) \
+    INSTANTIATE(Tk, float  , dr) \
+    INSTANTIATE(Tk, double , dr) \
+    INSTANTIATE(Tk, cfloat , dr) \
+    INSTANTIATE(Tk, cdouble, dr) \
+    INSTANTIATE(Tk, int    , dr) \
+    INSTANTIATE(Tk, uint   , dr) \
+    INSTANTIATE(Tk, short  , dr) \
+    INSTANTIATE(Tk, ushort , dr) \
+    INSTANTIATE(Tk, char   , dr) \
+    INSTANTIATE(Tk, uchar  , dr) \
+    INSTANTIATE(Tk, intl   , dr) \
+    INSTANTIATE(Tk, uintl  , dr)
+    }
+}
diff --git a/src/backend/cuda/kernel/sort_helper.hpp b/src/backend/cuda/kernel/sort_helper.hpp
deleted file mode 100644
index 93fb33ac8f..0000000000
--- a/src/backend/cuda/kernel/sort_helper.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*******************************************************
- * Copyright (c) 2014, ArrayFire
- * All rights reserved.
- *
- * This file is distributed under 3-clause BSD license.
- * The complete license agreement can be obtained at:
- * http://arrayfire.com/licenses/BSD-3-Clause
- ********************************************************/
-
-#include <dispatch.hpp>
-#include <err_cuda.hpp>
-#include <debug_cuda.hpp>
-
-// This needs to be in global namespace as it is used by thrust
-template<typename Tk, typename Tv>
-struct IndexPair
-{
-    Tk first;
-    Tv second;
-};
-
-template <typename Tk, typename Tv, bool isAscending>
-struct IPCompare
-{
-    __host__ __device__
-    bool operator()(const IndexPair<Tk, Tv> &lhs, const IndexPair<Tk, Tv> &rhs) const
-    {
-        // Check stable sort condition
-        if(isAscending) return (lhs.first < rhs.first);
-        else return (lhs.first > rhs.first);
-    }
-};
-
-namespace cuda
-{
-    namespace kernel
-    {
-        static const int copyPairIter = 4;
-
-        template <typename Tk, typename Tv>
-        __global__
-        void makeIndexPair(IndexPair<Tk, Tv> *out, const Tk *first, const Tv *second, const int N)
-        {
-            int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x;
-
-            for(int i = tIdx; i < N; i += blockDim.x)
-            {
-                out[i].first = first[i];
-                out[i].second = second[i];
-            }
-        }
-
-        template <typename Tk, typename Tv>
-        __global__
-        void splitIndexPair(Tk *first, Tv *second, const IndexPair<Tk, Tv> *out, const int N)
-        {
-            int tIdx = blockIdx.x * blockDim.x * copyPairIter + threadIdx.x;
-
-            for(int i = tIdx; i < N; i += blockDim.x)
-            {
-                first[i]  = out[i].first;
-                second[i] = out[i].second;
-            }
-        }
-    }
-}
diff --git a/src/backend/cuda/sort_by_key_impl.hpp b/src/backend/cuda/sort_by_key.cu
similarity index 63%
rename from src/backend/cuda/sort_by_key_impl.hpp
rename to src/backend/cuda/sort_by_key.cu
index 8cc86b55db..2d5d68eef0 100644
--- a/src/backend/cuda/sort_by_key_impl.hpp
+++ b/src/backend/cuda/sort_by_key.cu
@@ -51,22 +51,36 @@ namespace cuda
         }
     }
 
-#define INSTANTIATE(Tk, Tv, dr)                                         \
-    template void                                                       \
-    sort_by_key<Tk, Tv, dr>(Array<Tk> &okey, Array<Tv> &oval,           \
-                            const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim); \
+#define INSTANTIATE(Tk, Tv)                                                         \
+    template void sort_by_key<Tk, Tv, true >(Array<Tk> &okey, Array<Tv> &oval,      \
+                const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim);      \
+    template void sort_by_key<Tk, Tv, false>(Array<Tk> &okey, Array<Tv> &oval,      \
+                const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim);      \
+
+#define INSTANTIATE1(Tk    ) \
+    INSTANTIATE(Tk, float  ) \
+    INSTANTIATE(Tk, double ) \
+    INSTANTIATE(Tk, cfloat ) \
+    INSTANTIATE(Tk, cdouble) \
+    INSTANTIATE(Tk, int    ) \
+    INSTANTIATE(Tk, uint   ) \
+    INSTANTIATE(Tk, short  ) \
+    INSTANTIATE(Tk, ushort ) \
+    INSTANTIATE(Tk, char   ) \
+    INSTANTIATE(Tk, uchar  ) \
+    INSTANTIATE(Tk, intl   ) \
+    INSTANTIATE(Tk, uintl  )
+
+
+INSTANTIATE1(float )
+INSTANTIATE1(double)
+INSTANTIATE1(int   )
+INSTANTIATE1(uint  )
+INSTANTIATE1(short )
+INSTANTIATE1(ushort)
+INSTANTIATE1(char  )
+INSTANTIATE1(uchar )
+INSTANTIATE1(intl  )
+INSTANTIATE1(uintl )
 
-#define INSTANTIATE1(Tk    , dr) \
-    INSTANTIATE(Tk, float  , dr) \
-    INSTANTIATE(Tk, double , dr) \
-    INSTANTIATE(Tk, cfloat , dr) \
-    INSTANTIATE(Tk, cdouble, dr) \
-    INSTANTIATE(Tk, int    , dr) \
-    INSTANTIATE(Tk, uint   , dr) \
-    INSTANTIATE(Tk, short  , dr) \
-    INSTANTIATE(Tk, ushort , dr) \
-    INSTANTIATE(Tk, char   , dr) \
-    INSTANTIATE(Tk, uchar  , dr) \
-    INSTANTIATE(Tk, intl   , dr) \
-    INSTANTIATE(Tk, uintl  , dr)
 }

From bbdae15c0875d633d18fa2b5ed63122a14b6c3a2 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Thu, 21 Apr 2016 20:37:31 -0400
Subject: [PATCH 24/28] Instantiate sort_by_key kernels in separately in opencl

---
 src/backend/opencl/CMakeLists.txt             |   9 +-
 src/backend/opencl/kernel/iota.hpp            |   3 +-
 src/backend/opencl/kernel/sort_by_key.hpp     | 184 +--------
 .../opencl/{ => kernel}/sort_by_key/b8.cpp    |   5 +-
 .../opencl/{ => kernel}/sort_by_key/f32.cpp   |   5 +-
 .../opencl/{ => kernel}/sort_by_key/f64.cpp   |   5 +-
 .../opencl/{ => kernel}/sort_by_key/s16.cpp   |   5 +-
 .../opencl/{ => kernel}/sort_by_key/s32.cpp   |   5 +-
 .../opencl/{ => kernel}/sort_by_key/s64.cpp   |   5 +-
 .../opencl/{ => kernel}/sort_by_key/u16.cpp   |   5 +-
 .../opencl/{ => kernel}/sort_by_key/u32.cpp   |   5 +-
 .../opencl/{ => kernel}/sort_by_key/u64.cpp   |   5 +-
 .../opencl/{ => kernel}/sort_by_key/u8.cpp    |   5 +-
 .../opencl/kernel/sort_by_key_impl.hpp        | 373 ++++++++++++++++++
 src/backend/opencl/kernel/sort_helper.hpp     | 151 +------
 .../{sort_by_key/impl.hpp => sort_by_key.cpp} |  48 ++-
 src/backend/opencl/traits.hpp                 |   1 +
 17 files changed, 453 insertions(+), 366 deletions(-)
 rename src/backend/opencl/{ => kernel}/sort_by_key/b8.cpp (87%)
 rename src/backend/opencl/{ => kernel}/sort_by_key/f32.cpp (87%)
 rename src/backend/opencl/{ => kernel}/sort_by_key/f64.cpp (87%)
 rename src/backend/opencl/{ => kernel}/sort_by_key/s16.cpp (87%)
 rename src/backend/opencl/{ => kernel}/sort_by_key/s32.cpp (87%)
 rename src/backend/opencl/{ => kernel}/sort_by_key/s64.cpp (87%)
 rename src/backend/opencl/{ => kernel}/sort_by_key/u16.cpp (87%)
 rename src/backend/opencl/{ => kernel}/sort_by_key/u32.cpp (87%)
 rename src/backend/opencl/{ => kernel}/sort_by_key/u64.cpp (87%)
 rename src/backend/opencl/{ => kernel}/sort_by_key/u8.cpp (87%)
 create mode 100644 src/backend/opencl/kernel/sort_by_key_impl.hpp
 rename src/backend/opencl/{sort_by_key/impl.hpp => sort_by_key.cpp} (65%)

diff --git a/src/backend/opencl/CMakeLists.txt b/src/backend/opencl/CMakeLists.txt
index 2cb8ddfdf9..b7eb77ded5 100644
--- a/src/backend/opencl/CMakeLists.txt
+++ b/src/backend/opencl/CMakeLists.txt
@@ -111,12 +111,10 @@ ENDIF()
 
 FILE(GLOB opencl_headers
   "*.hpp"
-  "*.h"
-  "sort_by_key/*.hpp")
+  "*.h")
 
 FILE(GLOB opencl_sources
-    "*.cpp"
-    "sort_by_key/*.cpp")
+    "*.cpp")
 
 FILE(GLOB jit_sources
     "jit/*.hpp")
@@ -128,7 +126,8 @@ FILE(GLOB opencl_kernels
     "kernel/*.cl")
 
 FILE(GLOB kernel_sources
-     "kernel/*.cpp")
+     "kernel/*.cpp"
+     "kernel/sort_by_key/*.cpp")
 
 FILE(GLOB conv_ker_headers
     "kernel/convolve/*.hpp")
diff --git a/src/backend/opencl/kernel/iota.hpp b/src/backend/opencl/kernel/iota.hpp
index 210b6b202e..7cd8046d68 100644
--- a/src/backend/opencl/kernel/iota.hpp
+++ b/src/backend/opencl/kernel/iota.hpp
@@ -9,6 +9,7 @@
 
 #pragma once
 #include <kernel_headers/iota.hpp>
+#include <af/dim4.hpp>
 #include <program.hpp>
 #include <traits.hpp>
 #include <string>
@@ -37,7 +38,7 @@ namespace opencl
         static const int TILEY = 32;
 
         template<typename T>
-        void iota(Param out, const dim4 &sdims, const dim4 &tdims)
+        void iota(Param out, const af::dim4 &sdims, const af::dim4 &tdims)
         {
             try {
                 static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
diff --git a/src/backend/opencl/kernel/sort_by_key.hpp b/src/backend/opencl/kernel/sort_by_key.hpp
index 6984603882..224f6411ff 100644
--- a/src/backend/opencl/kernel/sort_by_key.hpp
+++ b/src/backend/opencl/kernel/sort_by_key.hpp
@@ -8,200 +8,22 @@
  ********************************************************/
 
 #pragma once
-#include <program.hpp>
 #include <traits.hpp>
-#include <string>
-#include <mutex>
 #include <dispatch.hpp>
 #include <Param.hpp>
 #include <debug_opencl.hpp>
-#include <math.hpp>
-#include <kernel/sort_helper.hpp>
-#include <kernel/iota.hpp>
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-
-#include <boost/compute/core.hpp>
-#include <boost/compute/algorithm/sort_by_key.hpp>
-#include <boost/compute/functional/operator.hpp>
-#include <boost/compute/iterator/buffer_iterator.hpp>
-#include <boost/compute/algorithm/copy.hpp>
-#include <boost/compute/algorithm/transform.hpp>
-#include <boost/compute/container/vector.hpp>
-#include <boost/compute/functional/get.hpp>
-#include <boost/compute/functional/field.hpp>
-#include <boost/compute/types/pair.hpp>
-
-namespace compute = boost::compute;
-
-using cl::Buffer;
-using cl::Program;
-using cl::Kernel;
-using cl::make_kernel;
-using cl::EnqueueArgs;
-using cl::NDRange;
-using std::string;
 
 namespace opencl
 {
     namespace kernel
     {
         template<typename Tk, typename Tv, bool isAscending>
-        void sort0ByKeyIterative(Param pKey, Param pVal)
-        {
-            try {
-                compute::command_queue c_queue(getQueue()());
-
-                compute::buffer pKey_buf((*pKey.data)());
-                compute::buffer pVal_buf((*pVal.data)());
-
-                for(int w = 0; w < pKey.info.dims[3]; w++) {
-                    int pKeyW = w * pKey.info.strides[3];
-                    int pValW = w * pVal.info.strides[3];
-                    for(int z = 0; z < pKey.info.dims[2]; z++) {
-                        int pKeyWZ = pKeyW + z * pKey.info.strides[2];
-                        int pValWZ = pValW + z * pVal.info.strides[2];
-                        for(int y = 0; y < pKey.info.dims[1]; y++) {
-
-                            int pKeyOffset = pKeyWZ + y * pKey.info.strides[1];
-                            int pValOffset = pValWZ + y * pVal.info.strides[1];
-
-                            compute::buffer_iterator< type_t<Tk> > start= compute::make_buffer_iterator< type_t<Tk> >(pKey_buf, pKeyOffset);
-                            compute::buffer_iterator< type_t<Tk> > end  = compute::make_buffer_iterator< type_t<Tk> >(pKey_buf, pKeyOffset + pKey.info.dims[0]);
-                            compute::buffer_iterator< type_t<Tv> > vals = compute::make_buffer_iterator< type_t<Tv> >(pVal_buf, pValOffset);
-                            if(isAscending) {
-                                compute::sort_by_key(start, end, vals, c_queue);
-                            } else {
-                                compute::sort_by_key(start, end, vals,
-                                                     compute::greater< type_t<Tk> >(), c_queue);
-                            }
-                        }
-                    }
-                }
-
-                CL_DEBUG_FINISH(getQueue());
-            } catch (cl::Error err) {
-                CL_TO_AF_ERROR(err);
-                throw;
-            }
-        }
+        void sort0ByKeyIterative(Param pKey, Param pVal);
 
         template<typename Tk_, typename Tv_, bool isAscending, int dim>
-        void sortByKeyBatched(Param pKey, Param pVal)
-        {
-            typedef type_t<Tk_> Tk;
-            typedef type_t<Tv_> Tv;
-
-            try {
-                af::dim4 inDims;
-                for(int i = 0; i < 4; i++)
-                    inDims[i] = pKey.info.dims[i];
-
-                // Sort dimension
-                // tileDims * seqDims = inDims
-                af::dim4 tileDims(1);
-                af::dim4 seqDims = inDims;
-                tileDims[dim] = inDims[dim];
-                seqDims[dim] = 1;
-
-                // Create/call iota
-                // Array<Tv> key = iota<Tv>(seqDims, tileDims);
-                dim4 keydims = inDims;
-                cl::Buffer* key = bufferAlloc(keydims.elements() * sizeof(unsigned));
-                Param pSeq;
-                pSeq.data = key;
-                pSeq.info.offset = 0;
-                pSeq.info.dims[0] = keydims[0];
-                pSeq.info.strides[0] = 1;
-                for(int i = 1; i < 4; i++) {
-                    pSeq.info.dims[i] = keydims[i];
-                    pSeq.info.strides[i] = pSeq.info.strides[i - 1] * pSeq.info.dims[i - 1];
-                }
-                kernel::iota<unsigned>(pSeq, seqDims, tileDims);
-
-                int elements = inDims.elements();
-
-                // Flat - Not required since inplace and both are continuous
-                //val.modDims(inDims.elements());
-                //key.modDims(inDims.elements());
-
-                // Sort indices
-                // sort_by_key<T, Tv, isAscending>(*resVal, *resKey, val, key, 0);
-                //kernel::sort0_by_key<T, Tv, isAscending>(pVal, pKey);
-                compute::command_queue c_queue(getQueue()());
-                compute::context c_context(getContext()());
-
-                // Create buffer iterators for seq
-                compute::buffer pSeq_buf((*pSeq.data)());
-                compute::buffer_iterator<unsigned> seq0 = compute::make_buffer_iterator<unsigned>(pSeq_buf, 0);
-                compute::buffer_iterator<unsigned> seqN = compute::make_buffer_iterator<unsigned>(pSeq_buf, elements);
-                // Create buffer iterators for key and val
-                compute::buffer pKey_buf((*pKey.data)());
-                compute::buffer pVal_buf((*pVal.data)());
-                compute::buffer_iterator<Tk> key0 = compute::make_buffer_iterator<Tk>(pKey_buf, 0);
-                compute::buffer_iterator<Tk> keyN = compute::make_buffer_iterator<Tk>(pKey_buf, elements);
-                compute::buffer_iterator<Tv> val0 = compute::make_buffer_iterator<Tv>(pVal_buf, 0);
-                compute::buffer_iterator<Tv> valN = compute::make_buffer_iterator<Tv>(pVal_buf, elements);
-
-                // Sort By Key for descending is stable in the reverse
-                // (greater) order. Sorting in ascending with negated values
-                // will give the right result
-                if(!isAscending) compute::transform(key0, keyN, key0, flipFunction<Tk>(), c_queue);
-
-                // Create a copy of the pKey buffer
-                cl::Buffer* cKey = bufferAlloc(elements * sizeof(Tk));
-                compute::buffer cKey_buf((*cKey)());
-                compute::buffer_iterator<Tk> cKey0 = compute::make_buffer_iterator<Tk>(cKey_buf, 0);
-                compute::buffer_iterator<Tk> cKeyN = compute::make_buffer_iterator<Tk>(cKey_buf, elements);
-                compute::copy(key0, keyN, cKey0, c_queue);
-
-                // FIRST SORT
-                compute::sort_by_key(key0, keyN, seq0, c_queue);
-                compute::sort_by_key(cKey0, cKeyN, val0, c_queue);
-
-                // Create a copy of the seq buffer after first sort
-                cl::Buffer* cSeq = bufferAlloc(elements * sizeof(unsigned));
-                compute::buffer cSeq_buf((*cSeq)());
-                compute::buffer_iterator<unsigned> cSeq0 = compute::make_buffer_iterator<unsigned>(cSeq_buf, 0);
-                compute::buffer_iterator<unsigned> cSeqN = compute::make_buffer_iterator<unsigned>(cSeq_buf, elements);
-                compute::copy(seq0, seqN, cSeq0, c_queue);
-
-                // SECOND SORT
-                // First call will sort key, second sort will sort val
-                // Needs to be ascending (true) in order to maintain the indices properly
-                //kernel::sort0_by_key<Tv, T, true>(pKey, pVal);
-                compute::sort_by_key(seq0, seqN, key0, c_queue);
-                compute::sort_by_key(cSeq0, cSeqN, val0, c_queue);
-
-                // If descending, flip it back
-                if(!isAscending) compute::transform(key0, keyN, key0, flipFunction<Tk>(), c_queue);
-
-                //// No need of doing moddims here because the original Array<T>
-                //// dimensions have not been changed
-                ////val.modDims(inDims);
-
-                CL_DEBUG_FINISH(getQueue());
-                bufferFree(key);
-                bufferFree(cSeq);
-                bufferFree(cKey);
-            } catch (cl::Error err) {
-                CL_TO_AF_ERROR(err);
-                throw;
-            }
-        }
+        void sortByKeyBatched(Param pKey, Param pVal);
 
         template<typename Tk, typename Tv, bool isAscending>
-        void sort0ByKey(Param pKey, Param pVal)
-        {
-            int higherDims =  pKey.info.dims[1] * pKey.info.dims[2] * pKey.info.dims[3];
-            // TODO Make a better heurisitic
-            if(higherDims > 0)
-                kernel::sortByKeyBatched<Tk, Tv, isAscending, 0>(pKey, pVal);
-            else
-                kernel::sort0ByKeyIterative<Tk, Tv, isAscending>(pKey, pVal);
-        }
+        void sort0ByKey(Param pKey, Param pVal);
     }
 }
-
-#pragma GCC diagnostic pop
diff --git a/src/backend/opencl/sort_by_key/b8.cpp b/src/backend/opencl/kernel/sort_by_key/b8.cpp
similarity index 87%
rename from src/backend/opencl/sort_by_key/b8.cpp
rename to src/backend/opencl/kernel/sort_by_key/b8.cpp
index 118d20dc92..ad0d7f48ae 100644
--- a/src/backend/opencl/sort_by_key/b8.cpp
+++ b/src/backend/opencl/kernel/sort_by_key/b8.cpp
@@ -7,10 +7,13 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include "impl.hpp"
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace opencl
+{
+namespace kernel
 {
     INSTANTIATE1(char,true)
     INSTANTIATE1(char,false)
 }
+}
diff --git a/src/backend/opencl/sort_by_key/f32.cpp b/src/backend/opencl/kernel/sort_by_key/f32.cpp
similarity index 87%
rename from src/backend/opencl/sort_by_key/f32.cpp
rename to src/backend/opencl/kernel/sort_by_key/f32.cpp
index a7baf486f1..a1e9ae5f1f 100644
--- a/src/backend/opencl/sort_by_key/f32.cpp
+++ b/src/backend/opencl/kernel/sort_by_key/f32.cpp
@@ -7,10 +7,13 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include "impl.hpp"
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace opencl
+{
+namespace kernel
 {
     INSTANTIATE1(float,true)
     INSTANTIATE1(float,false)
 }
+}
diff --git a/src/backend/opencl/sort_by_key/f64.cpp b/src/backend/opencl/kernel/sort_by_key/f64.cpp
similarity index 87%
rename from src/backend/opencl/sort_by_key/f64.cpp
rename to src/backend/opencl/kernel/sort_by_key/f64.cpp
index 6971c90982..7fb7a79bd8 100644
--- a/src/backend/opencl/sort_by_key/f64.cpp
+++ b/src/backend/opencl/kernel/sort_by_key/f64.cpp
@@ -7,10 +7,13 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include "impl.hpp"
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace opencl
+{
+namespace kernel
 {
     INSTANTIATE1(double,true)
     INSTANTIATE1(double,false)
 }
+}
diff --git a/src/backend/opencl/sort_by_key/s16.cpp b/src/backend/opencl/kernel/sort_by_key/s16.cpp
similarity index 87%
rename from src/backend/opencl/sort_by_key/s16.cpp
rename to src/backend/opencl/kernel/sort_by_key/s16.cpp
index 44e17b5030..491ea0e3a2 100644
--- a/src/backend/opencl/sort_by_key/s16.cpp
+++ b/src/backend/opencl/kernel/sort_by_key/s16.cpp
@@ -7,10 +7,13 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include "impl.hpp"
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace opencl
+{
+namespace kernel
 {
     INSTANTIATE1(short,true)
     INSTANTIATE1(short,false)
 }
+}
diff --git a/src/backend/opencl/sort_by_key/s32.cpp b/src/backend/opencl/kernel/sort_by_key/s32.cpp
similarity index 87%
rename from src/backend/opencl/sort_by_key/s32.cpp
rename to src/backend/opencl/kernel/sort_by_key/s32.cpp
index 9fed1a53b3..67ba20e7dd 100644
--- a/src/backend/opencl/sort_by_key/s32.cpp
+++ b/src/backend/opencl/kernel/sort_by_key/s32.cpp
@@ -7,10 +7,13 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include "impl.hpp"
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace opencl
+{
+namespace kernel
 {
     INSTANTIATE1(int,true)
     INSTANTIATE1(int,false)
 }
+}
diff --git a/src/backend/opencl/sort_by_key/s64.cpp b/src/backend/opencl/kernel/sort_by_key/s64.cpp
similarity index 87%
rename from src/backend/opencl/sort_by_key/s64.cpp
rename to src/backend/opencl/kernel/sort_by_key/s64.cpp
index e2ed8d687b..a48f36ee47 100644
--- a/src/backend/opencl/sort_by_key/s64.cpp
+++ b/src/backend/opencl/kernel/sort_by_key/s64.cpp
@@ -7,10 +7,13 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include "impl.hpp"
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace opencl
+{
+namespace kernel
 {
     INSTANTIATE1(intl,true)
     INSTANTIATE1(intl,false)
 }
+}
diff --git a/src/backend/opencl/sort_by_key/u16.cpp b/src/backend/opencl/kernel/sort_by_key/u16.cpp
similarity index 87%
rename from src/backend/opencl/sort_by_key/u16.cpp
rename to src/backend/opencl/kernel/sort_by_key/u16.cpp
index c53b68fb53..36678d0a42 100644
--- a/src/backend/opencl/sort_by_key/u16.cpp
+++ b/src/backend/opencl/kernel/sort_by_key/u16.cpp
@@ -7,10 +7,13 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include "impl.hpp"
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace opencl
+{
+namespace kernel
 {
     INSTANTIATE1(ushort,true)
     INSTANTIATE1(ushort,false)
 }
+}
diff --git a/src/backend/opencl/sort_by_key/u32.cpp b/src/backend/opencl/kernel/sort_by_key/u32.cpp
similarity index 87%
rename from src/backend/opencl/sort_by_key/u32.cpp
rename to src/backend/opencl/kernel/sort_by_key/u32.cpp
index c2e3e62163..f1e4b5322f 100644
--- a/src/backend/opencl/sort_by_key/u32.cpp
+++ b/src/backend/opencl/kernel/sort_by_key/u32.cpp
@@ -7,10 +7,13 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include "impl.hpp"
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace opencl
+{
+namespace kernel
 {
     INSTANTIATE1(uint,true)
     INSTANTIATE1(uint,false)
 }
+}
diff --git a/src/backend/opencl/sort_by_key/u64.cpp b/src/backend/opencl/kernel/sort_by_key/u64.cpp
similarity index 87%
rename from src/backend/opencl/sort_by_key/u64.cpp
rename to src/backend/opencl/kernel/sort_by_key/u64.cpp
index 89649b1ba5..0a6f5b0c4f 100644
--- a/src/backend/opencl/sort_by_key/u64.cpp
+++ b/src/backend/opencl/kernel/sort_by_key/u64.cpp
@@ -7,10 +7,13 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include "impl.hpp"
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace opencl
+{
+namespace kernel
 {
     INSTANTIATE1(uintl,true)
     INSTANTIATE1(uintl,false)
 }
+}
diff --git a/src/backend/opencl/sort_by_key/u8.cpp b/src/backend/opencl/kernel/sort_by_key/u8.cpp
similarity index 87%
rename from src/backend/opencl/sort_by_key/u8.cpp
rename to src/backend/opencl/kernel/sort_by_key/u8.cpp
index 2dfb4c3a73..45af011a86 100644
--- a/src/backend/opencl/sort_by_key/u8.cpp
+++ b/src/backend/opencl/kernel/sort_by_key/u8.cpp
@@ -7,10 +7,13 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include "impl.hpp"
+#include <kernel/sort_by_key_impl.hpp>
 
 namespace opencl
+{
+namespace kernel
 {
     INSTANTIATE1(uchar,true)
     INSTANTIATE1(uchar,false)
 }
+}
diff --git a/src/backend/opencl/kernel/sort_by_key_impl.hpp b/src/backend/opencl/kernel/sort_by_key_impl.hpp
new file mode 100644
index 0000000000..dc1aa2735f
--- /dev/null
+++ b/src/backend/opencl/kernel/sort_by_key_impl.hpp
@@ -0,0 +1,373 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <kernel_headers/sort_pair.hpp>
+#include <kernel/sort_by_key.hpp>
+#include <kernel/sort_helper.hpp>
+#include <program.hpp>
+#include <memory.hpp>
+#include <traits.hpp>
+#include <string>
+#include <mutex>
+#include <dispatch.hpp>
+#include <Param.hpp>
+#include <debug_opencl.hpp>
+#include <math.hpp>
+#include <kernel/iota.hpp>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+#include <boost/compute/core.hpp>
+#include <boost/compute/algorithm/sort_by_key.hpp>
+#include <boost/compute/functional/operator.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/transform.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/functional/get.hpp>
+#include <boost/compute/functional/field.hpp>
+#include <boost/compute/types/pair.hpp>
+
+namespace compute = boost::compute;
+
+using cl::Buffer;
+using cl::Program;
+using cl::Kernel;
+using cl::make_kernel;
+using cl::EnqueueArgs;
+using cl::NDRange;
+using std::string;
+
+template<typename Tk, typename Tv, bool isAscending>
+inline
+boost::compute::function<bool(const std::pair<Tk, Tv>, const std::pair<Tk, Tv>)>
+makeCompareFunction()
+{
+    // Cannot use isAscending in BOOST_COMPUTE_FUNCTION
+    if(isAscending) {
+        BOOST_COMPUTE_FUNCTION(bool, IPCompare, (std::pair<Tk, Tv> lhs, std::pair<Tk, Tv> rhs),
+            {
+                return lhs.first < rhs.first;
+            }
+        );
+        return IPCompare;
+    } else {
+        BOOST_COMPUTE_FUNCTION(bool, IPCompare, (std::pair<Tk, Tv> lhs, std::pair<Tk, Tv> rhs),
+            {
+                return lhs.first > rhs.first;
+            }
+        );
+        return IPCompare;
+    }
+}
+
+template<typename Tk>
+inline boost::compute::function<Tk(Tk)>
+flipFunction()
+{
+    BOOST_COMPUTE_FUNCTION(Tk, negateFn, (const Tk x),
+        {
+            return -x;
+        }
+    );
+
+    return negateFn;
+}
+
+#define INSTANTIATE_FLIP(TY, XMAX)                              \
+template<> inline boost::compute::function<TY(TY)>              \
+flipFunction<TY>()                                              \
+{                                                               \
+    BOOST_COMPUTE_FUNCTION(TY, negateFn, (const TY x),          \
+       {                                                        \
+            return XMAX - x;                                    \
+        }                                                       \
+    );                                                          \
+                                                                \
+    return negateFn;                                            \
+}
+
+INSTANTIATE_FLIP(unsigned, UINT_MAX)
+INSTANTIATE_FLIP(unsigned short, USHRT_MAX)
+INSTANTIATE_FLIP(unsigned char, UCHAR_MAX)
+INSTANTIATE_FLIP(cl_ulong, ULONG_MAX)
+
+#undef INSTANTIATE_FLIP
+
+namespace opencl
+{
+    namespace kernel
+    {
+        static const int copyPairIter = 4;
+
+        template<typename Tk, typename Tv>
+        void makePair(cl::Buffer *out, const cl::Buffer *first, const cl::Buffer *second, const unsigned N)
+        {
+            try {
+                static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
+                static std::map<int, Program*>   sortPairProgs;
+                static std::map<int, Kernel*>  sortPairKernels;
+
+                int device = getActiveDeviceId();
+
+                std::call_once( compileFlags[device], [device] () {
+                    std::ostringstream options;
+                    options << " -D Tk="      << dtype_traits<Tk>::getName()
+                            << " -D Tv="      << dtype_traits<Tv>::getName()
+                            << " -D copyPairIter=" << copyPairIter;
+                    if (std::is_same<Tk, double >::value ||
+                        std::is_same<Tk, cdouble>::value ||
+                        std::is_same<Tv, double >::value ||
+                        std::is_same<Tv, cdouble>::value) {
+                        options << " -D USE_DOUBLE";
+                    }
+                    Program prog;
+                    buildProgram(prog, sort_pair_cl, sort_pair_cl_len, options.str());
+                    sortPairProgs[device]   = new Program(prog);
+                    sortPairKernels[device] = new Kernel(*sortPairProgs[device], "make_pair_kernel");
+                });
+
+                auto makePairOp = make_kernel<Buffer, const Buffer, const Buffer, const unsigned>
+                                          (*sortPairKernels[device]);
+
+                NDRange local(256, 1, 1);
+                NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1);
+
+                makePairOp(EnqueueArgs(getQueue(), global, local), *out, *first, *second, N);
+
+                CL_DEBUG_FINISH(getQueue());
+            } catch (cl::Error err) {
+                CL_TO_AF_ERROR(err);
+                throw;
+            }
+        }
+
+        template<typename Tk, typename Tv>
+        void splitPair(cl::Buffer *first, cl::Buffer *second, const cl::Buffer *in, const unsigned N)
+        {
+            try {
+                static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
+                static std::map<int, Program*>   sortPairProgs;
+                static std::map<int, Kernel*>  sortPairKernels;
+
+                int device = getActiveDeviceId();
+
+                std::call_once( compileFlags[device], [device] () {
+                    std::ostringstream options;
+                    options << " -D Tk="      << dtype_traits<Tk>::getName()
+                            << " -D Tv="      << dtype_traits<Tv>::getName()
+                            << " -D copyPairIter=" << copyPairIter;
+                    if (std::is_same<Tk, double >::value ||
+                        std::is_same<Tk, cdouble>::value ||
+                        std::is_same<Tv, double >::value ||
+                        std::is_same<Tv, cdouble>::value) {
+                        options << " -D USE_DOUBLE";
+                    }
+                    Program prog;
+                    buildProgram(prog, sort_pair_cl, sort_pair_cl_len, options.str());
+                    sortPairProgs[device]   = new Program(prog);
+                    sortPairKernels[device] = new Kernel(*sortPairProgs[device], "split_pair_kernel");
+                });
+
+                auto splitPairOp = make_kernel<Buffer, Buffer, const Buffer, const unsigned>
+                                          (*sortPairKernels[device]);
+
+                NDRange local(256, 1, 1);
+                NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1);
+
+                splitPairOp(EnqueueArgs(getQueue(), global, local), *first, *second, *in, N);
+
+                CL_DEBUG_FINISH(getQueue());
+            } catch (cl::Error err) {
+                CL_TO_AF_ERROR(err);
+                throw;
+            }
+        }
+
+        template<typename Tk, typename Tv, bool isAscending>
+        void sort0ByKeyIterative(Param pKey, Param pVal)
+        {
+            try {
+                compute::command_queue c_queue(getQueue()());
+
+                compute::buffer pKey_buf((*pKey.data)());
+                compute::buffer pVal_buf((*pVal.data)());
+
+                for(int w = 0; w < pKey.info.dims[3]; w++) {
+                    int pKeyW = w * pKey.info.strides[3];
+                    int pValW = w * pVal.info.strides[3];
+                    for(int z = 0; z < pKey.info.dims[2]; z++) {
+                        int pKeyWZ = pKeyW + z * pKey.info.strides[2];
+                        int pValWZ = pValW + z * pVal.info.strides[2];
+                        for(int y = 0; y < pKey.info.dims[1]; y++) {
+
+                            int pKeyOffset = pKeyWZ + y * pKey.info.strides[1];
+                            int pValOffset = pValWZ + y * pVal.info.strides[1];
+
+                            compute::buffer_iterator< type_t<Tk> > start= compute::make_buffer_iterator< type_t<Tk> >(pKey_buf, pKeyOffset);
+                            compute::buffer_iterator< type_t<Tk> > end  = compute::make_buffer_iterator< type_t<Tk> >(pKey_buf, pKeyOffset + pKey.info.dims[0]);
+                            compute::buffer_iterator< type_t<Tv> > vals = compute::make_buffer_iterator< type_t<Tv> >(pVal_buf, pValOffset);
+                            if(isAscending) {
+                                compute::sort_by_key(start, end, vals, c_queue);
+                            } else {
+                                compute::sort_by_key(start, end, vals,
+                                                     compute::greater< type_t<Tk> >(), c_queue);
+                            }
+                        }
+                    }
+                }
+
+                CL_DEBUG_FINISH(getQueue());
+            } catch (cl::Error err) {
+                CL_TO_AF_ERROR(err);
+                throw;
+            }
+        }
+
+        template<typename Tk_, typename Tv_, bool isAscending, int dim>
+        void sortByKeyBatched(Param pKey, Param pVal)
+        {
+            typedef type_t<Tk_> Tk;
+            typedef type_t<Tv_> Tv;
+
+            try {
+                af::dim4 inDims;
+                for(int i = 0; i < 4; i++)
+                    inDims[i] = pKey.info.dims[i];
+
+                // Sort dimension
+                // tileDims * seqDims = inDims
+                af::dim4 tileDims(1);
+                af::dim4 seqDims = inDims;
+                tileDims[dim] = inDims[dim];
+                seqDims[dim] = 1;
+
+                // Create/call iota
+                // Array<Tv> key = iota<Tv>(seqDims, tileDims);
+                cl::Buffer* key = bufferAlloc(inDims.elements() * sizeof(unsigned));
+                Param pSeq;
+                pSeq.data = key;
+                pSeq.info.offset = 0;
+                pSeq.info.dims[0] = inDims[0];
+                pSeq.info.strides[0] = 1;
+                for(int i = 1; i < 4; i++) {
+                    pSeq.info.dims[i] = inDims[i];
+                    pSeq.info.strides[i] = pSeq.info.strides[i - 1] * pSeq.info.dims[i - 1];
+                }
+                kernel::iota<unsigned>(pSeq, seqDims, tileDims);
+
+                int elements = inDims.elements();
+
+                // Flat - Not required since inplace and both are continuous
+                //val.modDims(inDims.elements());
+                //key.modDims(inDims.elements());
+
+                // Sort indices
+                // sort_by_key<T, Tv, isAscending>(*resVal, *resKey, val, key, 0);
+                //kernel::sort0_by_key<T, Tv, isAscending>(pVal, pKey);
+                compute::command_queue c_queue(getQueue()());
+                compute::context c_context(getContext()());
+
+                // Create buffer iterators for seq
+                compute::buffer pSeq_buf((*pSeq.data)());
+                compute::buffer_iterator<unsigned> seq0 = compute::make_buffer_iterator<unsigned>(pSeq_buf, 0);
+                compute::buffer_iterator<unsigned> seqN = compute::make_buffer_iterator<unsigned>(pSeq_buf, elements);
+                // Create buffer iterators for key and val
+                compute::buffer pKey_buf((*pKey.data)());
+                compute::buffer pVal_buf((*pVal.data)());
+                compute::buffer_iterator<Tk> key0 = compute::make_buffer_iterator<Tk>(pKey_buf, 0);
+                compute::buffer_iterator<Tk> keyN = compute::make_buffer_iterator<Tk>(pKey_buf, elements);
+                compute::buffer_iterator<Tv> val0 = compute::make_buffer_iterator<Tv>(pVal_buf, 0);
+                compute::buffer_iterator<Tv> valN = compute::make_buffer_iterator<Tv>(pVal_buf, elements);
+
+                // Sort By Key for descending is stable in the reverse
+                // (greater) order. Sorting in ascending with negated values
+                // will give the right result
+                if(!isAscending) compute::transform(key0, keyN, key0, flipFunction<Tk>(), c_queue);
+
+                // Create a copy of the pKey buffer
+                cl::Buffer* cKey = bufferAlloc(elements * sizeof(Tk));
+                compute::buffer cKey_buf((*cKey)());
+                compute::buffer_iterator<Tk> cKey0 = compute::make_buffer_iterator<Tk>(cKey_buf, 0);
+                compute::buffer_iterator<Tk> cKeyN = compute::make_buffer_iterator<Tk>(cKey_buf, elements);
+                compute::copy(key0, keyN, cKey0, c_queue);
+
+                // FIRST SORT
+                compute::sort_by_key(key0, keyN, seq0, c_queue);
+                compute::sort_by_key(cKey0, cKeyN, val0, c_queue);
+
+                // Create a copy of the seq buffer after first sort
+                cl::Buffer* cSeq = bufferAlloc(elements * sizeof(unsigned));
+                compute::buffer cSeq_buf((*cSeq)());
+                compute::buffer_iterator<unsigned> cSeq0 = compute::make_buffer_iterator<unsigned>(cSeq_buf, 0);
+                compute::buffer_iterator<unsigned> cSeqN = compute::make_buffer_iterator<unsigned>(cSeq_buf, elements);
+                compute::copy(seq0, seqN, cSeq0, c_queue);
+
+                // SECOND SORT
+                // First call will sort key, second sort will sort val
+                // Needs to be ascending (true) in order to maintain the indices properly
+                //kernel::sort0_by_key<Tv, T, true>(pKey, pVal);
+                compute::sort_by_key(seq0, seqN, key0, c_queue);
+                compute::sort_by_key(cSeq0, cSeqN, val0, c_queue);
+
+                // If descending, flip it back
+                if(!isAscending) compute::transform(key0, keyN, key0, flipFunction<Tk>(), c_queue);
+
+                //// No need of doing moddims here because the original Array<T>
+                //// dimensions have not been changed
+                ////val.modDims(inDims);
+
+                CL_DEBUG_FINISH(getQueue());
+                bufferFree(key);
+                bufferFree(cSeq);
+                bufferFree(cKey);
+            } catch (cl::Error err) {
+                CL_TO_AF_ERROR(err);
+                throw;
+            }
+        }
+
+        template<typename Tk, typename Tv, bool isAscending>
+        void sort0ByKey(Param pKey, Param pVal)
+        {
+            int higherDims =  pKey.info.dims[1] * pKey.info.dims[2] * pKey.info.dims[3];
+            // TODO Make a better heurisitic
+            if(higherDims > 0)
+                kernel::sortByKeyBatched<Tk, Tv, isAscending, 0>(pKey, pVal);
+            else
+                kernel::sort0ByKeyIterative<Tk, Tv, isAscending>(pKey, pVal);
+        }
+
+#define INSTANTIATE(Tk, Tv, dr)                                                 \
+    template void sort0ByKey<Tk, Tv, dr>(Param okey, Param oval);               \
+    template void sort0ByKeyIterative<Tk, Tv, dr>(Param okey, Param oval);      \
+    template void sortByKeyBatched<Tk, Tv, dr, 0>(Param okey, Param oval);      \
+    template void sortByKeyBatched<Tk, Tv, dr, 1>(Param okey, Param oval);      \
+    template void sortByKeyBatched<Tk, Tv, dr, 2>(Param okey, Param oval);      \
+    template void sortByKeyBatched<Tk, Tv, dr, 3>(Param okey, Param oval);      \
+
+#define INSTANTIATE1(Tk    , dr) \
+    INSTANTIATE(Tk, float  , dr) \
+    INSTANTIATE(Tk, double , dr) \
+    INSTANTIATE(Tk, cfloat , dr) \
+    INSTANTIATE(Tk, cdouble, dr) \
+    INSTANTIATE(Tk, int    , dr) \
+    INSTANTIATE(Tk, uint   , dr) \
+    INSTANTIATE(Tk, short  , dr) \
+    INSTANTIATE(Tk, ushort , dr) \
+    INSTANTIATE(Tk, char   , dr) \
+    INSTANTIATE(Tk, uchar  , dr) \
+    INSTANTIATE(Tk, intl   , dr) \
+    INSTANTIATE(Tk, uintl  , dr)
+    }
+}
+
+#pragma GCC diagnostic pop
diff --git a/src/backend/opencl/kernel/sort_helper.hpp b/src/backend/opencl/kernel/sort_helper.hpp
index 899e0873fa..b8031c2314 100644
--- a/src/backend/opencl/kernel/sort_helper.hpp
+++ b/src/backend/opencl/kernel/sort_helper.hpp
@@ -8,75 +8,11 @@
  ********************************************************/
 
 #pragma once
-#include <kernel_headers/sort_pair.hpp>
-#include <program.hpp>
 #include <traits.hpp>
-#include <string>
-#include <mutex>
-#include <map>
-#include <utility>
 #include <dispatch.hpp>
 #include <Param.hpp>
 #include <debug_opencl.hpp>
 
-#include <boost/compute/function.hpp>
-
-template<typename Tk, typename Tv, bool isAscending>
-inline
-boost::compute::function<bool(const std::pair<Tk, Tv>, const std::pair<Tk, Tv>)>
-makeCompareFunction()
-{
-    // Cannot use isAscending in BOOST_COMPUTE_FUNCTION
-    if(isAscending) {
-        BOOST_COMPUTE_FUNCTION(bool, IPCompare, (std::pair<Tk, Tv> lhs, std::pair<Tk, Tv> rhs),
-            {
-                return lhs.first < rhs.first;
-            }
-        );
-        return IPCompare;
-    } else {
-        BOOST_COMPUTE_FUNCTION(bool, IPCompare, (std::pair<Tk, Tv> lhs, std::pair<Tk, Tv> rhs),
-            {
-                return lhs.first > rhs.first;
-            }
-        );
-        return IPCompare;
-    }
-}
-
-template<typename Tk>
-inline boost::compute::function<Tk(Tk)>
-flipFunction()
-{
-    BOOST_COMPUTE_FUNCTION(Tk, negateFn, (const Tk x),
-        {
-            return -x;
-        }
-    );
-
-    return negateFn;
-}
-
-#define INSTANTIATE_FLIP(TY, XMAX)                              \
-template<> inline boost::compute::function<TY(TY)>              \
-flipFunction<TY>()                                              \
-{                                                               \
-    BOOST_COMPUTE_FUNCTION(TY, negateFn, (const TY x),          \
-       {                                                        \
-            return XMAX - x;                                    \
-        }                                                       \
-    );                                                          \
-                                                                \
-    return negateFn;                                            \
-}
-
-INSTANTIATE_FLIP(unsigned, UINT_MAX)
-INSTANTIATE_FLIP(unsigned short, USHRT_MAX)
-INSTANTIATE_FLIP(unsigned char, UCHAR_MAX)
-INSTANTIATE_FLIP(cl_ulong, ULONG_MAX)
-
-#undef INSTANTIATE_FLIP
-
 namespace opencl
 {
     namespace kernel
@@ -107,91 +43,6 @@ namespace opencl
         using type_t = typename conditional<is_same<T, uintl>::value,
                                             cl_ulong, ltype_t<T>
                                            >::type;
-
-        static const int copyPairIter = 4;
-
-        template<typename Tk, typename Tv>
-        void makePair(cl::Buffer *out, const cl::Buffer *first, const cl::Buffer *second, const unsigned N)
-        {
-            try {
-                static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
-                static std::map<int, Program*>   sortPairProgs;
-                static std::map<int, Kernel*>  sortPairKernels;
-
-                int device = getActiveDeviceId();
-
-                std::call_once( compileFlags[device], [device] () {
-                    std::ostringstream options;
-                    options << " -D Tk="      << dtype_traits<Tk>::getName()
-                            << " -D Tv="      << dtype_traits<Tv>::getName()
-                            << " -D copyPairIter=" << copyPairIter;
-                    if (std::is_same<Tk, double >::value ||
-                        std::is_same<Tk, cdouble>::value ||
-                        std::is_same<Tv, double >::value ||
-                        std::is_same<Tv, cdouble>::value) {
-                        options << " -D USE_DOUBLE";
-                    }
-                    Program prog;
-                    buildProgram(prog, sort_pair_cl, sort_pair_cl_len, options.str());
-                    sortPairProgs[device]   = new Program(prog);
-                    sortPairKernels[device] = new Kernel(*sortPairProgs[device], "make_pair_kernel");
-                });
-
-                auto makePairOp = make_kernel<Buffer, const Buffer, const Buffer, const unsigned>
-                                          (*sortPairKernels[device]);
-
-                NDRange local(256, 1, 1);
-                NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1);
-
-                makePairOp(EnqueueArgs(getQueue(), global, local), *out, *first, *second, N);
-
-                CL_DEBUG_FINISH(getQueue());
-            } catch (cl::Error err) {
-                CL_TO_AF_ERROR(err);
-                throw;
-            }
-        }
-
-        template<typename Tk, typename Tv>
-        void splitPair(cl::Buffer *first, cl::Buffer *second, const cl::Buffer *in, const unsigned N)
-        {
-            try {
-                static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
-                static std::map<int, Program*>   sortPairProgs;
-                static std::map<int, Kernel*>  sortPairKernels;
-
-                int device = getActiveDeviceId();
-
-                std::call_once( compileFlags[device], [device] () {
-                    std::ostringstream options;
-                    options << " -D Tk="      << dtype_traits<Tk>::getName()
-                            << " -D Tv="      << dtype_traits<Tv>::getName()
-                            << " -D copyPairIter=" << copyPairIter;
-                    if (std::is_same<Tk, double >::value ||
-                        std::is_same<Tk, cdouble>::value ||
-                        std::is_same<Tv, double >::value ||
-                        std::is_same<Tv, cdouble>::value) {
-                        options << " -D USE_DOUBLE";
-                    }
-                    Program prog;
-                    buildProgram(prog, sort_pair_cl, sort_pair_cl_len, options.str());
-                    sortPairProgs[device]   = new Program(prog);
-                    sortPairKernels[device] = new Kernel(*sortPairProgs[device], "split_pair_kernel");
-                });
-
-                auto splitPairOp = make_kernel<Buffer, Buffer, const Buffer, const unsigned>
-                                          (*sortPairKernels[device]);
-
-                NDRange local(256, 1, 1);
-                NDRange global(local[0] * divup(N, local[0] * copyPairIter), 1, 1);
-
-                splitPairOp(EnqueueArgs(getQueue(), global, local), *first, *second, *in, N);
-
-                CL_DEBUG_FINISH(getQueue());
-            } catch (cl::Error err) {
-                CL_TO_AF_ERROR(err);
-                throw;
-            }
-        }
     }
 }
+
diff --git a/src/backend/opencl/sort_by_key/impl.hpp b/src/backend/opencl/sort_by_key.cpp
similarity index 65%
rename from src/backend/opencl/sort_by_key/impl.hpp
rename to src/backend/opencl/sort_by_key.cpp
index f68fe91b3c..27c2dc2462 100644
--- a/src/backend/opencl/sort_by_key/impl.hpp
+++ b/src/backend/opencl/sort_by_key.cpp
@@ -55,26 +55,36 @@ namespace opencl
         }
     }
 
-#define INSTANTIATE(Tk, Tv, isAscending)                                \
-    template void                                                       \
-    sort_by_key<Tk, Tv, isAscending>(Array<Tk> &okey, Array<Tv> &oval,  \
-                                     const Array<Tk> &ikey,             \
-                                     const Array<Tv> &ival,             \
-                                     const unsigned dim);               \
+#define INSTANTIATE(Tk, Tv)                                                         \
+    template void sort_by_key<Tk, Tv, true >(Array<Tk> &okey, Array<Tv> &oval,      \
+                const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim);      \
+    template void sort_by_key<Tk, Tv, false>(Array<Tk> &okey, Array<Tv> &oval,      \
+                const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim);      \
 
+#define INSTANTIATE1(Tk    ) \
+    INSTANTIATE(Tk, float  ) \
+    INSTANTIATE(Tk, double ) \
+    INSTANTIATE(Tk, cfloat ) \
+    INSTANTIATE(Tk, cdouble) \
+    INSTANTIATE(Tk, int    ) \
+    INSTANTIATE(Tk, uint   ) \
+    INSTANTIATE(Tk, short  ) \
+    INSTANTIATE(Tk, ushort ) \
+    INSTANTIATE(Tk, char   ) \
+    INSTANTIATE(Tk, uchar  ) \
+    INSTANTIATE(Tk, intl   ) \
+    INSTANTIATE(Tk, uintl  )
 
-#define INSTANTIATE1(Tk, isAscending)           \
-    INSTANTIATE(Tk, float  , isAscending)       \
-    INSTANTIATE(Tk, double , isAscending)       \
-    INSTANTIATE(Tk, cfloat , isAscending)       \
-    INSTANTIATE(Tk, cdouble, isAscending)       \
-    INSTANTIATE(Tk, int    , isAscending)       \
-    INSTANTIATE(Tk, uint   , isAscending)       \
-    INSTANTIATE(Tk, char   , isAscending)       \
-    INSTANTIATE(Tk, uchar  , isAscending)       \
-    INSTANTIATE(Tk, short  , isAscending)       \
-    INSTANTIATE(Tk, ushort , isAscending)       \
-    INSTANTIATE(Tk, intl   , isAscending)       \
-    INSTANTIATE(Tk, uintl  , isAscending)       \
+
+INSTANTIATE1(float )
+INSTANTIATE1(double)
+INSTANTIATE1(int   )
+INSTANTIATE1(uint  )
+INSTANTIATE1(short )
+INSTANTIATE1(ushort)
+INSTANTIATE1(char  )
+INSTANTIATE1(uchar )
+INSTANTIATE1(intl  )
+INSTANTIATE1(uintl )
 
 }
diff --git a/src/backend/opencl/traits.hpp b/src/backend/opencl/traits.hpp
index 4e63095421..54ba158e8a 100644
--- a/src/backend/opencl/traits.hpp
+++ b/src/backend/opencl/traits.hpp
@@ -10,6 +10,7 @@
 #pragma once
 
 #include <defines.hpp>
+#include <types.hpp>
 #include <af/traits.hpp>
 #include <string>
 #include <sstream>

From cbefc1f2952ed780b4c4f71b98f6bb9bdf54d646 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Thu, 21 Apr 2016 21:22:48 -0400
Subject: [PATCH 25/28] Instantiate sort_by_key kernels in separately in cpu

---
 src/backend/cpu/CMakeLists.txt              |   3 +-
 src/backend/cpu/kernel/sort_by_key.hpp      | 120 +-------------
 src/backend/cpu/kernel/sort_by_key/b8.cpp   |  19 +++
 src/backend/cpu/kernel/sort_by_key/f32.cpp  |  19 +++
 src/backend/cpu/kernel/sort_by_key/f64.cpp  |  19 +++
 src/backend/cpu/kernel/sort_by_key/s16.cpp  |  19 +++
 src/backend/cpu/kernel/sort_by_key/s32.cpp  |  19 +++
 src/backend/cpu/kernel/sort_by_key/s64.cpp  |  19 +++
 src/backend/cpu/kernel/sort_by_key/u16.cpp  |  19 +++
 src/backend/cpu/kernel/sort_by_key/u32.cpp  |  19 +++
 src/backend/cpu/kernel/sort_by_key/u64.cpp  |  19 +++
 src/backend/cpu/kernel/sort_by_key/u8.cpp   |  19 +++
 src/backend/cpu/kernel/sort_by_key_impl.hpp | 166 ++++++++++++++++++++
 13 files changed, 361 insertions(+), 118 deletions(-)
 create mode 100644 src/backend/cpu/kernel/sort_by_key/b8.cpp
 create mode 100644 src/backend/cpu/kernel/sort_by_key/f32.cpp
 create mode 100644 src/backend/cpu/kernel/sort_by_key/f64.cpp
 create mode 100644 src/backend/cpu/kernel/sort_by_key/s16.cpp
 create mode 100644 src/backend/cpu/kernel/sort_by_key/s32.cpp
 create mode 100644 src/backend/cpu/kernel/sort_by_key/s64.cpp
 create mode 100644 src/backend/cpu/kernel/sort_by_key/u16.cpp
 create mode 100644 src/backend/cpu/kernel/sort_by_key/u32.cpp
 create mode 100644 src/backend/cpu/kernel/sort_by_key/u64.cpp
 create mode 100644 src/backend/cpu/kernel/sort_by_key/u8.cpp
 create mode 100644 src/backend/cpu/kernel/sort_by_key_impl.hpp

diff --git a/src/backend/cpu/CMakeLists.txt b/src/backend/cpu/CMakeLists.txt
index f7857ec6d6..0ae74e8384 100644
--- a/src/backend/cpu/CMakeLists.txt
+++ b/src/backend/cpu/CMakeLists.txt
@@ -93,7 +93,8 @@ FILE(GLOB cpu_headers
     "*.h")
 
 FILE(GLOB cpu_sources
-    "*.cpp")
+    "*.cpp"
+    "kernel/sort_by_key/*.cpp")
 
 LIST(SORT cpu_headers)
 LIST(SORT cpu_sources)
diff --git a/src/backend/cpu/kernel/sort_by_key.hpp b/src/backend/cpu/kernel/sort_by_key.hpp
index 1be4a94d3a..55d5a89337 100644
--- a/src/backend/cpu/kernel/sort_by_key.hpp
+++ b/src/backend/cpu/kernel/sort_by_key.hpp
@@ -8,15 +8,8 @@
  ********************************************************/
 
 #pragma once
-#include <af/defines.h>
 #include <Array.hpp>
-#include <math.hpp>
-#include <algorithm>
-#include <numeric>
-#include <queue>
 #include <err_cpu.hpp>
-#include <functional>
-#include <kernel/sort_helper.hpp>
 
 namespace cpu
 {
@@ -24,120 +17,13 @@ namespace kernel
 {
 
 template<typename Tk, typename Tv, bool isAscending>
-void sort0ByKeyIterative(Array<Tk> okey, Array<Tv> oval)
-{
-    // Get pointers and initialize original index locations
-    Tk *okey_ptr = okey.get();
-    Tv *oval_ptr = oval.get();
-
-    std::vector<IndexPair<Tk, Tv> > X;
-    X.reserve(okey.dims()[0]);
-
-    for(dim_t w = 0; w < okey.dims()[3]; w++) {
-        dim_t okeyW = w * okey.strides()[3];
-        dim_t ovalW = w * oval.strides()[3];
-
-        for(dim_t z = 0; z < okey.dims()[2]; z++) {
-            dim_t okeyWZ = okeyW + z * okey.strides()[2];
-            dim_t ovalWZ = ovalW + z * oval.strides()[2];
-
-            for(dim_t y = 0; y < okey.dims()[1]; y++) {
-
-                dim_t okeyOffset = okeyWZ + y * okey.strides()[1];
-                dim_t ovalOffset = ovalWZ + y * oval.strides()[1];
-
-                X.clear();
-                std::transform(okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims()[0],
-                               oval_ptr + ovalOffset,
-                               std::back_inserter(X),
-                               [](Tk v_, Tv i_) { return std::make_pair(v_, i_); }
-                               );
-
-                std::stable_sort(X.begin(), X.end(), IPCompare<Tk, Tv, isAscending>());
-
-                for(unsigned it = 0; it < X.size(); it++) {
-                    okey_ptr[okeyOffset + it] = X[it].first;
-                    oval_ptr[ovalOffset + it] = X[it].second;
-                }
-            }
-        }
-    }
-
-    return;
-}
+void sort0ByKeyIterative(Array<Tk> okey, Array<Tv> oval);
 
 template<typename Tk, typename Tv, bool isAscending, int dim>
-void sortByKeyBatched(Array<Tk> okey, Array<Tv> oval)
-{
-    af::dim4 inDims = okey.dims();
-
-    af::dim4 tileDims(1);
-    af::dim4 seqDims = inDims;
-    tileDims[dim] = inDims[dim];
-    seqDims[dim] = 1;
-
-    uint* key = memAlloc<uint>(inDims.elements());
-    // IOTA
-    {
-        af::dim4 dims    = inDims;
-        uint* out        = key;
-        af::dim4 strides(1);
-        for(int i = 1; i < 4; i++)
-            strides[i] = strides[i-1] * dims[i-1];
-
-        for(dim_t w = 0; w < dims[3]; w++) {
-            dim_t offW = w * strides[3];
-            uint okeyW = (w % seqDims[3]) * seqDims[0] * seqDims[1] * seqDims[2];
-            for(dim_t z = 0; z < dims[2]; z++) {
-                dim_t offWZ = offW + z * strides[2];
-                uint okeyZ = okeyW + (z % seqDims[2]) * seqDims[0] * seqDims[1];
-                for(dim_t y = 0; y < dims[1]; y++) {
-                    dim_t offWZY = offWZ + y * strides[1];
-                    uint okeyY = okeyZ + (y % seqDims[1]) * seqDims[0];
-                    for(dim_t x = 0; x < dims[0]; x++) {
-                        dim_t id = offWZY + x;
-                        out[id] = okeyY + (x % seqDims[0]);
-                    }
-                }
-            }
-        }
-    }
-
-    // initialize original index locations
-    Tk *okey_ptr = okey.get();
-    Tv *oval_ptr = oval.get();
-
-    std::vector<KeyIndexPair<Tk, Tv> > X;
-    X.reserve(okey.elements());
-
-    for(unsigned i = 0; i < okey.elements(); i++) {
-        X.push_back(std::make_pair(std::make_pair(okey_ptr[i], oval_ptr[i]), key[i]));
-    }
-
-    memFree(key); // key is no longer required
-
-    std::stable_sort(X.begin(), X.end(), KIPCompareV<Tk, Tv, isAscending>());
-
-    std::stable_sort(X.begin(), X.end(), KIPCompareK<Tk, Tv, true>());
-
-    for(unsigned it = 0; it < okey.elements(); it++) {
-        okey_ptr[it] = X[it].first.first;
-        oval_ptr[it] = X[it].first.second;
-    }
-
-    return;
-}
+void sortByKeyBatched(Array<Tk> okey, Array<Tv> oval);
 
 template<typename Tk, typename Tv, bool isAscending>
-void sort0ByKey(Array<Tk> okey, Array<Tv> oval)
-{
-    int higherDims =  okey.dims()[1] * okey.dims()[2] * okey.dims()[3];
-    // TODO Make a better heurisitic
-    if(higherDims > 0)
-        kernel::sortByKeyBatched<Tk, Tv, isAscending, 0>(okey, oval);
-    else
-        kernel::sort0ByKeyIterative<Tk, Tv, isAscending>(okey, oval);
-}
+void sort0ByKey(Array<Tk> okey, Array<Tv> oval);
 
 }
 }
diff --git a/src/backend/cpu/kernel/sort_by_key/b8.cpp b/src/backend/cpu/kernel/sort_by_key/b8.cpp
new file mode 100644
index 0000000000..855e7a93ca
--- /dev/null
+++ b/src/backend/cpu/kernel/sort_by_key/b8.cpp
@@ -0,0 +1,19 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <kernel/sort_by_key_impl.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+    INSTANTIATE1(char,true)
+    INSTANTIATE1(char,false)
+}
+}
diff --git a/src/backend/cpu/kernel/sort_by_key/f32.cpp b/src/backend/cpu/kernel/sort_by_key/f32.cpp
new file mode 100644
index 0000000000..11d8139957
--- /dev/null
+++ b/src/backend/cpu/kernel/sort_by_key/f32.cpp
@@ -0,0 +1,19 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <kernel/sort_by_key_impl.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+    INSTANTIATE1(float,true)
+    INSTANTIATE1(float,false)
+}
+}
diff --git a/src/backend/cpu/kernel/sort_by_key/f64.cpp b/src/backend/cpu/kernel/sort_by_key/f64.cpp
new file mode 100644
index 0000000000..21746d773a
--- /dev/null
+++ b/src/backend/cpu/kernel/sort_by_key/f64.cpp
@@ -0,0 +1,19 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <kernel/sort_by_key_impl.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+    INSTANTIATE1(double,true)
+    INSTANTIATE1(double,false)
+}
+}
diff --git a/src/backend/cpu/kernel/sort_by_key/s16.cpp b/src/backend/cpu/kernel/sort_by_key/s16.cpp
new file mode 100644
index 0000000000..50b718d04c
--- /dev/null
+++ b/src/backend/cpu/kernel/sort_by_key/s16.cpp
@@ -0,0 +1,19 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <kernel/sort_by_key_impl.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+    INSTANTIATE1(short,true)
+    INSTANTIATE1(short,false)
+}
+}
diff --git a/src/backend/cpu/kernel/sort_by_key/s32.cpp b/src/backend/cpu/kernel/sort_by_key/s32.cpp
new file mode 100644
index 0000000000..c50efcdc26
--- /dev/null
+++ b/src/backend/cpu/kernel/sort_by_key/s32.cpp
@@ -0,0 +1,19 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <kernel/sort_by_key_impl.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+    INSTANTIATE1(int,true)
+    INSTANTIATE1(int,false)
+}
+}
diff --git a/src/backend/cpu/kernel/sort_by_key/s64.cpp b/src/backend/cpu/kernel/sort_by_key/s64.cpp
new file mode 100644
index 0000000000..82946f820e
--- /dev/null
+++ b/src/backend/cpu/kernel/sort_by_key/s64.cpp
@@ -0,0 +1,19 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <kernel/sort_by_key_impl.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+    INSTANTIATE1(intl,true)
+    INSTANTIATE1(intl,false)
+}
+}
diff --git a/src/backend/cpu/kernel/sort_by_key/u16.cpp b/src/backend/cpu/kernel/sort_by_key/u16.cpp
new file mode 100644
index 0000000000..feedd1d56e
--- /dev/null
+++ b/src/backend/cpu/kernel/sort_by_key/u16.cpp
@@ -0,0 +1,19 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <kernel/sort_by_key_impl.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+    INSTANTIATE1(ushort,true)
+    INSTANTIATE1(ushort,false)
+}
+}
diff --git a/src/backend/cpu/kernel/sort_by_key/u32.cpp b/src/backend/cpu/kernel/sort_by_key/u32.cpp
new file mode 100644
index 0000000000..cd514af19a
--- /dev/null
+++ b/src/backend/cpu/kernel/sort_by_key/u32.cpp
@@ -0,0 +1,19 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <kernel/sort_by_key_impl.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+    INSTANTIATE1(uint,true)
+    INSTANTIATE1(uint,false)
+}
+}
diff --git a/src/backend/cpu/kernel/sort_by_key/u64.cpp b/src/backend/cpu/kernel/sort_by_key/u64.cpp
new file mode 100644
index 0000000000..ec955b3de7
--- /dev/null
+++ b/src/backend/cpu/kernel/sort_by_key/u64.cpp
@@ -0,0 +1,19 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <kernel/sort_by_key_impl.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+    INSTANTIATE1(uintl,true)
+    INSTANTIATE1(uintl,false)
+}
+}
diff --git a/src/backend/cpu/kernel/sort_by_key/u8.cpp b/src/backend/cpu/kernel/sort_by_key/u8.cpp
new file mode 100644
index 0000000000..fd58cbfaa1
--- /dev/null
+++ b/src/backend/cpu/kernel/sort_by_key/u8.cpp
@@ -0,0 +1,19 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <kernel/sort_by_key_impl.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+    INSTANTIATE1(uchar,true)
+    INSTANTIATE1(uchar,false)
+}
+}
diff --git a/src/backend/cpu/kernel/sort_by_key_impl.hpp b/src/backend/cpu/kernel/sort_by_key_impl.hpp
new file mode 100644
index 0000000000..fcd415c29c
--- /dev/null
+++ b/src/backend/cpu/kernel/sort_by_key_impl.hpp
@@ -0,0 +1,166 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <kernel/sort_by_key.hpp>
+#include <kernel/sort_helper.hpp>
+#include <Array.hpp>
+#include <math.hpp>
+#include <algorithm>
+#include <numeric>
+#include <queue>
+#include <err_cpu.hpp>
+#include <functional>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename Tk, typename Tv, bool isAscending>
+void sort0ByKeyIterative(Array<Tk> okey, Array<Tv> oval)
+{
+    // Get pointers and initialize original index locations
+    Tk *okey_ptr = okey.get();
+    Tv *oval_ptr = oval.get();
+
+    std::vector<IndexPair<Tk, Tv> > X;
+    X.reserve(okey.dims()[0]);
+
+    for(dim_t w = 0; w < okey.dims()[3]; w++) {
+        dim_t okeyW = w * okey.strides()[3];
+        dim_t ovalW = w * oval.strides()[3];
+
+        for(dim_t z = 0; z < okey.dims()[2]; z++) {
+            dim_t okeyWZ = okeyW + z * okey.strides()[2];
+            dim_t ovalWZ = ovalW + z * oval.strides()[2];
+
+            for(dim_t y = 0; y < okey.dims()[1]; y++) {
+
+                dim_t okeyOffset = okeyWZ + y * okey.strides()[1];
+                dim_t ovalOffset = ovalWZ + y * oval.strides()[1];
+
+                X.clear();
+                std::transform(okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims()[0],
+                               oval_ptr + ovalOffset,
+                               std::back_inserter(X),
+                               [](Tk v_, Tv i_) { return std::make_pair(v_, i_); }
+                               );
+
+                std::stable_sort(X.begin(), X.end(), IPCompare<Tk, Tv, isAscending>());
+
+                for(unsigned it = 0; it < X.size(); it++) {
+                    okey_ptr[okeyOffset + it] = X[it].first;
+                    oval_ptr[ovalOffset + it] = X[it].second;
+                }
+            }
+        }
+    }
+
+    return;
+}
+
+template<typename Tk, typename Tv, bool isAscending, int dim>
+void sortByKeyBatched(Array<Tk> okey, Array<Tv> oval)
+{
+    af::dim4 inDims = okey.dims();
+
+    af::dim4 tileDims(1);
+    af::dim4 seqDims = inDims;
+    tileDims[dim] = inDims[dim];
+    seqDims[dim] = 1;
+
+    uint* key = memAlloc<uint>(inDims.elements());
+    // IOTA
+    {
+        af::dim4 dims    = inDims;
+        uint* out        = key;
+        af::dim4 strides(1);
+        for(int i = 1; i < 4; i++)
+            strides[i] = strides[i-1] * dims[i-1];
+
+        for(dim_t w = 0; w < dims[3]; w++) {
+            dim_t offW = w * strides[3];
+            uint okeyW = (w % seqDims[3]) * seqDims[0] * seqDims[1] * seqDims[2];
+            for(dim_t z = 0; z < dims[2]; z++) {
+                dim_t offWZ = offW + z * strides[2];
+                uint okeyZ = okeyW + (z % seqDims[2]) * seqDims[0] * seqDims[1];
+                for(dim_t y = 0; y < dims[1]; y++) {
+                    dim_t offWZY = offWZ + y * strides[1];
+                    uint okeyY = okeyZ + (y % seqDims[1]) * seqDims[0];
+                    for(dim_t x = 0; x < dims[0]; x++) {
+                        dim_t id = offWZY + x;
+                        out[id] = okeyY + (x % seqDims[0]);
+                    }
+                }
+            }
+        }
+    }
+
+    // initialize original index locations
+    Tk *okey_ptr = okey.get();
+    Tv *oval_ptr = oval.get();
+
+    std::vector<KeyIndexPair<Tk, Tv> > X;
+    X.reserve(okey.elements());
+
+    for(unsigned i = 0; i < okey.elements(); i++) {
+        X.push_back(std::make_pair(std::make_pair(okey_ptr[i], oval_ptr[i]), key[i]));
+    }
+
+    memFree(key); // key is no longer required
+
+    std::stable_sort(X.begin(), X.end(), KIPCompareV<Tk, Tv, isAscending>());
+
+    std::stable_sort(X.begin(), X.end(), KIPCompareK<Tk, Tv, true>());
+
+    for(unsigned it = 0; it < okey.elements(); it++) {
+        okey_ptr[it] = X[it].first.first;
+        oval_ptr[it] = X[it].first.second;
+    }
+
+    return;
+}
+
+template<typename Tk, typename Tv, bool isAscending>
+void sort0ByKey(Array<Tk> okey, Array<Tv> oval)
+{
+    int higherDims =  okey.dims()[1] * okey.dims()[2] * okey.dims()[3];
+    // TODO Make a better heurisitic
+    if(higherDims > 4)
+        kernel::sortByKeyBatched<Tk, Tv, isAscending, 0>(okey, oval);
+    else
+        kernel::sort0ByKeyIterative<Tk, Tv, isAscending>(okey, oval);
+}
+
+#define INSTANTIATE(Tk, Tv, dr)                                                         \
+    template void sort0ByKey<Tk, Tv, dr>(Array<Tk> okey, Array<Tv> oval);               \
+    template void sort0ByKeyIterative<Tk, Tv, dr>(Array<Tk> okey, Array<Tv> oval);      \
+    template void sortByKeyBatched<Tk, Tv, dr, 0>(Array<Tk> okey, Array<Tv> oval);      \
+    template void sortByKeyBatched<Tk, Tv, dr, 1>(Array<Tk> okey, Array<Tv> oval);      \
+    template void sortByKeyBatched<Tk, Tv, dr, 2>(Array<Tk> okey, Array<Tv> oval);      \
+    template void sortByKeyBatched<Tk, Tv, dr, 3>(Array<Tk> okey, Array<Tv> oval);      \
+
+#define INSTANTIATE1(Tk    , dr) \
+    INSTANTIATE(Tk, float  , dr) \
+    INSTANTIATE(Tk, double , dr) \
+    INSTANTIATE(Tk, cfloat , dr) \
+    INSTANTIATE(Tk, cdouble, dr) \
+    INSTANTIATE(Tk, int    , dr) \
+    INSTANTIATE(Tk, uint   , dr) \
+    INSTANTIATE(Tk, short  , dr) \
+    INSTANTIATE(Tk, ushort , dr) \
+    INSTANTIATE(Tk, char   , dr) \
+    INSTANTIATE(Tk, uchar  , dr) \
+    INSTANTIATE(Tk, intl   , dr) \
+    INSTANTIATE(Tk, uintl  , dr)
+}
+}
+

From 45574db72c7bdc3470ec9d3f4f0fd212dd76dac1 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Thu, 21 Apr 2016 21:24:08 -0400
Subject: [PATCH 26/28] Sort by key cuda - create pair memory using memalloc,
 reasonable heuristic

---
 src/backend/cuda/kernel/sort_by_key_impl.hpp  | 30 ++++++++++---------
 .../opencl/kernel/sort_by_key_impl.hpp        |  2 +-
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/backend/cuda/kernel/sort_by_key_impl.hpp b/src/backend/cuda/kernel/sort_by_key_impl.hpp
index 66a6087401..bcc2fefeb4 100644
--- a/src/backend/cuda/kernel/sort_by_key_impl.hpp
+++ b/src/backend/cuda/kernel/sort_by_key_impl.hpp
@@ -117,6 +117,8 @@ namespace cuda
             for(int i = 0; i < 4; i++)
                 inDims[i] = pKey.dims[i];
 
+            const dim_t elements = inDims.elements();
+
             // Sort dimension
             // tileDims * seqDims = inDims
             af::dim4 tileDims(1);
@@ -126,34 +128,34 @@ namespace cuda
 
             // Create/call iota
             // Array<uint> key = iota<uint>(seqDims, tileDims);
-            af::dim4 keydims = inDims;
-            uint* key = memAlloc<uint>(keydims.elements());
+            uint* key = memAlloc<uint>(elements);
             Param<uint> pSeq;
             pSeq.ptr = key;
             pSeq.strides[0] = 1;
-            pSeq.dims[0] = keydims[0];
+            pSeq.dims[0] = inDims[0];
             for(int i = 1; i < 4; i++) {
-                pSeq.dims[i] = keydims[i];
+                pSeq.dims[i] = inDims[i];
                 pSeq.strides[i] = pSeq.strides[i - 1] * pSeq.dims[i - 1];
             }
             cuda::kernel::iota<uint>(pSeq, seqDims, tileDims);
 
             // Make pkey, pVal into a pair
-            thrust::device_vector<IndexPair<Tk, Tv> > X(inDims.elements());
-            IndexPair<Tk, Tv> *Xptr = thrust::raw_pointer_cast(X.data());
+            IndexPair<Tk, Tv> *Xptr = (IndexPair<Tk, Tv>*)memAlloc<char>(sizeof(IndexPair<Tk, Tv>) * elements);
 
             const int threads = 256;
-            int blocks = divup(inDims.elements(), threads * copyPairIter);
+            int blocks = divup(elements, threads * copyPairIter);
             CUDA_LAUNCH((makeIndexPair<Tk, Tv>), blocks, threads,
-                        Xptr, pKey.ptr, pVal.ptr, inDims.elements());
+                        Xptr, pKey.ptr, pVal.ptr, elements);
             POST_LAUNCH_CHECK();
 
+            thrust::device_ptr<IndexPair<Tk, Tv> > X = thrust::device_pointer_cast(Xptr);
+
             // Sort indices
             // Need to convert pSeq to thrust::device_ptr, otherwise thrust
             // throws weird errors for all *64 data types (double, intl, uintl etc)
             thrust::device_ptr<uint> dSeq = thrust::device_pointer_cast(pSeq.ptr);
             THRUST_SELECT(thrust::stable_sort_by_key,
-                          X.begin(), X.end(),
+                          X, X + elements,
                           dSeq,
                           IPCompare<Tk, Tv, isAscending>());
             POST_LAUNCH_CHECK();
@@ -161,13 +163,12 @@ namespace cuda
             // Needs to be ascending (true) in order to maintain the indices properly
             //kernel::sort0_by_key<uint, T, true>(pKey, pVal);
             THRUST_SELECT(thrust::stable_sort_by_key,
-                          dSeq,
-                          dSeq + inDims.elements(),
-                          X.begin());
+                          dSeq, dSeq + elements,
+                          X);
             POST_LAUNCH_CHECK();
 
             CUDA_LAUNCH((splitIndexPair<Tk, Tv>), blocks, threads,
-                        pKey.ptr, pVal.ptr, Xptr, inDims.elements());
+                        pKey.ptr, pVal.ptr, Xptr, elements);
             POST_LAUNCH_CHECK();
 
             // No need of doing moddims here because the original Array<T>
@@ -175,6 +176,7 @@ namespace cuda
             //val.modDims(inDims);
 
             memFree(key);
+            memFree((char*)Xptr);
         }
 
         template<typename Tk, typename Tv, bool isAscending>
@@ -182,7 +184,7 @@ namespace cuda
         {
             int higherDims =  okey.dims[1] * okey.dims[2] * okey.dims[3];
             // TODO Make a better heurisitic
-            if(higherDims > 5)
+            if(higherDims > 4)
                 kernel::sortByKeyBatched<Tk, Tv, isAscending, 0>(okey, oval);
             else
                 kernel::sort0ByKeyIterative<Tk, Tv, isAscending>(okey, oval);
diff --git a/src/backend/opencl/kernel/sort_by_key_impl.hpp b/src/backend/opencl/kernel/sort_by_key_impl.hpp
index dc1aa2735f..243034541f 100644
--- a/src/backend/opencl/kernel/sort_by_key_impl.hpp
+++ b/src/backend/opencl/kernel/sort_by_key_impl.hpp
@@ -340,7 +340,7 @@ namespace opencl
         {
             int higherDims =  pKey.info.dims[1] * pKey.info.dims[2] * pKey.info.dims[3];
             // TODO Make a better heurisitic
-            if(higherDims > 0)
+            if(higherDims > 5)
                 kernel::sortByKeyBatched<Tk, Tv, isAscending, 0>(pKey, pVal);
             else
                 kernel::sort0ByKeyIterative<Tk, Tv, isAscending>(pKey, pVal);

From 840ea28d28fa0f63e860989008490f491e5e1fc3 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Fri, 22 Apr 2016 14:20:14 -0400
Subject: [PATCH 27/28] Remove sort 0 dim restriction note from documentation

---
 include/af/algorithm.h | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/include/af/algorithm.h b/include/af/algorithm.h
index 792d6e2f44..a25120ffe3 100644
--- a/include/af/algorithm.h
+++ b/include/af/algorithm.h
@@ -357,8 +357,6 @@ namespace af
        \return the sorted output
 
        \ingroup sort_func_sort
-
-       \note \p dim is currently restricted to 0.
     */
     AFAPI array sort(const array &in, const unsigned dim = 0, const bool isAscending = true);
 
@@ -372,8 +370,6 @@ namespace af
        \param[in] isAscending specifies the sorting order
 
        \ingroup sort_func_sort_index
-
-       \note \p dim is currently restricted to 0.
     */
     AFAPI void  sort(array &out, array &indices, const array &in, const unsigned dim = 0,
                      const bool isAscending = true);
@@ -388,8 +384,6 @@ namespace af
        \param[in] isAscending specifies the sorting order
 
        \ingroup sort_func_sort_keys
-
-       \note \p dim is currently restricted to 0.
     */
     AFAPI void  sort(array &out_keys, array &out_values, const array &keys, const array &values,
                      const unsigned dim = 0, const bool isAscending = true);
@@ -794,8 +788,6 @@ extern "C" {
        \return \ref AF_SUCCESS if the execution completes properly
 
        \ingroup sort_func_sort
-
-       \note \p dim is currently restricted to 0.
     */
     AFAPI af_err af_sort(af_array *out, const af_array in, const unsigned dim, const bool isAscending);
 
@@ -810,8 +802,6 @@ extern "C" {
        \return \ref AF_SUCCESS if the execution completes properly
 
        \ingroup sort_func_sort_index
-
-       \note \p dim is currently restricted to 0.
     */
     AFAPI af_err af_sort_index(af_array *out, af_array *indices, const af_array in,
                                const unsigned dim, const bool isAscending);
@@ -827,8 +817,6 @@ extern "C" {
        \return \ref AF_SUCCESS if the execution completes properly
 
        \ingroup sort_func_sort_keys
-
-       \note \p dim is currently restricted to 0.
     */
     AFAPI af_err af_sort_by_key(af_array *out_keys, af_array *out_values,
                                 const af_array keys, const af_array values,

From f6eae071675f8fe4f2c5672b104a3c1394de66b7 Mon Sep 17 00:00:00 2001
From: Shehzan Mohammed <shehzan@arrayfire.com>
Date: Fri, 22 Apr 2016 15:58:11 -0400
Subject: [PATCH 28/28] Add multi dimension support to median, tests

---
 src/api/c/median.cpp |  13 ++--
 test/median.cpp      | 156 ++++++++++++++++++++++++++++---------------
 2 files changed, 108 insertions(+), 61 deletions(-)

diff --git a/src/api/c/median.cpp b/src/api/c/median.cpp
index 50bcad25ee..b5c033f461 100644
--- a/src/api/c/median.cpp
+++ b/src/api/c/median.cpp
@@ -68,8 +68,8 @@ static af_array median(const af_array& in, const dim_t dim)
     const Array<T> input = getArray<T>(in);
     Array<T> sortedIn   = sort<T, true>(input, dim);
 
-    int nElems    = input.dims()[0];
-    double mid    = (nElems + 1) / 2;
+    int dimLength = input.dims()[dim];
+    double mid    = (dimLength + 1) / 2;
     af_array left = 0;
 
     af_seq slices[4] = {af_span, af_span, af_span, af_span};
@@ -78,7 +78,7 @@ static af_array median(const af_array& in, const dim_t dim)
     af_array sortedIn_handle = getHandle<T>(sortedIn);
     AF_CHECK(af_index(&left, sortedIn_handle, input.ndims(), slices));
 
-    if (nElems % 2 == 1) {
+    if (dimLength % 2 == 1) {
         // mid-1 is our guy
         if (input.isFloating()) return left;
 
@@ -90,7 +90,7 @@ static af_array median(const af_array& in, const dim_t dim)
         return out;
     } else {
         // ((mid-1)+mid)/2 is our guy
-        dim4  dims = input.dims();
+        dim4 dims = input.dims();
         af_array right = 0;
         slices[dim] = af_make_seq(mid, mid, 1.0);
 
@@ -100,7 +100,8 @@ static af_array median(const af_array& in, const dim_t dim)
         af_array carr   = 0;
         af_array result = 0;
 
-        dim4 cdims = dim4(1, dims[1], dims[2], dims[3]);
+        dim4 cdims = dims;
+        cdims[dim] = 1;
         AF_CHECK(af_constant(&carr, 0.5, cdims.ndims(), cdims.get(), input.isDouble() ? f64 : f32));
 
         if (!input.isFloating()) {
@@ -148,7 +149,7 @@ af_err af_median_all(double *realVal, double *imagVal, const af_array in)
 af_err af_median(af_array* out, const af_array in, const dim_t dim)
 {
     try {
-        ARG_ASSERT(2, (dim>=0 && dim<=0));
+        ARG_ASSERT(2, (dim >= 0 && dim <= 4));
 
         af_array output = 0;
         ArrayInfo info = getInfo(in);
diff --git a/test/median.cpp b/test/median.cpp
index e0b21ba281..5b26a44a97 100644
--- a/test/median.cpp
+++ b/test/median.cpp
@@ -37,96 +37,142 @@ af::array generateArray<unsigned int>(int nx, int ny, int nz, int nw)
     return a;
 }
 
-template<typename To, typename Ti, bool flat>
-void median0(int nx, int ny=1, int nz=1, int nw=1)
+template<typename To, typename Ti>
+void median_flat(int nx, int ny=1, int nz=1, int nw=1)
 {
     if (noDoubleTests<Ti>()) return;
     array a = generateArray<Ti>(nx, ny, nz, nw);
-    array sa = sort(a);
 
-    Ti *h_sa = sa.host<Ti>();
+    // Verification
+    array sa = sort(flat(a));
+    dim_t mid = (sa.dims(0) + 1) / 2;
 
-    To *h_b = NULL;
-    To val = 0;
+    To verify;
 
-    if (flat) {
-        val = median<To>(a);
-        h_b = &val;
+    To *h_sa = sa.as((af_dtype)af::dtype_traits<To>::af_type).host<To>();
+    if(sa.dims(0) % 2 == 1) {
+        verify = h_sa[mid - 1];
     } else {
-        array b = median(a);
-        h_b = b.host<To>();
+        verify = (h_sa[mid - 1] + h_sa[mid]) / (To)2;
     }
 
-    for (int w = 0; w < nw; w++) {
-        for (int z = 0; z < nz; z++) {
-            for (int y = 0; y < ny; y++) {
+    // Test Part
+    To val = median<To>(a);
 
-                int off = (y  + ny * (z + nz * w));
-                int id = nx / 2;
+    ASSERT_EQ(verify, val);
 
-                if (nx & 2) {
-                    ASSERT_EQ(h_sa[id + off * nx], h_b[off]);
-                } else {
-                    To left = h_sa[id + off * nx - 1];
-                    To right = h_sa[id + off * nx];
+    delete[] h_sa;
+}
+
+template<typename To, typename Ti, int dim>
+void median_test(int nx, int ny=1, int nz=1, int nw=1)
+{
+    if (noDoubleTests<Ti>()) return;
+
+    array a = generateArray<Ti>(nx, ny, nz, nw);
+
+    // If selected dim is higher than input ndims, then return
+    if(dim >= a.dims().ndims())
+        return;
+
+    array verify;
+
+    // Verification
+    array sa = sort(a, dim);
+
+    double mid = (a.dims(dim) + 1) / 2;
+    af::seq mSeq[4] = {span, span, span, span};
+    mSeq[dim] = af::seq(mid, mid, 1.0);
 
-                    ASSERT_NEAR((left + right) / 2, h_b[off], 1e-5);
-                }
-            }
-        }
+    if(sa.dims(dim) % 2 == 1) {
+        mSeq[dim] = mSeq[dim] - 1.0;
+        verify = sa(mSeq[0], mSeq[1], mSeq[2], mSeq[3]);
+    } else {
+        dim_t sdim[4] = {0};
+        sdim[dim] = 1;
+        sa = sa.as((af_dtype)af::dtype_traits<To>::af_type);
+        array sas = shift(sa, sdim[0], sdim[1], sdim[2], sdim[3]);
+        verify = ((sa + sas) / 2)(mSeq[0], mSeq[1], mSeq[2], mSeq[3]);
     }
 
-    delete[] h_sa;
-    if (!flat) delete[] h_b;
+    // Test Part
+    array out = median(a, dim);
+
+    ASSERT_EQ(out.dims() == verify.dims(), true);
+    ASSERT_NEAR(0, sum<double>(af::abs(out - verify)), 1e-5);
 }
 
-#define MEDIAN0(To, Ti)                         \
-    TEST(median0, Ti##_1D_even)                 \
+#define MEDIAN_FLAT(To, Ti)                     \
+    TEST(MedianFlat, Ti##_flat_even)            \
+    {                                           \
+        median_flat<To, Ti>(1000);              \
+    }                                           \
+    TEST(MedianFlat, Ti##_flat_odd)             \
     {                                           \
-        median0<To, Ti, false>(1000);           \
+        median_flat<To, Ti>(783);               \
     }                                           \
-    TEST(median0, Ti##_2D_even)                 \
+    TEST(MedianFlat, Ti##_flat_multi_even)      \
     {                                           \
-        median0<To, Ti, false>(1000, 100);      \
+        median_flat<To, Ti>(24, 11, 3);         \
     }                                           \
-    TEST(median0, Ti##_3D_even)                 \
+    TEST(MedianFlat, Ti##_flat_multi_odd)       \
     {                                           \
-        median0<To, Ti, false>(1000, 25, 4);    \
+        median_flat<To, Ti>(15, 21, 7);         \
     }                                           \
-    TEST(median0, Ti##_4D_even)                 \
+
+MEDIAN_FLAT(float, float)
+MEDIAN_FLAT(float, int)
+MEDIAN_FLAT(float, uint)
+MEDIAN_FLAT(float, uchar)
+MEDIAN_FLAT(float, short)
+MEDIAN_FLAT(float, ushort)
+MEDIAN_FLAT(double, double)
+
+#define MEDIAN_TEST(To, Ti, dim)                \
+    TEST(Median, Ti##_1D_##dim##_even)          \
     {                                           \
-        median0<To, Ti, false>(1000, 25, 2, 2); \
+        median_test<To, Ti, dim>(1000);         \
     }                                           \
-    TEST(median0, Ti##_flat_even)               \
+    TEST(Median, Ti##_2D_##dim##_even)          \
     {                                           \
-        median0<To, Ti, true>(1000);            \
+        median_test<To, Ti, dim>(1000, 25);     \
     }                                           \
-    TEST(median0, Ti##_1D_odd)                  \
+    TEST(Median, Ti##_3D_##dim##_even)          \
     {                                           \
-        median0<To, Ti, false>(783);            \
+        median_test<To, Ti, dim>(100, 25, 4);   \
     }                                           \
-    TEST(median0, Ti##_2D_odd)                  \
+    TEST(Median, Ti##_4D_##dim##_even)          \
     {                                           \
-        median0<To, Ti, false>(783, 100);       \
+        median_test<To, Ti, dim>(100, 25, 2, 2);\
     }                                           \
-    TEST(median0, Ti##_3D_odd)                  \
+    TEST(Median, Ti##_1D_##dim##_odd)           \
     {                                           \
-        median0<To, Ti, false>(783, 25, 4);     \
+        median_test<To, Ti, dim>(783);          \
     }                                           \
-    TEST(median0, Ti##_4D_odd)                  \
+    TEST(Median, Ti##_2D_##dim##_odd)           \
     {                                           \
-        median0<To, Ti, false>(783, 25, 2, 2);  \
+        median_test<To, Ti, dim>(783, 25);      \
     }                                           \
-    TEST(median0, Ti##_flat_odd)                \
+    TEST(Median, Ti##_3D_##dim##_odd)           \
     {                                           \
-        median0<To, Ti, true>(783);             \
+        median_test<To, Ti, dim>(123, 25, 3);   \
     }                                           \
+    TEST(Median, Ti##_4D_##dim##_odd)           \
+    {                                           \
+        median_test<To, Ti, dim>(123, 25, 3, 3);\
+    }                                           \
+
 
+#define MEDIAN(To, Ti)      \
+    MEDIAN_TEST(To, Ti, 0)  \
+    MEDIAN_TEST(To, Ti, 1)  \
+    MEDIAN_TEST(To, Ti, 2)  \
+    MEDIAN_TEST(To, Ti, 3)  \
 
-MEDIAN0(float, float)
-MEDIAN0(float, int)
-MEDIAN0(float, uint)
-MEDIAN0(float, uchar)
-MEDIAN0(float, short)
-MEDIAN0(float, ushort)
-MEDIAN0(double, double)
+MEDIAN(float, float)
+MEDIAN(float, int)
+MEDIAN(float, uint)
+MEDIAN(float, uchar)
+MEDIAN(float, short)
+MEDIAN(float, ushort)
+MEDIAN(double, double)