1 files changed, 36 insertions, 0 deletions
diff --git a/nerv/lib/matrix/cukernel.cu b/nerv/lib/matrix/cukernel.cu
index a19030a..210e6bf 100644
--- a/nerv/lib/matrix/cukernel.cu
+++ b/nerv/lib/matrix/cukernel.cu
@@ -1,5 +1,41 @@
 #define NERV_GENERIC_CUKERNEL
 
+#include "cumatrix.h"
+
+#ifdef __NERV_FUTURE_CUDA_7
+__device__ double atomicAdd_nvidia(double* address, double val) {
+    /* nvidia provided this implementation
+     atmoicAdd is not included in CUDA for double */
+    unsigned long long int* address_as_ull =
+                                          (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(val +
+                        __longlong_as_double(assumed)));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+
+__device__ float atomicAdd_nvidia(float* address, float val) {
+    /* nvidia provided this implementation
+     I tried the included atomocAdd, but the select_liner layer result seems
+     unreproduceable, but sadly, even if I used this implementation, the
+     select_linear layer result is still unreproduceable */
+    int* address_as_ull = (int*)address;
+    int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+                        __float_as_int(val +
+                        __int_as_float(assumed)));
+    } while (assumed != old);
+    return __int_as_float(old);
+}
+#endif
+
+
 #define cudak_(NAME) cudak_float_ ## NAME
 #define MATRIX_USE_FLOAT
 #include "generic/elem_type.h"