1 files changed, 17 insertions, 1 deletions
diff --git a/nerv/lib/matrix/cukernel.cu b/nerv/lib/matrix/cukernel.cu
index 6fb78f0..1e856b9 100644
--- a/nerv/lib/matrix/cukernel.cu
+++ b/nerv/lib/matrix/cukernel.cu
@@ -2,7 +2,8 @@
 
 #include "cumatrix.h"
 
-__device__ double atomicAdd(double* address, double val) { 
+__device__ double atomicAdd_nvidia(double* address, double val) { 
+    //nvidia provided this implementation on the net
     //atmoicAdd is not included in CUDA for double
     unsigned long long int* address_as_ull =
                                           (unsigned long long int*)address;
@@ -16,6 +17,21 @@ __device__ double atomicAdd(double* address, double val) {
     return __longlong_as_double(old);
 }
 
+__device__ float atomicAdd_nvidia(float* address, float val) { 
+    //nvidia provided this implementation on the net
+    //I tried the included atomocAdd, but the select_liner layer result seems unreproduceable, but sadly, even if I used this implementation, the select_linear layer result is still unreproduceable
+    int* address_as_ull = (int*)address;
+    int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed, 
+                        __float_as_int(val + 
+                        __int_as_float(assumed)));
+    } while (assumed != old);
+    return __int_as_float(old);
+}
+
+
 #define cudak_(NAME) cudak_float_ ## NAME
 #define MATRIX_USE_FLOAT
 #include "generic/elem_type.h"