From fc4c5a71053b837ed6143659a6e7b45792ed9e51 Mon Sep 17 00:00:00 2001
From: txh18 <cloudygooseg@gmail.com>
Date: Tue, 17 Nov 2015 21:58:05 +0800
Subject: added atomicAdd for select_linear update, however, the result still
 seems unreproducable, I changed select_linear layer update back to
 line-by-line

---
 nerv/examples/lmptb/lmptb/layer/select_linear.lua | 13 ++++++-------
 nerv/examples/lmptb/tnn_ptb_main.lua              |  2 +-
 nerv/layer/affine.lua                             |  2 +-
 nerv/lib/matrix/cukernel.cu                       | 18 +++++++++++++++++-
 nerv/lib/matrix/generic/cukernel.cu               |  2 +-
 nerv/matrix/init.lua                              | 12 ++++++++++++
 6 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/nerv/examples/lmptb/lmptb/layer/select_linear.lua b/nerv/examples/lmptb/lmptb/layer/select_linear.lua
index 672b7e2..a0ae207 100644
--- a/nerv/examples/lmptb/lmptb/layer/select_linear.lua
+++ b/nerv/examples/lmptb/lmptb/layer/select_linear.lua
@@ -30,13 +30,12 @@ function SL:init(batch_size)
 end
 
 function SL:update(bp_err, input, output)
-    --for i = 1, input[1]:ncol(), 1 do
-    --    if (input[1][0][i - 1] ~= 0) then
-    --        local word_vec = self.ltp.trans[input[1][0][i - 1]]
-            --word_vec:add(word_vec, bp_err[1][i - 1], 1, - self.gconf.lrate / self.gconf.batch_size)
-    --    end
-    --end 
-    self.ltp.trans:update_select_rows(bp_err[1], input[1]:trans(), - self.gconf.lrate / self.gconf.batch_size, 0)
+    for i = 1, input[1]:ncol(), 1 do
+        local word_vec = self.ltp.trans[input[1][0][i - 1]]
+        word_vec:add(word_vec, bp_err[1][i - 1], 1, - self.gconf.lrate / self.gconf.batch_size)
+    end 
+    --I tried the update_select_rows kernel which uses atomicAdd, but it generates unreproducable result
+    --self.ltp.trans:update_select_rows(bp_err[1], input[1]:trans(), - self.gconf.lrate / self.gconf.batch_size, 0)
     self.ltp.trans:add(self.ltp.trans, self.ltp.trans, 1.0, - self.gconf.lrate * self.gconf.wcost / self.gconf.batch_size)
 end
 
diff --git a/nerv/examples/lmptb/tnn_ptb_main.lua b/nerv/examples/lmptb/tnn_ptb_main.lua
index c37b217..50286c9 100644
--- a/nerv/examples/lmptb/tnn_ptb_main.lua
+++ b/nerv/examples/lmptb/tnn_ptb_main.lua
@@ -228,7 +228,7 @@ vocab_fn = '/home/slhome/txh18/workspace/nerv/nerv/nerv/examples/lmptb/m-tests/s
 global_conf = {
     lrate = 1, wcost = 1e-5, momentum = 0,
     cumat_type = nerv.CuMatrixFloat,
-    mmat_type = nerv.CuMatrixFloat,
+    mmat_type = nerv.MMatrixFloat,
     nn_act_default = 0, 
 
     hidden_size = 20,
diff --git a/nerv/layer/affine.lua b/nerv/layer/affine.lua
index 3ba9408..6a541e8 100644
--- a/nerv/layer/affine.lua
+++ b/nerv/layer/affine.lua
@@ -5,7 +5,7 @@ local AffineLayer = nerv.class('nerv.AffineLayer', 'nerv.Layer')
 
 function MatrixParam:read(handle)
     self.trans = self.gconf.cumat_type.new_from_host(
-                    nerv.MMatrixFloat.load(handle))
+                    self.gconf.mmat_type.load(handle))
 end
 
 function MatrixParam:write(handle)
diff --git a/nerv/lib/matrix/cukernel.cu b/nerv/lib/matrix/cukernel.cu
index 6fb78f0..1e856b9 100644
--- a/nerv/lib/matrix/cukernel.cu
+++ b/nerv/lib/matrix/cukernel.cu
@@ -2,7 +2,8 @@
 
 #include "cumatrix.h"
 
-__device__ double atomicAdd(double* address, double val) { 
+__device__ double atomicAdd_nvidia(double* address, double val) { 
+    //nvidia provided this implementation on the net
     //atmoicAdd is not included in CUDA for double
     unsigned long long int* address_as_ull =
                                           (unsigned long long int*)address;
@@ -16,6 +17,21 @@ __device__ double atomicAdd(double* address, double val) {
     return __longlong_as_double(old);
 }
 
+__device__ float atomicAdd_nvidia(float* address, float val) { 
+    //nvidia provided this implementation on the net
+    //I tried the included atomocAdd, but the select_liner layer result seems unreproduceable, but sadly, even if I used this implementation, the select_linear layer result is still unreproduceable
+    int* address_as_ull = (int*)address;
+    int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed, 
+                        __float_as_int(val + 
+                        __int_as_float(assumed)));
+    } while (assumed != old);
+    return __int_as_float(old);
+}
+
+
 #define cudak_(NAME) cudak_float_ ## NAME
 #define MATRIX_USE_FLOAT
 #include "generic/elem_type.h"
diff --git a/nerv/lib/matrix/generic/cukernel.cu b/nerv/lib/matrix/generic/cukernel.cu
index 8885b41..f996fdd 100644
--- a/nerv/lib/matrix/generic/cukernel.cu
+++ b/nerv/lib/matrix/generic/cukernel.cu
@@ -232,7 +232,7 @@ __global__ void cudak_(update_select_rows)(MATRIX_ELEM *c, const MATRIX_ELEM *a,
     if (i >= nrow_a || j >= ncol_a) return;
     int i_c = lrintf(idx[i]);
     //c[j + i_c * stride_c] = c[j + i_c * stride_c] * (1 - beta * alpha) + a[j + i * stride_a] * alpha;
-    atomicAdd(c + j + i_c * stride_c, c[j + i_c * stride_c] * (- beta * alpha) + a[j + i * stride_a] * alpha);
+    atomicAdd_nvidia(&c[j + i_c * stride_c], c[j + i_c * stride_c] * (- beta * alpha) + a[j + i * stride_a] * alpha);
 }
 
 __global__ void cudak_(expand_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
diff --git a/nerv/matrix/init.lua b/nerv/matrix/init.lua
index 1091d7e..14b83d4 100644
--- a/nerv/matrix/init.lua
+++ b/nerv/matrix/init.lua
@@ -80,3 +80,15 @@ function nerv.CuMatrixFloat:new_to_host()
     self:copy_toh(res)
     return res
 end
+
+function nerv.CuMatrixDouble.new_from_host(mat)
+    local res = nerv.CuMatrixDouble(mat:nrow(), mat:ncol())
+    res:copy_fromh(mat)
+    return res
+end
+
+function nerv.CuMatrixDouble:new_to_host()
+    local res = nerv.MMatrixDouble(self:nrow(), self:ncol())
+    self:copy_toh(res)
+    return res
+end
-- 
cgit v1.2.3-70-g09d2