summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--nerv/examples/lmptb/lmptb/layer/select_linear.lua13
-rw-r--r--nerv/examples/lmptb/tnn_ptb_main.lua2
-rw-r--r--nerv/layer/affine.lua2
-rw-r--r--nerv/lib/matrix/cukernel.cu18
-rw-r--r--nerv/lib/matrix/generic/cukernel.cu2
-rw-r--r--nerv/matrix/init.lua12
6 files changed, 38 insertions, 11 deletions
diff --git a/nerv/examples/lmptb/lmptb/layer/select_linear.lua b/nerv/examples/lmptb/lmptb/layer/select_linear.lua
index 672b7e2..a0ae207 100644
--- a/nerv/examples/lmptb/lmptb/layer/select_linear.lua
+++ b/nerv/examples/lmptb/lmptb/layer/select_linear.lua
@@ -30,13 +30,12 @@ function SL:init(batch_size)
end
function SL:update(bp_err, input, output)
- --for i = 1, input[1]:ncol(), 1 do
- -- if (input[1][0][i - 1] ~= 0) then
- -- local word_vec = self.ltp.trans[input[1][0][i - 1]]
- --word_vec:add(word_vec, bp_err[1][i - 1], 1, - self.gconf.lrate / self.gconf.batch_size)
- -- end
- --end
- self.ltp.trans:update_select_rows(bp_err[1], input[1]:trans(), - self.gconf.lrate / self.gconf.batch_size, 0)
+ for i = 1, input[1]:ncol(), 1 do
+ local word_vec = self.ltp.trans[input[1][0][i - 1]]
+ word_vec:add(word_vec, bp_err[1][i - 1], 1, - self.gconf.lrate / self.gconf.batch_size)
+ end
+ --I tried the update_select_rows kernel which uses atomicAdd, but it generates unreproducable result
+ --self.ltp.trans:update_select_rows(bp_err[1], input[1]:trans(), - self.gconf.lrate / self.gconf.batch_size, 0)
self.ltp.trans:add(self.ltp.trans, self.ltp.trans, 1.0, - self.gconf.lrate * self.gconf.wcost / self.gconf.batch_size)
end
diff --git a/nerv/examples/lmptb/tnn_ptb_main.lua b/nerv/examples/lmptb/tnn_ptb_main.lua
index c37b217..50286c9 100644
--- a/nerv/examples/lmptb/tnn_ptb_main.lua
+++ b/nerv/examples/lmptb/tnn_ptb_main.lua
@@ -228,7 +228,7 @@ vocab_fn = '/home/slhome/txh18/workspace/nerv/nerv/nerv/examples/lmptb/m-tests/s
global_conf = {
lrate = 1, wcost = 1e-5, momentum = 0,
cumat_type = nerv.CuMatrixFloat,
- mmat_type = nerv.CuMatrixFloat,
+ mmat_type = nerv.MMatrixFloat,
nn_act_default = 0,
hidden_size = 20,
diff --git a/nerv/layer/affine.lua b/nerv/layer/affine.lua
index 3ba9408..6a541e8 100644
--- a/nerv/layer/affine.lua
+++ b/nerv/layer/affine.lua
@@ -5,7 +5,7 @@ local AffineLayer = nerv.class('nerv.AffineLayer', 'nerv.Layer')
function MatrixParam:read(handle)
self.trans = self.gconf.cumat_type.new_from_host(
- nerv.MMatrixFloat.load(handle))
+ self.gconf.mmat_type.load(handle))
end
function MatrixParam:write(handle)
diff --git a/nerv/lib/matrix/cukernel.cu b/nerv/lib/matrix/cukernel.cu
index 6fb78f0..1e856b9 100644
--- a/nerv/lib/matrix/cukernel.cu
+++ b/nerv/lib/matrix/cukernel.cu
@@ -2,7 +2,8 @@
#include "cumatrix.h"
-__device__ double atomicAdd(double* address, double val) {
+__device__ double atomicAdd_nvidia(double* address, double val) {
+ //nvidia provided this implementation on the net
//atmoicAdd is not included in CUDA for double
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
@@ -16,6 +17,21 @@ __device__ double atomicAdd(double* address, double val) {
return __longlong_as_double(old);
}
+__device__ float atomicAdd_nvidia(float* address, float val) {
+ //nvidia provided this implementation on the net
+ //I tried the included atomocAdd, but the select_liner layer result seems unreproduceable, but sadly, even if I used this implementation, the select_linear layer result is still unreproduceable
+ int* address_as_ull = (int*)address;
+ int old = *address_as_ull, assumed;
+ do {
+ assumed = old;
+ old = atomicCAS(address_as_ull, assumed,
+ __float_as_int(val +
+ __int_as_float(assumed)));
+ } while (assumed != old);
+ return __int_as_float(old);
+}
+
+
#define cudak_(NAME) cudak_float_ ## NAME
#define MATRIX_USE_FLOAT
#include "generic/elem_type.h"
diff --git a/nerv/lib/matrix/generic/cukernel.cu b/nerv/lib/matrix/generic/cukernel.cu
index 8885b41..f996fdd 100644
--- a/nerv/lib/matrix/generic/cukernel.cu
+++ b/nerv/lib/matrix/generic/cukernel.cu
@@ -232,7 +232,7 @@ __global__ void cudak_(update_select_rows)(MATRIX_ELEM *c, const MATRIX_ELEM *a,
if (i >= nrow_a || j >= ncol_a) return;
int i_c = lrintf(idx[i]);
//c[j + i_c * stride_c] = c[j + i_c * stride_c] * (1 - beta * alpha) + a[j + i * stride_a] * alpha;
- atomicAdd(c + j + i_c * stride_c, c[j + i_c * stride_c] * (- beta * alpha) + a[j + i * stride_a] * alpha);
+ atomicAdd_nvidia(&c[j + i_c * stride_c], c[j + i_c * stride_c] * (- beta * alpha) + a[j + i * stride_a] * alpha);
}
__global__ void cudak_(expand_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
diff --git a/nerv/matrix/init.lua b/nerv/matrix/init.lua
index 1091d7e..14b83d4 100644
--- a/nerv/matrix/init.lua
+++ b/nerv/matrix/init.lua
@@ -80,3 +80,15 @@ function nerv.CuMatrixFloat:new_to_host()
self:copy_toh(res)
return res
end
+
+function nerv.CuMatrixDouble.new_from_host(mat)
+ local res = nerv.CuMatrixDouble(mat:nrow(), mat:ncol())
+ res:copy_fromh(mat)
+ return res
+end
+
+function nerv.CuMatrixDouble:new_to_host()
+ local res = nerv.MMatrixDouble(self:nrow(), self:ncol())
+ self:copy_toh(res)
+ return res
+end