From fc4c5a71053b837ed6143659a6e7b45792ed9e51 Mon Sep 17 00:00:00 2001 From: txh18 Date: Tue, 17 Nov 2015 21:58:05 +0800 Subject: added atomicAdd for select_linear update, however, the result still seems unreproducable, I changed select_linear layer update back to line-by-line --- nerv/examples/lmptb/lmptb/layer/select_linear.lua | 13 ++++++------- nerv/examples/lmptb/tnn_ptb_main.lua | 2 +- nerv/layer/affine.lua | 2 +- nerv/lib/matrix/cukernel.cu | 18 +++++++++++++++++- nerv/lib/matrix/generic/cukernel.cu | 2 +- nerv/matrix/init.lua | 12 ++++++++++++ 6 files changed, 38 insertions(+), 11 deletions(-) diff --git a/nerv/examples/lmptb/lmptb/layer/select_linear.lua b/nerv/examples/lmptb/lmptb/layer/select_linear.lua index 672b7e2..a0ae207 100644 --- a/nerv/examples/lmptb/lmptb/layer/select_linear.lua +++ b/nerv/examples/lmptb/lmptb/layer/select_linear.lua @@ -30,13 +30,12 @@ function SL:init(batch_size) end function SL:update(bp_err, input, output) - --for i = 1, input[1]:ncol(), 1 do - -- if (input[1][0][i - 1] ~= 0) then - -- local word_vec = self.ltp.trans[input[1][0][i - 1]] - --word_vec:add(word_vec, bp_err[1][i - 1], 1, - self.gconf.lrate / self.gconf.batch_size) - -- end - --end - self.ltp.trans:update_select_rows(bp_err[1], input[1]:trans(), - self.gconf.lrate / self.gconf.batch_size, 0) + for i = 1, input[1]:ncol(), 1 do + local word_vec = self.ltp.trans[input[1][0][i - 1]] + word_vec:add(word_vec, bp_err[1][i - 1], 1, - self.gconf.lrate / self.gconf.batch_size) + end + --I tried the update_select_rows kernel which uses atomicAdd, but it generates unreproducable result + --self.ltp.trans:update_select_rows(bp_err[1], input[1]:trans(), - self.gconf.lrate / self.gconf.batch_size, 0) self.ltp.trans:add(self.ltp.trans, self.ltp.trans, 1.0, - self.gconf.lrate * self.gconf.wcost / self.gconf.batch_size) end diff --git a/nerv/examples/lmptb/tnn_ptb_main.lua b/nerv/examples/lmptb/tnn_ptb_main.lua index c37b217..50286c9 100644 --- a/nerv/examples/lmptb/tnn_ptb_main.lua +++ b/nerv/examples/lmptb/tnn_ptb_main.lua @@ -228,7 +228,7 @@ vocab_fn = '/home/slhome/txh18/workspace/nerv/nerv/nerv/examples/lmptb/m-tests/s global_conf = { lrate = 1, wcost = 1e-5, momentum = 0, cumat_type = nerv.CuMatrixFloat, - mmat_type = nerv.CuMatrixFloat, + mmat_type = nerv.MMatrixFloat, nn_act_default = 0, hidden_size = 20, diff --git a/nerv/layer/affine.lua b/nerv/layer/affine.lua index 3ba9408..6a541e8 100644 --- a/nerv/layer/affine.lua +++ b/nerv/layer/affine.lua @@ -5,7 +5,7 @@ local AffineLayer = nerv.class('nerv.AffineLayer', 'nerv.Layer') function MatrixParam:read(handle) self.trans = self.gconf.cumat_type.new_from_host( - nerv.MMatrixFloat.load(handle)) + self.gconf.mmat_type.load(handle)) end function MatrixParam:write(handle) diff --git a/nerv/lib/matrix/cukernel.cu b/nerv/lib/matrix/cukernel.cu index 6fb78f0..1e856b9 100644 --- a/nerv/lib/matrix/cukernel.cu +++ b/nerv/lib/matrix/cukernel.cu @@ -2,7 +2,8 @@ #include "cumatrix.h" -__device__ double atomicAdd(double* address, double val) { +__device__ double atomicAdd_nvidia(double* address, double val) { + //nvidia provided this implementation on the net //atmoicAdd is not included in CUDA for double unsigned long long int* address_as_ull = (unsigned long long int*)address; @@ -16,6 +17,21 @@ __device__ double atomicAdd(double* address, double val) { return __longlong_as_double(old); } +__device__ float atomicAdd_nvidia(float* address, float val) { + //nvidia provided this implementation on the net + //I tried the included atomocAdd, but the select_liner layer result seems unreproduceable, but sadly, even if I used this implementation, the select_linear layer result is still unreproduceable + int* address_as_ull = (int*)address; + int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __float_as_int(val + + __int_as_float(assumed))); + } while (assumed != old); + return __int_as_float(old); +} + + #define cudak_(NAME) cudak_float_ ## NAME #define MATRIX_USE_FLOAT #include "generic/elem_type.h" diff --git a/nerv/lib/matrix/generic/cukernel.cu b/nerv/lib/matrix/generic/cukernel.cu index 8885b41..f996fdd 100644 --- a/nerv/lib/matrix/generic/cukernel.cu +++ b/nerv/lib/matrix/generic/cukernel.cu @@ -232,7 +232,7 @@ __global__ void cudak_(update_select_rows)(MATRIX_ELEM *c, const MATRIX_ELEM *a, if (i >= nrow_a || j >= ncol_a) return; int i_c = lrintf(idx[i]); //c[j + i_c * stride_c] = c[j + i_c * stride_c] * (1 - beta * alpha) + a[j + i * stride_a] * alpha; - atomicAdd(c + j + i_c * stride_c, c[j + i_c * stride_c] * (- beta * alpha) + a[j + i * stride_a] * alpha); + atomicAdd_nvidia(&c[j + i_c * stride_c], c[j + i_c * stride_c] * (- beta * alpha) + a[j + i * stride_a] * alpha); } __global__ void cudak_(expand_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b, diff --git a/nerv/matrix/init.lua b/nerv/matrix/init.lua index 1091d7e..14b83d4 100644 --- a/nerv/matrix/init.lua +++ b/nerv/matrix/init.lua @@ -80,3 +80,15 @@ function nerv.CuMatrixFloat:new_to_host() self:copy_toh(res) return res end + +function nerv.CuMatrixDouble.new_from_host(mat) + local res = nerv.CuMatrixDouble(mat:nrow(), mat:ncol()) + res:copy_fromh(mat) + return res +end + +function nerv.CuMatrixDouble:new_to_host() + local res = nerv.MMatrixDouble(self:nrow(), self:ncol()) + self:copy_toh(res) + return res +end -- cgit v1.2.3