#ifdef NERV_GENERIC_CUKERNEL
#include <assert.h>
#include <stdio.h>
#include "../matrix.h"
#include "cuda.h"
#include "float.h"
#define CUDA_THREADS_N 16
#define CUDA_THREADS_NN ((CUDA_THREADS_N) * (CUDA_THREADS_N))
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
__global__ void cudak_(log_elem)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
int nrow, int ncol, int stride) {
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
long idx;
MATRIX_ELEM tmp;
if (i >= nrow || j >= ncol) return;
idx = j + i * stride;
tmp = a[idx];
if(tmp < FLT_MIN) tmp = FLT_MIN;
b[idx] = log(tmp);
}
__global__ void cudak_(thres_mask)(MATRIX_ELEM *a, MATRIX_ELEM *b, double thres, double low, double high,
int nrow, int ncol, int stride) {
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
long idx;
if (i >= nrow || j >= ncol) return;
idx = j + i * stride;
if (b[idx] < thres)
a[idx] = low;
else
a[idx] = high;
}
__global__ void cudak_(mul_elem)(const MATRIX_ELEM *a, const MATRIX_ELEM *b,
MATRIX_ELEM *c,
int nrow, int ncol, int stride) {
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
long idx;
if (i >= nrow || j >= ncol) return;
idx = j + i * stride;
c[idx] = a[idx] * b[idx];
}
__global__ void cudak_(sigmoid)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
int nrow, int ncol, int stride) {
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
long idx;
if (i >= nrow || j >= ncol) return;
idx = j + i * stride;
b[idx] = 1.0 / (1.0 + exp(-a[idx]));
}
__global__ void cudak_(sigmoid_grad)(const MATRIX_ELEM *output,
const MATRIX_ELEM *err,
MATRIX_ELEM *nerr,
int nrow, int ncol, int stride) {
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
long idx;
if (i >= nrow || j >= ncol) return;
idx = j + i * stride;
nerr[idx] = output[idx] * (1.0 - output[idx]) * err[idx];
}
__global__ void cudak_(tanh)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
int nrow, int ncol, int stride) {
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
long idx;
if (i >= nrow || j >= ncol) return;
idx = j + i * stride;
//b[idx] = (exp(a[idx]) - exp(-a[idx])) / (exp(a[idx]) + exp(-a[idx])); //could cause nan
b[idx] = tanh(a[idx]);
}
__global__ void cudak_(tanh_grad)(const MATRIX_ELEM *output,
const MATRIX_ELEM *err,
MATRIX_ELEM *nerr,
int nrow, int ncol, int stride) {
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
long idx;
if (i >= nrow || j >= ncol) return;
idx = j + i * stride;
nerr[idx] = (1.0 - output[idx] * output[idx]) * err[idx];
}
__global__ void cudak_(softmax_final)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
const MATRIX_ELEM *max, const MATRIX_ELEM *deno,
int nrow, int ncol, int stride, int mstride) {
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
long idx;
if (i >= nrow || j >= ncol) return;
idx = j + i *