diff options
-rw-r--r-- | Makefile | 3 | ||||
-rw-r--r-- | matrix/cukernel.cu | 7 |
2 files changed, 6 insertions, 4 deletions
@@ -11,6 +11,7 @@ CFLAGS := -Wall -Wextra OBJ_DIR := build/objs LUA_DIR := build/lua NVCC := $(CUDA_BASE)/bin/nvcc +NVCC_FLAGS := -Xcompiler -fPIC,-Wall,-Wextra OBJS := $(addprefix $(OBJ_DIR)/,$(OBJS)) LIBS := $(addprefix $(OBJ_DIR)/,$(LIBS)) @@ -30,7 +31,7 @@ $(OBJ_DIR)/%.o: %.c $(OBJ_DIR)/matrix/%.o: matrix/%.c gcc -c -o $@ $< $(INCLUDE) -fPIC $(CFLAGS) $(OBJ_DIR)/matrix/cukernel.o: matrix/cukernel.cu - $(NVCC) -c -o $@ $< -Xcompiler -fPIC $(INCLUDE) + $(NVCC) -c -o $@ $< $(INCLUDE) $(NVCC_FLAGS) $(LUA_DIR)/%.lua: %.lua cp $< $@ $(OBJ_DIR)/luaT.o: diff --git a/matrix/cukernel.cu b/matrix/cukernel.cu index dd1ebfc..ee6d871 100644 --- a/matrix/cukernel.cu +++ b/matrix/cukernel.cu @@ -84,6 +84,7 @@ __global__ void block_reduce_max(const float *input, float *output, } extern "C" { +#include "cukernel.h" void cuda_sigmoid(const Matrix *a, Matrix *b) { dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); @@ -106,7 +107,7 @@ extern "C" { a->stride / sizeof(float), stride / sizeof(float), ncol); ncol = blocks_per_row; - assert(ncol <= block.x); + assert((unsigned long)ncol <= block.x); grid.x = 1; block_reduce_sum<<<grid, block, block.x * sizeof(float)>>> \ (res, b->data.f, @@ -143,7 +144,7 @@ extern "C" { max->stride / sizeof(float), ncol); ncol = blocks_per_row; - assert(ncol <= block.x); + assert((unsigned long)ncol <= block.x); grid.x = 1; block_reduce_sum<<<grid, block, block.x * sizeof(float)>>> \ (res, b->data.f, @@ -165,7 +166,7 @@ extern "C" { a->stride / sizeof(float), stride / sizeof(float), ncol); ncol = blocks_per_row; - assert(ncol <= block.x); + assert((unsigned long)ncol <= block.x); grid.x = 1; block_reduce_max<<<grid, block, block.x * sizeof(float)>>> \ (res, b->data.f, |