aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoruphantom <dengwei.2011@gmail.com>2015-08-28 17:41:14 +0800
committeruphantom <dengwei.2011@gmail.com>2015-08-28 17:41:14 +0800
commita68d3c982ed0dd4ef5bbc9e0c22b9ecf9565b924 (patch)
treebc59ef1a69b32276cc97454fbc3c881fc8c518cc
parent1a9f63e351582f54fec7817927168cb1dbb0c1d6 (diff)
fastnn version 1.0
-rw-r--r--Makefile7
-rw-r--r--fastnn/Makefile58
-rw-r--r--fastnn/device/Device.cpp496
-rw-r--r--fastnn/device/Device.h51
-rw-r--r--fastnn/device/device.c178
-rw-r--r--fastnn/device/device.lua6
-rw-r--r--fastnn/example/asgd_data_trainer.lua405
-rw-r--r--fastnn/example/asgd_sds_trainer.lua328
-rw-r--r--fastnn/fastnn-scm-1.rockspec36
-rw-r--r--fastnn/fastnn_baseline.lua236
-rw-r--r--fastnn/init.c43
-rw-r--r--fastnn/init.lua11
-rw-r--r--fastnn/io/Example.cpp186
-rw-r--r--fastnn/io/Example.h49
-rw-r--r--fastnn/io/example.c249
-rw-r--r--fastnn/io/example.lua38
-rw-r--r--fastnn/lib/ModelSync.h119
-rw-r--r--fastnn/lib/modelsync.c532
-rw-r--r--fastnn/lib/modelsync.lua107
-rw-r--r--fastnn/test.lua57
-rw-r--r--fastnn/threads/Makefile42
-rw-r--r--fastnn/threads/init.lua14
-rw-r--r--fastnn/threads/lib/THThread.c349
-rw-r--r--fastnn/threads/lib/THThread.h36
-rw-r--r--fastnn/threads/lib/init.c27
-rw-r--r--fastnn/threads/lib/luaTHRD.h84
-rw-r--r--fastnn/threads/lib/threads.c355
-rw-r--r--fastnn/threads/test/test-low-level.lua39
-rw-r--r--fastnn/threads/test/test-threads-async.lua66
-rw-r--r--fastnn/threads/test/test-threads-multiple.lua15
-rw-r--r--fastnn/threads/test/test-threads-shared.lua111
-rw-r--r--fastnn/threads/test/test-threads.lua20
-rw-r--r--fastnn/threads/threads-scm-1.rockspec36
-rw-r--r--fastnn/threads/threads.lua14
-rw-r--r--nerv/init.lua6
-rw-r--r--nerv/layer/affine.lua37
-rw-r--r--nerv/layer/combiner.lua6
-rw-r--r--nerv/layer/sigmoid.lua8
-rw-r--r--nerv/layer/softmax.lua9
-rw-r--r--nerv/layer/softmax_ce.lua8
-rw-r--r--nerv/lib/common.c1
-rw-r--r--nerv/lib/common.h2
-rw-r--r--nerv/lib/io/chunk_file.c4
-rw-r--r--nerv/lib/matrix/cukernel.h2
-rw-r--r--nerv/lib/matrix/cumatrix.c8
-rw-r--r--nerv/lib/matrix/cumatrix.h14
-rw-r--r--nerv/lib/matrix/generic/cukernel.cu12
-rw-r--r--nerv/lib/matrix/generic/cumatrix.c31
-rw-r--r--nerv/lib/matrix/generic/cumatrix.h8
-rw-r--r--nerv/lib/matrix/generic/matrix.c30
-rw-r--r--nerv/lib/matrix/generic/mmatrix.c44
-rw-r--r--nerv/lib/matrix/generic/mmatrix.h6
-rw-r--r--nerv/lib/matrix/matrix.h1
-rw-r--r--nerv/lib/matrix/mmatrix.c18
-rw-r--r--nerv/matrix/cumatrix.c1
-rw-r--r--nerv/matrix/generic/cumatrix.c27
-rw-r--r--nerv/matrix/generic/matrix.c7
-rw-r--r--nerv/matrix/generic/mmatrix.c34
-rw-r--r--nerv/matrix/init.lua6
-rw-r--r--nerv/nn/layer_dag.lua23
60 files changed, 4696 insertions, 57 deletions
diff --git a/Makefile b/Makefile
index 664a83b..4595be5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
-.PHONY: all clean install luajit luarocks speech
+.PHONY: all clean install luajit luarocks speech fastnn
SHELL := /bin/bash
PREFIX := $(CURDIR)/install/
-all: luajit luarocks install
+all: luajit luarocks install fastnn
luajit:
PREFIX=$(PREFIX) ./tools/build_luajit.sh
luarocks:
@@ -11,5 +11,8 @@ install:
speech:
cd speech/speech_utils; $(PREFIX)/bin/luarocks make
cd speech/htk_io; $(PREFIX)/bin/luarocks make
+fastnn:
+ cd fastnn/threads; $(PREFIX)/bin/luarocks make
+ cd fastnn; $(PREFIX)/bin/luarocks make
clean:
cd nerv && make clean
diff --git a/fastnn/Makefile b/fastnn/Makefile
new file mode 100644
index 0000000..1f1ba1e
--- /dev/null
+++ b/fastnn/Makefile
@@ -0,0 +1,58 @@
+.PHONY: build install clean
+SHELL := /bin/bash
+BUILD_DIR := $(CURDIR)/build
+LUA_DIR = $(INST_LUADIR)/fastnn
+CUDA_BASE := /usr/local/cuda-6.5
+
+INC_PATH := -I $(LUA_BINDIR)/../include/nerv
+CUDA_INCLUDE := -I $(CUDA_BASE)/include/
+LIB_PATH := $(LUA_BINDIR)/../lib
+
+OBJ_DIR := $(BUILD_DIR)/objs
+SUBDIR := lib io threads device
+
+
+OBJS := init.o lib/modelsync.o lib/ModelSync.o io/example.o io/Example.o device/device.o device/Device.o
+LIBS := $(INST_LIBDIR)/libfastnn.so
+LUA_LIBS := init.lua lib/modelsync.lua io/example.lua device/device.lua
+INCLUDE := -I $(LUA_INCDIR) $(INC_PATH) $(CUDA_INCLUDE) -DLUA_USE_APICHECK -DUSE_PTHREAD_THREADS
+LDFLAGS := -L$(CUDA_BASE)/lib64/ -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcublas
+
+
+OBJS := $(addprefix $(OBJ_DIR)/,$(OBJS))
+OBJ_SUBDIR := $(addprefix $(OBJ_DIR)/,$(SUBDIR))
+LUA_SUBDIR := $(addprefix $(LUA_DIR)/,$(SUBDIR))
+LUA_LIBS := $(addprefix $(LUA_DIR)/,$(LUA_LIBS))
+
+CFLAGS := -Wall -Wextra -g -O0
+
+build: $(OBJ_DIR) $(OBJ_SUBDIR) $(OBJS)
+install: $(LIBS) $(LUA_DIR) $(LUA_SUBDIR) $(LUA_LIBS)
+
+#$(SUBMODULE): ECHO
+# cd threads; $(LUA_BINDIR)/luarocks make
+
+$(OBJ_DIR) $(LUA_DIR) $(OBJ_SUBDIR) $(LUA_SUBDIR):
+ -mkdir -p $@
+
+$(LUA_DIR)/%.lua: %.lua
+ cp $< $@
+
+$(OBJ_DIR)/io/Example.o: io/Example.cpp
+ gcc -c -o $@ $< $(INCLUDE) -fPIC $(CFLAGS)
+$(OBJ_DIR)/device/Device.o: device/Device.cpp
+ gcc -c -o $@ $< $(INCLUDE) -fPIC $(CFLAGS)
+
+$(OBJ_DIR)/%.o: %.c $(patsubst /%.o,/%.c,$@)
+ gcc -c -o $@ $< $(INCLUDE) -fPIC $(CFLAGS)
+
+$(LIBS): $(OBJS)
+ gcc -shared -o $@ $^ $(LDFLAGS) -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lthreads -lnervcore -lluaT -lpthread -lstdc++
+
+ECHO:
+ @echo $(SUBMODULE)
+
+clean:
+ -rm -rf $(OBJ_DIR)
+
+
diff --git a/fastnn/device/Device.cpp b/fastnn/device/Device.cpp
new file mode 100644
index 0000000..3b69086
--- /dev/null
+++ b/fastnn/device/Device.cpp
@@ -0,0 +1,496 @@
+
+#include <vector>
+#include <string>
+#include <iostream>
+#include "../../nerv/lib/matrix/cuda_helper.h"
+#include <dlfcn.h>
+
+
+extern "C" {
+
+
+#include "Device.h"
+#include "stdlib.h"
+#include "string.h"
+
+
+struct GPUInfo
+{
+ int gpuid;
+ float d2h_bandwidth;
+ float h2d_bandwidth;
+ size_t mem_free;
+ size_t mem_total;
+ float mem_ratio;
+ bool used;
+};
+
+struct MPIGPUInfo
+{
+ char hostname[STRLEN];
+ int gpuid;
+ int myid;
+};
+
+struct Device
+{
+ std::vector<GPUInfo> gpuinfo_;
+ int refcount;
+};
+
+///////////////////////////////////////////
+
+extern cublasHandle_t* nerv_get_cublas_handle();
+
+CuContext* CuContext_new()
+{
+ CuContext *context = new CuContext;
+ cublasCreate(&context->cublas_handle);
+ cudaEventCreate(&context->profile_start);
+ cudaEventCreate(&context->profile_stop);
+ context->profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
+ context->refcount = 1;
+ context->pid = pthread_self();
+ return context;
+}
+
+CuContext* CuContext_newWithId(long id)
+{
+ CuContext *context = (CuContext*)id;
+ __sync_fetch_and_add(&context->refcount, 1);
+ return context;
+}
+
+long CuContext_id(CuContext *context)
+{
+ return (long)context;
+}
+
+void CuContext_destroy(CuContext *context)
+{
+ if (NULL != context && __sync_fetch_and_add(&context->refcount, -1) == 1)
+ {
+ cublasDestroy(context->cublas_handle);
+ hashmap_clear(context->profile);
+ delete context;
+ context = NULL;
+ }
+}
+
+Device* Device_new()
+{
+ Device* device = new Device;
+ device->refcount = 1;
+ return device;
+}
+
+Device* Device_newWithId(long id)
+{
+ Device *device = (Device*)id;
+ __sync_fetch_and_add(&device->refcount, 1);
+ return device;
+}
+
+long Device_id(Device *device)
+{
+ return (long)device;
+}
+
+void Device_destroy(Device *device)
+{
+ if (NULL != device && __sync_fetch_and_add(&device->refcount, -1) == 1)
+ {
+ delete device;
+ device = NULL;
+ }
+}
+
+void GetFreeMemory(size_t* free, size_t* total, Status *status)
+{
+ // WARNING! the CUDA API is inconsistent accross versions!
+ #if (CUDA_VERSION >= 3020)
+ //define the function signature type
+ size_t mem_free, mem_total;
+#else
+ unsigned int mem_free, mem_total;
+#endif
+ {
+ //we will load the cuMemGetInfo dynamically from libcuda.so
+ //cuMemGetInfo(&mem_free, &mem_total);
+ //pre-fill ``safe'' values that will not cause problems
+ mem_free = 1; mem_total = 1;
+ //open libcuda.so
+ void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
+ if(NULL == libcuda)
+ {
+ NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
+ }
+ else
+ {
+ //define the function signature type
+ //and get the symbol
+#if (CUDA_VERSION >= 3020)
+ typedef CUresult (*cu_fun_ptr)(size_t*, size_t*);
+ cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2");
+#else
+ typedef CUresult (*cu_fun_ptr)(int*, int*);
+ cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo");
+#endif
+ if(NULL == dl_cuMemGetInfo)
+ {
+ NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuMemGetInfo from libcuda.so");
+ }
+ else
+ {
+ //call the function
+ dl_cuMemGetInfo(&mem_free, &mem_total);
+ }
+ //close the library
+ dlclose(libcuda);
+ }
+ }
+ // copy the output values outside
+ if(NULL != free) *free = mem_free;
+ if(NULL != total) *total = mem_total;
+}
+
+void DeviceGetName(char* name, int len, int dev, Status *status)
+{
+ //prefill with something reasonable
+ strncpy(name,"Unknown GPU",len);
+ //open libcuda.so
+ void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
+ if(NULL == libcuda)
+ {
+ NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
+ }
+ else
+ {
+ //define the function signature type
+ typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice);
+ //get the symbol
+ cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName");
+ if(NULL == cuDeviceGetName_ptr) {
+ NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuDeviceGetName from libcuda.so");
+ }
+ else {
+ //call the function
+ cuDeviceGetName_ptr(name, len, dev);
+ }
+ //close the library
+ dlclose(libcuda);
+ }
+}
+
+void GetBandwidth(int gpu_idx, float *d2h, float *h2d, Status *status)
+{
+ int idx = gpu_idx;
+ int memSize = 64*1024*1024;
+ float elapsedTimeInMs = 0.0f;
+ float bandwidthInMBs = 0.0f;
+ unsigned char *h_idata = NULL;
+ unsigned char *h_odata = NULL;
+ cudaEvent_t start, stop;
+ bool PINNED = true;
+ int MEMCOPY_ITERATIONS = 5;
+
+ CUDA_SAFE_SYNC_CALL(cudaEventCreate(&start), status);
+ CUDA_SAFE_SYNC_CALL(cudaEventCreate(&stop), status);
+
+ //allocate host memory
+ if (PINNED)
+ {
+ #if CUDART_VERSION >= 2020
+ CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_idata, memSize, cudaHostAllocPortable), status);
+ CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_odata, memSize, cudaHostAllocPortable), status);
+ #else
+ CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_idata, memSize), status);
+ CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_odata, memSize), status);
+ #endif
+ }
+ else
+ {
+ //pageable memory mode - use malloc
+ h_odata = (unsigned char *)malloc(memSize);
+
+ if (h_odata == 0)
+ {
+ NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Not enough memory available on host to run test!");
+ }
+ }
+
+ //initialize the memory
+ for (unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
+ {
+ h_idata[i] = (unsigned char)(i & 0xff);
+ }
+
+ // allocate device memory
+ unsigned char *d_idata;
+ CUDA_SAFE_SYNC_CALL(cudaMalloc((void **) &d_idata, memSize), status);
+
+ //initialize the device memory
+ CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_idata, memSize,cudaMemcpyHostToDevice), status);
+
+ //copy data from GPU to Host
+
+ CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status);
+
+ if (PINNED)
+ {
+ for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
+ {
+ CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost, 0), status);
+ }
+ }
+ else
+ {
+ for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
+ {
+ CUDA_SAFE_SYNC_CALL(cudaMemcpy(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost), status);
+ }
+ }
+
+ CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status);
+
+ // make sure GPU has finished copying
+ CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status);
+ //get the the total elapsed time in ms
+
+ CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status);
+
+
+ //calculate bandwidth in MB/s
+ bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20));
+ *d2h = bandwidthInMBs;
+
+ /////////////////////////////////////////////////////
+ //copy data from Host to GPU
+ CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status);
+
+ if (PINNED)
+ {
+ for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
+ {
+ CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(d_idata, h_odata, memSize,cudaMemcpyHostToDevice, 0), status);
+ }
+ }
+ else
+ {
+ for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
+ {
+ CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_odata, memSize,cudaMemcpyHostToDevice), status);
+ }
+ }
+
+ CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status);
+
+ // make sure GPU has finished copying
+ CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status);
+ //get the the total elapsed time in ms
+
+ CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status);
+
+
+ //calculate bandwidth in MB/s
+ bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20));
+ *h2d = bandwidthInMBs;
+
+ //clean up memory
+ CUDA_SAFE_SYNC_CALL(cudaEventDestroy(stop), status);
+ CUDA_SAFE_SYNC_CALL(cudaEventDestroy(start), status);
+
+ if (PINNED)
+ {
+ CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_idata), status);
+ CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_odata), status);
+ }
+ else
+ {
+ free(h_idata);
+ free(h_odata);
+ }
+
+ CUDA_SAFE_SYNC_CALL(cudaFree(d_idata), status);
+}
+
+int AutoSelectGPU(Device *device, Status *status)
+{
+ //GPU selection is based on largest proportion of free memory.
+
+ int max_id = 0, n_gpu = 0;
+ std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
+
+ cudaGetDeviceCount(&n_gpu);
+
+ if(n_gpu == 0)
+ {
+ NERV_SET_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found");
+ return -1;
+ }
+
+ if (n_gpu > 0)
+ {
+ for (int i = 0; i < gpuinfo_.size(); i++)
+ {
+ if (!gpuinfo_[i].used)
+ {
+ max_id = i;
+ break;
+ }
+ }
+ //find GPU with max free memory
+ for (int n = 1; n < gpuinfo_.size(); n++)
+ {
+ if (!gpuinfo_[n].used && gpuinfo_[n].mem_ratio > gpuinfo_[max_id].mem_ratio)
+ max_id = n;
+ }
+
+
+ std::cerr << "Selected device: " << max_id << " (automatically)" << std::endl;
+
+ std::cerr << "free: " << gpuinfo_[max_id].mem_free/1024/1024 << "M, "
+ << "total: "<< gpuinfo_[max_id].mem_total/1024/1024 << "M, "
+ << "ratio: "<< gpuinfo_[max_id].mem_ratio << " "
+ << "d2h bandwidth: " << gpuinfo_[max_id].d2h_bandwidth << "MB/s, "
+ << "h2d bandwidth: "<< gpuinfo_[max_id].h2d_bandwidth << "MB/s" << std::endl;
+
+ cudaSetDevice(max_id);
+ //initialize the CUBLAS
+ //cublasInit();
+ //cublasHandle_t *cublas_handle = nerv_get_cublas_handle();
+ //std::cerr << "cublasHandle_t: " << cublas_handle << std::endl;
+ //cublasCreate(cublas_handle);
+
+ //create the context
+ cudaError_t e;
+ e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
+ if(e != cudaSuccess) {
+ std::cerr << "Failed to create CUDA context on a GPU." << std::endl;
+ }
+ }
+
+ gpuinfo_[max_id].used = true;
+ NERV_SET_STATUS(status, NERV_NORMAL, 0);
+ return max_id;
+}
+
+void SelectGPU(Device *device, int gpu_id, Status *status)
+{
+
+ std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
+ int n_gpu = 0;
+ cudaGetDeviceCount(&n_gpu);
+ if(gpu_id >= n_gpu)
+ {
+ NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot select GPU CUDA capable cards!");
+ std::cerr << "Cannot select GPU " << gpu_id
+ << ", detected " << n_gpu << " CUDA capable cards!" << std::endl;
+ }
+
+
+ std::cerr << "Selected device: " << gpu_id << " (manually)";
+
+ std::cerr << "free: " << gpuinfo_[gpu_id].mem_free/1024/1024 << "M, "
+ << "total: "<< gpuinfo_[gpu_id].mem_total/1024/1024 << "M, "
+ << "ratio: "<< gpuinfo_[gpu_id].mem_ratio << " "
+ << "d2h bandwidth: " << gpuinfo_[gpu_id].d2h_bandwidth << "MB/s, "
+ << "h2d bandwidth: "<< gpuinfo_[gpu_id].h2d_bandwidth << "MB/s" << std::endl;
+
+ CUDA_SAFE_SYNC_CALL(cudaSetDevice(gpu_id), status);
+ //initialize the CUBLAS
+ //CUBLAS_SAFE_SYNC_CALL(cublasInit(), status);
+ //cublasHandle_t *cublas_handle = nerv_get_cublas_handle();
+ //cublasCreate(cublas_handle);
+
+ //create the context
+ cudaError_t e;
+ e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
+ if(e != cudaSuccess) {
+ NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Failed to create CUDA context on a GPU.");
+ std::cerr << "Failed to create CUDA context on a GPU.";
+ }
+
+ gpuinfo_[gpu_id].used = true;
+
+ NERV_SET_STATUS(status, NERV_NORMAL, 0);
+}
+
+void Initialize(Device *device, Status *status)
+{
+ // Check that we have at least one gpu
+ int n_gpu = 0;
+ cudaGetDeviceCount(&n_gpu);
+ if(n_gpu == 0)
+ NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found");
+
+ std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
+ gpuinfo_.resize(n_gpu);
+
+ std::cerr << "gpu information ..." << std::endl;
+
+ // Get ratios of memory use, if possible
+ for(int n = 0; n < n_gpu; n++)
+ {
+ int ret = cudaSetDevice(n);
+ switch(ret)
+ {
+ case cudaSuccess : {
+ //create the CUDA context for the thread
+ //cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
+ cudaDeviceSynchronize();
+ //get GPU name
+ char name[STRLEN];
+ DeviceGetName(name,STRLEN,n, status);
+ //get GPU memory stats
+ size_t free, total;
+ float d2h, h2d;
+
+ GetFreeMemory(&free, &total, status);
+ GetBandwidth(n, &d2h, &h2d, status);
+
+ gpuinfo_[n].gpuid = n;
+ gpuinfo_[n].d2h_bandwidth = d2h;
+ gpuinfo_[n].h2d_bandwidth = h2d;
+ gpuinfo_[n].mem_free = free;
+ gpuinfo_[n].mem_total = total;
+ gpuinfo_[n].mem_ratio = free/(float)total;
+ gpuinfo_[n].used = false;
+
+ std::cerr << "gpu: " << n << " ==> "
+ << "free: " << free/1024/1024 << "M, "
+ << "total: "<< total/1024/1024 << "M, "
+ << "ratio: "<< free/(float)total << " "
+ << "d2h bandwidth: " << d2h << "MB/s, "
+ << "h2d bandwidth: "<< h2d << "MB/s" << std::endl;
+
+ //destroy the CUDA context for the thread
+ //cudaThreadExit(); //deprecated, but for legacy reason not cudaDeviceReset
+ } break;
+
+ #if (CUDA_VERSION > 3020)
+ case cudaErrorDeviceAlreadyInUse :
+ std::cerr << "cudaSetDevice(" << n << "): "
+ << "Device cannot be accessed, used EXCLUSIVE-THREAD mode..." << std::endl;
+ break;
+ #endif
+ case cudaErrorInvalidDevice :
+ std::cerr << "cudaSetDevice(" << n << "): "
+ << "Device cannot be accessed, not a VALID CUDA device!" << std::endl;
+ break;
+ default :
+ std::cerr << "cudaSetDevice(" << n << "): "
+ << "returned " << ret << ", "
+ << cudaGetErrorString((cudaError_t)ret) << std::endl;
+ }
+ }
+ cudaDeviceReset();
+ printf("Result = PASS\n");
+ NERV_SET_STATUS(status, NERV_NORMAL, 0);
+}
+
+
+
+
+
+} // extern
diff --git a/fastnn/device/Device.h b/fastnn/device/Device.h
new file mode 100644
index 0000000..9a80128
--- /dev/null
+++ b/fastnn/device/Device.h
@@ -0,0 +1,51 @@
+
+#ifndef NERV_FASTNN_EXAMPLE_H
+#define NERV_FASTNN_EXAMPLE_H
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "matrix/matrix.h"
+#include "stdbool.h"
+#include "../../nerv/lib/matrix/cuda_helper.h"
+#include "../../nerv/lib/common.h"
+#include "../../nerv/lib/matrix/cumatrix.h"
+#define STRLEN 1024
+
+