From a68d3c982ed0dd4ef5bbc9e0c22b9ecf9565b924 Mon Sep 17 00:00:00 2001 From: uphantom Date: Fri, 28 Aug 2015 17:41:14 +0800 Subject: fastnn version 1.0 --- fastnn/device/Device.cpp | 496 +++++++++++++++++++++++++++++++++++++++++++++++ fastnn/device/Device.h | 51 +++++ fastnn/device/device.c | 178 +++++++++++++++++ fastnn/device/device.lua | 6 + 4 files changed, 731 insertions(+) create mode 100644 fastnn/device/Device.cpp create mode 100644 fastnn/device/Device.h create mode 100644 fastnn/device/device.c create mode 100644 fastnn/device/device.lua (limited to 'fastnn/device') diff --git a/fastnn/device/Device.cpp b/fastnn/device/Device.cpp new file mode 100644 index 0000000..3b69086 --- /dev/null +++ b/fastnn/device/Device.cpp @@ -0,0 +1,496 @@ + +#include +#include +#include +#include "../../nerv/lib/matrix/cuda_helper.h" +#include + + +extern "C" { + + +#include "Device.h" +#include "stdlib.h" +#include "string.h" + + +struct GPUInfo +{ + int gpuid; + float d2h_bandwidth; + float h2d_bandwidth; + size_t mem_free; + size_t mem_total; + float mem_ratio; + bool used; +}; + +struct MPIGPUInfo +{ + char hostname[STRLEN]; + int gpuid; + int myid; +}; + +struct Device +{ + std::vector gpuinfo_; + int refcount; +}; + +/////////////////////////////////////////// + +extern cublasHandle_t* nerv_get_cublas_handle(); + +CuContext* CuContext_new() +{ + CuContext *context = new CuContext; + cublasCreate(&context->cublas_handle); + cudaEventCreate(&context->profile_start); + cudaEventCreate(&context->profile_stop); + context->profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp); + context->refcount = 1; + context->pid = pthread_self(); + return context; +} + +CuContext* CuContext_newWithId(long id) +{ + CuContext *context = (CuContext*)id; + __sync_fetch_and_add(&context->refcount, 1); + return context; +} + +long CuContext_id(CuContext *context) +{ + return (long)context; +} + +void CuContext_destroy(CuContext *context) +{ + if (NULL != context && __sync_fetch_and_add(&context->refcount, -1) == 1) + { + cublasDestroy(context->cublas_handle); + hashmap_clear(context->profile); + delete context; + context = NULL; + } +} + +Device* Device_new() +{ + Device* device = new Device; + device->refcount = 1; + return device; +} + +Device* Device_newWithId(long id) +{ + Device *device = (Device*)id; + __sync_fetch_and_add(&device->refcount, 1); + return device; +} + +long Device_id(Device *device) +{ + return (long)device; +} + +void Device_destroy(Device *device) +{ + if (NULL != device && __sync_fetch_and_add(&device->refcount, -1) == 1) + { + delete device; + device = NULL; + } +} + +void GetFreeMemory(size_t* free, size_t* total, Status *status) +{ + // WARNING! the CUDA API is inconsistent accross versions! + #if (CUDA_VERSION >= 3020) + //define the function signature type + size_t mem_free, mem_total; +#else + unsigned int mem_free, mem_total; +#endif + { + //we will load the cuMemGetInfo dynamically from libcuda.so + //cuMemGetInfo(&mem_free, &mem_total); + //pre-fill ``safe'' values that will not cause problems + mem_free = 1; mem_total = 1; + //open libcuda.so + void* libcuda = dlopen("libcuda.so",RTLD_LAZY); + if(NULL == libcuda) + { + NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so"); + } + else + { + //define the function signature type + //and get the symbol +#if (CUDA_VERSION >= 3020) + typedef CUresult (*cu_fun_ptr)(size_t*, size_t*); + cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2"); +#else + typedef CUresult (*cu_fun_ptr)(int*, int*); + cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo"); +#endif + if(NULL == dl_cuMemGetInfo) + { + NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuMemGetInfo from libcuda.so"); + } + else + { + //call the function + dl_cuMemGetInfo(&mem_free, &mem_total); + } + //close the library + dlclose(libcuda); + } + } + // copy the output values outside + if(NULL != free) *free = mem_free; + if(NULL != total) *total = mem_total; +} + +void DeviceGetName(char* name, int len, int dev, Status *status) +{ + //prefill with something reasonable + strncpy(name,"Unknown GPU",len); + //open libcuda.so + void* libcuda = dlopen("libcuda.so",RTLD_LAZY); + if(NULL == libcuda) + { + NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so"); + } + else + { + //define the function signature type + typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice); + //get the symbol + cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName"); + if(NULL == cuDeviceGetName_ptr) { + NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuDeviceGetName from libcuda.so"); + } + else { + //call the function + cuDeviceGetName_ptr(name, len, dev); + } + //close the library + dlclose(libcuda); + } +} + +void GetBandwidth(int gpu_idx, float *d2h, float *h2d, Status *status) +{ + int idx = gpu_idx; + int memSize = 64*1024*1024; + float elapsedTimeInMs = 0.0f; + float bandwidthInMBs = 0.0f; + unsigned char *h_idata = NULL; + unsigned char *h_odata = NULL; + cudaEvent_t start, stop; + bool PINNED = true; + int MEMCOPY_ITERATIONS = 5; + + CUDA_SAFE_SYNC_CALL(cudaEventCreate(&start), status); + CUDA_SAFE_SYNC_CALL(cudaEventCreate(&stop), status); + + //allocate host memory + if (PINNED) + { + #if CUDART_VERSION >= 2020 + CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_idata, memSize, cudaHostAllocPortable), status); + CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_odata, memSize, cudaHostAllocPortable), status); + #else + CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_idata, memSize), status); + CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_odata, memSize), status); + #endif + } + else + { + //pageable memory mode - use malloc + h_odata = (unsigned char *)malloc(memSize); + + if (h_odata == 0) + { + NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Not enough memory available on host to run test!"); + } + } + + //initialize the memory + for (unsigned int i = 0; i < memSize/sizeof(unsigned char); i++) + { + h_idata[i] = (unsigned char)(i & 0xff); + } + + // allocate device memory + unsigned char *d_idata; + CUDA_SAFE_SYNC_CALL(cudaMalloc((void **) &d_idata, memSize), status); + + //initialize the device memory + CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_idata, memSize,cudaMemcpyHostToDevice), status); + + //copy data from GPU to Host + + CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status); + + if (PINNED) + { + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) + { + CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost, 0), status); + } + } + else + { + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) + { + CUDA_SAFE_SYNC_CALL(cudaMemcpy(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost), status); + } + } + + CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status); + + // make sure GPU has finished copying + CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status); + //get the the total elapsed time in ms + + CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status); + + + //calculate bandwidth in MB/s + bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20)); + *d2h = bandwidthInMBs; + + ///////////////////////////////////////////////////// + //copy data from Host to GPU + CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status); + + if (PINNED) + { + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) + { + CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(d_idata, h_odata, memSize,cudaMemcpyHostToDevice, 0), status); + } + } + else + { + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) + { + CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_odata, memSize,cudaMemcpyHostToDevice), status); + } + } + + CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status); + + // make sure GPU has finished copying + CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status); + //get the the total elapsed time in ms + + CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status); + + + //calculate bandwidth in MB/s + bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20)); + *h2d = bandwidthInMBs; + + //clean up memory + CUDA_SAFE_SYNC_CALL(cudaEventDestroy(stop), status); + CUDA_SAFE_SYNC_CALL(cudaEventDestroy(start), status); + + if (PINNED) + { + CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_idata), status); + CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_odata), status); + } + else + { + free(h_idata); + free(h_odata); + } + + CUDA_SAFE_SYNC_CALL(cudaFree(d_idata), status); +} + +int AutoSelectGPU(Device *device, Status *status) +{ + //GPU selection is based on largest proportion of free memory. + + int max_id = 0, n_gpu = 0; + std::vector &gpuinfo_ = device->gpuinfo_; + + cudaGetDeviceCount(&n_gpu); + + if(n_gpu == 0) + { + NERV_SET_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found"); + return -1; + } + + if (n_gpu > 0) + { + for (int i = 0; i < gpuinfo_.size(); i++) + { + if (!gpuinfo_[i].used) + { + max_id = i; + break; + } + } + //find GPU with max free memory + for (int n = 1; n < gpuinfo_.size(); n++) + { + if (!gpuinfo_[n].used && gpuinfo_[n].mem_ratio > gpuinfo_[max_id].mem_ratio) + max_id = n; + } + + + std::cerr << "Selected device: " << max_id << " (automatically)" << std::endl; + + std::cerr << "free: " << gpuinfo_[max_id].mem_free/1024/1024 << "M, " + << "total: "<< gpuinfo_[max_id].mem_total/1024/1024 << "M, " + << "ratio: "<< gpuinfo_[max_id].mem_ratio << " " + << "d2h bandwidth: " << gpuinfo_[max_id].d2h_bandwidth << "MB/s, " + << "h2d bandwidth: "<< gpuinfo_[max_id].h2d_bandwidth << "MB/s" << std::endl; + + cudaSetDevice(max_id); + //initialize the CUBLAS + //cublasInit(); + //cublasHandle_t *cublas_handle = nerv_get_cublas_handle(); + //std::cerr << "cublasHandle_t: " << cublas_handle << std::endl; + //cublasCreate(cublas_handle); + + //create the context + cudaError_t e; + e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize + if(e != cudaSuccess) { + std::cerr << "Failed to create CUDA context on a GPU." << std::endl; + } + } + + gpuinfo_[max_id].used = true; + NERV_SET_STATUS(status, NERV_NORMAL, 0); + return max_id; +} + +void SelectGPU(Device *device, int gpu_id, Status *status) +{ + + std::vector &gpuinfo_ = device->gpuinfo_; + int n_gpu = 0; + cudaGetDeviceCount(&n_gpu); + if(gpu_id >= n_gpu) + { + NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot select GPU CUDA capable cards!"); + std::cerr << "Cannot select GPU " << gpu_id + << ", detected " << n_gpu << " CUDA capable cards!" << std::endl; + } + + + std::cerr << "Selected device: " << gpu_id << " (manually)"; + + std::cerr << "free: " << gpuinfo_[gpu_id].mem_free/1024/1024 << "M, " + << "total: "<< gpuinfo_[gpu_id].mem_total/1024/1024 << "M, " + << "ratio: "<< gpuinfo_[gpu_id].mem_ratio << " " + << "d2h bandwidth: " << gpuinfo_[gpu_id].d2h_bandwidth << "MB/s, " + << "h2d bandwidth: "<< gpuinfo_[gpu_id].h2d_bandwidth << "MB/s" << std::endl; + + CUDA_SAFE_SYNC_CALL(cudaSetDevice(gpu_id), status); + //initialize the CUBLAS + //CUBLAS_SAFE_SYNC_CALL(cublasInit(), status); + //cublasHandle_t *cublas_handle = nerv_get_cublas_handle(); + //cublasCreate(cublas_handle); + + //create the context + cudaError_t e; + e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize + if(e != cudaSuccess) { + NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Failed to create CUDA context on a GPU."); + std::cerr << "Failed to create CUDA context on a GPU."; + } + + gpuinfo_[gpu_id].used = true; + + NERV_SET_STATUS(status, NERV_NORMAL, 0); +} + +void Initialize(Device *device, Status *status) +{ + // Check that we have at least one gpu + int n_gpu = 0; + cudaGetDeviceCount(&n_gpu); + if(n_gpu == 0) + NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found"); + + std::vector &gpuinfo_ = device->gpuinfo_; + gpuinfo_.resize(n_gpu); + + std::cerr << "gpu information ..." << std::endl; + + // Get ratios of memory use, if possible + for(int n = 0; n < n_gpu; n++) + { + int ret = cudaSetDevice(n); + switch(ret) + { + case cudaSuccess : { + //create the CUDA context for the thread + //cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize + cudaDeviceSynchronize(); + //get GPU name + char name[STRLEN]; + DeviceGetName(name,STRLEN,n, status); + //get GPU memory stats + size_t free, total; + float d2h, h2d; + + GetFreeMemory(&free, &total, status); + GetBandwidth(n, &d2h, &h2d, status); + + gpuinfo_[n].gpuid = n; + gpuinfo_[n].d2h_bandwidth = d2h; + gpuinfo_[n].h2d_bandwidth = h2d; + gpuinfo_[n].mem_free = free; + gpuinfo_[n].mem_total = total; + gpuinfo_[n].mem_ratio = free/(float)total; + gpuinfo_[n].used = false; + + std::cerr << "gpu: " << n << " ==> " + << "free: " << free/1024/1024 << "M, " + << "total: "<< total/1024/1024 << "M, " + << "ratio: "<< free/(float)total << " " + << "d2h bandwidth: " << d2h << "MB/s, " + << "h2d bandwidth: "<< h2d << "MB/s" << std::endl; + + //destroy the CUDA context for the thread + //cudaThreadExit(); //deprecated, but for legacy reason not cudaDeviceReset + } break; + + #if (CUDA_VERSION > 3020) + case cudaErrorDeviceAlreadyInUse : + std::cerr << "cudaSetDevice(" << n << "): " + << "Device cannot be accessed, used EXCLUSIVE-THREAD mode..." << std::endl; + break; + #endif + case cudaErrorInvalidDevice : + std::cerr << "cudaSetDevice(" << n << "): " + << "Device cannot be accessed, not a VALID CUDA device!" << std::endl; + break; + default : + std::cerr << "cudaSetDevice(" << n << "): " + << "returned " << ret << ", " + << cudaGetErrorString((cudaError_t)ret) << std::endl; + } + } + cudaDeviceReset(); + printf("Result = PASS\n"); + NERV_SET_STATUS(status, NERV_NORMAL, 0); +} + + + + + +} // extern diff --git a/fastnn/device/Device.h b/fastnn/device/Device.h new file mode 100644 index 0000000..9a80128 --- /dev/null +++ b/fastnn/device/Device.h @@ -0,0 +1,51 @@ + +#ifndef NERV_FASTNN_EXAMPLE_H +#define NERV_FASTNN_EXAMPLE_H + + +#ifdef __cplusplus +extern "C" { +#endif + +#include "matrix/matrix.h" +#include "stdbool.h" +#include "../../nerv/lib/matrix/cuda_helper.h" +#include "../../nerv/lib/common.h" +#include "../../nerv/lib/matrix/cumatrix.h" +#define STRLEN 1024 + + + +typedef struct GPUInfo GPUInfo; + +typedef struct MPIGPUInfo MPIGPUInfo; + +typedef struct Device Device; + +CuContext* CuContext_new(); +CuContext* CuContext_newWithId(long id); +long CuContext_id(CuContext *context); +void CuContext_destroy(CuContext *context); + + +Device* Device_new(); +Device* Device_newWithId(long); +long Device_id(Device *device); +void Device_detroy(Device *device); +void Initialize(Device *device, Status *status); +void GetFreeMemory(size_t* free, size_t* total, Status *status); +void DeviceGetName(char* name, int len, int dev, Status *status); +void GetBandwidth(int gpu_idx, float *d2h, float *h2d, Status *status); +int AutoSelectGPU(Device *device, Status *status); +void SelectGPU(Device *device, int gpu_id, Status *status); + + + + +#ifdef __cplusplus +} // closing brace for extern "C" + +#endif // end Device + +#endif + diff --git a/fastnn/device/device.c b/fastnn/device/device.c new file mode 100644 index 0000000..71d6ec1 --- /dev/null +++ b/fastnn/device/device.c @@ -0,0 +1,178 @@ + +#include +#include + +#include +#include +#include + +#include "../threads/lib/luaTHRD.h" +#include "Device.h" + + +const char *fastnn_device_tname = "fastnn.CDevice"; + + +static int device_new(lua_State *L) +{ + Device *device = NULL; + if(lua_gettop(L) == 0) + { + device = Device_new(); + } + else if(lua_gettop(L) == 1) + { + long id = luaL_checkinteger(L, 1); + device = Device_newWithId(id); + } + else + luaL_error(L, "device: device new invalid arguments"); + if (!device) + luaL_error(L, "device: device failed"); + + luaTHRD_pushudata(L, device, fastnn_device_tname); + + return 1; +} + +static int device_init(lua_State *L) +{ + Device *device = luaTHRD_checkudata(L, 1, fastnn_device_tname); + Status status; + Initialize(device, &status); + NERV_LUA_CHECK_STATUS(L, status); + return 0; +} + +static int device_select_gpu(lua_State *L) +{ + Device *device = luaTHRD_checkudata(L, 1, fastnn_device_tname); + + Status status; + if(lua_gettop(L) == 2) + { + int gpuid = luaL_checkinteger(L, 2); + SelectGPU(device, gpuid, &status); + NERV_LUA_CHECK_STATUS(L, status); + return 0; + } + else if(lua_gettop(L) == 1) + { + int gpuid = AutoSelectGPU(device, &status); + NERV_LUA_CHECK_STATUS(L, status); + lua_pushinteger(L, gpuid); + return 1; + } + else + luaL_error(L, "device: device select gpu failed"); +} + +static int device_id(lua_State *L) +{ + Device *device = luaTHRD_checkudata(L, 1, fastnn_device_tname); + lua_pushinteger(L, Device_id(device)); + return 1; +} + +static int device_tostring(lua_State *L) +{ + char str[STRLEN]; + Device* device = luaTHRD_checkudata(L, 1, fastnn_device_tname); + snprintf(str, STRLEN, "%s <%lx>", fastnn_device_tname, Device_id(device)); + lua_pushstring(L, str); + return 1; +} + +static int device_destroy(lua_State *L) +{ + Device *device = luaTHRD_checkudata(L, 1, fastnn_device_tname); + Device_destroy(device); + //printf("device_destroy ... end\n"); + return 0; +} + + +////////////////////////////////////////////// + +static int context_new(lua_State *L) +{ + CuContext *context = NULL; + if(lua_gettop(L) == 0) + { + context = CuContext_new(); + } + else if(lua_gettop(L) == 1) + { + long id = luaL_checkinteger(L, 1); + context = CuContext_newWithId(id); + } + else + luaL_error(L, "device: context new invalid arguments"); + if (!context) + luaL_error(L, "device: context failed"); + + luaTHRD_pushudata(L, context, nerv_context_tname); + + return 1; +} + + +static int context_id(lua_State *L) +{ + CuContext *context = luaTHRD_checkudata(L, 1, nerv_context_tname); + lua_pushinteger(L, CuContext_id(context)); + return 1; +} + +static int context_tostring(lua_State *L) +{ + char str[STRLEN]; + CuContext* context = luaTHRD_checkudata(L, 1, nerv_context_tname); + snprintf(str, STRLEN, "%s <%lx>", nerv_context_tname, CuContext_id(context)); + lua_pushstring(L, str); + return 1; +} + +static int context_destroy(lua_State *L) +{ + CuContext* context = luaTHRD_checkudata(L, 1, nerv_context_tname); + CuContext_destroy(context); + return 0; +} + + +static const struct luaL_Reg device__ [] = { + {"new", device_new}, + {"__tostring", device_tostring}, + {"id", device_id}, + {"init", device_init}, + {"select_gpu", device_select_gpu}, + {"free", device_destroy}, + {NULL, NULL} +}; + +static const struct luaL_Reg context__ [] = { + {"new", context_new}, + {"__tostring", context_tostring}, + {"id", context_id}, + {"free", context_destroy}, + {NULL, NULL} +}; + + +void fastnn_init_device(lua_State *L) +{ + luaT_newmetatable(L, fastnn_device_tname, NULL, device_new, device_destroy, NULL); + luaL_register(L, NULL, device__); + lua_pop(L, 1); +} + +void fastnn_init_context(lua_State *L) +{ + luaT_newmetatable(L, nerv_context_tname, NULL, context_new, context_destroy, NULL); + luaL_register(L, NULL, context__); + lua_pop(L, 1); +} + + + diff --git a/fastnn/device/device.lua b/fastnn/device/device.lua new file mode 100644 index 0000000..d3dea73 --- /dev/null +++ b/fastnn/device/device.lua @@ -0,0 +1,6 @@ + +local C = require 'libfastnn' + +fastnn.CDevice = C.CDevice + + -- cgit v1.2.3-70-g09d2