From a68d3c982ed0dd4ef5bbc9e0c22b9ecf9565b924 Mon Sep 17 00:00:00 2001 From: uphantom Date: Fri, 28 Aug 2015 17:41:14 +0800 Subject: fastnn version 1.0 --- fastnn/device/Device.cpp | 496 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 496 insertions(+) create mode 100644 fastnn/device/Device.cpp (limited to 'fastnn/device/Device.cpp') diff --git a/fastnn/device/Device.cpp b/fastnn/device/Device.cpp new file mode 100644 index 0000000..3b69086 --- /dev/null +++ b/fastnn/device/Device.cpp @@ -0,0 +1,496 @@ + +#include +#include +#include +#include "../../nerv/lib/matrix/cuda_helper.h" +#include + + +extern "C" { + + +#include "Device.h" +#include "stdlib.h" +#include "string.h" + + +struct GPUInfo +{ + int gpuid; + float d2h_bandwidth; + float h2d_bandwidth; + size_t mem_free; + size_t mem_total; + float mem_ratio; + bool used; +}; + +struct MPIGPUInfo +{ + char hostname[STRLEN]; + int gpuid; + int myid; +}; + +struct Device +{ + std::vector gpuinfo_; + int refcount; +}; + +/////////////////////////////////////////// + +extern cublasHandle_t* nerv_get_cublas_handle(); + +CuContext* CuContext_new() +{ + CuContext *context = new CuContext; + cublasCreate(&context->cublas_handle); + cudaEventCreate(&context->profile_start); + cudaEventCreate(&context->profile_stop); + context->profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp); + context->refcount = 1; + context->pid = pthread_self(); + return context; +} + +CuContext* CuContext_newWithId(long id) +{ + CuContext *context = (CuContext*)id; + __sync_fetch_and_add(&context->refcount, 1); + return context; +} + +long CuContext_id(CuContext *context) +{ + return (long)context; +} + +void CuContext_destroy(CuContext *context) +{ + if (NULL != context && __sync_fetch_and_add(&context->refcount, -1) == 1) + { + cublasDestroy(context->cublas_handle); + hashmap_clear(context->profile); + delete context; + context = NULL; + } +} + +Device* Device_new() +{ + Device* device = new Device; + device->refcount = 1; + return device; +} + +Device* Device_newWithId(long id) +{ + Device *device = (Device*)id; + __sync_fetch_and_add(&device->refcount, 1); + return device; +} + +long Device_id(Device *device) +{ + return (long)device; +} + +void Device_destroy(Device *device) +{ + if (NULL != device && __sync_fetch_and_add(&device->refcount, -1) == 1) + { + delete device; + device = NULL; + } +} + +void GetFreeMemory(size_t* free, size_t* total, Status *status) +{ + // WARNING! the CUDA API is inconsistent accross versions! + #if (CUDA_VERSION >= 3020) + //define the function signature type + size_t mem_free, mem_total; +#else + unsigned int mem_free, mem_total; +#endif + { + //we will load the cuMemGetInfo dynamically from libcuda.so + //cuMemGetInfo(&mem_free, &mem_total); + //pre-fill ``safe'' values that will not cause problems + mem_free = 1; mem_total = 1; + //open libcuda.so + void* libcuda = dlopen("libcuda.so",RTLD_LAZY); + if(NULL == libcuda) + { + NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so"); + } + else + { + //define the function signature type + //and get the symbol +#if (CUDA_VERSION >= 3020) + typedef CUresult (*cu_fun_ptr)(size_t*, size_t*); + cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2"); +#else + typedef CUresult (*cu_fun_ptr)(int*, int*); + cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo"); +#endif + if(NULL == dl_cuMemGetInfo) + { + NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuMemGetInfo from libcuda.so"); + } + else + { + //call the function + dl_cuMemGetInfo(&mem_free, &mem_total); + } + //close the library + dlclose(libcuda); + } + } + // copy the output values outside + if(NULL != free) *free = mem_free; + if(NULL != total) *total = mem_total; +} + +void DeviceGetName(char* name, int len, int dev, Status *status) +{ + //prefill with something reasonable + strncpy(name,"Unknown GPU",len); + //open libcuda.so + void* libcuda = dlopen("libcuda.so",RTLD_LAZY); + if(NULL == libcuda) + { + NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so"); + } + else + { + //define the function signature type + typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice); + //get the symbol + cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName"); + if(NULL == cuDeviceGetName_ptr) { + NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuDeviceGetName from libcuda.so"); + } + else { + //call the function + cuDeviceGetName_ptr(name, len, dev); + } + //close the library + dlclose(libcuda); + } +} + +void GetBandwidth(int gpu_idx, float *d2h, float *h2d, Status *status) +{ + int idx = gpu_idx; + int memSize = 64*1024*1024; + float elapsedTimeInMs = 0.0f; + float bandwidthInMBs = 0.0f; + unsigned char *h_idata = NULL; + unsigned char *h_odata = NULL; + cudaEvent_t start, stop; + bool PINNED = true; + int MEMCOPY_ITERATIONS = 5; + + CUDA_SAFE_SYNC_CALL(cudaEventCreate(&start), status); + CUDA_SAFE_SYNC_CALL(cudaEventCreate(&stop), status); + + //allocate host memory + if (PINNED) + { + #if CUDART_VERSION >= 2020 + CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_idata, memSize, cudaHostAllocPortable), status); + CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_odata, memSize, cudaHostAllocPortable), status); + #else + CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_idata, memSize), status); + CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_odata, memSize), status); + #endif + } + else + { + //pageable memory mode - use malloc + h_odata = (unsigned char *)malloc(memSize); + + if (h_odata == 0) + { + NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Not enough memory available on host to run test!"); + } + } + + //initialize the memory + for (unsigned int i = 0; i < memSize/sizeof(unsigned char); i++) + { + h_idata[i] = (unsigned char)(i & 0xff); + } + + // allocate device memory + unsigned char *d_idata; + CUDA_SAFE_SYNC_CALL(cudaMalloc((void **) &d_idata, memSize), status); + + //initialize the device memory + CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_idata, memSize,cudaMemcpyHostToDevice), status); + + //copy data from GPU to Host + + CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status); + + if (PINNED) + { + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) + { + CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost, 0), status); + } + } + else + { + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) + { + CUDA_SAFE_SYNC_CALL(cudaMemcpy(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost), status); + } + } + + CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status); + + // make sure GPU has finished copying + CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status); + //get the the total elapsed time in ms + + CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status); + + + //calculate bandwidth in MB/s + bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20)); + *d2h = bandwidthInMBs; + + ///////////////////////////////////////////////////// + //copy data from Host to GPU + CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status); + + if (PINNED) + { + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) + { + CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(d_idata, h_odata, memSize,cudaMemcpyHostToDevice, 0), status); + } + } + else + { + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) + { + CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_odata, memSize,cudaMemcpyHostToDevice), status); + } + } + + CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status); + + // make sure GPU has finished copying + CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status); + //get the the total elapsed time in ms + + CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status); + + + //calculate bandwidth in MB/s + bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20)); + *h2d = bandwidthInMBs; + + //clean up memory + CUDA_SAFE_SYNC_CALL(cudaEventDestroy(stop), status); + CUDA_SAFE_SYNC_CALL(cudaEventDestroy(start), status); + + if (PINNED) + { + CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_idata), status); + CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_odata), status); + } + else + { + free(h_idata); + free(h_odata); + } + + CUDA_SAFE_SYNC_CALL(cudaFree(d_idata), status); +} + +int AutoSelectGPU(Device *device, Status *status) +{ + //GPU selection is based on largest proportion of free memory. + + int max_id = 0, n_gpu = 0; + std::vector &gpuinfo_ = device->gpuinfo_; + + cudaGetDeviceCount(&n_gpu); + + if(n_gpu == 0) + { + NERV_SET_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found"); + return -1; + } + + if (n_gpu > 0) + { + for (int i = 0; i < gpuinfo_.size(); i++) + { + if (!gpuinfo_[i].used) + { + max_id = i; + break; + } + } + //find GPU with max free memory + for (int n = 1; n < gpuinfo_.size(); n++) + { + if (!gpuinfo_[n].used && gpuinfo_[n].mem_ratio > gpuinfo_[max_id].mem_ratio) + max_id = n; + } + + + std::cerr << "Selected device: " << max_id << " (automatically)" << std::endl; + + std::cerr << "free: " << gpuinfo_[max_id].mem_free/1024/1024 << "M, " + << "total: "<< gpuinfo_[max_id].mem_total/1024/1024 << "M, " + << "ratio: "<< gpuinfo_[max_id].mem_ratio << " " + << "d2h bandwidth: " << gpuinfo_[max_id].d2h_bandwidth << "MB/s, " + << "h2d bandwidth: "<< gpuinfo_[max_id].h2d_bandwidth << "MB/s" << std::endl; + + cudaSetDevice(max_id); + //initialize the CUBLAS + //cublasInit(); + //cublasHandle_t *cublas_handle = nerv_get_cublas_handle(); + //std::cerr << "cublasHandle_t: " << cublas_handle << std::endl; + //cublasCreate(cublas_handle); + + //create the context + cudaError_t e; + e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize + if(e != cudaSuccess) { + std::cerr << "Failed to create CUDA context on a GPU." << std::endl; + } + } + + gpuinfo_[max_id].used = true; + NERV_SET_STATUS(status, NERV_NORMAL, 0); + return max_id; +} + +void SelectGPU(Device *device, int gpu_id, Status *status) +{ + + std::vector &gpuinfo_ = device->gpuinfo_; + int n_gpu = 0; + cudaGetDeviceCount(&n_gpu); + if(gpu_id >= n_gpu) + { + NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot select GPU CUDA capable cards!"); + std::cerr << "Cannot select GPU " << gpu_id + << ", detected " << n_gpu << " CUDA capable cards!" << std::endl; + } + + + std::cerr << "Selected device: " << gpu_id << " (manually)"; + + std::cerr << "free: " << gpuinfo_[gpu_id].mem_free/1024/1024 << "M, " + << "total: "<< gpuinfo_[gpu_id].mem_total/1024/1024 << "M, " + << "ratio: "<< gpuinfo_[gpu_id].mem_ratio << " " + << "d2h bandwidth: " << gpuinfo_[gpu_id].d2h_bandwidth << "MB/s, " + << "h2d bandwidth: "<< gpuinfo_[gpu_id].h2d_bandwidth << "MB/s" << std::endl; + + CUDA_SAFE_SYNC_CALL(cudaSetDevice(gpu_id), status); + //initialize the CUBLAS + //CUBLAS_SAFE_SYNC_CALL(cublasInit(), status); + //cublasHandle_t *cublas_handle = nerv_get_cublas_handle(); + //cublasCreate(cublas_handle); + + //create the context + cudaError_t e; + e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize + if(e != cudaSuccess) { + NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Failed to create CUDA context on a GPU."); + std::cerr << "Failed to create CUDA context on a GPU."; + } + + gpuinfo_[gpu_id].used = true; + + NERV_SET_STATUS(status, NERV_NORMAL, 0); +} + +void Initialize(Device *device, Status *status) +{ + // Check that we have at least one gpu + int n_gpu = 0; + cudaGetDeviceCount(&n_gpu); + if(n_gpu == 0) + NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found"); + + std::vector &gpuinfo_ = device->gpuinfo_; + gpuinfo_.resize(n_gpu); + + std::cerr << "gpu information ..." << std::endl; + + // Get ratios of memory use, if possible + for(int n = 0; n < n_gpu; n++) + { + int ret = cudaSetDevice(n); + switch(ret) + { + case cudaSuccess : { + //create the CUDA context for the thread + //cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize + cudaDeviceSynchronize(); + //get GPU name + char name[STRLEN]; + DeviceGetName(name,STRLEN,n, status); + //get GPU memory stats + size_t free, total; + float d2h, h2d; + + GetFreeMemory(&free, &total, status); + GetBandwidth(n, &d2h, &h2d, status); + + gpuinfo_[n].gpuid = n; + gpuinfo_[n].d2h_bandwidth = d2h; + gpuinfo_[n].h2d_bandwidth = h2d; + gpuinfo_[n].mem_free = free; + gpuinfo_[n].mem_total = total; + gpuinfo_[n].mem_ratio = free/(float)total; + gpuinfo_[n].used = false; + + std::cerr << "gpu: " << n << " ==> " + << "free: " << free/1024/1024 << "M, " + << "total: "<< total/1024/1024 << "M, " + << "ratio: "<< free/(float)total << " " + << "d2h bandwidth: " << d2h << "MB/s, " + << "h2d bandwidth: "<< h2d << "MB/s" << std::endl; + + //destroy the CUDA context for the thread + //cudaThreadExit(); //deprecated, but for legacy reason not cudaDeviceReset + } break; + + #if (CUDA_VERSION > 3020) + case cudaErrorDeviceAlreadyInUse : + std::cerr << "cudaSetDevice(" << n << "): " + << "Device cannot be accessed, used EXCLUSIVE-THREAD mode..." << std::endl; + break; + #endif + case cudaErrorInvalidDevice : + std::cerr << "cudaSetDevice(" << n << "): " + << "Device cannot be accessed, not a VALID CUDA device!" << std::endl; + break; + default : + std::cerr << "cudaSetDevice(" << n << "): " + << "returned " << ret << ", " + << cudaGetErrorString((cudaError_t)ret) << std::endl; + } + } + cudaDeviceReset(); + printf("Result = PASS\n"); + NERV_SET_STATUS(status, NERV_NORMAL, 0); +} + + + + + +} // extern -- cgit v1.2.3-70-g09d2