#include #include #include #include "../../nerv/lib/matrix/cuda_helper.h" #include extern "C" { #include "Device.h" #include "stdlib.h" #include "string.h" struct GPUInfo { int gpuid; float d2h_bandwidth; float h2d_bandwidth; size_t mem_free; size_t mem_total; float mem_ratio; bool used; }; struct MPIGPUInfo { char hostname[STRLEN]; int gpuid; int myid; }; struct Device { std::vector gpuinfo_; int refcount; }; /////////////////////////////////////////// extern cublasHandle_t* nerv_get_cublas_handle(); CuContext* CuContext_new() { CuContext *context = new CuContext; cublasCreate(&context->cublas_handle); cudaEventCreate(&context->profile_start); cudaEventCreate(&context->profile_stop); context->profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp); context->refcount = 1; context->pid = pthread_self(); return context; } CuContext* CuContext_newWithId(long id) { CuContext *context = (CuContext*)id; __sync_fetch_and_add(&context->refcount, 1); return context; } long CuContext_id(CuContext *context) { return (long)context; } void CuContext_destroy(CuContext *context) { if (NULL != context && __sync_fetch_and_add(&context->refcount, -1) == 1) { cublasDestroy(context->cublas_handle); hashmap_clear(context->profile); delete context; context = NULL; } } Device* Device_new() { Device* device = new Device; device->refcount = 1; return device; } Device* Device_newWithId(long id) { Device *device = (Device*)id; __sync_fetch_and_add(&device->refcount, 1); return device; } long Device_id(Device *device) { return (long)device; } void Device_destroy(Device *device) { if (NULL != device && __sync_fetch_and_add(&device->refcount, -1) == 1) { delete device; device = NULL; } } void GetFreeMemory(size_t* free, size_t* total, Status *status) { // WARNING! the CUDA API is inconsistent accross versions! #if (CUDA_VERSION >= 3020) //define the function signature type size_t mem_free, mem_total; #else unsigned int mem_free, mem_total; #endif { //we will load the cuMemGetInfo dynamically from libcuda.so //cuMemGetInfo(&mem_free, &mem_total); //pre-fill ``safe'' values that will not cause problems mem_free = 1; mem_total = 1; //open libcuda.so void* libcuda = dlopen("libcuda.so",RTLD_LAZY); if(NULL == libcuda) { NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so"); } else { //define the function signature type //and get the symbol #if (CUDA_VERSION >= 3020) typedef CUresult (*cu_fun_ptr)(size_t*, size_t*); cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2"); #else typedef CUresult (*cu_fun_ptr)(int*, int*); cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo"); #endif if(NULL == dl_cuMemGetInfo) { NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuMemGetInfo from libcuda.so"); } else { //call the function dl_cuMemGetInfo(&mem_free, &mem_total); } //close the library dlclose(libcuda); } } // copy the output values outside if(NULL != free) *free = mem_free; if(NULL != total) *total = mem_total; } void DeviceGetName(char* name, int len, int dev, Status *status) { //prefill with something reasonable strncpy(name,"Unknown GPU",len); //open libcuda.so void* libcuda = dlopen("libcuda.so",RTLD_LAZY); if(NULL == libcuda) { NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so"); } else { //define the function signature type typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice); //get the symbol cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName"); if(NULL == cuDeviceGetName_ptr) { NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuDeviceGetName from libcuda.so"); } else { //call the function cuDeviceGetName_ptr(name, len, dev); } //close the library dlclose(libcuda); } } void GetBandwidth(int gpu_idx, float *d2h, float *h2d, Status *status) { int idx = gpu_idx; int memSize = 64*1024*1024; float elapsedTimeInMs = 0.0f; float bandwidthInMBs = 0.0f; unsigned char *h_idata = NULL; unsigned char *h_odata = NULL; cudaEvent_t start, stop; bool PINNED = true; int MEMCOPY_ITERATIONS = 5; CUDA_SAFE_SYNC_CALL(cudaEventCreate(&start), status); CUDA_SAFE_SYNC_CALL(cudaEventCreate(&stop), status); //allocate host memory if (PINNED) { #if CUDART_VERSION >= 2020 CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_idata, memSize, cudaHostAllocPortable), status); CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_odata, memSize, cudaHostAllocPortable), status); #else CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_idata, memSize), status); CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_odata, memSize), status); #endif } else { //pageable memory mode - use malloc h_odata = (unsigned char *)malloc(memSize); if (h_odata == 0) { NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Not enough memory available on host to run test!"); } } //initialize the memory for (unsigned int i = 0; i < memSize/sizeof(unsigned char); i++) { h_idata[i] = (unsigned char)(i & 0xff); } // allocate device memory unsigned char *d_idata; CUDA_SAFE_SYNC_CALL(cudaMalloc((void **) &d_idata, memSize), status); //initialize the device memory CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_idata, memSize,cudaMemcpyHostToDevice), status); //copy data from GPU to Host CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status); if (PINNED) { for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost, 0), status); } } else { for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { CUDA_SAFE_SYNC_CALL(cudaMemcpy(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost), status); } } CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status); // make sure GPU has finished copying CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status); //get the the total elapsed time in ms CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status); //calculate bandwidth in MB/s bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20)); *d2h = bandwidthInMBs; ///////////////////////////////////////////////////// //copy data from Host to GPU CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status); if (PINNED) { for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(d_idata, h_odata, memSize,cudaMemcpyHostToDevice, 0), status); } } else { for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_odata, memSize,cudaMemcpyHostToDevice), status); } } CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status); // make sure GPU has finished copying CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status); //get the the total elapsed time in ms CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status); //calculate bandwidth in MB/s bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20)); *h2d = bandwidthInMBs; //clean up memory CUDA_SAFE_SYNC_CALL(cudaEventDestroy(stop), status); CUDA_SAFE_SYNC_CALL(cudaEventDestroy(start), status); if (PINNED) { CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_idata), status); CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_odata), status); } else { free(h_idata); free(h_odata); } CUDA_SAFE_SYNC_CALL(cudaFree(d_idata), status); } int AutoSelectGPU(Device *device, Status *status) { //GPU selection is based on largest proportion of free memory. int max_id = 0, n_gpu = 0; std::vector &gpuinfo_ = device->gpuinfo_; cudaGetDeviceCount(&n_gpu); if(n_gpu == 0) { NERV_SET_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found"); return -1; } if (n_gpu > 0) { for (int i = 0; i < gpuinfo_.size(); i++) { if (!gpuinfo_[i].used) { max_id = i; break; } } //find GPU with max free memory for (int n = 1; n < gpuinfo_.size(); n++) { if (!gpuinfo_[n].used && gpuinfo_[n].mem_ratio > gpuinfo_[max_id].mem_ratio) max_id = n; } std::cerr << "Selected device: " << max_id << " (automatically)" << std::endl; std::cerr << "free: " << gpuinfo_[max_id].mem_free/1024/1024 << "M, " << "total: "<< gpuinfo_[max_id].mem_total/1024/1024 << "M, " << "ratio: "<< gpuinfo_[max_id].mem_ratio << " " << "d2h bandwidth: " << gpuinfo_[max_id].d2h_bandwidth << "MB/s, " << "h2d bandwidth: "<< gpuinfo_[max_id].h2d_bandwidth << "MB/s" << std::endl; cudaSetDevice(max_id); //initialize the CUBLAS //cublasInit(); //cublasHandle_t *cublas_handle = nerv_get_cublas_handle(); //std::cerr << "cublasHandle_t: " << cublas_handle << std::endl; //cublasCreate(cublas_handle); //create the context cudaError_t e; e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize if(e != cudaSuccess) { std::cerr << "Failed to create CUDA context on a GPU." << std::endl; } } gpuinfo_[max_id].used = true; NERV_SET_STATUS(status, NERV_NORMAL, 0); return max_id; } void SelectGPU(Device *device, int gpu_id, Status *status) { std::vector &gpuinfo_ = device->gpuinfo_; int n_gpu = 0; cudaGetDeviceCount(&n_gpu); if(gpu_id >= n_gpu) { NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot select GPU CUDA capable cards!"); std::cerr << "Cannot select GPU " << gpu_id << ", detected " << n_gpu << " CUDA capable cards!" << std::endl; } std::cerr << "Selected device: " << gpu_id << " (manually)"; std::cerr << "free: " << gpuinfo_[gpu_id].mem_free/1024/1024 << "M, " << "total: "<< gpuinfo_[gpu_id].mem_total/1024/1024 << "M, " << "ratio: "<< gpuinfo_[gpu_id].mem_ratio << " " << "d2h bandwidth: " << gpuinfo_[gpu_id].d2h_bandwidth << "MB/s, " << "h2d bandwidth: "<< gpuinfo_[gpu_id].h2d_bandwidth << "MB/s" << std::endl; CUDA_SAFE_SYNC_CALL(cudaSetDevice(gpu_id), status); //initialize the CUBLAS //CUBLAS_SAFE_SYNC_CALL(cublasInit(), status); //cublasHandle_t *cublas_handle = nerv_get_cublas_handle(); //cublasCreate(cublas_handle); //create the context cudaError_t e; e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize if(e != cudaSuccess) { NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Failed to create CUDA context on a GPU."); std::cerr << "Failed to create CUDA context on a GPU."; } gpuinfo_[gpu_id].used = true; NERV_SET_STATUS(status, NERV_NORMAL, 0); } void Initialize(Device *device, Status *status) { // Check that we have at least one gpu int n_gpu = 0; cudaGetDeviceCount(&n_gpu); if(n_gpu == 0) NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found"); std::vector &gpuinfo_ = device->gpuinfo_; gpuinfo_.resize(n_gpu); std::cerr << "gpu information ..." << std::endl; // Get ratios of memory use, if possible for(int n = 0; n < n_gpu; n++) { int ret = cudaSetDevice(n); switch(ret) { case cudaSuccess : { //create the CUDA context for the thread //cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize cudaDeviceSynchronize(); //get GPU name char name[STRLEN]; DeviceGetName(name,STRLEN,n, status); //get GPU memory stats size_t free, total; float d2h, h2d; GetFreeMemory(&free, &total, status); GetBandwidth(n, &d2h, &h2d, status); gpuinfo_[n].gpuid = n; gpuinfo_[n].d2h_bandwidth = d2h; gpuinfo_[n].h2d_bandwidth = h2d; gpuinfo_[n].mem_free = free; gpuinfo_[n].mem_total = total; gpuinfo_[n].mem_ratio = free/(float)total; gpuinfo_[n].used = false; std::cerr << "gpu: " << n << " ==> " << "free: " << free/1024/1024 << "M, " << "total: "<< total/1024/1024 << "M, " << "ratio: "<< free/(float)total << " " << "d2h bandwidth: " << d2h << "MB/s, " << "h2d bandwidth: "<< h2d << "MB/s" << std::endl; //destroy the CUDA context for the thread //cudaThreadExit(); //deprecated, but for legacy reason not cudaDeviceReset } break; #if (CUDA_VERSION > 3020) case cudaErrorDeviceAlreadyInUse : std::cerr << "cudaSetDevice(" << n << "): " << "Device cannot be accessed, used EXCLUSIVE-THREAD mode..." << std::endl; break; #endif case cudaErrorInvalidDevice : std::cerr << "cudaSetDevice(" << n << "): " << "Device cannot be accessed, not a VALID CUDA device!" << std::endl; break; default : std::cerr << "cudaSetDevice(" << n << "): " << "returned " << ret << ", " << cudaGetErrorString((cudaError_t)ret) << std::endl; } } cudaDeviceReset(); printf("Result = PASS\n"); NERV_SET_STATUS(status, NERV_NORMAL, 0); } } // extern