#include <vector>
#include <string>
#include <iostream>
#include "../../nerv/lib/matrix/cuda_helper.h"
#include <dlfcn.h>
extern "C" {
#include "Device.h"
#include "stdlib.h"
#include "string.h"
struct GPUInfo
{
int gpuid;
float d2h_bandwidth;
float h2d_bandwidth;
size_t mem_free;
size_t mem_total;
float mem_ratio;
bool used;
};
struct MPIGPUInfo
{
char hostname[STRLEN];
int gpuid;
int myid;
};
struct Device
{
std::vector<GPUInfo> gpuinfo_;
int refcount;
};
///////////////////////////////////////////
extern cublasHandle_t* nerv_get_cublas_handle();
CuContext* CuContext_new()
{
CuContext *context = new CuContext;
cublasCreate(&context->cublas_handle);
cudaEventCreate(&context->profile_start);
cudaEventCreate(&context->profile_stop);
context->profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
context->refcount = 1;
context->pid = pthread_self();
return context;
}
CuContext* CuContext_newWithId(long id)
{
CuContext *context = (CuContext*)id;
__sync_fetch_and_add(&context->refcount, 1);
return context;
}
long CuContext_id(CuContext *context)
{
return (long)context;
}
void CuContext_destroy(CuContext *context)
{
if (NULL != context && __sync_fetch_and_add(&context->refcount, -1) == 1)
{
cublasDestroy(context->cublas_handle);
hashmap_clear(context->profile);
delete context;
context = NULL;
}
}
Device* Device_new()
{
Device* device = new Device;
device->refcount = 1;
return device;
}
Device* Device_newWithId(long id)
{
Device *device = (Device*)id;
__sync_fetch_and_add(&device->refcount, 1);
return device;
}
long Device_id(Device *device)
{
return (long)device;
}
void Device_destroy(Device *device)
{
if (NULL != device && __sync_fetch_and_add(&device->refcount, -1) == 1)
{
delete device;
device = NULL;
}
}
void GetFreeMemory(size_t* free, size_t* total, Status *status)
{
// WARNING! the CUDA API is inconsistent accross versions!
#if (CUDA_VERSION >= 3020)
//define the function signature type
size_t mem_free, mem_total;
#else
unsigned int mem_free, mem_total;
#endif
{
//we will load the cuMemGetInfo dynamically from libcuda.so
//cuMemGetInfo(&mem_free, &mem_total);
//pre-fill ``safe'' values that will not cause problems
mem_free = 1; mem_total = 1;
//open libcuda.so
void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
if(NULL == libcuda)
{
NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
}
else
{
//define the function signature type
//and get the symbol
#if (CUDA_VERSION >= 3020)
typedef CUresult (*cu_fun_ptr)(size_t*, size_t*);
cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2");
#else
typedef CUresult (*cu_fun_ptr)(int*, int*);
cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo");
#endif
if(NULL == dl_cuMemGetInfo)
{
NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuMemGetInfo from libcuda.so");
}
else
{
//call the function
dl_cuMemGetInfo(&mem_free, &mem_total);
}
//close the library
dlclose(libcuda);
}
}
// copy the output values outside
if(NULL != free) *free = mem_free;
if(NULL != total) *total = mem_total;
}
void DeviceGetName(char* name, int len, int dev, Status *status)
{
//prefill with something reasonable
strncpy(name,"Unknown GPU",len);
//open libcuda.so
void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
if(NULL == libcuda)
{
NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
}
else
{
//define the function signature type
typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice);
//get the symbol
cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName");
if(NULL == cuDeviceGetName_ptr) {
NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuDeviceGetName from libcuda.so");
}
else {
//call the function
cuDeviceGetName_ptr(name, len, dev);
}
//close the library
dlclose(libcuda);
}
}
void GetBandwidth(int gpu_idx, float *d2h, float *h2d, Status *status)
{
int idx = gpu_idx;
int memSize = 64*1024*1024;
float elapsedTimeInMs = 0.0f;
float bandwidthInMBs = 0.0f;
unsigned char *h_idata = NULL;
unsigned char *h_odata = NULL;
cudaEvent_t start, stop;
bool PINNED = true;
int MEMCOPY_ITERATIONS = 5;
CUDA_SAFE_SYNC_CALL(cudaEventCreate(&start), status);
CUDA_SAFE_SYNC_CALL(cudaEventCreate(&stop), status);
//allocate host memory
if (PINNED)
{
#if CUDART_VERSION >= 2020
CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_idata, memSize, cudaHostAllocPortable), status);
CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_odata, memSize, cudaHostAllocPortable), status);
#else
CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_idata, memSize), status);
CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_odata, memSize), status);
#endif
}
else
{
//pageable memory mode - use malloc
h_odata = (unsigned char *)malloc(memSize);
if (h_odata == 0)
{
NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Not enough memory available on host to run test!");
}
}
//initialize the memory
for (unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
{
h_idata[i] = (unsigned char)(i & 0xff);
}
// allocate device memory
unsigned char *d_idata;
CUDA_SAFE_SYNC_CALL(cudaMalloc((void **) &d_idata, memSize), status);
//initialize the device memory
CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_idata, memSize,cudaMemcpyHostToDevice), status);
//copy data from GPU to Host
CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status);
if (PINNED)
{
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost, 0), status);
}
}
else
{
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
CUDA_SAFE_SYNC_CALL(cudaMemcpy(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost), status);
}
}
CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status);
// make sure GPU has finished copying
CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status);
//get the the total elapsed time in ms
CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status);
//calculate bandwidth in MB/s
bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20));
*d2h = bandwidthInMBs;
/////////////////////////////////////////////////////
//copy data from Host to GPU
CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status);
if (PINNED)
{
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(d_idata, h_odata, memSize,cudaMemcpyHostToDevice, 0), status);
}
}
else
{
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_odata, memSize,cudaMemcpyHostToDevice), status);
}
}
CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status);
// make sure GPU has finished copying
CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status);
//get the the total elapsed time in ms
CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status);
//calculate bandwidth in MB/s
bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20));
*h2d = bandwidthInMBs;
//clean up memory
CUDA_SAFE_SYNC_CALL(cudaEventDestroy(stop), status);
CUDA_SAFE_SYNC_CALL(cudaEventDestroy(start), status);
if (PINNED)
{
CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_idata), status);
CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_odata), status);
}
else
{
free(h_idata);
free(h_odata);
}
CUDA_SAFE_SYNC_CALL(cudaFree(d_idata), status);
}
int AutoSelectGPU(Device *device, Status *status)
{
//GPU selection is based on largest proportion of free memory.
int max_id = 0, n_gpu = 0;
std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
cudaGetDeviceCount(&n_gpu);
if(n_gpu == 0)
{
NERV_SET_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found");
return -1;
}
if (n_gpu > 0)
{
for (int i = 0; i < gpuinfo_.size(); i++)
{
if (!gpuinfo_[i].used)
{
max_id = i;
break;
}
}
//find GPU with max free memory
for (int n = 1; n < gpuinfo_.size(); n++)
{
if (!gpuinfo_[n].used && gpuinfo_[n].mem_ratio > gpuinfo_[max_id].mem_ratio)
max_id = n;
}
std::cerr << "Selected device: " << max_id << " (automatically)" << std::endl;
std::cerr << "free: " << gpuinfo_[max_id].mem_free/1024/1024 << "M, "
<< "total: "<< gpuinfo_[max_id].mem_total/1024/1024 << "M, "
<< "ratio: "<< gpuinfo_[max_id].mem_ratio << " "
<< "d2h bandwidth: " << gpuinfo_[max_id].d2h_bandwidth << "MB/s, "
<< "h2d bandwidth: "<< gpuinfo_[max_id].h2d_bandwidth << "MB/s" << std::endl;
cudaSetDevice(max_id);
//initialize the CUBLAS
//cublasInit();
//cublasHandle_t *cublas_handle = nerv_get_cublas_handle();
//std::cerr << "cublasHandle_t: " << cublas_handle << std::endl;
//cublasCreate(cublas_handle);
//create the context
cudaError_t e;
e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
if(e != cudaSuccess) {
std::cerr << "Failed to create CUDA context on a GPU." << std::endl;
}
}
gpuinfo_[max_id].used = true;
NERV_SET_STATUS(status, NERV_NORMAL, 0);
return max_id;
}
void SelectGPU(Device *device, int gpu_id, Status *status)
{
std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
int n_gpu = 0;
cudaGetDeviceCount(&n_gpu);
if(gpu_id >= n_gpu)
{
NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot select GPU CUDA capable cards!");
std::cerr << "Cannot select GPU " << gpu_id
<< ", detected " << n_gpu << " CUDA capable cards!" << std::endl;
}
std::cerr << "Selected device: " << gpu_id << " (manually)";
std::cerr << "free: " << gpuinfo_[gpu_id].mem_free/1024/1024 << "M, "
<< "total: "<< gpuinfo_[gpu_id].mem_total/1024/1024 << "M, "
<< "ratio: "<< gpuinfo_[gpu_id].mem_ratio << " "
<< "d2h bandwidth: " << gpuinfo_[gpu_id].d2h_bandwidth << "MB/s, "
<< "h2d bandwidth: "<< gpuinfo_[gpu_id].h2d_bandwidth << "MB/s" << std::endl;
CUDA_SAFE_SYNC_CALL(cudaSetDevice(gpu_id), status);
//initialize the CUBLAS
//CUBLAS_SAFE_SYNC_CALL(cublasInit(), status);
//cublasHandle_t *cublas_handle = nerv_get_cublas_handle();
//cublasCreate(cublas_handle);
//create the context
cudaError_t e;
e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
if(e != cudaSuccess) {
NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Failed to create CUDA context on a GPU.");
std::cerr << "Failed to create CUDA context on a GPU.";
}
gpuinfo_[gpu_id].used = true;
NERV_SET_STATUS(status, NERV_NORMAL, 0);
}
void Initialize(Device *device, Status *status)
{
// Check that we have at least one gpu
int n_gpu = 0;
cudaGetDeviceCount(&n_gpu);
if(n_gpu == 0)
NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found");
std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
gpuinfo_.resize(n_gpu);
std::cerr << "gpu information ..." << std::endl;
// Get ratios of memory use, if possible
for(int n = 0; n < n_gpu; n++)
{
int ret = cudaSetDevice(n);
switch(ret)
{
case cudaSuccess : {
//create the CUDA context for the thread
//cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
cudaDeviceSynchronize();
//get GPU name
char name[STRLEN];
DeviceGetName(name,STRLEN,n, status);
//get GPU memory stats
size_t free, total;
float d2h, h2d;
GetFreeMemory(&free, &total, status);
GetBandwidth(n, &d2h, &h2d, status);
gpuinfo_[n].gpuid = n;
gpuinfo_[n].d2h_bandwidth = d2h;
gpuinfo_[n].h2d_bandwidth = h2d;
gpuinfo_[n].mem_free = free;
gpuinfo_[n].mem_total = total;
gpuinfo_[n].mem_ratio = free/(float)total;
gpuinfo_[n].used = false;
std::cerr << "gpu: " << n << " ==> "
<< "free: " << free/1024/1024 << "M, "
<< "total: "<< total/1024/1024 << "M, "
<< "ratio: "<< free/(float)total << " "
<< "d2h bandwidth: " << d2h << "MB/s, "
<< "h2d bandwidth: "<< h2d << "MB/s" << std::endl;
//destroy the CUDA context for the thread
//cudaThreadExit(); //deprecated, but for legacy reason not cudaDeviceReset
} break;
#if (CUDA_VERSION > 3020)
case cudaErrorDeviceAlreadyInUse :
std::cerr << "cudaSetDevice(" << n << "): "
<< "Device cannot be accessed, used EXCLUSIVE-THREAD mode..." << std::endl;
break;
#endif
case cudaErrorInvalidDevice :
std::cerr << "cudaSetDevice(" << n << "): "
<< "Device cannot be accessed, not a VALID CUDA device!" << std::endl;
break;
default :
std::cerr << "cudaSetDevice(" << n << "): "
<< "returned " << ret << ", "
<< cudaGetErrorString((cudaError_t)ret) << std::endl;
}
}
cudaDeviceReset();
printf("Result = PASS\n");
NERV_SET_STATUS(status, NERV_NORMAL, 0);
}
} // extern