#include <vector>
#include <string>
#include <iostream>
#include "../../nerv/lib/matrix/cuda_helper.h"
#include <dlfcn.h>
extern "C" {
#include "Device.h"
#include "stdlib.h"
#include "string.h"
struct GPUInfo
{
int gpuid;
float d2h_bandwidth;
float h2d_bandwidth;
size_t mem_free;
size_t mem_total;
float mem_ratio;
bool used;
};
struct MPIGPUInfo
{
char hostname[STRLEN];
int gpuid;
int myid;
};
struct Device
{
std::vector<GPUInfo> gpuinfo_;
int refcount;
};
///////////////////////////////////////////
extern cublasHandle_t* nerv_get_cublas_handle();
CuContext* CuContext_new()
{
CuContext *context = new CuContext;
cublasCreate(&context->cublas_handle);
cudaEventCreate(&context->profile_start);
cudaEventCreate(&context->profile_stop);
context->profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
context->refcount = 1;
context->pid = pthread_self();
return context;
}
CuContext* CuContext_newWithId(long id)
{
CuContext *context = (CuContext*)id;
__sync_fetch_and_add(&context->refcount, 1);
return context;
}
long CuContext_id(CuContext *context)
{
return (long)context;
}
void CuContext_destroy(CuContext *context)
{
if (NULL != context && __sync_fetch_and_add(&context->refcount, -1) == 1)
{
cublasDestroy(context->cublas_handle);
hashmap_clear(context->profile);
delete context;
context = NULL;
}
}
Device* Device_new()
{
Device* device = new Device;
device->refcount = 1;
return device;
}
Device* Device_newWithId(long id)
{
Device *device = (Device*)id;
__sync_fetch_and_add(&device->refcount, 1);
return device;
}
long Device_id(Device *device)
{
return (long)device;
}
void Device_destroy(Device *device)
{
if (NULL != device && __sync_fetch_and_add(&device->refcount, -1) == 1)
{
delete device;
device = NULL;
}
}
void GetFreeMemory(size_t* free, size_t* total, Status *status)
{
// WARNING! the CUDA API is inconsistent accross versions!
#if (CUDA_VERSION >= 3020)
//define the function signature type
size_t mem_free, mem_total;
#else
unsigned int mem_free, mem_total;
#endif
{
//we will load the cuMemGetInfo dynamically from libcuda.so
//cuMemGetInfo(&mem_free, &mem_total);
//pre-fill ``safe'' values that will not cause problems
mem_free = 1; mem_total = 1;
//open libcuda.so
void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
if(NULL == libcuda)
{
NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
}
else
{
//define the function signature type
//and get the symbol
#if (CUDA_VERSION >= 3020)
typedef CUresult (*cu_fun_ptr)(size_t*, size_t*);
cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2");
#else
typedef CUresult (*cu_fun_ptr)(int*, int*);
cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo");
#endif
if(NULL == dl_cuMemGetInfo)
{
NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuMemGetInfo from libcuda.so");
}
else
{
//call the function
dl_cuMemGetInfo(&mem_free, &mem_total);
}
//close the library
dlclose(libcuda);
}
}
// copy the output values outside
if(NULL != free) *free = mem_free;
if(NULL != total) *total = mem_total;
}
void DeviceGetName(char* name, int len, int dev, Status *status)
{
//prefill with something reasonable
strncpy(name,"Unknown GPU",len);
//open libcuda.so
void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
if(NULL == libcuda)
{
NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
}
else
{
//define the function signature type
typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice);
//get the symbol
cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName");
if(NULL == cuDeviceGetName_ptr) {
NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuDeviceGetName from libcuda.so");
}
else {
//call the function
cuDeviceGetName_ptr(name, len, dev);
}
//close the library
dlclose(libcuda);
}
}
void GetBandwidth(int gpu_idx, float *d2h, float *h2d, Status *status)
{
int idx = gpu_idx;
int memSize = 64*1024*1024;
float elapsedTimeInMs = 0.0f;
float bandwidthInMBs = 0.0f;
unsigned char *h_idata = NULL;
unsigned char *h_odata = NULL;
cudaEvent_t start, stop;
bool PINNED = true;
int MEMCOPY_ITERATIONS = 5;
CUDA_SAFE_SYNC_CALL(cudaEventCreate(&start), status);
CUDA_SAFE_SYNC_CALL(cudaEventCreate(&stop), status);
//allocate host memory
if (PINNED)
{
#if CUDART_VERSION >= 2020
CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_idata, memSize, cudaHostAllocPortable), status);
CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_odata, memSize, cudaHostAllocPortable), status);
#else
CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_idata, memSize), status);
CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_odata, memSize), status);
#endif
}
else
{
//pageable memory mode - use malloc
h_odata = (unsigned char *)malloc(memSize);
if (h_odata == 0)
{
NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Not enough memory available on host to run test!");
}
}
//initialize the memory
for (unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
{
h_idata[i] = (unsigned char)(i & 0xff);
}
// allocate device memory
unsigned char *d_idata;
CUDA_SAFE_SYNC_CALL(cudaMalloc((void **) &d_idata, memSize), status);
//initialize the device memory
CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_idata, memSize,cudaMemcpyHostToDevice), status);
//copy data from GPU to Host
CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status);
if (PINNED)
{
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost, 0), status);
}
}
else
{
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
{
CUDA_SAFE_SYNC_CALL(cudaMemcpy(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost), status);
}
}
CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status);
// make sure GPU has finished copying
CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status);
//get the the total elapsed time in ms
CUDA_SAFE_SYNC_CALL<