path: root/fastnn/device/Device.cpp



#include <vector>
#include <string>
#include <iostream>
#include "../../nerv/lib/matrix/cuda_helper.h"
#include <dlfcn.h>


extern "C" {


#include "Device.h"
#include "stdlib.h"
#include "string.h"


struct GPUInfo 
{
        int     gpuid;
        float   d2h_bandwidth;
        float   h2d_bandwidth;
        size_t  mem_free;
        size_t  mem_total;
        float   mem_ratio;
        bool    used;
};

struct MPIGPUInfo 
{
        char    hostname[STRLEN];
        int     gpuid;
        int     myid;
};

struct Device
{
        std::vector<GPUInfo> gpuinfo_;
	int	refcount;
};

///////////////////////////////////////////

extern cublasHandle_t* nerv_get_cublas_handle();

CuContext* CuContext_new()
{
	CuContext *context = new CuContext;
	cublasCreate(&context->cublas_handle);
	cudaEventCreate(&context->profile_start);
    	cudaEventCreate(&context->profile_stop);
	context->profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
	context->refcount = 1;
	context->pid = pthread_self();
	return context;
}

CuContext* CuContext_newWithId(long id)
{
	CuContext *context = (CuContext*)id;
	__sync_fetch_and_add(&context->refcount, 1);
	return context;
}

long	CuContext_id(CuContext *context)
{
	return (long)context;
}

void CuContext_destroy(CuContext *context)
{
	if (NULL != context && __sync_fetch_and_add(&context->refcount, -1) == 1)
        {   
		cublasDestroy(context->cublas_handle);
		hashmap_clear(context->profile);
                delete context;
                context = NULL;
        }   	
}

Device* Device_new()
{
	Device* device = new Device;
	device->refcount = 1;
	return device;
}

Device* Device_newWithId(long id)
{
	Device *device = (Device*)id;
	__sync_fetch_and_add(&device->refcount, 1);	
	return device;
}

long    Device_id(Device *device)
{
	return (long)device;
}

void	Device_destroy(Device *device)
{
	if (NULL != device && __sync_fetch_and_add(&device->refcount, -1) == 1)
	{
		delete device;
		device = NULL;
	}
}

void GetFreeMemory(size_t* free, size_t* total, Status *status) 
{
	// WARNING! the CUDA API is inconsistent accross versions!
	#if (CUDA_VERSION >= 3020)
  	//define the function signature type
  	size_t mem_free, mem_total;
#else
  	unsigned int mem_free, mem_total;
#endif
  { 
    	//we will load the cuMemGetInfo dynamically from libcuda.so
    	//cuMemGetInfo(&mem_free, &mem_total);
    	//pre-fill ``safe'' values that will not cause problems
    	mem_free = 1; mem_total = 1;
    	//open libcuda.so
    	void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
    	if(NULL == libcuda) 
	{ 
		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
    	} 
	else 
	{
      		//define the function signature type
      		//and get the symbol
#if (CUDA_VERSION >= 3020)
      		typedef CUresult (*cu_fun_ptr)(size_t*, size_t*);
      		cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2"); 
#else
      		typedef CUresult (*cu_fun_ptr)(int*, int*);
      		cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo"); 
#endif
      		if(NULL == dl_cuMemGetInfo) 
		{
			NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuMemGetInfo from libcuda.so");
      		} 
		else 
		{
        		//call the function
        		dl_cuMemGetInfo(&mem_free, &mem_total);
      		}
      		//close the library
      		dlclose(libcuda);
    	}
  }
  	// copy the output values outside
  	if(NULL != free) *free = mem_free;
  	if(NULL != total) *total = mem_total;
}

void DeviceGetName(char* name, int len, int dev, Status *status) 
{
  	//prefill with something reasonable
  	strncpy(name,"Unknown GPU",len);
  	//open libcuda.so
  	void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
  	if(NULL == libcuda) 
	{
		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
  	} 
	else 
	{
    		//define the function signature type
    		typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice);
    		//get the symbol
    		cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName"); 
    		if(NULL == cuDeviceGetName_ptr) {
			NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuDeviceGetName from libcuda.so");
    		} 
		else {
      			//call the function
      			cuDeviceGetName_ptr(name, len, dev);
    		}
    		//close the library
    		dlclose(libcuda);
  	}
}

void GetBandwidth(int gpu_idx, float *d2h, float *h2d, Status *status)
{
	  int idx = gpu_idx;
	  int memSize = 64*1024*1024;
	  		float elapsedTimeInMs = 0.0f;
	  	    float bandwidthInMBs = 0.0f;
	  	    unsigned char *h_idata = NULL;
	  	    unsigned char *h_odata = NULL;
	  	    cudaEvent_t start, stop;
	  	    bool PINNED = true;
	  	    int MEMCOPY_ITERATIONS = 5;

	  	  CUDA_SAFE_SYNC_CALL(cudaEventCreate(&start), status);
	  	  CUDA_SAFE_SYNC_CALL(cudaEventCreate(&stop), status);

	  	    //allocate host memory
	  	    if (PINNED)
	  	        {
	  	#if CUDART_VERSION >= 2020
	  	    	CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_idata, memSize, cudaHostAllocPortable), status);
	  	    	CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_odata, memSize, cudaHostAllocPortable), status);
	  	#else
	  	    	CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_idata, memSize), status);
	  	    	CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_odata, memSize), status);
	  	#endif
	  	        }
	  	        else
	  	        {
	  	                //pageable memory mode - use malloc
	  	            h_odata = (unsigned char *)malloc(memSize);

	  	            if (h_odata == 0)
	  	            {
				NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Not enough memory available on host to run test!");
	  	            }
	  	         }

	  	    //initialize the memory
	  	    for (unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
	  	    {
	  	        h_idata[i] = (unsigned char)(i & 0xff);
	  	    }

	  	    // allocate device memory
	  	    unsigned char *d_idata;
	  	    CUDA_SAFE_SYNC_CALL(cudaMalloc((void **) &d_idata, memSize), status);

	  	    //initialize the device memory
	  	    CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_idata, memSize,cudaMemcpyHostToDevice), status);

	  	    //copy data from GPU to Host

	  	    CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status);

	  	    if (PINNED)
	  	    {
	  	        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
	  	        {
	  	        	CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost, 0), status);
	  	        }
	  	    }
	  	    else
	  	    {
	  	        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
	  	        {
	  	        	CUDA_SAFE_SYNC_CALL(cudaMemcpy(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost), status);
	  	        }
	  	    }

	  	  CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status);

	  	    // make sure GPU has finished copying
	  	CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status);
	  	    //get the the total elapsed time in ms

	  	CUDA_SAFE_SYNC_CALL<
#include <vector>
#include <string>
#include <iostream>
#include "../../nerv/lib/matrix/cuda_helper.h"
#include <dlfcn.h>


extern "C" {


#include "Device.h"
#include "stdlib.h"
#include "string.h"


struct GPUInfo 
{
        int     gpuid;
        float   d2h_bandwidth;
        float   h2d_bandwidth;
        size_t  mem_free;
        size_t  mem_total;
        float   mem_ratio;
        bool    used;
};

struct MPIGPUInfo 
{
        char    hostname[STRLEN];
        int     gpuid;
        int     myid;
};

struct Device
{
        std::vector<GPUInfo> gpuinfo_;
	int	refcount;
};

///////////////////////////////////////////

extern cublasHandle_t* nerv_get_cublas_handle();

CuContext* CuContext_new()
{
	CuContext *context = new CuContext;
	cublasCreate(&context->cublas_handle);
	cudaEventCreate(&context->profile_start);
    	cudaEventCreate(&context->profile_stop);
	context->profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
	context->refcount = 1;
	context->pid = pthread_self();
	return context;
}

CuContext* CuContext_newWithId(long id)
{
	CuContext *context = (CuContext*)id;
	__sync_fetch_and_add(&context->refcount, 1);
	return context;
}

long	CuContext_id(CuContext *context)
{
	return (long)context;
}

void CuContext_destroy(CuContext *context)
{
	if (NULL != context && __sync_fetch_and_add(&context->refcount, -1) == 1)
        {   
		cublasDestroy(context->cublas_handle);
		hashmap_clear(context->profile);
                delete context;
                context = NULL;
        }   	
}

Device* Device_new()
{
	Device* device = new Device;
	device->refcount = 1;
	return device;
}

Device* Device_newWithId(long id)
{
	Device *device = (Device*)id;
	__sync_fetch_and_add(&device->refcount, 1);	
	return device;
}

long    Device_id(Device *device)
{
	return (long)device;
}

void	Device_destroy(Device *device)
{
	if (NULL != device && __sync_fetch_and_add(&device->refcount, -1) == 1)
	{
		delete device;
		device = NULL;
	}
}

void GetFreeMemory(size_t* free, size_t* total, Status *status) 
{
	// WARNING! the CUDA API is inconsistent accross versions!
	#if (CUDA_VERSION >= 3020)
  	//define the function signature type
  	size_t mem_free, mem_total;
#else
  	unsigned int mem_free, mem_total;
#endif
  { 
    	//we will load the cuMemGetInfo dynamically from libcuda.so
    	//cuMemGetInfo(&mem_free, &mem_total);
    	//pre-fill ``safe'' values that will not cause problems
    	mem_free = 1; mem_total = 1;
    	//open libcuda.so
    	void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
    	if(NULL == libcuda) 
	{ 
		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
    	} 
	else 
	{
      		//define the function signature type
      		//and get the symbol
#if (CUDA_VERSION >= 3020)
      		typedef CUresult (*cu_fun_ptr)(size_t*, size_t*);
      		cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2"); 
#else
      		typedef CUresult (*cu_fun_ptr)(int*, int*);
      		cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo"); 
#endif
      		if(NULL == dl_cuMemGetInfo) 
		{
			NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuMemGetInfo from libcuda.so");
      		} 
		else 
		{
        		//call the function
        		dl_cuMemGetInfo(&mem_free, &mem_total);
      		}
      		//close the library
      		dlclose(libcuda);
    	}
  }
  	// copy the output values outside
  	if(NULL != free) *free = mem_free;
  	if(NULL != total) *total = mem_total;
}

void DeviceGetName(char* name, int len, int dev, Status *status) 
{
  	//prefill with something reasonable
  	strncpy(name,"Unknown GPU",len);
  	//open libcuda.so
  	void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
  	if(NULL == libcuda) 
	{
		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
  	} 
	else 
	{
    		//define the function signature type
    		typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice);
    		//get the symbol
    		cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName"); 
    		if(NULL == cuDeviceGetName_ptr) {
			NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuDeviceGetName from libcuda.so");
    		} 
		else {
      			//call the function
      			cuDeviceGetName_ptr(name, len, dev);
    		}
    		//close the library
    		dlclose(libcuda);
  	}
}

void GetBandwidth(int gpu_idx, float *d2h, float *h2d, Status *status)
{
	  int idx = gpu_idx;
	  int memSize = 64*1024*1024;
	  		float elapsedTimeInMs = 0.0f;
	  	    float bandwidthInMBs = 0.0f;
	  	    unsigned char *h_idata = NULL;
	  	    unsigned char *h_odata = NULL;
	  	    cudaEvent_t start, stop;
	  	    bool PINNED = true;
	  	    int MEMCOPY_ITERATIONS = 5;

	  	  CUDA_SAFE_SYNC_CALL(cudaEventCreate(&start), status);
	  	  CUDA_SAFE_SYNC_CALL(cudaEventCreate(&stop), status);

	  	    //allocate host memory
	  	    if (PINNED)
	  	        {
	  	#if CUDART_VERSION >= 2020
	  	    	CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_idata, memSize, cudaHostAllocPortable), status);
	  	    	CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_odata, memSize, cudaHostAllocPortable), status);
	  	#else
	  	    	CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_idata, memSize), status);
	  	    	CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_odata, memSize), status);
	  	#endif
	  	        }
	  	        else
	  	        {
	  	                //pageable memory mode - use malloc
	  	            h_odata = (unsigned char *)malloc(memSize);

	  	            if (h_odata == 0)
	  	            {
				NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Not enough memory available on host to run test!");
	  	            }
	  	         }

	  	    //initialize the memory
	  	    for (unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
	  	    {
	  	        h_idata[i] = (unsigned char)(i & 0xff);
	  	    }

	  	    // allocate device memory
	  	    unsigned char *d_idata;
	  	    CUDA_SAFE_SYNC_CALL(cudaMalloc((void **) &d_idata, memSize), status);

	  	    //initialize the device memory
	  	    CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_idata, memSize,cudaMemcpyHostToDevice), status);

	  	    //copy data from GPU to Host

	  	    CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status);

	  	    if (PINNED)
	  	    {
	  	        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
	  	        {
	  	        	CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost, 0), status);
	  	        }
	  	    }
	  	    else
	  	    {
	  	        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
	  	        {
	  	        	CUDA_SAFE_SYNC_CALL(cudaMemcpy(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost), status);
	  	        }
	  	    }

	  	  CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status);

	  	    // make sure GPU has finished copying
	  	CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status);
	  	    //get the the total elapsed time in ms

	  	CUDA_SAFE_SYNC_CALL<