path: root/fastnn/device/Device.cpp



#include <vector>
#include <string>
#include <iostream>
#include "../../nerv/lib/matrix/cuda_helper.h"
#include <dlfcn.h>


extern "C" {


#include "Device.h"
#include "stdlib.h"
#include "string.h"


struct GPUInfo 
{
        int     gpuid;
        float   d2h_bandwidth;
        float   h2d_bandwidth;
        size_t  mem_free;
        size_t  mem_total;
        float   mem_ratio;
        bool    used;
};

struct MPIGPUInfo 
{
        char    hostname[STRLEN];
        int     gpuid;
        int     myid;
};

struct Device
{
        std::vector<GPUInfo> gpuinfo_;
	int	refcount;
};

///////////////////////////////////////////

extern cublasHandle_t* nerv_get_cublas_handle();

CuContext* CuContext_new()
{
	CuContext *context = new CuContext;
	cublasCreate(&context->cublas_handle);
	cudaEventCreate(&context->profile_start);
    	cudaEventCreate(&context->profile_stop);
	context->profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
	context->refcount = 1;
	context->pid = pthread_self();
	return context;
}

CuContext* CuContext_newWithId(long id)
{
	CuContext *context = (CuContext*)id;
	__sync_fetch_and_add(&context->refcount, 1);
	return context;
}

long	CuContext_id(CuContext *context)
{
	return (long)context;
}

void CuContext_destroy(CuContext *context)
{
	if (NULL != context && __sync_fetch_and_add(&context->refcount, -1) == 1)
        {   
		cublasDestroy(context->cublas_handle);
		hashmap_clear(context->profile);
                delete context;
                context = NULL;
        }   	
}

Device* Device_new()
{
	Device* device = new Device;
	device->refcount = 1;
	return device;
}

Device* Device_newWithId(long id)
{
	Device *device = (Device*)id;
	__sync_fetch_and_add(&device->refcount, 1);	
	return device;
}

long    Device_id(Device *device)
{
	return (long)device;
}

void	Device_destroy(Device *device)
{
	if (NULL != device && __sync_fetch_and_add(&device->refcount, -1) == 1)
	{
		delete device;
		device = NULL;
	}
}

void GetFreeMemory(size_t* free, size_t* total, Status *status) 
{
	// WARNING! the CUDA API is inconsistent accross versions!
	#if (CUDA_VERSION >= 3020)
  	//define the function signature type
  	size_t mem_free, mem_total;
#else
  	unsigned int mem_free, mem_total;
#endif
  { 
    	//we will load the cuMemGetInfo dynamically from libcuda.so
    	//cuMemGetInfo(&mem_free, &mem_total);
    	//pre-fill ``safe'' values that will not cause problems
    	mem_free = 1; mem_total = 1;
    	//open libcuda.so
    	void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
    	if(NULL == libcuda) 
	{ 
		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
    	} 
	else 
	{
      		//define the function signature type
      		//and get the symbol
#if (CUDA_VERSION >= 3020)
      		typedef CUresult (*cu_fun_ptr)(size_t*, size_t*);
      		cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2"); 
#else
      		typedef CUresult (*cu_fun_ptr)(int*, int*);
      		cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo"); 
#endif
      		if(NULL == dl_cuMemGetInfo) 
		{
			NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuMemGetInfo from libcuda.so");
      		} 
		else 
		{
        		//call the function
        		dl_cuMemGetInfo(&mem_free, &mem_total);
      		}
      		//close the library
      		dlclose(libcuda);
    	}
  }
  	// copy the output values outside
  	if(NULL != free) *free = mem_free;
  	if(NULL != total) *total = mem_total;
}

void DeviceGetName(char* name, int len, int dev, Status *status) 
{
  	//prefill with something reasonable
  	strncpy(name,"Unknown GPU",len);
  	//open libcuda.so
  	void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
  	if(NULL == libcuda) 
	{
		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
  	} 
	else 
	{
    		//define the function signature type
    		typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice);
    		//get the symbol
    		cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName"); 
    		if(NULL == cuDeviceGetName_ptr) {
			NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuDeviceGetName from libcuda.so");
    		} 
		else {
      			//call the function
      			cuDeviceGetName_ptr(name, len, dev);
    		}
    		//close the library
    		dlclose(libcuda);
  	}
}

void GetBandwidth(int gpu_idx, float *d2h, float *h2d, Status *status)
{
	  int idx = gpu_idx;
	  int memSize = 64*1024*1024;
	  		float elapsedTimeInMs = 0.0f;
	  	    float bandwidthInMBs = 0.0f;
	  	    unsigned char *h_idata = NULL;
	  	    unsigned char *h_odata = NULL;
	  	    cudaEvent_t start, stop;
	  	    bool PINNED = true;
	  	    int MEMCOPY_ITERATIONS = 5;

	  	  CUDA_SAFE_SYNC_CALL(cudaEventCreate(&start), status);
	  	  CUDA_SAFE_SYNC_CALL(cudaEventCreate(&stop), status);

	  	    //allocate host memory
	  	    if (PINNED)
	  	        {
	  	#if CUDART_VERSION >= 2020
	  	    	CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_idata, memSize, cudaHostAllocPortable), status);
	  	    	CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_odata, memSize, cudaHostAllocPortable), status);
	  	#else
	  	    	CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_idata, memSize), status);
	  	    	CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_odata, memSize), status);
	  	#endif
	  	        }
	  	        else
	  	        {
	  	                //pageable memory mode - use malloc
	  	            h_odata = (unsigned char *)malloc(memSize);

	  	            if (h_odata == 0)
	  	            {
				NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Not enough memory available on host to run test!");
	  	            }
	  	         }

	  	    //initialize the memory
	  	    for (unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
	  	    {
	  	        h_idata[i] = (unsigned char)(i & 0xff);
	  	    }

	  	    // allocate device memory
	  	    unsigned char *d_idata;
	  	    CUDA_SAFE_SYNC_CALL(cudaMalloc((void **) &d_idata, memSize), status);

	  	    //initialize the device memory
	  	    CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_idata, memSize,cudaMemcpyHostToDevice), status);

	  	    //copy data from GPU to Host

	  	    CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status);

	  	    if (PINNED)
	  	    {
	  	        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
	  	        {
	  	        	CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost, 0), status);
	  	        }
	  	    }
	  	    else
	  	    {
	  	        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
	  	        {
	  	        	CUDA_SAFE_SYNC_CALL(cudaMemcpy(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost), status);
	  	        }
	  	    }

	  	  CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status);

	  	    // make sure GPU has finished copying
	  	CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status);
	  	    //get the the total elapsed time in ms

	  	CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status);


	  	//calculate bandwidth in MB/s
	  	bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20));
	  	*d2h = bandwidthInMBs;

	  	/////////////////////////////////////////////////////
	  	//copy data from Host to GPU
	  	CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status);

	  		  	    if (PINNED)
	  		  	    {
	  		  	        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
	  		  	        {
	  		  	        	CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(d_idata, h_odata, memSize,cudaMemcpyHostToDevice, 0), status);
	  		  	        }
	  		  	    }
	  		  	    else
	  		  	    {
	  		  	        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
	  		  	        {
	  		  	        	CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_odata, memSize,cudaMemcpyHostToDevice), status);
	  		  	        }
	  		  	    }

	  		  	  CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status);

	  		  	    // make sure GPU has finished copying
	  		  	CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status);
	  		  	    //get the the total elapsed time in ms

	  		  	CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status);


	  		  	//calculate bandwidth in MB/s
	  	bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20));
	  	*h2d = bandwidthInMBs;

	  	//clean up memory
	  	CUDA_SAFE_SYNC_CALL(cudaEventDestroy(stop), status);
	  	CUDA_SAFE_SYNC_CALL(cudaEventDestroy(start), status);

	  	    if (PINNED)
	  	    {
	  	    	CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_idata), status);
	  	    	CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_odata), status);
	  	    }
	  	    else
	  	    {
	  	        free(h_idata);
	  	        free(h_odata);
	  	    }

	  	  CUDA_SAFE_SYNC_CALL(cudaFree(d_idata), status);
}

int AutoSelectGPU(Device *device, Status *status)
{
	//GPU selection is based on largest proportion of free memory.

	int max_id = 0, n_gpu = 0;
	std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
	
	cudaGetDeviceCount(&n_gpu);

	if(n_gpu == 0) 
	{
		NERV_SET_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found");	
		return -1;
	}

	if (n_gpu > 0)
	{
		for (int i = 0; i < gpuinfo_.size(); i++)
		{
			if (!gpuinfo_[i].used)
			{
				max_id = i;
				break;
			}
		}
		//find GPU with max free memory
		for (int n = 1; n < gpuinfo_.size(); n++)
		{
			if (!gpuinfo_[n].used && gpuinfo_[n].mem_ratio > gpuinfo_[max_id].mem_ratio)
				max_id = n;
		}


        std::cerr << "Selected device: " << max_id << " (automatically)" << std::endl;

        std::cerr << "free: " << gpuinfo_[max_id].mem_free/1024/1024 << "M, "
                  << "total: "<< gpuinfo_[max_id].mem_total/1024/1024 << "M, "
                  << "ratio: "<< gpuinfo_[max_id].mem_ratio << "   "
        	  << "d2h bandwidth: " << gpuinfo_[max_id].d2h_bandwidth << "MB/s, "
                  << "h2d bandwidth: "<< gpuinfo_[max_id].h2d_bandwidth << "MB/s" << std::endl;

		cudaSetDevice(max_id);
		//initialize the CUBLAS
		//cublasInit();
		//cublasHandle_t *cublas_handle = nerv_get_cublas_handle();
		//std::cerr << "cublasHandle_t: " << cublas_handle << std::endl;
       	 	//cublasCreate(cublas_handle);

		//create the context
		cudaError_t e;
		e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
		if(e != cudaSuccess) {
			std::cerr << "Failed to create CUDA context on a GPU." << std::endl;
		}
	}

	gpuinfo_[max_id].used = true;
	NERV_SET_STATUS(status, NERV_NORMAL, 0);
	return max_id;
}

void SelectGPU(Device *device, int gpu_id, Status *status)
{

	std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
	int n_gpu = 0;
	cudaGetDeviceCount(&n_gpu);
	if(gpu_id >= n_gpu) 
	{
		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot select GPU CUDA capable cards!");
    		std::cerr << "Cannot select GPU " << gpu_id
              		<< ", detected " << n_gpu << " CUDA capable cards!" << std::endl;
	}


    	std::cerr << "Selected device: " << gpu_id << " (manually)";

    	std::cerr << "free: " << gpuinfo_[gpu_id].mem_free/1024/1024 << "M, "
              << "total: "<< gpuinfo_[gpu_id].mem_total/1024/1024 << "M, "
              << "ratio: "<< gpuinfo_[gpu_id].mem_ratio << "   "
    	      << "d2h bandwidth: " << gpuinfo_[gpu_id].d2h_bandwidth << "MB/s, "
              << "h2d bandwidth: "<< gpuinfo_[gpu_id].h2d_bandwidth << "MB/s" << std::endl;

	CUDA_SAFE_SYNC_CALL(cudaSetDevice(gpu_id), status);
	//initialize the CUBLAS
	//CUBLAS_SAFE_SYNC_CALL(cublasInit(), status);
	//cublasHandle_t *cublas_handle = nerv_get_cublas_handle();
	//cublasCreate(cublas_handle);

	//create the context
	cudaError_t e;
	e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
	if(e != cudaSuccess) {
		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Failed to create CUDA context on a GPU.");
		std::cerr << "Failed to create CUDA context on a GPU.";
	}

	gpuinfo_[gpu_id].used = true;

	NERV_SET_STATUS(status, NERV_NORMAL, 0);
}

void Initialize(Device *device, Status *status)
{
	// Check that we have at least one gpu
	int n_gpu = 0;
	cudaGetDeviceCount(&n_gpu);
	if(n_gpu == 0) 
		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found");

	std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
	gpuinfo_.resize(n_gpu);

	std::cerr << "gpu information ..." << std::endl;

	// Get ratios of memory use, if possible
	for(int n = 0; n < n_gpu; n++) 
	{
		int ret = cudaSetDevice(n);
	    	switch(ret) 
		{
	      	case cudaSuccess : {
	        //create the CUDA context for the thread
	        //cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
		cudaDeviceSynchronize();
	        //get GPU name
	        char name[STRLEN];
	        DeviceGetName(name,STRLEN,n, status);
	        //get GPU memory stats
	        size_t free, total;
		float d2h, h2d;

	        GetFreeMemory(&free, &total, status);
	        GetBandwidth(n, &d2h, &h2d, status);

	        gpuinfo_[n].gpuid = n;
	        gpuinfo_[n].d2h_bandwidth = d2h;
	        gpuinfo_[n].h2d_bandwidth = h2d;
	        gpuinfo_[n].mem_free = free;
	        gpuinfo_[n].mem_total = total;
	        gpuinfo_[n].mem_ratio = free/(float)total;
	        gpuinfo_[n].used = false;

		std::cerr << "gpu: "  << n << " ==> "
	        	  << "free: " << free/1024/1024 << "M, "
	                  << "total: "<< total/1024/1024 << "M, "
	                  << "ratio: "<< free/(float)total << "   "
	        	  << "d2h bandwidth: " << d2h << "MB/s, "
	                  << "h2d bandwidth: "<< h2d << "MB/s" << std::endl;

	        //destroy the CUDA context for the thread
	        //cudaThreadExit(); //deprecated, but for legacy reason not cudaDeviceReset
	      	} break;

	#if (CUDA_VERSION > 3020)
	      case cudaErrorDeviceAlreadyInUse :
	        std::cerr << "cudaSetDevice(" << n << "): "
	                  << "Device cannot be accessed, used EXCLUSIVE-THREAD mode..." << std::endl;
	        break;
	#endif
	      case cudaErrorInvalidDevice :
	        std::cerr << "cudaSetDevice(" << n << "): "
	                  << "Device cannot be accessed, not a VALID CUDA device!" << std::endl;
	        break;
	      default :
	        std::cerr << "cudaSetDevice(" << n << "): "
	                  << "returned " << ret << ", "
	                  << cudaGetErrorString((cudaError_t)ret) << std::endl;
	    	}
	}
	cudaDeviceReset();
	printf("Result = PASS\n");
	NERV_SET_STATUS(status, NERV_NORMAL, 0);
}


} // extern
#include <vector>
#include <string>
#include <iostream>
#include "../../nerv/lib/matrix/cuda_helper.h"
#include <dlfcn.h>


extern "C" {


#include "Device.h"
#include "stdlib.h"
#include "string.h"


struct GPUInfo 
{
        int     gpuid;
        float   d2h_bandwidth;
        float   h2d_bandwidth;
        size_t  mem_free;
        size_t  mem_total;
        float   mem_ratio;
        bool    used;
};

struct MPIGPUInfo 
{
        char    hostname[STRLEN];
        int     gpuid;
        int     myid;
};

struct Device
{
        std::vector<GPUInfo> gpuinfo_;
	int	refcount;
};

///////////////////////////////////////////

extern cublasHandle_t* nerv_get_cublas_handle();

CuContext* CuContext_new()
{
	CuContext *context = new CuContext;
	cublasCreate(&context->cublas_handle);
	cudaEventCreate(&context->profile_start);
    	cudaEventCreate(&context->profile_stop);
	context->profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
	context->refcount = 1;
	context->pid = pthread_self();
	return context;
}

CuContext* CuContext_newWithId(long id)
{
	CuContext *context = (CuContext*)id;
	__sync_fetch_and_add(&context->refcount, 1);
	return context;
}

long	CuContext_id(CuContext *context)
{
	return (long)context;
}

void CuContext_destroy(CuContext *context)
{
	if (NULL != context && __sync_fetch_and_add(&context->refcount, -1) == 1)
        {   
		cublasDestroy(context->cublas_handle);
		hashmap_clear(context->profile);
                delete context;
                context = NULL;
        }   	
}

Device* Device_new()
{
	Device* device = new Device;
	device->refcount = 1;
	return device;
}

Device* Device_newWithId(long id)
{
	Device *device = (Device*)id;
	__sync_fetch_and_add(&device->refcount, 1);	
	return device;
}

long    Device_id(Device *device)
{
	return (long)device;
}

void	Device_destroy(Device *device)
{
	if (NULL != device && __sync_fetch_and_add(&device->refcount, -1) == 1)
	{
		delete device;
		device = NULL;
	}
}

void GetFreeMemory(size_t* free, size_t* total, Status *status) 
{
	// WARNING! the CUDA API is inconsistent accross versions!
	#if (CUDA_VERSION >= 3020)
  	//define the function signature type
  	size_t mem_free, mem_total;
#else
  	unsigned int mem_free, mem_total;
#endif
  { 
    	//we will load the cuMemGetInfo dynamically from libcuda.so
    	//cuMemGetInfo(&mem_free, &mem_total);
    	//pre-fill ``safe'' values that will not cause problems
    	mem_free = 1; mem_total = 1;
    	//open libcuda.so
    	void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
    	if(NULL == libcuda) 
	{ 
		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
    	} 
	else 
	{
      		//define the function signature type
      		//and get the symbol
#if (CUDA_VERSION >= 3020)
      		typedef CUresult (*cu_fun_ptr)(size_t*, size_t*);
      		cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2"); 
#else
      		typedef CUresult (*cu_fun_ptr)(int*, int*);
      		cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo"); 
#endif
      		if(NULL == dl_cuMemGetInfo) 
		{
			NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuMemGetInfo from libcuda.so");
      		} 
		else 
		{
        		//call the function
        		dl_cuMemGetInfo(&mem_free, &mem_total);
      		}
      		//close the library
      		dlclose(libcuda);
    	}
  }
  	// copy the output values outside
  	if(NULL != free) *free = mem_free;
  	if(NULL != total) *total = mem_total;
}

void DeviceGetName(char* name, int len, int dev, Status *status) 
{
  	//prefill with something reasonable
  	strncpy(name,"Unknown GPU",len);
  	//open libcuda.so
  	void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
  	if(NULL == libcuda) 
	{
		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
  	} 
	else 
	{
    		//define the function signature type
    		typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice);
    		//get the symbol
    		cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName"); 
    		if(NULL == cuDeviceGetName_ptr) {
			NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuDeviceGetName from libcuda.so");
    		} 
		else {
      			//call the function
      			cuDeviceGetName_ptr(name, len, dev);
    		}
    		//close the library
    		dlclose(libcuda);
  	}
}

void GetBandwidth(int gpu_idx, float *d2h, float *h2d, Status *status)
{
	  int idx = gpu_idx;
	  int memSize = 64*1024*1024;
	  		float elapsedTimeInMs = 0.0f;
	  	    float bandwidthInMBs = 0.0f;
	  	    unsigned char *h_idata = NULL;
	  	    unsigned char *h_odata = NULL;
	  	    cudaEvent_t start, stop;
	  	    bool PINNED = true;
	  	    int MEMCOPY_ITERATIONS = 5;

	  	  CUDA_SAFE_SYNC_CALL(cudaEventCreate(&start), status);
	  	  CUDA_SAFE_SYNC_CALL(cudaEventCreate(&stop), status);

	  	    //allocate host memory
	  	    if (PINNED)
	  	        {
	  	#if CUDART_VERSION >= 2020
	  	    	CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_idata, memSize, cudaHostAllocPortable), status);
	  	    	CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_odata, memSize, cudaHostAllocPortable), status);
	  	#else
	  	    	CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_idata, memSize), status);
	  	    	CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_odata, memSize), status);
	  	#endif
	  	        }
	  	        else
	  	        {
	  	                //pageable memory mode - use malloc
	  	            h_odata = (unsigned char *)malloc(memSize);

	  	            if (h_odata == 0)
	  	            {
				NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Not enough memory available on host to run test!");
	  	            }
	  	         }

	  	    //initialize the memory
	  	    for (unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
	  	    {
	  	        h_idata[i] = (unsigned char)(i & 0xff);
	  	    }

	  	    // allocate device memory
	  	    unsigned char *d_idata;
	  	    CUDA_SAFE_SYNC_CALL(cudaMalloc((void **) &d_idata, memSize), status);

	  	    //initialize the device memory
	  	    CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_idata, memSize,cudaMemcpyHostToDevice), status);

	  	    //copy data from GPU to Host

	  	    CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status);

	  	    if (PINNED)
	  	    {
	  	        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
	  	        {
	  	        	CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost, 0), status);
	  	        }
	  	    }
	  	    else
	  	    {
	  	        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
	  	        {
	  	        	CUDA_SAFE_SYNC_CALL(cudaMemcpy(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost), status);
	  	        }
	  	    }

	  	  CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status);

	  	    // make sure GPU has finished copying
	  	CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status);
	  	    //get the the total elapsed time in ms

	  	CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status);


	  	//calculate bandwidth in MB/s
	  	bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20));
	  	*d2h = bandwidthInMBs;

	  	/////////////////////////////////////////////////////
	  	//copy data from Host to GPU
	  	CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status);

	  		  	    if (PINNED)
	  		  	    {
	  		  	        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
	  		  	        {
	  		  	        	CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(d_idata, h_odata, memSize,cudaMemcpyHostToDevice, 0), status);
	  		  	        }
	  		  	    }
	  		  	    else
	  		  	    {
	  		  	        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
	  		  	        {
	  		  	        	CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_odata, memSize,cudaMemcpyHostToDevice), status);
	  		  	        }
	  		  	    }

	  		  	  CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status);

	  		  	    // make sure GPU has finished copying
	  		  	CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status);
	  		  	    //get the the total elapsed time in ms

	  		  	CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status);


	  		  	//calculate bandwidth in MB/s
	  	bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20));
	  	*h2d = bandwidthInMBs;

	  	//clean up memory
	  	CUDA_SAFE_SYNC_CALL(cudaEventDestroy(stop), status);
	  	CUDA_SAFE_SYNC_CALL(cudaEventDestroy(start), status);

	  	    if (PINNED)
	  	    {
	  	    	CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_idata), status);
	  	    	CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_odata), status);
	  	    }
	  	    else
	  	    {
	  	        free(h_idata);
	  	        free(h_odata);
	  	    }

	  	  CUDA_SAFE_SYNC_CALL(cudaFree(d_idata), status);
}

int AutoSelectGPU(Device *device, Status *status)
{
	//GPU selection is based on largest proportion of free memory.

	int max_id = 0, n_gpu = 0;
	std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
	
	cudaGetDeviceCount(&n_gpu);

	if(n_gpu == 0) 
	{
		NERV_SET_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found");	
		return -1;
	}

	if (n_gpu > 0)
	{
		for (int i = 0; i < gpuinfo_.size(); i++)
		{
			if (!gpuinfo_[i].used)
			{
				max_id = i;
				break;
			}
		}
		//find GPU with max free memory
		for (int n = 1; n < gpuinfo_.size(); n++)
		{
			if (!gpuinfo_[n].used && gpuinfo_[n].mem_ratio > gpuinfo_[max_id].mem_ratio)
				max_id = n;
		}


        std::cerr << "Selected device: " << max_id << " (automatically)" << std::endl;

        std::cerr << "free: " << gpuinfo_[max_id].mem_free/1024/1024 << "M, "
                  << "total: "<< gpuinfo_[max_id].mem_total/1024/1024 << "M, "
                  << "ratio: "<< gpuinfo_[max_id].mem_ratio << "   "
        	  << "d2h bandwidth: " << gpuinfo_[max_id].d2h_bandwidth << "MB/s, "
                  << "h2d bandwidth: "<< gpuinfo_[max_id].h2d_bandwidth << "MB/s" << std::endl;

		cudaSetDevice(max_id);
		//initialize the CUBLAS
		//cublasInit();
		//cublasHandle_t *cublas_handle = nerv_get_cublas_handle();
		//std::cerr << "cublasHandle_t: " << cublas_handle << std::endl;
       	 	//cublasCreate(cublas_handle);

		//create the context
		cudaError_t e;
		e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
		if(e != cudaSuccess) {
			std::cerr << "Failed to create CUDA context on a GPU." << std::endl;
		}
	}

	gpuinfo_[max_id].used = true;
	NERV_SET_STATUS(status, NERV_NORMAL, 0);
	return max_id;
}

void SelectGPU(Device *device, int gpu_id, Status *status)
{

	std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
	int n_gpu = 0;
	cudaGetDeviceCount(&n_gpu);
	if(gpu_id >= n_gpu) 
	{
		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot select GPU CUDA capable cards!");
    		std::cerr << "Cannot select GPU " << gpu_id
              		<< ", detected " << n_gpu << " CUDA capable cards!" << std::endl;
	}


    	std::cerr << "Selected device: " << gpu_id << " (manually)";

    	std::cerr << "free: " << gpuinfo_[gpu_id].mem_free/1024/1024 << "M, "
              << "total: "<< gpuinfo_[gpu_id].mem_total/1024/1024 << "M, "
              << "ratio: "<< gpuinfo_[gpu_id].mem_ratio << "   "
    	      << "d2h bandwidth: " << gpuinfo_[gpu_id].d2h_bandwidth << "MB/s, "
              << "h2d bandwidth: "<< gpuinfo_[gpu_id].h2d_bandwidth << "MB/s" << std::endl;

	CUDA_SAFE_SYNC_CALL(cudaSetDevice(gpu_id), status);
	//initialize the CUBLAS
	//CUBLAS_SAFE_SYNC_CALL(cublasInit(), status);
	//cublasHandle_t *cublas_handle = nerv_get_cublas_handle();
	//cublasCreate(cublas_handle);

	//create the context
	cudaError_t e;
	e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
	if(e != cudaSuccess) {
		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Failed to create CUDA context on a GPU.");
		std::cerr << "Failed to create CUDA context on a GPU.";
	}

	gpuinfo_[gpu_id].used = true;

	NERV_SET_STATUS(status, NERV_NORMAL, 0);
}

void Initialize(Device *device, Status *status)
{
	// Check that we have at least one gpu
	int n_gpu = 0;
	cudaGetDeviceCount(&n_gpu);
	if(n_gpu == 0) 
		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found");

	std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
	gpuinfo_.resize(n_gpu);

	std::cerr << "gpu information ..." << std::endl;

	// Get ratios of memory use, if possible
	for(int n = 0; n < n_gpu; n++) 
	{
		int ret = cudaSetDevice(n);
	    	switch(ret) 
		{
	      	case cudaSuccess : {
	        //create the CUDA context for the thread
	        //cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
		cudaDeviceSynchronize();
	        //get GPU name
	        char name[STRLEN];
	        DeviceGetName(name,STRLEN,n, status);
	        //get GPU memory stats
	        size_t free, total;
		float d2h, h2d;

	        GetFreeMemory(&free, &total, status);
	        GetBandwidth(n, &d2h, &h2d, status);

	        gpuinfo_[n].gpuid = n;
	        gpuinfo_[n].d2h_bandwidth = d2h;
	        gpuinfo_[n].h2d_bandwidth = h2d;
	        gpuinfo_[n].mem_free = free;
	        gpuinfo_[n].mem_total = total;
	        gpuinfo_[n].mem_ratio = free/(float)total;
	        gpuinfo_[n].used = false;

		std::cerr << "gpu: "  << n << " ==> "
	        	  << "free: " << free/1024/1024 << "M, "
	                  << "total: "<< total/1024/1024 << "M, "
	                  << "ratio: "<< free/(float)total << "   "
	        	  << "d2h bandwidth: " << d2h << "MB/s, "
	                  << "h2d bandwidth: "<< h2d << "MB/s" << std::endl;

	        //destroy the CUDA context for the thread
	        //cudaThreadExit(); //deprecated, but for legacy reason not cudaDeviceReset
	      	} break;

	#if (CUDA_VERSION > 3020)
	      case cudaErrorDeviceAlreadyInUse :
	        std::cerr << "cudaSetDevice(" << n << "): "
	                  << "Device cannot be accessed, used EXCLUSIVE-THREAD mode..." << std::endl;
	        break;
	#endif
	      case cudaErrorInvalidDevice :
	        std::cerr << "cudaSetDevice(" << n << "): "
	                  << "Device cannot be accessed, not a VALID CUDA device!" << std::endl;
	        break;
	      default :
	        std::cerr << "cudaSetDevice(" << n << "): "
	                  << "returned " << ret << ", "
	                  << cudaGetErrorString((cudaError_t)ret) << std::endl;
	    	}
	}
	cudaDeviceReset();
	printf("Result = PASS\n");
	NERV_SET_STATUS(status, NERV_NORMAL, 0);
}


} // extern