4 files changed, 731 insertions, 0 deletions
diff --git a/fastnn/device/Device.cpp b/fastnn/device/Device.cpp
new file mode 100644
index 0000000..3b69086
--- /dev/null
+++ b/fastnn/device/Device.cpp
@@ -0,0 +1,496 @@
+
+#include <vector>
+#include <string>
+#include <iostream>
+#include "../../nerv/lib/matrix/cuda_helper.h"
+#include <dlfcn.h>
+
+
+extern "C" {
+
+
+#include "Device.h"
+#include "stdlib.h"
+#include "string.h"
+
+
+struct GPUInfo 
+{
+        int     gpuid;
+        float   d2h_bandwidth;
+        float   h2d_bandwidth;
+        size_t  mem_free;
+        size_t  mem_total;
+        float   mem_ratio;
+        bool    used;
+};
+
+struct MPIGPUInfo 
+{
+        char    hostname[STRLEN];
+        int     gpuid;
+        int     myid;
+};
+
+struct Device
+{
+        std::vector<GPUInfo> gpuinfo_;
+	int	refcount;
+};
+
+///////////////////////////////////////////
+
+extern cublasHandle_t* nerv_get_cublas_handle();
+
+CuContext* CuContext_new()
+{
+	CuContext *context = new CuContext;
+	cublasCreate(&context->cublas_handle);
+	cudaEventCreate(&context->profile_start);
+    	cudaEventCreate(&context->profile_stop);
+	context->profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
+	context->refcount = 1;
+	context->pid = pthread_self();
+	return context;
+}
+
+CuContext* CuContext_newWithId(long id)
+{
+	CuContext *context = (CuContext*)id;
+	__sync_fetch_and_add(&context->refcount, 1);
+	return context;
+}
+
+long	CuContext_id(CuContext *context)
+{
+	return (long)context;
+}
+
+void CuContext_destroy(CuContext *context)
+{
+	if (NULL != context && __sync_fetch_and_add(&context->refcount, -1) == 1)
+        {   
+		cublasDestroy(context->cublas_handle);
+		hashmap_clear(context->profile);
+                delete context;
+                context = NULL;
+        }   	
+}
+
+Device* Device_new()
+{
+	Device* device = new Device;
+	device->refcount = 1;
+	return device;
+}
+
+Device* Device_newWithId(long id)
+{
+	Device *device = (Device*)id;
+	__sync_fetch_and_add(&device->refcount, 1);	
+	return device;
+}
+
+long    Device_id(Device *device)
+{
+	return (long)device;
+}
+
+void	Device_destroy(Device *device)
+{
+	if (NULL != device && __sync_fetch_and_add(&device->refcount, -1) == 1)
+	{
+		delete device;
+		device = NULL;
+	}
+}
+
+void GetFreeMemory(size_t* free, size_t* total, Status *status) 
+{
+	// WARNING! the CUDA API is inconsistent accross versions!
+	#if (CUDA_VERSION >= 3020)
+  	//define the function signature type
+  	size_t mem_free, mem_total;
+#else
+  	unsigned int mem_free, mem_total;
+#endif
+  { 
+    	//we will load the cuMemGetInfo dynamically from libcuda.so
+    	//cuMemGetInfo(&mem_free, &mem_total);
+    	//pre-fill ``safe'' values that will not cause problems
+    	mem_free = 1; mem_total = 1;
+    	//open libcuda.so
+    	void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
+    	if(NULL == libcuda) 
+	{ 
+		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
+    	} 
+	else 
+	{
+      		//define the function signature type
+      		//and get the symbol
+#if (CUDA_VERSION >= 3020)
+      		typedef CUresult (*cu_fun_ptr)(size_t*, size_t*);
+      		cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2"); 
+#else
+      		typedef CUresult (*cu_fun_ptr)(int*, int*);
+      		cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo"); 
+#endif
+      		if(NULL == dl_cuMemGetInfo) 
+		{
+			NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuMemGetInfo from libcuda.so");
+      		} 
+		else 
+		{
+        		//call the function
+        		dl_cuMemGetInfo(&mem_free, &mem_total);
+      		}
+      		//close the library
+      		dlclose(libcuda);
+    	}
+  }
+  	// copy the output values outside
+  	if(NULL != free) *free = mem_free;
+  	if(NULL != total) *total = mem_total;
+}
+
+void DeviceGetName(char* name, int len, int dev, Status *status) 
+{
+  	//prefill with something reasonable
+  	strncpy(name,"Unknown GPU",len);
+  	//open libcuda.so
+  	void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
+  	if(NULL == libcuda) 
+	{
+		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
+  	} 
+	else 
+	{
+    		//define the function signature type
+    		typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice);
+    		//get the symbol
+    		cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName"); 
+    		if(NULL == cuDeviceGetName_ptr) {
+			NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuDeviceGetName from libcuda.so");
+    		} 
+		else {
+      			//call the function
+      			cuDeviceGetName_ptr(name, len, dev);
+    		}
+    		//close the library
+    		dlclose(libcuda);
+  	}
+}
+
+void GetBandwidth(int gpu_idx, float *d2h, float *h2d, Status *status)
+{
+	  int idx = gpu_idx;
+	  int memSize = 64*1024*1024;
+	  		float elapsedTimeInMs = 0.0f;
+	  	    float bandwidthInMBs = 0.0f;
+	  	    unsigned char *h_idata = NULL;
+	  	    unsigned char *h_odata = NULL;
+	  	    cudaEvent_t start, stop;
+	  	    bool PINNED = true;
+	  	    int MEMCOPY_ITERATIONS = 5;
+
+	  	  CUDA_SAFE_SYNC_CALL(cudaEventCreate(&start), status);
+	  	  CUDA_SAFE_SYNC_CALL(cudaEventCreate(&stop), status);
+
+	  	    //allocate host memory
+	  	    if (PINNED)
+	  	        {
+	  	#if CUDART_VERSION >= 2020
+	  	    	CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_idata, memSize, cudaHostAllocPortable), status);
+	  	    	CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_odata, memSize, cudaHostAllocPortable), status);
+	  	#else
+	  	    	CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_idata, memSize), status);
+	  	    	CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_odata, memSize), status);
+	  	#endif
+	  	        }
+	  	        else
+	  	        {
+	  	                //pageable memory mode - use malloc
+	  	            h_odata = (unsigned char *)malloc(memSize);
+
+	  	            if (h_odata == 0)
+	  	            {
+				NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Not enough memory available on host to run test!");
+	  	            }
+	  	         }
+
+	  	    //initialize the memory
+	  	    for (unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
+	  	    {
+	  	        h_idata[i] = (unsigned char)(i & 0xff);
+	  	    }
+
+	  	    // allocate device memory
+	  	    unsigned char *d_idata;
+	  	    CUDA_SAFE_SYNC_CALL(cudaMalloc((void **) &d_idata, memSize), status);
+
+	  	    //initialize the device memory
+	  	    CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_idata, memSize,cudaMemcpyHostToDevice), status);
+
+	  	    //copy data from GPU to Host
+
+	  	    CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status);
+
+	  	    if (PINNED)
+	  	    {
+	  	        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
+	  	        {
+	  	        	CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost, 0), status);
+	  	        }
+	  	    }
+	  	    else
+	  	    {
+	  	        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
+	  	        {
+	  	        	CUDA_SAFE_SYNC_CALL(cudaMemcpy(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost), status);
+	  	        }
+	  	    }
+
+	  	  CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status);
+
+	  	    // make sure GPU has finished copying
+	  	CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status);
+	  	    //get the the total elapsed time in ms
+
+	  	CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status);
+
+
+	  	//calculate bandwidth in MB/s
+	  	bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20));
+	  	*d2h = bandwidthInMBs;
+
+	  	/////////////////////////////////////////////////////
+	  	//copy data from Host to GPU
+	  	CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status);
+
+	  		  	    if (PINNED)
+	  		  	    {
+	  		  	        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
+	  		  	        {
+	  		  	        	CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(d_idata, h_odata, memSize,cudaMemcpyHostToDevice, 0), status);
+	  		  	        }
+	  		  	    }
+	  		  	    else
+	  		  	    {
+	  		  	        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
+	  		  	        {
+	  		  	        	CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_odata, memSize,cudaMemcpyHostToDevice), status);
+	  		  	        }
+	  		  	    }
+
+	  		  	  CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status);
+
+	  		  	    // make sure GPU has finished copying
+	  		  	CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status);
+	  		  	    //get the the total elapsed time in ms
+
+	  		  	CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status);
+
+
+	  		  	//calculate bandwidth in MB/s
+	  	bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20));
+	  	*h2d = bandwidthInMBs;
+
+	  	//clean up memory
+	  	CUDA_SAFE_SYNC_CALL(cudaEventDestroy(stop), status);
+	  	CUDA_SAFE_SYNC_CALL(cudaEventDestroy(start), status);
+
+	  	    if (PINNED)
+	  	    {
+	  	    	CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_idata), status);
+	  	    	CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_odata), status);
+	  	    }
+	  	    else
+	  	    {
+	  	        free(h_idata);
+	  	        free(h_odata);
+	  	    }
+
+	  	  CUDA_SAFE_SYNC_CALL(cudaFree(d_idata), status);
+}
+
+int AutoSelectGPU(Device *device, Status *status)
+{
+	//GPU selection is based on largest proportion of free memory.
+
+	int max_id = 0, n_gpu = 0;
+	std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
+	
+	cudaGetDeviceCount(&n_gpu);
+
+	if(n_gpu == 0) 
+	{
+		NERV_SET_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found");	
+		return -1;
+	}
+
+	if (n_gpu > 0)
+	{
+		for (int i = 0; i < gpuinfo_.size(); i++)
+		{
+			if (!gpuinfo_[i].used)
+			{
+				max_id = i;
+				break;
+			}
+		}
+		//find GPU with max free memory
+		for (int n = 1; n < gpuinfo_.size(); n++)
+		{
+			if (!gpuinfo_[n].used && gpuinfo_[n].mem_ratio > gpuinfo_[max_id].mem_ratio)
+				max_id = n;
+		}
+
+
+        std::cerr << "Selected device: " << max_id << " (automatically)" << std::endl;
+
+        std::cerr << "free: " << gpuinfo_[max_id].mem_free/1024/1024 << "M, "
+                  << "total: "<< gpuinfo_[max_id].mem_total/1024/1024 << "M, "
+                  << "ratio: "<< gpuinfo_[max_id].mem_ratio << "   "
+        	  << "d2h bandwidth: " << gpuinfo_[max_id].d2h_bandwidth << "MB/s, "
+                  << "h2d bandwidth: "<< gpuinfo_[max_id].h2d_bandwidth << "MB/s" << std::endl;
+
+		cudaSetDevice(max_id);
+		//initialize the CUBLAS
+		//cublasInit();
+		//cublasHandle_t *cublas_handle = nerv_get_cublas_handle();
+		//std::cerr << "cublasHandle_t: " << cublas_handle << std::endl;
+       	 	//cublasCreate(cublas_handle);
+
+		//create the context
+		cudaError_t e;
+		e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
+		if(e != cudaSuccess) {
+			std::cerr << "Failed to create CUDA context on a GPU." << std::endl;
+		}
+	}
+
+	gpuinfo_[max_id].used = true;
+	NERV_SET_STATUS(status, NERV_NORMAL, 0);
+	return max_id;
+}
+
+void SelectGPU(Device *device, int gpu_id, Status *status)
+{
+
+	std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
+	int n_gpu = 0;
+	cudaGetDeviceCount(&n_gpu);
+	if(gpu_id >= n_gpu) 
+	{
+		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot select GPU CUDA capable cards!");
+    		std::cerr << "Cannot select GPU " << gpu_id
+              		<< ", detected " << n_gpu << " CUDA capable cards!" << std::endl;
+	}
+
+
+    	std::cerr << "Selected device: " << gpu_id << " (manually)";
+
+    	std::cerr << "free: " << gpuinfo_[gpu_id].mem_free/1024/1024 << "M, "
+              << "total: "<< gpuinfo_[gpu_id].mem_total/1024/1024 << "M, "
+              << "ratio: "<< gpuinfo_[gpu_id].mem_ratio << "   "
+    	      << "d2h bandwidth: " << gpuinfo_[gpu_id].d2h_bandwidth << "MB/s, "
+              << "h2d bandwidth: "<< gpuinfo_[gpu_id].h2d_bandwidth << "MB/s" << std::endl;
+
+	CUDA_SAFE_SYNC_CALL(cudaSetDevice(gpu_id), status);
+	//initialize the CUBLAS
+	//CUBLAS_SAFE_SYNC_CALL(cublasInit(), status);
+	//cublasHandle_t *cublas_handle = nerv_get_cublas_handle();
+	//cublasCreate(cublas_handle);
+
+	//create the context
+	cudaError_t e;
+	e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
+	if(e != cudaSuccess) {
+		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Failed to create CUDA context on a GPU.");
+		std::cerr << "Failed to create CUDA context on a GPU.";
+	}
+
+	gpuinfo_[gpu_id].used = true;
+
+	NERV_SET_STATUS(status, NERV_NORMAL, 0);
+}
+
+void Initialize(Device *device, Status *status)
+{
+	// Check that we have at least one gpu
+	int n_gpu = 0;
+	cudaGetDeviceCount(&n_gpu);
+	if(n_gpu == 0) 
+		NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found");
+
+	std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
+	gpuinfo_.resize(n_gpu);
+
+	std::cerr << "gpu information ..." << std::endl;
+
+	// Get ratios of memory use, if possible
+	for(int n = 0; n < n_gpu; n++) 
+	{
+		int ret = cudaSetDevice(n);
+	    	switch(ret) 
+		{
+	      	case cudaSuccess : {
+	        //create the CUDA context for the thread
+	        //cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
+		cudaDeviceSynchronize();
+	        //get GPU name
+	        char name[STRLEN];
+	        DeviceGetName(name,STRLEN,n, status);
+	        //get GPU memory stats
+	        size_t free, total;
+		float d2h, h2d;
+
+	        GetFreeMemory(&free, &total, status);
+	        GetBandwidth(n, &d2h, &h2d, status);
+
+	        gpuinfo_[n].gpuid = n;
+	        gpuinfo_[n].d2h_bandwidth = d2h;
+	        gpuinfo_[n].h2d_bandwidth = h2d;
+	        gpuinfo_[n].mem_free = free;
+	        gpuinfo_[n].mem_total = total;
+	        gpuinfo_[n].mem_ratio = free/(float)total;
+	        gpuinfo_[n].used = false;
+
+		std::cerr << "gpu: "  << n << " ==> "
+	        	  << "free: " << free/1024/1024 << "M, "
+	                  << "total: "<< total/1024/1024 << "M, "
+	                  << "ratio: "<< free/(float)total << "   "
+	        	  << "d2h bandwidth: " << d2h << "MB/s, "
+	                  << "h2d bandwidth: "<< h2d << "MB/s" << std::endl;
+
+	        //destroy the CUDA context for the thread
+	        //cudaThreadExit(); //deprecated, but for legacy reason not cudaDeviceReset
+	      	} break;
+
+	#if (CUDA_VERSION > 3020)
+	      case cudaErrorDeviceAlreadyInUse :
+	        std::cerr << "cudaSetDevice(" << n << "): "
+	                  << "Device cannot be accessed, used EXCLUSIVE-THREAD mode..." << std::endl;
+	        break;
+	#endif
+	      case cudaErrorInvalidDevice :
+	        std::cerr << "cudaSetDevice(" << n << "): "
+	                  << "Device cannot be accessed, not a VALID CUDA device!" << std::endl;
+	        break;
+	      default :
+	        std::cerr << "cudaSetDevice(" << n << "): "
+	                  << "returned " << ret << ", "
+	                  << cudaGetErrorString((cudaError_t)ret) << std::endl;
+	    	}
+	}
+	cudaDeviceReset();
+	printf("Result = PASS\n");
+	NERV_SET_STATUS(status, NERV_NORMAL, 0);
+}
+
+
+
+
+
+} // extern
diff --git a/fastnn/device/Device.h b/fastnn/device/Device.h
new file mode 100644
index 0000000..9a80128
--- /dev/null
+++ b/fastnn/device/Device.h
@@ -0,0 +1,51 @@
+
+#ifndef NERV_FASTNN_EXAMPLE_H
+#define NERV_FASTNN_EXAMPLE_H
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "matrix/matrix.h"
+#include "stdbool.h"
+#include "../../nerv/lib/matrix/cuda_helper.h"
+#include "../../nerv/lib/common.h"
+#include "../../nerv/lib/matrix/cumatrix.h"
+#define  STRLEN 1024
+
+
+
+typedef struct GPUInfo GPUInfo; 
+
+typedef struct MPIGPUInfo MPIGPUInfo; 
+
+typedef struct Device Device;
+
+CuContext* CuContext_new();
+CuContext* CuContext_newWithId(long id);
+long    CuContext_id(CuContext *context);
+void CuContext_destroy(CuContext *context);
+
+
+Device* Device_new();
+Device* Device_newWithId(long);
+long	Device_id(Device *device);
+void	Device_detroy(Device *device);
+void 	Initialize(Device *device, Status *status);
+void 	GetFreeMemory(size_t* free, size_t* total, Status *status);
+void 	DeviceGetName(char* name, int len, int dev, Status *status);
+void 	GetBandwidth(int gpu_idx, float *d2h, float *h2d, Status *status);
+int 	AutoSelectGPU(Device *device, Status *status);
+void 	SelectGPU(Device *device, int gpu_id, Status *status);
+
+
+
+
+#ifdef __cplusplus
+} // closing brace for extern "C"
+
+#endif // end Device
+
+#endif
+        
diff --git a/fastnn/device/device.c b/fastnn/device/device.c
new file mode 100644
index 0000000..71d6ec1
--- /dev/null
+++ b/fastnn/device/device.c
@@ -0,0 +1,178 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <lua.h>
+#include <lualib.h>
+#include <luaT/luaT.h>
+
+#include "../threads/lib/luaTHRD.h"
+#include "Device.h"
+
+
+const char *fastnn_device_tname = "fastnn.CDevice";
+
+
+static int device_new(lua_State *L)
+{
+	Device *device = NULL;
+	if(lua_gettop(L) == 0)
+        {
+                device = Device_new();
+        }
+        else if(lua_gettop(L) == 1)
+        {
+                long id = luaL_checkinteger(L, 1);
+                device = Device_newWithId(id);
+        }
+        else
+                luaL_error(L, "device: device new invalid arguments");
+        if (!device)
+                luaL_error(L, "device: device failed");
+
+        luaTHRD_pushudata(L, device, fastnn_device_tname);
+
+        return 1;
+}
+
+static int device_init(lua_State *L)
+{
+	Device *device = luaTHRD_checkudata(L, 1, fastnn_device_tname);
+	Status status;
+	Initialize(device, &status);
+	NERV_LUA_CHECK_STATUS(L, status);
+	return 0;
+}	
+
+static int device_select_gpu(lua_State *L)
+{
+	Device *device = luaTHRD_checkudata(L, 1, fastnn_device_tname);
+	
+	Status status;
+	if(lua_gettop(L) == 2)
+	{
+		int gpuid = luaL_checkinteger(L, 2);
+		SelectGPU(device, gpuid, &status);
+		NERV_LUA_CHECK_STATUS(L, status);
+		return 0;
+	}
+	else if(lua_gettop(L) == 1)
+	{
+		int gpuid = AutoSelectGPU(device, &status);
+		NERV_LUA_CHECK_STATUS(L, status);
+		lua_pushinteger(L, gpuid);
+		return 1;
+	}
+	else
+		luaL_error(L, "device: device select gpu failed");
+}	
+
+static int device_id(lua_State *L)
+{
+	Device *device = luaTHRD_checkudata(L, 1, fastnn_device_tname);
+	lua_pushinteger(L, Device_id(device));
+	return 1;
+}
+
+static int device_tostring(lua_State *L)
+{
+        char str[STRLEN];
+        Device* device = luaTHRD_checkudata(L, 1, fastnn_device_tname);
+        snprintf(str, STRLEN, "%s <%lx>", fastnn_device_tname, Device_id(device));
+        lua_pushstring(L, str);
+        return 1;
+}
+
+static int device_destroy(lua_State *L)
+{
+        Device *device = luaTHRD_checkudata(L, 1, fastnn_device_tname);
+        Device_destroy(device);
+        //printf("device_destroy ... end\n");
+        return 0;
+}
+
+
+//////////////////////////////////////////////
+
+static int context_new(lua_State *L) 
+{
+        CuContext *context = NULL;
+        if(lua_gettop(L) == 0)
+        {   
+                context = CuContext_new();
+        }   
+        else if(lua_gettop(L) == 1)
+        {   
+                long id = luaL_checkinteger(L, 1); 
+                context = CuContext_newWithId(id);
+        }   
+        else
+                luaL_error(L, "device: context new invalid arguments");
+        if (!context)
+                luaL_error(L, "device: context failed");
+
+        luaTHRD_pushudata(L, context, nerv_context_tname);
+
+        return 1;
+}
+
+
+static int context_id(lua_State *L)
+{
+        CuContext *context = luaTHRD_checkudata(L, 1, nerv_context_tname);
+        lua_pushinteger(L, CuContext_id(context));
+        return 1;
+}
+
+static int context_tostring(lua_State *L)
+{
+        char str[STRLEN];
+        CuContext* context = luaTHRD_checkudata(L, 1, nerv_context_tname);
+        snprintf(str, STRLEN, "%s <%lx>", nerv_context_tname, CuContext_id(context));
+        lua_pushstring(L, str);
+        return 1;
+}
+
+static int context_destroy(lua_State *L)
+{
+	CuContext* context = luaTHRD_checkudata(L, 1, nerv_context_tname);
+        CuContext_destroy(context);
+        return 0;
+}
+
+
+static const struct luaL_Reg device__ [] = {
+  {"new", device_new},
+  {"__tostring", device_tostring},
+  {"id", device_id},
+  {"init", device_init},
+  {"select_gpu", device_select_gpu},
+  {"free", device_destroy},
+  {NULL, NULL}
+};
+
+static const struct luaL_Reg context__ [] = {
+  {"new", context_new},
+  {"__tostring", context_tostring},
+  {"id", context_id},
+  {"free", context_destroy},
+  {NULL, NULL}
+};
+
+
+void fastnn_init_device(lua_State *L)
+{
+        luaT_newmetatable(L, fastnn_device_tname, NULL, device_new, device_destroy, NULL);
+        luaL_register(L, NULL, device__);
+        lua_pop(L, 1);
+}
+
+void fastnn_init_context(lua_State *L)
+{
+	luaT_newmetatable(L, nerv_context_tname, NULL, context_new, context_destroy, NULL);
+        luaL_register(L, NULL, context__);
+        lua_pop(L, 1);
+}
+
+
+
diff --git a/fastnn/device/device.lua b/fastnn/device/device.lua
new file mode 100644
index 0000000..d3dea73
--- /dev/null
+++ b/fastnn/device/device.lua
@@ -0,0 +1,6 @@
+
+local C =  require 'libfastnn'
+
+fastnn.CDevice = C.CDevice
+
+