aboutsummaryrefslogtreecommitdiff
path: root/fastnn/device
diff options
context:
space:
mode:
authorTed Yin <[email protected]>2015-08-31 12:03:15 +0800
committerTed Yin <[email protected]>2015-08-31 12:03:15 +0800
commit447bd1ec6b7be07f22653874fc9db84c9b6a9f9a (patch)
tree0268d85a8f75783daaa6b182bee1338dcd504f48 /fastnn/device
parentcad144243b898a7bed91c18572bf42944e9db3b3 (diff)
parent3463789202b7ededf5074b199d5122ca85d328ea (diff)
Merge pull request #4 from uphantom/master
fastnn first version, include follow submodular
Diffstat (limited to 'fastnn/device')
-rw-r--r--fastnn/device/Device.cpp496
-rw-r--r--fastnn/device/Device.h51
-rw-r--r--fastnn/device/device.c178
-rw-r--r--fastnn/device/device.lua6
4 files changed, 731 insertions, 0 deletions
diff --git a/fastnn/device/Device.cpp b/fastnn/device/Device.cpp
new file mode 100644
index 0000000..3b69086
--- /dev/null
+++ b/fastnn/device/Device.cpp
@@ -0,0 +1,496 @@
+
+#include <vector>
+#include <string>
+#include <iostream>
+#include "../../nerv/lib/matrix/cuda_helper.h"
+#include <dlfcn.h>
+
+
+extern "C" {
+
+
+#include "Device.h"
+#include "stdlib.h"
+#include "string.h"
+
+
+struct GPUInfo
+{
+ int gpuid;
+ float d2h_bandwidth;
+ float h2d_bandwidth;
+ size_t mem_free;
+ size_t mem_total;
+ float mem_ratio;
+ bool used;
+};
+
+struct MPIGPUInfo
+{
+ char hostname[STRLEN];
+ int gpuid;
+ int myid;
+};
+
+struct Device
+{
+ std::vector<GPUInfo> gpuinfo_;
+ int refcount;
+};
+
+///////////////////////////////////////////
+
+extern cublasHandle_t* nerv_get_cublas_handle();
+
+CuContext* CuContext_new()
+{
+ CuContext *context = new CuContext;
+ cublasCreate(&context->cublas_handle);
+ cudaEventCreate(&context->profile_start);
+ cudaEventCreate(&context->profile_stop);
+ context->profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
+ context->refcount = 1;
+ context->pid = pthread_self();
+ return context;
+}
+
+CuContext* CuContext_newWithId(long id)
+{
+ CuContext *context = (CuContext*)id;
+ __sync_fetch_and_add(&context->refcount, 1);
+ return context;
+}
+
+long CuContext_id(CuContext *context)
+{
+ return (long)context;
+}
+
+void CuContext_destroy(CuContext *context)
+{
+ if (NULL != context && __sync_fetch_and_add(&context->refcount, -1) == 1)
+ {
+ cublasDestroy(context->cublas_handle);
+ hashmap_clear(context->profile);
+ delete context;
+ context = NULL;
+ }
+}
+
+Device* Device_new()
+{
+ Device* device = new Device;
+ device->refcount = 1;
+ return device;
+}
+
+Device* Device_newWithId(long id)
+{
+ Device *device = (Device*)id;
+ __sync_fetch_and_add(&device->refcount, 1);
+ return device;
+}
+
+long Device_id(Device *device)
+{
+ return (long)device;
+}
+
+void Device_destroy(Device *device)
+{
+ if (NULL != device && __sync_fetch_and_add(&device->refcount, -1) == 1)
+ {
+ delete device;
+ device = NULL;
+ }
+}
+
+void GetFreeMemory(size_t* free, size_t* total, Status *status)
+{
+ // WARNING! the CUDA API is inconsistent accross versions!
+ #if (CUDA_VERSION >= 3020)
+ //define the function signature type
+ size_t mem_free, mem_total;
+#else
+ unsigned int mem_free, mem_total;
+#endif
+ {
+ //we will load the cuMemGetInfo dynamically from libcuda.so
+ //cuMemGetInfo(&mem_free, &mem_total);
+ //pre-fill ``safe'' values that will not cause problems
+ mem_free = 1; mem_total = 1;
+ //open libcuda.so
+ void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
+ if(NULL == libcuda)
+ {
+ NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
+ }
+ else
+ {
+ //define the function signature type
+ //and get the symbol
+#if (CUDA_VERSION >= 3020)
+ typedef CUresult (*cu_fun_ptr)(size_t*, size_t*);
+ cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2");
+#else
+ typedef CUresult (*cu_fun_ptr)(int*, int*);
+ cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo");
+#endif
+ if(NULL == dl_cuMemGetInfo)
+ {
+ NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuMemGetInfo from libcuda.so");
+ }
+ else
+ {
+ //call the function
+ dl_cuMemGetInfo(&mem_free, &mem_total);
+ }
+ //close the library
+ dlclose(libcuda);
+ }
+ }
+ // copy the output values outside
+ if(NULL != free) *free = mem_free;
+ if(NULL != total) *total = mem_total;
+}
+
+void DeviceGetName(char* name, int len, int dev, Status *status)
+{
+ //prefill with something reasonable
+ strncpy(name,"Unknown GPU",len);
+ //open libcuda.so
+ void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
+ if(NULL == libcuda)
+ {
+ NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot open libcuda.so");
+ }
+ else
+ {
+ //define the function signature type
+ typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice);
+ //get the symbol
+ cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName");
+ if(NULL == cuDeviceGetName_ptr) {
+ NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot load cuDeviceGetName from libcuda.so");
+ }
+ else {
+ //call the function
+ cuDeviceGetName_ptr(name, len, dev);
+ }
+ //close the library
+ dlclose(libcuda);
+ }
+}
+
+void GetBandwidth(int gpu_idx, float *d2h, float *h2d, Status *status)
+{
+ int idx = gpu_idx;
+ int memSize = 64*1024*1024;
+ float elapsedTimeInMs = 0.0f;
+ float bandwidthInMBs = 0.0f;
+ unsigned char *h_idata = NULL;
+ unsigned char *h_odata = NULL;
+ cudaEvent_t start, stop;
+ bool PINNED = true;
+ int MEMCOPY_ITERATIONS = 5;
+
+ CUDA_SAFE_SYNC_CALL(cudaEventCreate(&start), status);
+ CUDA_SAFE_SYNC_CALL(cudaEventCreate(&stop), status);
+
+ //allocate host memory
+ if (PINNED)
+ {
+ #if CUDART_VERSION >= 2020
+ CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_idata, memSize, cudaHostAllocPortable), status);
+ CUDA_SAFE_SYNC_CALL(cudaHostAlloc((void **)&h_odata, memSize, cudaHostAllocPortable), status);
+ #else
+ CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_idata, memSize), status);
+ CUDA_SAFE_SYNC_CALL(cudaMallocHost((void **)&h_odata, memSize), status);
+ #endif
+ }
+ else
+ {
+ //pageable memory mode - use malloc
+ h_odata = (unsigned char *)malloc(memSize);
+
+ if (h_odata == 0)
+ {
+ NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Not enough memory available on host to run test!");
+ }
+ }
+
+ //initialize the memory
+ for (unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
+ {
+ h_idata[i] = (unsigned char)(i & 0xff);
+ }
+
+ // allocate device memory
+ unsigned char *d_idata;
+ CUDA_SAFE_SYNC_CALL(cudaMalloc((void **) &d_idata, memSize), status);
+
+ //initialize the device memory
+ CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_idata, memSize,cudaMemcpyHostToDevice), status);
+
+ //copy data from GPU to Host
+
+ CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status);
+
+ if (PINNED)
+ {
+ for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
+ {
+ CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost, 0), status);
+ }
+ }
+ else
+ {
+ for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
+ {
+ CUDA_SAFE_SYNC_CALL(cudaMemcpy(h_odata, d_idata, memSize,cudaMemcpyDeviceToHost), status);
+ }
+ }
+
+ CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status);
+
+ // make sure GPU has finished copying
+ CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status);
+ //get the the total elapsed time in ms
+
+ CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status);
+
+
+ //calculate bandwidth in MB/s
+ bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20));
+ *d2h = bandwidthInMBs;
+
+ /////////////////////////////////////////////////////
+ //copy data from Host to GPU
+ CUDA_SAFE_SYNC_CALL(cudaEventRecord(start, 0), status);
+
+ if (PINNED)
+ {
+ for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
+ {
+ CUDA_SAFE_SYNC_CALL(cudaMemcpyAsync(d_idata, h_odata, memSize,cudaMemcpyHostToDevice, 0), status);
+ }
+ }
+ else
+ {
+ for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
+ {
+ CUDA_SAFE_SYNC_CALL(cudaMemcpy(d_idata, h_odata, memSize,cudaMemcpyHostToDevice), status);
+ }
+ }
+
+ CUDA_SAFE_SYNC_CALL(cudaEventRecord(stop, 0), status);
+
+ // make sure GPU has finished copying
+ CUDA_SAFE_SYNC_CALL(cudaDeviceSynchronize(), status);
+ //get the the total elapsed time in ms
+
+ CUDA_SAFE_SYNC_CALL(cudaEventElapsedTime(&elapsedTimeInMs, start, stop), status);
+
+
+ //calculate bandwidth in MB/s
+ bandwidthInMBs = (1e3f * memSize * (float)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (float)(1 << 20));
+ *h2d = bandwidthInMBs;
+
+ //clean up memory
+ CUDA_SAFE_SYNC_CALL(cudaEventDestroy(stop), status);
+ CUDA_SAFE_SYNC_CALL(cudaEventDestroy(start), status);
+
+ if (PINNED)
+ {
+ CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_idata), status);
+ CUDA_SAFE_SYNC_CALL(cudaFreeHost(h_odata), status);
+ }
+ else
+ {
+ free(h_idata);
+ free(h_odata);
+ }
+
+ CUDA_SAFE_SYNC_CALL(cudaFree(d_idata), status);
+}
+
+int AutoSelectGPU(Device *device, Status *status)
+{
+ //GPU selection is based on largest proportion of free memory.
+
+ int max_id = 0, n_gpu = 0;
+ std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
+
+ cudaGetDeviceCount(&n_gpu);
+
+ if(n_gpu == 0)
+ {
+ NERV_SET_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found");
+ return -1;
+ }
+
+ if (n_gpu > 0)
+ {
+ for (int i = 0; i < gpuinfo_.size(); i++)
+ {
+ if (!gpuinfo_[i].used)
+ {
+ max_id = i;
+ break;
+ }
+ }
+ //find GPU with max free memory
+ for (int n = 1; n < gpuinfo_.size(); n++)
+ {
+ if (!gpuinfo_[n].used && gpuinfo_[n].mem_ratio > gpuinfo_[max_id].mem_ratio)
+ max_id = n;
+ }
+
+
+ std::cerr << "Selected device: " << max_id << " (automatically)" << std::endl;
+
+ std::cerr << "free: " << gpuinfo_[max_id].mem_free/1024/1024 << "M, "
+ << "total: "<< gpuinfo_[max_id].mem_total/1024/1024 << "M, "
+ << "ratio: "<< gpuinfo_[max_id].mem_ratio << " "
+ << "d2h bandwidth: " << gpuinfo_[max_id].d2h_bandwidth << "MB/s, "
+ << "h2d bandwidth: "<< gpuinfo_[max_id].h2d_bandwidth << "MB/s" << std::endl;
+
+ cudaSetDevice(max_id);
+ //initialize the CUBLAS
+ //cublasInit();
+ //cublasHandle_t *cublas_handle = nerv_get_cublas_handle();
+ //std::cerr << "cublasHandle_t: " << cublas_handle << std::endl;
+ //cublasCreate(cublas_handle);
+
+ //create the context
+ cudaError_t e;
+ e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
+ if(e != cudaSuccess) {
+ std::cerr << "Failed to create CUDA context on a GPU." << std::endl;
+ }
+ }
+
+ gpuinfo_[max_id].used = true;
+ NERV_SET_STATUS(status, NERV_NORMAL, 0);
+ return max_id;
+}
+
+void SelectGPU(Device *device, int gpu_id, Status *status)
+{
+
+ std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
+ int n_gpu = 0;
+ cudaGetDeviceCount(&n_gpu);
+ if(gpu_id >= n_gpu)
+ {
+ NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Cannot select GPU CUDA capable cards!");
+ std::cerr << "Cannot select GPU " << gpu_id
+ << ", detected " << n_gpu << " CUDA capable cards!" << std::endl;
+ }
+
+
+ std::cerr << "Selected device: " << gpu_id << " (manually)";
+
+ std::cerr << "free: " << gpuinfo_[gpu_id].mem_free/1024/1024 << "M, "
+ << "total: "<< gpuinfo_[gpu_id].mem_total/1024/1024 << "M, "
+ << "ratio: "<< gpuinfo_[gpu_id].mem_ratio << " "
+ << "d2h bandwidth: " << gpuinfo_[gpu_id].d2h_bandwidth << "MB/s, "
+ << "h2d bandwidth: "<< gpuinfo_[gpu_id].h2d_bandwidth << "MB/s" << std::endl;
+
+ CUDA_SAFE_SYNC_CALL(cudaSetDevice(gpu_id), status);
+ //initialize the CUBLAS
+ //CUBLAS_SAFE_SYNC_CALL(cublasInit(), status);
+ //cublasHandle_t *cublas_handle = nerv_get_cublas_handle();
+ //cublasCreate(cublas_handle);
+
+ //create the context
+ cudaError_t e;
+ e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
+ if(e != cudaSuccess) {
+ NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "Failed to create CUDA context on a GPU.");
+ std::cerr << "Failed to create CUDA context on a GPU.";
+ }
+
+ gpuinfo_[gpu_id].used = true;
+
+ NERV_SET_STATUS(status, NERV_NORMAL, 0);
+}
+
+void Initialize(Device *device, Status *status)
+{
+ // Check that we have at least one gpu
+ int n_gpu = 0;
+ cudaGetDeviceCount(&n_gpu);
+ if(n_gpu == 0)
+ NERV_EXIT_STATUS(status, MAT_CUDA_ERR, "No CUDA devices found");
+
+ std::vector<GPUInfo> &gpuinfo_ = device->gpuinfo_;
+ gpuinfo_.resize(n_gpu);
+
+ std::cerr << "gpu information ..." << std::endl;
+
+ // Get ratios of memory use, if possible
+ for(int n = 0; n < n_gpu; n++)
+ {
+ int ret = cudaSetDevice(n);
+ switch(ret)
+ {
+ case cudaSuccess : {
+ //create the CUDA context for the thread
+ //cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
+ cudaDeviceSynchronize();
+ //get GPU name
+ char name[STRLEN];
+ DeviceGetName(name,STRLEN,n, status);
+ //get GPU memory stats
+ size_t free, total;
+ float d2h, h2d;
+
+ GetFreeMemory(&free, &total, status);
+ GetBandwidth(n, &d2h, &h2d, status);
+
+ gpuinfo_[n].gpuid = n;
+ gpuinfo_[n].d2h_bandwidth = d2h;
+ gpuinfo_[n].h2d_bandwidth = h2d;
+ gpuinfo_[n].mem_free = free;
+ gpuinfo_[n].mem_total = total;
+ gpuinfo_[n].mem_ratio = free/(float)total;
+ gpuinfo_[n].used = false;
+
+ std::cerr << "gpu: " << n << " ==> "
+ << "free: " << free/1024/1024 << "M, "
+ << "total: "<< total/1024/1024 << "M, "
+ << "ratio: "<< free/(float)total << " "
+ << "d2h bandwidth: " << d2h << "MB/s, "
+ << "h2d bandwidth: "<< h2d << "MB/s" << std::endl;
+
+ //destroy the CUDA context for the thread
+ //cudaThreadExit(); //deprecated, but for legacy reason not cudaDeviceReset
+ } break;
+
+ #if (CUDA_VERSION > 3020)
+ case cudaErrorDeviceAlreadyInUse :
+ std::cerr << "cudaSetDevice(" << n << "): "
+ << "Device cannot be accessed, used EXCLUSIVE-THREAD mode..." << std::endl;
+ break;
+ #endif
+ case cudaErrorInvalidDevice :
+ std::cerr << "cudaSetDevice(" << n << "): "
+ << "Device cannot be accessed, not a VALID CUDA device!" << std::endl;
+ break;
+ default :
+ std::cerr << "cudaSetDevice(" << n << "): "
+ << "returned " << ret << ", "
+ << cudaGetErrorString((cudaError_t)ret) << std::endl;
+ }
+ }
+ cudaDeviceReset();
+ printf("Result = PASS\n");
+ NERV_SET_STATUS(status, NERV_NORMAL, 0);
+}
+
+
+
+
+
+} // extern
diff --git a/fastnn/device/Device.h b/fastnn/device/Device.h
new file mode 100644
index 0000000..9a80128
--- /dev/null
+++ b/fastnn/device/Device.h
@@ -0,0 +1,51 @@
+
+#ifndef NERV_FASTNN_EXAMPLE_H
+#define NERV_FASTNN_EXAMPLE_H
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "matrix/matrix.h"
+#include "stdbool.h"
+#include "../../nerv/lib/matrix/cuda_helper.h"
+#include "../../nerv/lib/common.h"
+#include "../../nerv/lib/matrix/cumatrix.h"
+#define STRLEN 1024
+
+
+
+typedef struct GPUInfo GPUInfo;
+
+typedef struct MPIGPUInfo MPIGPUInfo;
+
+typedef struct Device Device;
+
+CuContext* CuContext_new();
+CuContext* CuContext_newWithId(long id);
+long CuContext_id(CuContext *context);
+void CuContext_destroy(CuContext *context);
+
+
+Device* Device_new();
+Device* Device_newWithId(long);
+long Device_id(Device *device);
+void Device_detroy(Device *device);
+void Initialize(Device *device, Status *status);
+void GetFreeMemory(size_t* free, size_t* total, Status *status);
+void DeviceGetName(char* name, int len, int dev, Status *status);
+void GetBandwidth(int gpu_idx, float *d2h, float *h2d, Status *status);
+int AutoSelectGPU(Device *device, Status *status);
+void SelectGPU(Device *device, int gpu_id, Status *status);
+
+
+
+
+#ifdef __cplusplus
+} // closing brace for extern "C"
+
+#endif // end Device
+
+#endif
+
diff --git a/fastnn/device/device.c b/fastnn/device/device.c
new file mode 100644
index 0000000..71d6ec1
--- /dev/null
+++ b/fastnn/device/device.c
@@ -0,0 +1,178 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <lua.h>
+#include <lualib.h>
+#include <luaT/luaT.h>
+
+#include "../threads/lib/luaTHRD.h"
+#include "Device.h"
+
+
+const char *fastnn_device_tname = "fastnn.CDevice";
+
+
+static int device_new(lua_State *L)
+{
+ Device *device = NULL;
+ if(lua_gettop(L) == 0)
+ {
+ device = Device_new();
+ }
+ else if(lua_gettop(L) == 1)
+ {
+ long id = luaL_checkinteger(L, 1);
+ device = Device_newWithId(id);
+ }
+ else
+ luaL_error(L, "device: device new invalid arguments");
+ if (!device)
+ luaL_error(L, "device: device failed");
+
+ luaTHRD_pushudata(L, device, fastnn_device_tname);
+
+ return 1;
+}
+
+static int device_init(lua_State *L)
+{
+ Device *device = luaTHRD_checkudata(L, 1, fastnn_device_tname);
+ Status status;
+ Initialize(device, &status);
+ NERV_LUA_CHECK_STATUS(L, status);
+ return 0;
+}
+
+static int device_select_gpu(lua_State *L)
+{
+ Device *device = luaTHRD_checkudata(L, 1, fastnn_device_tname);
+
+ Status status;
+ if(lua_gettop(L) == 2)
+ {
+ int gpuid = luaL_checkinteger(L, 2);
+ SelectGPU(device, gpuid, &status);
+ NERV_LUA_CHECK_STATUS(L, status);
+ return 0;
+ }
+ else if(lua_gettop(L) == 1)
+ {
+ int gpuid = AutoSelectGPU(device, &status);
+ NERV_LUA_CHECK_STATUS(L, status);
+ lua_pushinteger(L, gpuid);
+ return 1;
+ }
+ else
+ luaL_error(L, "device: device select gpu failed");
+}
+
+static int device_id(lua_State *L)
+{
+ Device *device = luaTHRD_checkudata(L, 1, fastnn_device_tname);
+ lua_pushinteger(L, Device_id(device));
+ return 1;
+}
+
+static int device_tostring(lua_State *L)
+{
+ char str[STRLEN];
+ Device* device = luaTHRD_checkudata(L, 1, fastnn_device_tname);
+ snprintf(str, STRLEN, "%s <%lx>", fastnn_device_tname, Device_id(device));
+ lua_pushstring(L, str);
+ return 1;
+}
+
+static int device_destroy(lua_State *L)
+{
+ Device *device = luaTHRD_checkudata(L, 1, fastnn_device_tname);
+ Device_destroy(device);
+ //printf("device_destroy ... end\n");
+ return 0;
+}
+
+
+//////////////////////////////////////////////
+
+static int context_new(lua_State *L)
+{
+ CuContext *context = NULL;
+ if(lua_gettop(L) == 0)
+ {
+ context = CuContext_new();
+ }
+ else if(lua_gettop(L) == 1)
+ {
+ long id = luaL_checkinteger(L, 1);
+ context = CuContext_newWithId(id);
+ }
+ else
+ luaL_error(L, "device: context new invalid arguments");
+ if (!context)
+ luaL_error(L, "device: context failed");
+
+ luaTHRD_pushudata(L, context, nerv_context_tname);
+
+ return 1;
+}
+
+
+static int context_id(lua_State *L)
+{
+ CuContext *context = luaTHRD_checkudata(L, 1, nerv_context_tname);
+ lua_pushinteger(L, CuContext_id(context));
+ return 1;
+}
+
+static int context_tostring(lua_State *L)
+{
+ char str[STRLEN];
+ CuContext* context = luaTHRD_checkudata(L, 1, nerv_context_tname);
+ snprintf(str, STRLEN, "%s <%lx>", nerv_context_tname, CuContext_id(context));
+ lua_pushstring(L, str);
+ return 1;
+}
+
+static int context_destroy(lua_State *L)
+{
+ CuContext* context = luaTHRD_checkudata(L, 1, nerv_context_tname);
+ CuContext_destroy(context);
+ return 0;
+}
+
+
+static const struct luaL_Reg device__ [] = {
+ {"new", device_new},
+ {"__tostring", device_tostring},
+ {"id", device_id},
+ {"init", device_init},
+ {"select_gpu", device_select_gpu},
+ {"free", device_destroy},
+ {NULL, NULL}
+};
+
+static const struct luaL_Reg context__ [] = {
+ {"new", context_new},
+ {"__tostring", context_tostring},
+ {"id", context_id},
+ {"free", context_destroy},
+ {NULL, NULL}
+};
+
+
+void fastnn_init_device(lua_State *L)
+{
+ luaT_newmetatable(L, fastnn_device_tname, NULL, device_new, device_destroy, NULL);
+ luaL_register(L, NULL, device__);
+ lua_pop(L, 1);
+}
+
+void fastnn_init_context(lua_State *L)
+{
+ luaT_newmetatable(L, nerv_context_tname, NULL, context_new, context_destroy, NULL);
+ luaL_register(L, NULL, context__);
+ lua_pop(L, 1);
+}
+
+
+
diff --git a/fastnn/device/device.lua b/fastnn/device/device.lua
new file mode 100644
index 0000000..d3dea73
--- /dev/null
+++ b/fastnn/device/device.lua
@@ -0,0 +1,6 @@
+
+local C = require 'libfastnn'
+
+fastnn.CDevice = C.CDevice
+
+