5 files changed, 56 insertions, 13 deletions
diff --git a/nerv/Makefile b/nerv/Makefile
index 9ef3212..46e79a0 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -42,7 +42,7 @@ INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK
 CUDA_INCLUDE := -I $(CUDA_BASE)/include/
 INCLUDE += $(CUDA_INCLUDE)
 
-CUDA_LDFLAGS := -L$(CUDA_BASE)/lib64/ -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcublas -lcurand
+CUDA_LDFLAGS := -L$(CUDA_BASE)/lib64/ -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcuda -lcublas -lcurand
 
 NVCC := $(CUDA_BASE)/bin/nvcc
 EMPTY :=
diff --git a/nerv/lib/matrix/cumatrix.c b/nerv/lib/matrix/cumatrix.c
index 537fabb..aec4d60 100644
--- a/nerv/lib/matrix/cumatrix.c
+++ b/nerv/lib/matrix/cumatrix.c
@@ -37,7 +37,9 @@ void nerv_cuda_context_accu_profile(CuContext *context,
     *val += delta;
 }
 
-static void new_cuda_handles(CuContext *context, Status *status) {
+static void new_cuda_handles(CuContext *context, int dev, Status *status) {
+    if (context->has_handle) return;
+    CUDA_SAFE_SYNC_CALL(cudaSetDevice(dev), status);
     CUBLAS_SAFE_SYNC_CALL(cublasCreate(&(context->cublas_handle)), status);
     CURAND_SAFE_SYNC_CALL(curandCreateGenerator(&(context->curand_gen),
                             CURAND_RNG_PSEUDO_DEFAULT), status);
@@ -47,9 +49,12 @@ static void new_cuda_handles(CuContext *context, Status *status) {
     CUDA_SAFE_SYNC_CALL(cudaEventCreate(&(context->profile_start)), status);
     CUDA_SAFE_SYNC_CALL(cudaEventCreate(&(context->profile_stop)), status);
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
+    context->has_handle = 1;
 }
 
 static void free_cuda_handles(CuContext *context, Status *status) {
+    if (!context->has_handle) return;
+    context->has_handle = 0;
     CUBLAS_SAFE_SYNC_CALL(cublasDestroy(context->cublas_handle), status);
     CURAND_SAFE_SYNC_CALL(curandDestroyGenerator(context->curand_gen), status);
     CUDA_SAFE_SYNC_CALL(cudaEventDestroy(context->profile_start), status);
@@ -57,9 +62,41 @@ static void free_cuda_handles(CuContext *context, Status *status) {
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
 }
 
-CuContext *nerv_cuda_context_create(Status *status) {
+static int choose_best_gpu(Status *status) {
+    int i, n, dev = 0;
+    float best_ratio = 0;
+    fprintf(stderr, "*** select a GPU based on available space\n");
+    CUDA_SAFE_CALL_RET(cudaGetDeviceCount(&n), status);
+    for (i = 0; i < n; i++)
+    {
+        size_t avail, total;
+        float ratio;
+        CUDA_SAFE_SYNC_CALL_RET(cudaSetDevice(i), status);
+        CUDA_SAFE_SYNC_CALL_RET(cuMemGetInfo(&avail, &total), status);
+        ratio = (float)avail/total * 100;
+        fprintf(stderr, "* card %d: %.2f%%\n", i, ratio);
+        if (ratio > best_ratio)
+        {
+            best_ratio = ratio;
+            dev = i;
+        }
+        CUDA_SAFE_SYNC_CALL_RET(cudaDeviceReset(), status);
+    }
+    fprintf(stderr, "*** final decision: GPU %d\n", dev);
+    NERV_SET_STATUS(status, NERV_NORMAL, 0);
+    return dev;
+}
+
+CuContext *nerv_cuda_context_create(int dev, Status *status) {
     CuContext *context = (CuContext *)malloc(sizeof(CuContext));
-    new_cuda_handles(context, status);
+    context->has_handle = 0; /* this line must come first */
+    if (dev == -1)
+    {
+        dev = choose_best_gpu(status);
+        if (status->err_code != NERV_NORMAL)
+            return NULL;
+    }
+    new_cuda_handles(context, dev, status);
     if (status->err_code != NERV_NORMAL)
         return NULL;
     context->profile = nerv_hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
@@ -78,11 +115,14 @@ void nerv_cuda_context_destroy(CuContext *context, Status *status) {
 
 void nerv_cuda_context_select_gpu(CuContext *context,
                                     int dev, Status *status) {
-    CUDA_SAFE_SYNC_CALL(cudaSetDevice(dev), status);
-    free_cuda_handles(context, status);
+    /* free_cuda_handles(context, status);
     if (status->err_code != NERV_NORMAL)
         return;
-    new_cuda_handles(context, status);
+    */
+    /* because of cudaDeviceReset */
+    context->has_handle = 0;
+    CUDA_SAFE_SYNC_CALL(cudaDeviceReset(), status);
+    new_cuda_handles(context, dev, status);
     if (status->err_code != NERV_NORMAL)
         return;
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
diff --git a/nerv/lib/matrix/cumatrix.h b/nerv/lib/matrix/cumatrix.h
index 280035b..fd2a5ce 100644
--- a/nerv/lib/matrix/cumatrix.h
+++ b/nerv/lib/matrix/cumatrix.h
@@ -5,6 +5,7 @@
 #include "cuda_helper.h"
 
 typedef struct CuContext {
+    int has_handle;
     cublasHandle_t cublas_handle;
     cudaEvent_t profile_start, profile_stop;
     curandGenerator_t curand_gen;
@@ -15,6 +16,6 @@ void nerv_cuda_context_print_profile(CuContext *context);
 void nerv_cuda_context_clear_profile(CuContext *context);
 void nerv_cuda_context_accu_profile(CuContext *context, const char *name, float delta);
 void nerv_cuda_context_select_gpu(CuContext *context, int dev, Status *status);
-CuContext *nerv_cuda_context_create(Status *status);
+CuContext *nerv_cuda_context_create(int dev, Status *status);
 void nerv_cuda_context_destroy(CuContext *contex, Status *status);
 #endif
diff --git a/nerv/matrix/cumatrix.c b/nerv/matrix/cumatrix.c
index 7d10895..b8eef9c 100644
--- a/nerv/matrix/cumatrix.c
+++ b/nerv/matrix/cumatrix.c
@@ -26,7 +26,8 @@ int nerv_cuda_context_lua_clear_profile(lua_State *L) {
 
 int nerv_cuda_context_lua_new(lua_State *L) {
     Status status;
-    CuContext *self = nerv_cuda_context_create(&status);
+    int dev = lua_gettop(L) > 0 ? luaL_checkinteger(L, 1) : -1;
+    CuContext *self = nerv_cuda_context_create(dev, &status);
     NERV_LUA_CHECK_STATUS(L, status);
     luaT_pushudata(L, self, nerv_cuda_context_tname);
     return 1;
diff --git a/nerv/nerv b/nerv/nerv
index 9295290..0b75a9b 100644
--- a/nerv/nerv
+++ b/nerv/nerv
@@ -24,15 +24,16 @@ local function _add_profile_method(cls)
 end
 
 if not opts["use-cpu"].val then
+    local dev = -1
+    if opts["select-gpu"].val then
+        dev = opts["select-gpu"].val
+    end
     nerv.info("automatically initialize a default CuContext...")
-    nerv.CuMatrix._default_context = nerv.CuContext()
+    nerv.CuMatrix._default_context = nerv.CuContext(dev)
     nerv.info("the default CuContext is ok")
     _add_profile_method(nerv.CuMatrix)
     nerv.CuMatrix.select_gpu =
             function (dev) nerv.CuMatrix._default_context:select_gpu(dev) end
-    if opts["select-gpu"].val then
-        nerv.CuMatrix.select_gpu(opts["select-gpu"].val)
-    end
 end
 
 nerv.info("automatically initialize a default MContext...")