add GPU auto selection

author: Determinant <ted.sybil@gmail.com> 2016-03-02 16:49:33 +0800
committer: Determinant <ted.sybil@gmail.com> 2016-03-02 16:49:33 +0800
commit: d3abc6459a776ff7fa3777f4f561bc4f5d5e2075 (patch)
tree: a0fffbdb4960ba463f720436ca5f050f0d747504 /nerv/lib/matrix/cumatrix.c
parent: 75108e2e000f4382129d453c0fa9073b14d32f97 (diff)
1 files changed, 46 insertions, 6 deletions
diff --git a/nerv/lib/matrix/cumatrix.c b/nerv/lib/matrix/cumatrix.c
index 537fabb..aec4d60 100644
--- a/nerv/lib/matrix/cumatrix.c
+++ b/nerv/lib/matrix/cumatrix.c
@@ -37,7 +37,9 @@ void nerv_cuda_context_accu_profile(CuContext *context,
     *val += delta;
 }
 
-static void new_cuda_handles(CuContext *context, Status *status) {
+static void new_cuda_handles(CuContext *context, int dev, Status *status) {
+    if (context->has_handle) return;
+    CUDA_SAFE_SYNC_CALL(cudaSetDevice(dev), status);
     CUBLAS_SAFE_SYNC_CALL(cublasCreate(&(context->cublas_handle)), status);
     CURAND_SAFE_SYNC_CALL(curandCreateGenerator(&(context->curand_gen),
                             CURAND_RNG_PSEUDO_DEFAULT), status);
@@ -47,9 +49,12 @@ static void new_cuda_handles(CuContext *context, Status *status) {
     CUDA_SAFE_SYNC_CALL(cudaEventCreate(&(context->profile_start)), status);
     CUDA_SAFE_SYNC_CALL(cudaEventCreate(&(context->profile_stop)), status);
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
+    context->has_handle = 1;
 }
 
 static void free_cuda_handles(CuContext *context, Status *status) {
+    if (!context->has_handle) return;
+    context->has_handle = 0;
     CUBLAS_SAFE_SYNC_CALL(cublasDestroy(context->cublas_handle), status);
     CURAND_SAFE_SYNC_CALL(curandDestroyGenerator(context->curand_gen), status);
     CUDA_SAFE_SYNC_CALL(cudaEventDestroy(context->profile_start), status);
@@ -57,9 +62,41 @@ static void free_cuda_handles(CuContext *context, Status *status) {
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
 }
 
-CuContext *nerv_cuda_context_create(Status *status) {
+static int choose_best_gpu(Status *status) {
+    int i, n, dev = 0;
+    float best_ratio = 0;
+    fprintf(stderr, "*** select a GPU based on available space\n");
+    CUDA_SAFE_CALL_RET(cudaGetDeviceCount(&n), status);
+    for (i = 0; i < n; i++)
+    {
+        size_t avail, total;
+        float ratio;
+        CUDA_SAFE_SYNC_CALL_RET(cudaSetDevice(i), status);
+        CUDA_SAFE_SYNC_CALL_RET(cuMemGetInfo(&avail, &total), status);
+        ratio = (float)avail/total * 100;
+        fprintf(stderr, "* card %d: %.2f%%\n", i, ratio);
+        if (ratio > best_ratio)
+        {
+            best_ratio = ratio;
+            dev = i;
+        }
+        CUDA_SAFE_SYNC_CALL_RET(cudaDeviceReset(), status);
+    }
+    fprintf(stderr, "*** final decision: GPU %d\n", dev);
+    NERV_SET_STATUS(status, NERV_NORMAL, 0);
+    return dev;
+}
+
+CuContext *nerv_cuda_context_create(int dev, Status *status) {
     CuContext *context = (CuContext *)malloc(sizeof(CuContext));
-    new_cuda_handles(context, status);
+    context->has_handle = 0; /* this line must come first */
+    if (dev == -1)
+    {
+        dev = choose_best_gpu(status);
+        if (status->err_code != NERV_NORMAL)
+            return NULL;
+    }
+    new_cuda_handles(context, dev, status);
     if (status->err_code != NERV_NORMAL)
         return NULL;
     context->profile = nerv_hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
@@ -78,11 +115,14 @@ void nerv_cuda_context_destroy(CuContext *context, Status *status) {
 
 void nerv_cuda_context_select_gpu(CuContext *context,
                                     int dev, Status *status) {
-    CUDA_SAFE_SYNC_CALL(cudaSetDevice(dev), status);
-    free_cuda_handles(context, status);
+    /* free_cuda_handles(context, status);
     if (status->err_code != NERV_NORMAL)
         return;
-    new_cuda_handles(context, status);
+    */
+    /* because of cudaDeviceReset */
+    context->has_handle = 0;
+    CUDA_SAFE_SYNC_CALL(cudaDeviceReset(), status);
+    new_cuda_handles(context, dev, status);
     if (status->err_code != NERV_NORMAL)
         return;
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
author	Determinant <ted.sybil@gmail.com>	2016-03-02 16:49:33 +0800
committer	Determinant <ted.sybil@gmail.com>	2016-03-02 16:49:33 +0800
commit	d3abc6459a776ff7fa3777f4f561bc4f5d5e2075 (patch)
tree	a0fffbdb4960ba463f720436ca5f050f0d747504 /nerv/lib/matrix/cumatrix.c
parent	75108e2e000f4382129d453c0fa9073b14d32f97 (diff)