nerv/lib/matrix/cumatrix.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

#define NERV_GENERIC_CUMATRIX
#define MATRIX_CONTEXT CuContext
#include <string.h>
#include <time.h>
#include "../common.h"
#include "cumatrix.h"
#include "cuda_helper.h"

void nerv_cuda_context_print_profile(CuContext *context) {
    HashMap *profile = context->profile;
    size_t i;
    float tmp, tot = 0;
    fprintf(stderr, "*** [nerv cumatrix profile] **\n");
    for (i = 0; i < profile->size; i++)
    {
        HashNode *ptr;
        for (ptr = profile->bucket[i]; ptr; ptr = ptr->next)
        {
            tmp = *(float *)ptr->val;
            fprintf(stderr, "%s:\t%.6f\n", ptr->key, tmp);
            tot += tmp;
        }
    }
    fprintf(stderr, "Total time:\t%.6f\n", tot);
}

void nerv_cuda_context_clear_profile(CuContext *context) {
    nerv_hashmap_clear(context->profile);
}

void nerv_cuda_context_accu_profile(CuContext *context,
                                    const char *name, float delta) {
    HashMap *profile = context->profile;
    float *val = nerv_hashmap_getval(profile, name);
    if (!val)
    {
        val = malloc(sizeof(float));
        *val = 0;
        nerv_hashmap_setval(profile, name, val);
    }
    *val += delta;
}

static void new_cuda_handles(CuContext *context, int dev, Status *status) {
    if (context->has_handle) return;
    CUDA_SAFE_SYNC_CALL(cudaSetDevice(dev), status);
    CUBLAS_SAFE_SYNC_CALL(cublasCreate(&(context->cublas_handle)), status);
    CURAND_SAFE_SYNC_CALL(curandCreateGenerator(&(context->curand_gen),
                            CURAND_RNG_PSEUDO_DEFAULT), status);
    CURAND_SAFE_SYNC_CALL(
            curandSetPseudoRandomGeneratorSeed(context->curand_gen, time(NULL)),
            status);
    CUDA_SAFE_SYNC_CALL(cudaEventCreate(&(context->profile_start)), status);
    CUDA_SAFE_SYNC_CALL(cudaEventCreate(&(context->profile_stop)), status);
    NERV_SET_STATUS(status, NERV_NORMAL, 0);
    context->has_handle = 1;
}

static void free_cuda_handles(CuContext *context, Status *status) {
    if (!context->has_handle) return;
    context->has_handle = 0;
    CUBLAS_SAFE_SYNC_CALL(cublasDestroy(context->cublas_handle), status);
    CURAND_SAFE_SYNC_CALL(curandDestroyGenerator(context->curand_gen), status);
    CUDA_SAFE_SYNC_CALL(cudaEventDestroy(context->profile_start), status);
    CUDA_SAFE_SYNC_CALL(cudaEventDestroy(context->profile_stop), status);
    NERV_SET_STATUS(status, NERV_NORMAL, 0);
}

static int choose_best_gpu(Status *status) {
    int i, n, dev = 0;
    float best_ratio = 0;
    fprintf(stderr, "*** select a GPU based on available space\n");
    CUDA_SAFE_CALL_RET(cudaGetDeviceCount(&n), status);
    for (i = 0; i < n; i++)
    {
        size_t avail, total;
        float ratio;
        CUDA_SAFE_SYNC_CALL_RET(cudaSetDevice(i), status);
        CUDA_SAFE_SYNC_CALL_RET(cuMemGetInfo(&avail, &total), status);
        ratio = (float)avail/total * 100;
        fprintf(stderr, "* card %d: %.2f%%\n", i, ratio);
        if (ratio > best_ratio)
        {
            best_ratio = ratio;
            dev = i;
        }
        CUDA_SAFE_SYNC_CALL_RET(cudaDeviceReset(), status);
    }
    fprintf(stderr, "*** final decision: GPU %d\n", dev);
    NERV_SET_STATUS(status, NERV_NORMAL, 0);
    return dev;
}

CuContext *nerv_cuda_context_create(int dev, Status *status) {
    CuContext *context = (CuContext *)malloc(sizeof(CuContext));
    context->has_handle = 0; /* this line must come first */
    if (dev == -1)
    {
        dev = choose_best_gpu(status);
        if (status->err_code != NERV_NORMAL)
            return NULL;
    }
    new_cuda_handles(context, dev, status);
    if (status->err_code != NERV_NORMAL)
        return NULL;
    context->profile = nerv_hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
    NERV_SET_STATUS(status, NERV_NORMAL, 0);
    return context;
}

void nerv_cuda_context_destroy(CuContext *context, Status *status) {
    free_cuda_handles(context, status);
    if (status->err_code != NERV_NORMAL)
        return;
    nerv_hashmap_destroy(context->profile);
    free(context);
    NERV_SET_STATUS(status, NERV_NORMAL, 0);
}

void nerv_cuda_context_select_gpu(CuContext *context,
                                    int dev, Status *status) {
    /* free_cuda_handles(context, status);
    if (status->err_code != NERV_NORMAL)
        return;
    */
    /* because of cudaDeviceReset */
    context->has_handle = 0;
    CUDA_SAFE_SYNC_CALL(cudaDeviceReset(), status);
    new_cuda_handles(context, dev, status);
    if (status->err_code != NERV_NORMAL)
        return;
    NERV_SET_STATUS(status, NERV_NORMAL, 0);
}

#define MATRIX_USE_FLOAT
#define cuda_matrix_(NAME) cuda_matrix_float_##NAME
#define nerv_matrix_(NAME) nerv_matrix_cuda_float_##NAME
#define cudak_(NAME) cudak_float_ ## NAME
#define NERV_CUBLAS_(NAME) cublasS##NAME
#include "generic/cumatrix.c"

#undef NERV_CUBLAS_
#undef cudak_
#undef nerv_matrix_
#undef cuda_matrix_
#undef MATRIX_USE_FLOAT
#undef MATRIX_ELEM
#undef MATRIX_ELEM_PTR
#undef MATRIX_ELEM_PTR_BASE
#undef MATRIX_ELEM_FMT
#undef MATRIX_ELEM_WRITE_FMT

#define MATRIX_USE_DOUBLE
#define cuda_matrix_(NAME) cuda_matrix_double_##NAME
#define nerv_matrix_(NAME) nerv_matrix_cuda_double_##NAME
#define cudak_(NAME) cudak_double_ ## NAME
#define NERV_CUBLAS_(NAME) cublasD##NAME
#include "generic/cumatrix.c"