add network config for timit baseline

author: Determinant <ted.sybil@gmail.com> 2016-02-29 19:49:18 +0800
committer: Determinant <ted.sybil@gmail.com> 2016-02-29 19:49:18 +0800
commit: accadc2c0c9a12a07ff39df3ee2e02f399054d8f (patch)
tree: 39e2ffb1e14527eecc69fa2855e29772969c2e3a
parent: 550680eacd00555817df19d2b59a20a92df77c42 (diff)
9 files changed, 1047 insertions, 14 deletions
diff --git a/Makefile b/Makefile
index 0982295..d8612dd 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,28 @@
 .PHONY: all clean install luajit luarocks speech
 SHELL := /bin/bash
 PREFIX := $(CURDIR)/install/
+#CUDA_BASE := /usr/local/cuda-7.0
+CUDA_BASE := /usr/local/cuda
+BLAS_BASE := /usr/lib/
+BLAS_LDFLAGS := -L$(BLAS_BASE) -Wl,-rpath=$(BLAS_BASE)
+BLAS_TYPE := atlas
+KALDI_BASE := /speechlab/tools/KALDI/kaldi-master/
+
+MKL_LDFLAGS := -lmkl_rt
+ATLAS_LDFLAGS := -lcblas -llapack_atlas
+ifeq ($(BLAS_TYPE), mkl)
+BLAS_LDFLAGS += $(MKL_LDFLAGS)
+else ifeq ($(BLAS_TYPE), atlas)
+BLAS_LDFLAGS += $(ATLAS_LDFLAGS)
+else
+$(error Invalid blas type)
+endif
+export CUDA_BASE
+export KALDI_BASE
+export BLAS_LDFLAGS
+
+.PHONY: htk_io kaldi_io kaldi_decode
+
 all: luajit luarocks install
 luajit:
 	PREFIX=$(PREFIX) ./tools/build_luajit.sh
@@ -8,12 +30,12 @@ luarocks:
 	PREFIX=$(PREFIX) ./tools/build_luarocks.sh
 install:
 	cd nerv; $(PREFIX)/bin/luarocks make CFLAGS=$(CFLAGS)
-speech:
-	cd speech/speech_utils; $(PREFIX)/bin/luarocks make
-	cd speech/htk_io; $(PREFIX)/bin/luarocks make
-	cd speech/kaldi_io; $(PREFIX)/bin/luarocks make
+speech: htk_io kaldi_io kaldi_decode
+htk_io kaldi_io kaldi_decode:
+	cd speech/$@; $(PREFIX)/bin/luarocks make
 clean:
 	cd nerv && make clean
 	cd speech/speech_utils && make clean
 	cd speech/htk_io && make clean
 	cd speech/kaldi_io && make clean
+	cd speech/kaldi_decode && make clean
diff --git a/nerv/Makefile b/nerv/Makefile
index a2155b9..9ef3212 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -39,12 +39,11 @@ LUA_LIBS := matrix/init.lua io/init.lua init.lua \
 			tnn/init.lua tnn/sutil.lua tnn/tnn.lua
 
 INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK
-#CUDA_BASE := /usr/local/cuda-7.0
-CUDA_BASE := /usr/local/cuda
 CUDA_INCLUDE := -I $(CUDA_BASE)/include/
 INCLUDE += $(CUDA_INCLUDE)
 
-LDFLAGS := -L$(CUDA_BASE)/lib64/ -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcublas -lcurand
+CUDA_LDFLAGS := -L$(CUDA_BASE)/lib64/ -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcublas -lcurand
+
 NVCC := $(CUDA_BASE)/bin/nvcc
 EMPTY :=
 SPACE := $(EMPTY) $(EMPTY)
@@ -65,11 +64,11 @@ $(LUA_DIR)/%.lua: %.lua
 	cp $< $@
 
 $(LIB_PATH)/libnervcore.so: $(CORE_OBJS)
-	gcc -shared -o $@ $^ $(LDFLAGS) -lcblas
+	gcc -shared -o $@ $^ $(LDFLAGS) $(CUDA_LDFLAGS) $(BLAS_LDFLAGS)
 $(LIB_PATH)/libluaT.so: $(LUAT_OBJS)
-	gcc -shared -o $@ $^ $(LDFLAGS)
+	gcc -shared -o $@ $^
 $(INST_LIBDIR)/libnerv.so: $(NERV_OBJS) $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so
-	gcc -shared -o $@ $(NERV_OBJS) $(LDFLAGS) -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lnervcore -lluaT
+	gcc -shared -o $@ $(NERV_OBJS) -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lnervcore -lluaT
 
 $(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c
 $(OBJ_DIR)/matrix/mmatrix.o: matrix/generic/mmatrix.c matrix/generic/matrix.c
diff --git a/nerv/examples/swb_baseline.lua b/nerv/examples/swb_baseline.lua
index 51052ba..cacc401 100644
--- a/nerv/examples/swb_baseline.lua
+++ b/nerv/examples/swb_baseline.lua
@@ -9,7 +9,7 @@ gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
         cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
         htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
         initialized_param = {"/slfs1/users/mfy43/swb_init.nerv",
-                "/slfs1/users/mfy43/swb_global_transf.nerv"},
+                            "/slfs1/users/mfy43/swb_global_transf.nerv"},
         debug = false}
 
 function make_layer_repo(param_repo)
diff --git a/nerv/examples/swb_baseline2.lua b/nerv/examples/swb_baseline2.lua
new file mode 100644
index 0000000..0e2a6e0
--- /dev/null
+++ b/nerv/examples/swb_baseline2.lua
@@ -0,0 +1,201 @@
+require 'htk_io'
+gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
+        cumat_type = nerv.CuMatrixFloat,
+        mmat_type = nerv.MMatrixFloat,
+        rearrange = true, -- just to make the context order consistent with old results, deprecated
+        frm_ext = 5,
+        frm_trim = 5, -- trim the first and last 5 frames, TNet just does this, deprecated
+        tr_scp = "/speechlab/users/mfy43/swb50/train_bp.scp",
+        cv_scp = "/speechlab/users/mfy43/swb50/train_cv.scp",
+        htk_conf = "/speechlab/users/mfy43/swb50/plp_0_d_a.conf",
+        initialized_param = {"/speechlab/users/mfy43/swb50/swb_init.nerv",
+                            "/speechlab/users/mfy43/swb50/swb_global_transf.nerv"},
+        debug = false}
+
+function make_layer_repo(param_repo)
+    local layer_repo = nerv.LayerRepo(
+    {
+        -- global transf
+        ["nerv.BiasLayer"] =
+        {
+            blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}},
+            blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}}
+        },
+        ["nerv.WindowLayer"] =
+        {
+            wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}},
+            wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}}
+        },
+        -- biased linearity
+        ["nerv.AffineLayer"] =
+        {
+            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+            {dim_in = {429}, dim_out = {2048}}},
+            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+            {dim_in = {2048}, dim_out = {3001}}}
+        },
+        ["nerv.SigmoidLayer"] =
+        {
+            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+        },
+        ["nerv.SoftmaxCELayer"] = -- softmax + ce criterion layer for finetune output
+        {
+            ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}}
+        },
+        ["nerv.SoftmaxLayer"] = -- softmax for decode output
+        {
+            softmax = {{}, {dim_in = {3001}, dim_out = {3001}}}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            global_transf = {{}, {
+                dim_in = {429}, dim_out = {429},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "blayer1[1]",
+                    ["blayer1[1]"] = "wlayer1[1]",
+                    ["wlayer1[1]"] = "blayer2[1]",
+                    ["blayer2[1]"] = "wlayer2[1]",
+                    ["wlayer2[1]"] = "<output>[1]"
+                }
+            }},
+            main = {{}, {
+                dim_in = {429}, dim_out = {3001},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "affine0[1]",
+                    ["affine0[1]"] = "sigmoid0[1]",
+                    ["sigmoid0[1]"] = "affine1[1]",
+                    ["affine1[1]"] = "sigmoid1[1]",
+                    ["sigmoid1[1]"] = "affine2[1]",
+                    ["affine2[1]"] = "sigmoid2[1]",
+                    ["sigmoid2[1]"] = "affine3[1]",
+                    ["affine3[1]"] = "sigmoid3[1]",
+                    ["sigmoid3[1]"] = "affine4[1]",
+                    ["affine4[1]"] = "sigmoid4[1]",
+                    ["sigmoid4[1]"] = "affine5[1]",
+                    ["affine5[1]"] = "sigmoid5[1]",
+                    ["sigmoid5[1]"] = "affine6[1]",
+                    ["affine6[1]"] = "sigmoid6[1]",
+                    ["sigmoid6[1]"] = "affine7[1]",
+                    ["affine7[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            ce_output = {{}, {
+                dim_in = {429, 1}, dim_out = {1},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "ce_crit[1]",
+                    ["<input>[2]"] = "ce_crit[2]",
+                    ["ce_crit[1]"] = "<output>[1]"
+                }
+            }},
+            softmax_output = {{}, {
+                dim_in = {429}, dim_out = {3001},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "softmax[1]",
+                    ["softmax[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+    return layer_repo
+end
+
+function get_network(layer_repo)
+    return layer_repo:get_layer("ce_output")
+end
+
+function get_decode_network(layer_repo)
+    return layer_repo:get_layer("softmax_output")
+end
+
+function get_global_transf(layer_repo)
+    return layer_repo:get_layer("global_transf")
+end
+
+function make_readers(scp_file, layer_repo)
+    return {
+                {reader = nerv.TNetReader(gconf,
+                    {
+                        id = "main_scp",
+                        scp_file = scp_file,
+                        conf_file = gconf.htk_conf,
+                        frm_ext = gconf.frm_ext,
+                        mlfs = {
+                            phone_state = {
+                                file = "/speechlab/users/mfy43/swb50/ref.mlf",
+                                format = "map",
+                                format_arg = "/speechlab/users/mfy43/swb50/dict",
+                                dir = "*/",
+                                ext = "lab"
+                            }
+                        }
+                    }),
+                data = {main_scp = 429, phone_state = 1}}
+            }
+end
+
+function make_buffer(readers)
+    return nerv.SGDBuffer(gconf,
+        {
+            buffer_size = gconf.buffer_size,
+            randomize = gconf.randomize,
+            readers = readers,
+            use_gpu = true
+        })
+end
+
+function get_input_order()
+    return {{id = "main_scp", global_transf = true},
+            {id = "phone_state"}}
+end
+
+function get_accuracy(layer_repo)
+    local ce_crit = layer_repo:get_layer("ce_crit")
+    return ce_crit.total_correct / ce_crit.total_frames * 100
+end
+
+function print_stat(layer_repo)
+    local ce_crit = layer_repo:get_layer("ce_crit")
+    nerv.info("*** training stat begin ***")
+    nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce)
+    nerv.printf("correct:\t\t%d\n", ce_crit.total_correct)
+    nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames)
+    nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames)
+    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo))
+    nerv.info("*** training stat end ***")
+end
diff --git a/nerv/examples/timit_baseline2.lua b/nerv/examples/timit_baseline2.lua
new file mode 100644
index 0000000..174b9e7
--- /dev/null
+++ b/nerv/examples/timit_baseline2.lua
@@ -0,0 +1,215 @@
+require 'kaldi_io'
+gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
+        cumat_type = nerv.CuMatrixFloat,
+        mmat_type = nerv.MMatrixFloat,
+        frm_ext = 5,
+        tr_scp = "ark:/speechlab/tools/KALDI/kaldi-master/src/featbin/copy-feats " ..
+                    "scp:/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/train.scp ark:- |",
+        cv_scp = "ark:/speechlab/tools/KALDI/kaldi-master/src/featbin/copy-feats " ..
+                    "scp:/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/cv.scp ark:- |",
+        initialized_param = {"/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/nnet_init.nerv",
+                            "/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/nnet_output.nerv",
+                            "/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/nnet_trans.nerv"},
+        decode_param = {"/speechlab/users/mfy43/timit/nnet_init_20160229015745_iter_13_lr0.013437_tr72.434_cv58.729.nerv",
+                        "/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/nnet_trans.nerv"},
+        debug = false}
+
+function make_layer_repo(param_repo)
+    local layer_repo = nerv.LayerRepo(
+    {
+        -- global transf
+        ["nerv.BiasLayer"] =
+        {
+            blayer1 = {{bias = "bias0"}, {dim_in = {440}, dim_out = {440}}}
+        },
+        ["nerv.WindowLayer"] =
+        {
+            wlayer1 = {{window = "window0"}, {dim_in = {440}, dim_out = {440}}}
+        },
+        -- biased linearity
+        ["nerv.AffineLayer"] =
+        {
+            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+            {dim_in = {440}, dim_out = {1024}}},
+            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+            {dim_in = {1024}, dim_out = {1024}}},
+            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+            {dim_in = {1024}, dim_out = {1024}}},
+            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+            {dim_in = {1024}, dim_out = {1024}}},
+            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+            {dim_in = {1024}, dim_out = {1024}}},
+            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+            {dim_in = {1024}, dim_out = {1024}}},
+            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+            {dim_in = {1024}, dim_out = {1959}}}
+        },
+        ["nerv.SigmoidLayer"] =
+        {
+            sigmoid0 = {{}, {dim_in = {1024}, dim_out = {1024}}},
+            sigmoid1 = {{}, {dim_in = {1024}, dim_out = {1024}}},
+            sigmoid2 = {{}, {dim_in = {1024}, dim_out = {1024}}},
+            sigmoid3 = {{}, {dim_in = {1024}, dim_out = {1024}}},
+            sigmoid4 = {{}, {dim_in = {1024}, dim_out = {1024}}},
+            sigmoid5 = {{}, {dim_in = {1024}, dim_out = {1024}}}
+        },
+        ["nerv.SoftmaxCELayer"] = -- softmax + ce criterion layer for finetune output
+        {
+            ce_crit = {{}, {dim_in = {1959, 1}, dim_out = {1}, compressed = true}}
+        },
+        ["nerv.SoftmaxLayer"] = -- softmax for decode output
+        {
+            softmax = {{}, {dim_in = {1959}, dim_out = {1959}}}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            global_transf = {{}, {
+                dim_in = {440}, dim_out = {440},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "blayer1[1]",
+                    ["blayer1[1]"] = "wlayer1[1]",
+                    ["wlayer1[1]"] = "<output>[1]"
+                }
+            }},
+            main = {{}, {
+                dim_in = {440}, dim_out = {1959},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "affine0[1]",
+                    ["affine0[1]"] = "sigmoid0[1]",
+                    ["sigmoid0[1]"] = "affine1[1]",
+                    ["affine1[1]"] = "sigmoid1[1]",
+                    ["sigmoid1[1]"] = "affine2[1]",
+                    ["affine2[1]"] = "sigmoid2[1]",
+                    ["sigmoid2[1]"] = "affine3[1]",
+                    ["affine3[1]"] = "sigmoid3[1]",
+                    ["sigmoid3[1]"] = "affine4[1]",
+                    ["affine4[1]"] = "sigmoid4[1]",
+                    ["sigmoid4[1]"] = "affine5[1]",
+                    ["affine5[1]"] = "sigmoid5[1]",
+                    ["sigmoid5[1]"] = "affine6[1]",
+                    ["affine6[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            ce_output = {{}, {
+                dim_in = {440, 1}, dim_out = {1},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "ce_crit[1]",
+                    ["<input>[2]"] = "ce_crit[2]",
+                    ["ce_crit[1]"] = "<output>[1]"
+                }
+            }},
+            softmax_output = {{}, {
+                dim_in = {440}, dim_out = {1959},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "softmax[1]",
+                    ["softmax[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+    return layer_repo
+end
+
+function get_network(layer_repo)
+    return layer_repo:get_layer("ce_output")
+end
+
+function get_decode_network(layer_repo)
+    return layer_repo:get_layer("softmax_output")
+end
+
+function get_global_transf(layer_repo)
+    return layer_repo:get_layer("global_transf")
+end
+
+function make_readers(scp_file, layer_repo)
+    return {
+                {reader = nerv.KaldiReader(gconf,
+                    {
+                        id = "main_scp",
+                        feature_rspecifier = scp_file,
+                        conf_file = gconf.htk_conf,
+                        frm_ext = gconf.frm_ext,
+                        mlfs = {
+                            phone_state = {
+                                targets_rspecifier = "ark:/speechlab/tools/KALDI/kaldi-master/src/bin/ali-to-pdf " ..
+                                                        "/speechlab/users/mfy43/timit/s5/exp/tri3_ali/final.mdl " ..
+                                                        "\"ark:gunzip -c /speechlab/users/mfy43/timit/s5/exp/tri3_ali/ali.*.gz |\" " ..
+                                                        "ark:- | " ..
+                                                    "/speechlab/tools/KALDI/kaldi-master/src/bin/ali-to-post " ..
+                                                        "ark:- ark:- |",
+                                format = "map"
+                            }
+                        }
+                    }),
+                data = {main_scp = 440, phone_state = 1}}
+            }
+end
+
+function make_decode_readers(scp_file, layer_repo)
+    return {
+                {reader = nerv.KaldiReader(gconf,
+                    {
+                        id = "main_scp",
+                        feature_rspecifier = scp_file,
+                        conf_file = gconf.htk_conf,
+                        frm_ext = gconf.frm_ext,
+                        mlfs = {},
+                        need_key = true
+                    }),
+                data = {main_scp = 440, phone_state = 1}}
+            }
+end
+
+function make_buffer(readers)
+    return nerv.SGDBuffer(gconf,
+        {
+            buffer_size = gconf.buffer_size,
+            randomize = gconf.randomize,
+            readers = readers,
+            use_gpu = true
+        })
+end
+
+function get_input_order()
+    return {{id = "main_scp", global_transf = true},
+            {id = "phone_state"}}
+end
+
+function get_decode_input_order()
+    return {{id = "main_scp", global_transf = true}}
+end
+
+function get_accuracy(layer_repo)
+    local ce_crit = layer_repo:get_layer("ce_crit")
+    return ce_crit.total_correct / ce_crit.total_frames * 100
+end
+
+function print_stat(layer_repo)
+    local ce_crit = layer_repo:get_layer("ce_crit")
+    nerv.info("*** training stat begin ***")
+    nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce)
+    nerv.printf("correct:\t\t%d\n", ce_crit.total_correct)
+    nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames)
+    nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames)
+    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo))
+    nerv.info("*** training stat end ***")
+end
diff --git a/nerv/init.lua b/nerv/init.lua
index e7d668c..4aa7a45 100644
--- a/nerv/init.lua
+++ b/nerv/init.lua
@@ -25,7 +25,7 @@ end
 -- @param fmt the format string
 -- @param ... args, the data to be formatted
 function nerv.printf(fmt, ...)
-    io.write(nerv.sprintf(fmt, ...))
+    io.stderr:write(nerv.sprintf(fmt, ...))
 end
 
 --- Raise an global error with the formatted message.
diff --git a/nerv/lib/cblas.h b/nerv/lib/cblas.h
new file mode 100644
index 0000000..4087ffb
--- /dev/null
+++ b/nerv/lib/cblas.h
@@ -0,0 +1,596 @@
+#ifndef CBLAS_H
+
+#ifndef CBLAS_ENUM_DEFINED_H
+   #define CBLAS_ENUM_DEFINED_H
+   enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 };
+   enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113,
+                         AtlasConj=114};
+   enum CBLAS_UPLO  {CblasUpper=121, CblasLower=122};
+   enum CBLAS_DIAG  {CblasNonUnit=131, CblasUnit=132};
+   enum CBLAS_SIDE  {CblasLeft=141, CblasRight=142};
+#endif
+
+#ifndef CBLAS_ENUM_ONLY
+#define CBLAS_H
+#define CBLAS_INDEX int
+
+int cblas_errprn(int ierr, int info, char *form, ...);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 1 BLAS functions (complex are recast as routines)
+ * ===========================================================================
+ */
+float  cblas_sdsdot(const int N, const float alpha, const float *X,
+                    const int incX, const float *Y, const int incY);
+double cblas_dsdot(const int N, const float *X, const int incX, const float *Y,
+                   const int incY);
+float  cblas_sdot(const int N, const float  *X, const int incX,
+                  const float  *Y, const int incY);
+double cblas_ddot(const int N, const double *X, const int incX,
+                  const double *Y, const int incY);
+/*
+ * Functions having prefixes Z and C only
+ */
+void   cblas_cdotu_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotu);
+void   cblas_cdotc_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotc);
+
+void   cblas_zdotu_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotu);
+void   cblas_zdotc_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotc);
+
+
+/*
+ * Functions having prefixes S D SC DZ
+ */
+float  cblas_snrm2(const int N, const float *X, const int incX);
+float  cblas_sasum(const int N, const float *X, const int incX);
+
+double cblas_dnrm2(const int N, const double *X, const int incX);
+double cblas_dasum(const int N, const double *X, const int incX);
+
+float  cblas_scnrm2(const int N, const void *X, const int incX);
+float  cblas_scasum(const int N, const void *X, const int incX);
+
+double cblas_dznrm2(const int N, const void *X, const int incX);
+double cblas_dzasum(const int N, const void *X, const int incX);
+
+
+/*
+ * Functions having standard 4 prefixes (S D C Z)
+ */
+CBLAS_INDEX cblas_isamax(const int N, const float  *X, const int incX);
+CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX);
+CBLAS_INDEX cblas_icamax(const int N, const void   *X, const int incX);
+CBLAS_INDEX cblas_izamax(const int N, const void   *X, const int incX);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 1 BLAS routines
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (s, d, c, z)
+ */
+void cblas_sswap(const int N, float *X, const int incX,
+                 float *Y, const int incY);
+void cblas_scopy(const int N, const float *X, const int incX,
+                 float *Y, const int incY);
+void cblas_saxpy(const int N, const float alpha, const float *X,
+                 const int incX, float *Y, const int incY);
+void catlas_saxpby(const int N, const float alpha, const float *X,
+                  const int incX, const float beta, float *Y, const int incY);
+void catlas_sset
+   (const int N, const float alpha, float *X, const int incX);
+
+void cblas_dswap(const int N, double *X, const int incX,
+                 double *Y, const int incY);
+void cblas_dcopy(const int N, const double *X, const int incX,
+                 double *Y, const int incY);
+void cblas_daxpy(const int N, const double alpha, const double *X,
+                 const int incX, double *Y, const int incY);
+void catlas_daxpby(const int N, const double alpha, const double *X,
+                  const int incX, const double beta, double *Y, const int incY);
+void catlas_dset
+   (const int N, const double alpha, double *X, const int incX);
+
+void cblas_cswap(const int N, void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_ccopy(const int N, const void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_caxpy(const int N, const void *alpha, const void *X,
+                 const int incX, void *Y, const int incY);
+void catlas_caxpby(const int N, const void *alpha, const void *X,
+                  const int incX, const void *beta, void *Y, const int incY);
+void catlas_cset
+   (const int N, const void *alpha, void *X, const int incX);
+
+void cblas_zswap(const int N, void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_zcopy(const int N, const void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_zaxpy(const int N, const void *alpha, const void *X,
+                 const int incX, void *Y, const int incY);
+void catlas_zaxpby(const int N, const void *alpha, const void *X,
+                  const int incX, const void *beta, void *Y, const int incY);
+void catlas_zset
+   (const int N, const void *alpha, void *X, const int incX);
+
+
+/*
+ * Routines with S and D prefix only
+ */
+void cblas_srotg(float *a, float *b, float *c, float *s);
+void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
+void cblas_srot(const int N, float *X, const int incX,
+                float *Y, const int incY, const float c, const float s);
+void cblas_srotm(const int N, float *X, const int incX,
+                float *Y, const int incY, const float *P);
+
+void cblas_drotg(double *a, double *b, double *c, double *s);
+void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
+void cblas_drot(const int N, double *X, const int incX,
+                double *Y, const int incY, const double c, const double s);
+void cblas_drotm(const int N, double *X, const int incX,
+                double *Y, const int incY, const double *P);
+
+
+/*
+ * Routines with S D C Z CS and ZD prefixes
+ */
+void cblas_sscal(const int N, const float alpha, float *X, const int incX);
+void cblas_dscal(const int N, const double alpha, double *X, const int incX);
+void cblas_cscal(const int N, const void *alpha, void *X, const int incX);
+void cblas_zscal(const int N, const void *alpha, void *X, const int incX);
+void cblas_csscal(const int N, const float alpha, void *X, const int incX);
+void cblas_zdscal(const int N, const double alpha, void *X, const int incX);
+
+/*
+ * Extra reference routines provided by ATLAS, but not mandated by the standard
+ */
+void cblas_crotg(void *a, void *b, void *c, void *s);
+void cblas_zrotg(void *a, void *b, void *c, void *s);
+void cblas_csrot(const int N, void *X, const int incX, void *Y, const int incY,
+                 const float c, const float s);
+void cblas_zdrot(const int N, void *X, const int incX, void *Y, const int incY,
+                 const double c, const double s);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 2 BLAS
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (S, D, C, Z)
+ */
+void cblas_sgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 const float *X, const int incX, const float beta,
+                 float *Y, const int incY);
+void cblas_sgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const float alpha,
+                 const float *A, const int lda, const float *X,
+                 const int incX, const float beta, float *Y, const int incY);
+void cblas_strmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *A, const int lda,
+                 float *X, const int incX);
+void cblas_stbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const float *A, const int lda,
+                 float *X, const int incX);
+void cblas_stpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *Ap, float *X, const int incX);
+void cblas_strsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *A, const int lda, float *X,
+                 const int incX);
+void cblas_stbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const float *A, const int lda,
+                 float *X, const int incX);
+void cblas_stpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *Ap, float *X, const int incX);
+
+void cblas_dgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 const double *X, const int incX, const double beta,
+                 double *Y, const int incY);
+void cblas_dgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const double alpha,
+                 const double *A, const int lda, const double *X,
+                 const int incX, const double beta, double *Y, const int incY);
+void cblas_dtrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *A, const int lda,
+                 double *X, const int incX);
+void cblas_dtbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const double *A, const int lda,
+                 double *X, const int incX);
+void cblas_dtpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *Ap, double *X, const int incX);
+void cblas_dtrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *A, const int lda, double *X,
+                 const int incX);
+void cblas_dtbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const double *A, const int lda,
+                 double *X, const int incX);
+void cblas_dtpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *Ap, double *X, const int incX);
+
+void cblas_cgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *X, const int incX, const void *beta,
+                 void *Y, const int incY);
+void cblas_cgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const void *alpha,
+                 const void *A, const int lda, const void *X,
+                 const int incX, const void *beta, void *Y, const int incY);
+void cblas_ctrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ctbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ctpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+void cblas_ctrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda, void *X,
+                 const int incX);
+void cblas_ctbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ctpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+
+void cblas_zgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *X, const int incX, const void *beta,
+                 void *Y, const int incY);
+void cblas_zgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const void *alpha,
+                 const void *A, const int lda, const void *X,
+                 const int incX, const void *beta, void *Y, const int incY);
+void cblas_ztrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ztbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ztpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+void cblas_ztrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda, void *X,
+                 const int incX);
+void cblas_ztbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ztpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+
+
+/*
+ * Routines with S and D prefixes only
+ */
+void cblas_ssymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const float alpha, const float *A,
+                 const int lda, const float *X, const int incX,
+                 const float beta, float *Y, const int incY);
+void cblas_ssbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const float alpha, const float *A,
+                 const int lda, const float *X, const int incX,
+                 const float beta, float *Y, const int incY);
+void cblas_sspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const float alpha, const float *Ap,
+                 const float *X, const int incX,
+                 const float beta, float *Y, const int incY);
+void cblas_sger(const enum CBLAS_ORDER Order, const int M, const int N,
+                const float alpha, const float *X, const int incX,
+                const float *Y, const int incY, float *A, const int lda);
+void cblas_ssyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, float *A, const int lda);
+void cblas_sspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, float *Ap);
+void cblas_ssyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, const float *Y, const int incY, float *A,
+                const int lda);
+void cblas_sspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, const float *Y, const int incY, float *A);
+
+void cblas_dsymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const double alpha, const double *A,
+                 const int lda, const double *X, const int incX,
+                 const double beta, double *Y, const int incY);
+void cblas_dsbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const double alpha, const double *A,
+                 const int lda, const double *X, const int incX,
+                 const double beta, double *Y, const int incY);
+void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const double alpha, const double *Ap,
+                 const double *X, const int incX,
+                 const double beta, double *Y, const int incY);
+void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N,
+                const double alpha, const double *X, const int incX,
+                const double *Y, const int incY, double *A, const int lda);
+void cblas_dsyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, double *A, const int lda);
+void cblas_dspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, double *Ap);
+void cblas_dsyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, const double *Y, const int incY, double *A,
+                const int lda);
+void cblas_dspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, const double *Y, const int incY, double *A);
+
+
+/*
+ * Routines with C and Z prefixes only
+ */
+void cblas_chemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_chbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_chpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *Ap,
+                 const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_cgeru(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_cgerc(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_cher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const void *X, const int incX,
+                void *A, const int lda);
+void cblas_chpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const void *X,
+                const int incX, void *A);
+void cblas_cher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *A, const int lda);
+void cblas_chpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *Ap);
+
+void cblas_zhemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_zhbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *Ap,
+                 const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_zher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const void *X, const int incX,
+                void *A, const int lda);
+void cblas_zhpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const void *X,
+                const int incX, void *A);
+void cblas_zher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *A, const int lda);
+void cblas_zhpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *Ap);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 3 BLAS
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (S, D, C, Z)
+ */
+void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const float alpha, const float *A,
+                 const int lda, const float *B, const int ldb,
+                 const float beta, float *C, const int ldc);
+void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 const float *B, const int ldb, const float beta,
+                 float *C, const int ldc);
+void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const float alpha, const float *A, const int lda,
+                 const float beta, float *C, const int ldc);
+void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const float alpha, const float *A, const int lda,
+                  const float *B, const int ldb, const float beta,
+                  float *C, const int ldc);
+void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb);
+void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb);
+
+void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const double alpha, const double *A,
+                 const int lda, const double *B, const int ldb,
+                 const double beta, double *C, const int ldc);
+void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 const double *B, const int ldb, const double beta,
+                 double *C, const int ldc);
+void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const double alpha, const double *A, const int lda,
+                 const double beta, double *C, const int ldc);
+void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const double alpha, const double *A, const int lda,
+                  const double *B, const int ldb, const double beta,
+                  double *C, const int ldc);
+void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb);
+void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb);
+
+void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const void *alpha, const void *A,
+                 const int lda, const void *B, const int ldb,
+                 const void *beta, void *C, const int ldc);
+void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const void *alpha, const void *A, const int lda,
+                 const void *beta, void *C, const int ldc);
+void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const void *beta,
+                  void *C, const int ldc);
+void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+
+void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const void *alpha, const void *A,
+                 const int lda, const void *B, const int ldb,
+                 const void *beta, void *C, const int ldc);
+void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const void *alpha, const void *A, const int lda,
+                 const void *beta, void *C, const int ldc);
+void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const void *beta,
+                  void *C, const int ldc);
+void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+
+
+/*
+ * Routines with prefixes C and Z only
+ */
+void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const float alpha, const void *A, const int lda,
+                 const float beta, void *C, const int ldc);
+void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const float beta,
+                  void *C, const int ldc);
+void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const double alpha, const void *A, const int lda,
+                 const double beta, void *C, const int ldc);
+void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const double beta,
+                  void *C, const int ldc);
+
+int cblas_errprn(int ierr, int info, char *form, ...);
+
+#endif  /* end #ifdef CBLAS_ENUM_ONLY */
+#endif
diff --git a/nerv/lib/matrix/generic/mmatrix.c b/nerv/lib/matrix/generic/mmatrix.c
index 485d778..fb99b53 100644
--- a/nerv/lib/matrix/generic/mmatrix.c
+++ b/nerv/lib/matrix/generic/mmatrix.c
@@ -8,10 +8,10 @@
                                                 context, status)
 #define NERV_GENERIC_MATRIX
 #include "../../common.h"
+#include "../../cblas.h"
 #include "../../io/chunk_file.h"
 #include <string.h>
 #include <math.h>
-#include <cblas.h>
 #include <float.h>
 
 Matrix *nerv_matrix_(colsum)(Matrix *a, MContext *context, Status *status) {
diff --git a/nerv/matrix/generic/mmatrix.c b/nerv/matrix/generic/mmatrix.c
index 69000b7..1f37173 100644
--- a/nerv/matrix/generic/mmatrix.c
+++ b/nerv/matrix/generic/mmatrix.c
@@ -8,10 +8,10 @@
 #define MATRIX_BASE_TNAME nerv_matrix_host_tname
 #define NERV_GENERIC_MATRIX
 #include "../../lib/common.h"
+#include "../../lib/cblas.h"
 #include "../../lib/matrix/generic/mmatrix.h"
 #include "../../io/chunk_file.h"
 #include <string.h>
-#include <cblas.h>
 
 #define BLAS_OP_N CblasNoTrans
 static int nerv_matrix_(lua_get_blas_op)(char ch) {
author	Determinant <ted.sybil@gmail.com>	2016-02-29 19:49:18 +0800
committer	Determinant <ted.sybil@gmail.com>	2016-02-29 19:49:18 +0800
commit	accadc2c0c9a12a07ff39df3ee2e02f399054d8f (patch)
tree	39e2ffb1e14527eecc69fa2855e29772969c2e3a
parent	550680eacd00555817df19d2b59a20a92df77c42 (diff)