summaryrefslogtreecommitdiff
path: root/kaldi_io/src/tools/ATLAS/include
diff options
context:
space:
mode:
authorDeterminant <[email protected]>2015-08-14 11:51:42 +0800
committerDeterminant <[email protected]>2015-08-14 11:51:42 +0800
commit96a32415ab43377cf1575bd3f4f2980f58028209 (patch)
tree30a2d92d73e8f40ac87b79f6f56e227bfc4eea6e /kaldi_io/src/tools/ATLAS/include
parentc177a7549bd90670af4b29fa813ddea32cfe0f78 (diff)
add implementation for kaldi io (by ymz)
Diffstat (limited to 'kaldi_io/src/tools/ATLAS/include')
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_altivec.h27
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_asm.h411
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_aux.h785
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_cblascalias.h40
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_cblasdalias.h39
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_cblassalias.h39
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_cblastypealias.h9
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_cblaszalias.h40
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_enum.h55
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_f77.h83
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_f77blas.h849
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_f77wrap.h1088
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_fopen.h40
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_kern3.h110
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_kernel2.h5408
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_kernel3.h1393
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_lapack.h239
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_level1.h127
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_level2.h267
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_level3.h181
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_lvl2.h294
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_lvl3.h512
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_misc.h416
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_mv.h45
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_pkblas.h569
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_prefetch.h197
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_ptalias1.h60
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_ptalias2.h80
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_ptalias3.h43
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_ptlevel3.h284
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_ptlvl3.h389
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_ptmisc.h410
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_r1.h39
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_rblas3.h474
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_refalias1.h59
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_refalias2.h79
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_refalias3.h43
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_reflevel1.h421
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_reflevel2.h788
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_reflevel3.h374
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_reflvl2.h3184
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_reflvl3.h2292
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_refmisc.h367
-rw-r--r--kaldi_io/src/tools/ATLAS/include/atlas_tst.h909
-rw-r--r--kaldi_io/src/tools/ATLAS/include/cblas.h596
-rw-r--r--kaldi_io/src/tools/ATLAS/include/cblas_test.h542
-rw-r--r--kaldi_io/src/tools/ATLAS/include/clapack.h149
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h188
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/Make.ext39
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h709
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h1626
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h295
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h215
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h2982
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h331
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h508
-rw-r--r--kaldi_io/src/tools/ATLAS/include/f77wrap_lapack.h91
57 files changed, 31829 insertions, 0 deletions
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_altivec.h b/kaldi_io/src/tools/ATLAS/include/atlas_altivec.h
new file mode 100644
index 0000000..a772448
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_altivec.h
@@ -0,0 +1,27 @@
+#ifndef ATLAS_ALTIVEC_H
+ #define ATLAS_ALTIVEC_H
+
+#ifdef ATL_AltiVec
+ #ifdef ATL_AVgcc
+ #include <altivec.h>
+
+ #define VECTOR_INIT(v0_,v1_,v2_,v3_) (vector float) {v0_,v1_,v2_,v3_}
+ #define VECTOR_INITI(v0_,v1_,v2_,v3_) (vector int) {v0_,v1_,v2_,v3_}
+ #else
+ #define VECTOR_INIT(v0_,v1_,v2_,v3_) (vector float)(v0_,v1_,v2_,v3_)
+ #define VECTOR_INITI(v0_,v1_,v2_,v3_) (vector int)(v0_,v1_,v2_,v3_)
+ #define VECTOR_INITL(v0_,v1_,v2_,v3_) (vector long)(v0_,v1_,v2_,v3_)
+ #endif
+ #define ATL_GetCtrl(stride, count, size) \
+ (int)((stride) | ((count)<<16) | ((size)<<24))
+ #define ATL_pfavR(ptr, cwrd, stream) \
+ vec_dst((vector float *)(ptr), (cwrd), (stream))
+ #define ATL_pfavW(ptr, cwrd, stream) \
+ vec_dstst((vector float *)(ptr), (cwrd), (stream))
+#else
+ #define ATL_GetCtrl(stride, count, size)
+ #define ATL_pfavR(ptr, cwrd, stream)
+ #define ATL_pfavW(ptr, cwrd, stream)
+#endif
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_asm.h b/kaldi_io/src/tools/ATLAS/include/atlas_asm.h
new file mode 100644
index 0000000..4c4fa86
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_asm.h
@@ -0,0 +1,411 @@
+#ifndef ATLAS_ASM_H
+ #define ATLAS_ASM_H
+
+#ifndef Mjoin
+ #define Mjoin(pre, nam) my_join(pre, nam)
+ #define my_join(pre, nam) pre ## nam
+#endif
+
+#if defined(ATL_OS_WinNT) || defined(ATL_OS_Win9x) || defined(ATL_OS_OSX)
+ #define ATL_asmdecor(nam) Mjoin(_,nam)
+#elif defined(ATL_OS_AIX) && defined(ATL_GAS_PPC)
+ #define ATL_asmdecor(nam) Mjoin(.,nam)
+#elif !defined(ATL_OS_OSX) && defined(ATL_GAS_PPC) && defined(ATL_USE64BITS)
+ #define ATL_asmdecor(nam) Mjoin(.,nam)
+#else
+ #define ATL_asmdecor(nam) nam
+#endif
+
+#ifdef ATL_GAS_PARISC
+ #ifdef ATL_OS_HPUX
+ #define ATL_HPUX_PARISC
+ #else
+ #define ATL_LINUX_PARISC
+ #endif
+#endif
+
+#ifdef ATL_GAS_PPC
+ #ifdef ATL_OS_OSX
+ #define ATL_AS_OSX_PPC
+ #elif defined(ATL_OS_AIX)
+ #define ATL_AS_AIX_PPC
+ #else
+ #define ATL_GAS_LINUX_PPC
+ #endif
+#endif
+
+#if defined(ATL_GAS_LINUX_PPC) || defined(ATL_AS_AIX_PPC)
+
+ #define r0 0
+ #define f0 0
+ #define r1 1
+ #define f1 1
+ #define r2 2
+ #define f2 2
+ #define r3 3
+ #define f3 3
+ #define r4 4
+ #define f4 4
+ #define r5 5
+ #define f5 5
+ #define r6 6
+ #define f6 6
+ #define r7 7
+ #define f7 7
+ #define r8 8
+ #define f8 8
+ #define r9 9
+ #define f9 9
+ #define r10 10
+ #define f10 10
+ #define r11 11
+ #define f11 11
+ #define r12 12
+ #define f12 12
+ #define r13 13
+ #define f13 13
+ #define r14 14
+ #define f14 14
+ #define r15 15
+ #define f15 15
+ #define r16 16
+ #define f16 16
+ #define r17 17
+ #define f17 17
+ #define r18 18
+ #define f18 18
+ #define r19 19
+ #define f19 19
+ #define r20 20
+ #define f20 20
+ #define r21 21
+ #define f21 21
+ #define r22 22
+ #define f22 22
+ #define r23 23
+ #define f23 23
+ #define r24 24
+ #define f24 24
+ #define r25 25
+ #define f25 25
+ #define r26 26
+ #define f26 26
+ #define r27 27
+ #define f27 27
+ #define r28 28
+ #define f28 28
+ #define r29 29
+ #define f29 29
+ #define r30 30
+ #define f30 30
+ #define r31 31
+ #define f31 31
+ #define cr0 0
+ #define cr1 1
+ #define cr2 2
+ #define cr3 3
+ #define cr4 4
+ #define cr5 5
+ #define cr6 6
+ #define cr7 7
+
+#endif
+
+#ifdef ATL_OS_OSX
+ #define ALIGN2 .align 1
+ #define ALIGN4 .align 2
+ #define ALIGN8 .align 3
+ #define ALIGN16 .align 4
+ #define ALIGN32 .align 5
+ #define ALIGN64 .align 6
+ #define ALIGN128 .align 7
+ #define global globl
+#else
+ #define ALIGN2 .align 2
+ #define ALIGN4 .align 4
+ #define ALIGN8 .align 8
+ #define ALIGN16 .align 16
+ #define ALIGN32 .align 32
+ #define ALIGN64 .align 64
+ #define ALIGN128 .align 128
+#endif
+
+#if defined(ATL_SSE1) && !defined(ATL_3DNow)
+ #define prefetchw prefetchnta
+#endif
+/*
+ * Solaris doesn't allow division in integer expressions in assembly, but
+ * many x86 kernels need to do $MB/mu; we work around this insanity with
+ * this kludge
+ */
+#if defined(ATL_DIV_NUM) && defined(ATL_DIV_DEN)
+ #if (ATL_DIV_NUM/ATL_DIV_DEN) == 0
+ #define ATL_DivAns 0
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 1
+ #define ATL_DivAns 1
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 2
+ #define ATL_DivAns 2
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 3
+ #define ATL_DivAns 3
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 4
+ #define ATL_DivAns 4
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 5
+ #define ATL_DivAns 5
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 6
+ #define ATL_DivAns 6
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 7
+ #define ATL_DivAns 7
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 8
+ #define ATL_DivAns 8
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 9
+ #define ATL_DivAns 9
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 10
+ #define ATL_DivAns 10
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 11
+ #define ATL_DivAns 11
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 12
+ #define ATL_DivAns 12
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 13
+ #define ATL_DivAns 13
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 14
+ #define ATL_DivAns 14
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 15
+ #define ATL_DivAns 15
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 16
+ #define ATL_DivAns 16
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 17
+ #define ATL_DivAns 17
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 18
+ #define ATL_DivAns 18
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 19
+ #define ATL_DivAns 19
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 20
+ #define ATL_DivAns 20
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 21
+ #define ATL_DivAns 21
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 22
+ #define ATL_DivAns 22
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 23
+ #define ATL_DivAns 23
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 24
+ #define ATL_DivAns 24
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 25
+ #define ATL_DivAns 25
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 26
+ #define ATL_DivAns 26
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 27
+ #define ATL_DivAns 27
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 28
+ #define ATL_DivAns 28
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 29
+ #define ATL_DivAns 29
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 30
+ #define ATL_DivAns 30
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 31
+ #define ATL_DivAns 31
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 32
+ #define ATL_DivAns 32
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 33
+ #define ATL_DivAns 33
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 34
+ #define ATL_DivAns 34
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 35
+ #define ATL_DivAns 35
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 36
+ #define ATL_DivAns 36
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 37
+ #define ATL_DivAns 37
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 38
+ #define ATL_DivAns 38
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 39
+ #define ATL_DivAns 39
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 40
+ #define ATL_DivAns 40
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 41
+ #define ATL_DivAns 41
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 42
+ #define ATL_DivAns 42
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 43
+ #define ATL_DivAns 43
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 44
+ #define ATL_DivAns 44
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 45
+ #define ATL_DivAns 45
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 46
+ #define ATL_DivAns 46
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 47
+ #define ATL_DivAns 47
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 48
+ #define ATL_DivAns 48
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 49
+ #define ATL_DivAns 49
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 50
+ #define ATL_DivAns 50
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 51
+ #define ATL_DivAns 51
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 52
+ #define ATL_DivAns 52
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 53
+ #define ATL_DivAns 53
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 54
+ #define ATL_DivAns 54
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 55
+ #define ATL_DivAns 55
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 56
+ #define ATL_DivAns 56
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 57
+ #define ATL_DivAns 57
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 58
+ #define ATL_DivAns 58
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 59
+ #define ATL_DivAns 59
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 60
+ #define ATL_DivAns 60
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 61
+ #define ATL_DivAns 61
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 62
+ #define ATL_DivAns 62
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 63
+ #define ATL_DivAns 63
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 64
+ #define ATL_DivAns 64
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 65
+ #define ATL_DivAns 65
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 66
+ #define ATL_DivAns 66
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 67
+ #define ATL_DivAns 67
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 68
+ #define ATL_DivAns 68
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 69
+ #define ATL_DivAns 69
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 70
+ #define ATL_DivAns 70
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 71
+ #define ATL_DivAns 71
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 72
+ #define ATL_DivAns 72
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 73
+ #define ATL_DivAns 73
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 74
+ #define ATL_DivAns 74
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 75
+ #define ATL_DivAns 75
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 76
+ #define ATL_DivAns 76
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 77
+ #define ATL_DivAns 77
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 78
+ #define ATL_DivAns 78
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 79
+ #define ATL_DivAns 79
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 80
+ #define ATL_DivAns 80
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 81
+ #define ATL_DivAns 81
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 82
+ #define ATL_DivAns 82
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 83
+ #define ATL_DivAns 83
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 84
+ #define ATL_DivAns 84
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 85
+ #define ATL_DivAns 85
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 86
+ #define ATL_DivAns 86
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 87
+ #define ATL_DivAns 87
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 88
+ #define ATL_DivAns 88
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 89
+ #define ATL_DivAns 89
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 90
+ #define ATL_DivAns 90
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 91
+ #define ATL_DivAns 91
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 92
+ #define ATL_DivAns 92
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 93
+ #define ATL_DivAns 93
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 94
+ #define ATL_DivAns 94
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 95
+ #define ATL_DivAns 95
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 96
+ #define ATL_DivAns 96
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 97
+ #define ATL_DivAns 97
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 98
+ #define ATL_DivAns 98
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 99
+ #define ATL_DivAns 99
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 100
+ #define ATL_DivAns 100
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 101
+ #define ATL_DivAns 101
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 102
+ #define ATL_DivAns 102
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 103
+ #define ATL_DivAns 103
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 104
+ #define ATL_DivAns 104
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 105
+ #define ATL_DivAns 105
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 106
+ #define ATL_DivAns 106
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 107
+ #define ATL_DivAns 107
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 108
+ #define ATL_DivAns 108
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 109
+ #define ATL_DivAns 109
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 110
+ #define ATL_DivAns 110
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 111
+ #define ATL_DivAns 111
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 112
+ #define ATL_DivAns 112
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 113
+ #define ATL_DivAns 113
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 114
+ #define ATL_DivAns 114
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 115
+ #define ATL_DivAns 115
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 116
+ #define ATL_DivAns 116
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 117
+ #define ATL_DivAns 117
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 118
+ #define ATL_DivAns 118
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 119
+ #define ATL_DivAns 119
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 120
+ #define ATL_DivAns 120
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 121
+ #define ATL_DivAns 121
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 122
+ #define ATL_DivAns 122
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 123
+ #define ATL_DivAns 123
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 124
+ #define ATL_DivAns 124
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 125
+ #define ATL_DivAns 125
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 126
+ #define ATL_DivAns 126
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 127
+ #define ATL_DivAns 127
+ #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 128
+ #define ATL_DivAns 128
+ #endif
+#endif
+
+/*
+ * For GNU/Linux, set no-execute bit for all ATLAS assembly
+ */
+#if defined(ATL_OS_Linux) && defined(__ELF__) && defined(__GNUC__) && \
+ defined(ATL_SSE1)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_aux.h b/kaldi_io/src/tools/ATLAS/include/atlas_aux.h
new file mode 100644
index 0000000..ce31eee
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_aux.h
@@ -0,0 +1,785 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+/*
+ * Header file for ATLAS's auxiliary routines
+ */
+#ifndef ATLAS_AUX_H
+#define ATLAS_AUX_H
+#include "atlas_misc.h"
+
+void ATL_xerbla(int p, char *rout, char *form, ...);
+int ATL_lcm(const int M, const int N);
+double ATL_walltime();
+double ATL_cputime();
+
+/*
+ * Auxiliary routines that come in all four types
+ */
+void ATL_sgeadd(const int M, const int N, const float alpha,
+ const float *A, const int lda, const float beta,
+ float *C, const int ldc);
+void ATL_sgemove(const int M, const int N, const float alpha,
+ const float *A, const int lda, float *C, const int ldc);
+void ATL_sgemoveT(const int N, const int M, const float alpha,
+ const float *A, const int lda, float *C, const int ldc);
+void ATL_ssyreflect(const enum ATLAS_UPLO Uplo, const int N,
+ float *C, const int ldc);
+void ATL_sgecopy(const int M, const int N, const float *A, const int lda,
+ float *C, const int ldc);
+
+void ATL_sgescal(const int M, const int N, const float beta,
+ float *C, const int ldc);
+void ATL_strscal
+ (const enum ATLAS_UPLO Uplo, const int M, const int N, const float alpha,
+ float *A, const int lda);
+void ATL_shescal
+ (const enum ATLAS_UPLO Uplo, const int M, const int N, const float alpha,
+ float *A, const int lda);
+
+void ATL_sgezero(const int M, const int N, float *C, const int ldc);
+
+void ATL_szero(const int N, float *X, const int incX);
+void ATL_sset(const int N, const float alpha, float *X, const int incX);
+void ATL_sscal(const int N, const float alpha, float *X, const int incX);
+void ATL_scopy(const int N, const float *X, const int incX,
+ float *Y, const int incY);
+void ATL_scpsc(const int N, const float alpha, const float *X,
+ const int incX, float *Y, const int incY);
+void ATL_saxpy(const int N, const float alpha, const float *X,
+ const int incX, float *Y, const int incY);
+void ATL_saxpy_x1_y1(const int N, const float alpha, const float *X,
+ const int incX, float *Y, const int incY);
+void ATL_saxpby(const int N, const float alpha, const float *X,
+ const int incX, const float beta, float *Y, const int incY);
+
+void ATL_sgeadd_a1_b1
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float beta, float *C, const int ldc);
+void ATL_saxpby_a1_b1
+ (const int N, const float alpha, const float *X, const int incX,
+ const float beta, float *Y, const int incY);
+void ATL_sgeadd_a0_b1
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float beta, float *C, const int ldc);
+void ATL_saxpby_a0_b1
+ (const int N, const float alpha, const float *X, const int incX,
+ const float beta, float *Y, const int incY);
+void ATL_sgeadd_aX_b1
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float beta, float *C, const int ldc);
+void ATL_saxpby_aX_b1
+ (const int N, const float alpha, const float *X, const int incX,
+ const float beta, float *Y, const int incY);
+void ATL_sgeadd_a1_b0
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float beta, float *C, const int ldc);
+void ATL_saxpby_a1_b0
+ (const int N, const float alpha, const float *X, const int incX,
+ const float beta, float *Y, const int incY);
+void ATL_sgeadd_a0_b0
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float beta, float *C, const int ldc);
+void ATL_saxpby_a0_b0
+ (const int N, const float alpha, const float *X, const int incX,
+ const float beta, float *Y, const int incY);
+void ATL_sgeadd_aX_b0
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float beta, float *C, const int ldc);
+void ATL_saxpby_aX_b0
+ (const int N, const float alpha, const float *X, const int incX,
+ const float beta, float *Y, const int incY);
+void ATL_sgeadd_a1_bX
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float beta, float *C, const int ldc);
+void ATL_saxpby_a1_bX
+ (const int N, const float alpha, const float *X, const int incX,
+ const float beta, float *Y, const int incY);
+void ATL_sgeadd_a0_bX
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float beta, float *C, const int ldc);
+void ATL_saxpby_a0_bX
+ (const int N, const float alpha, const float *X, const int incX,
+ const float beta, float *Y, const int incY);
+void ATL_sgeadd_aX_bX
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float beta, float *C, const int ldc);
+void ATL_saxpby_aX_bX
+ (const int N, const float alpha, const float *X, const int incX,
+ const float beta, float *Y, const int incY);
+
+void ATL_sgemove_a1
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_sgemove_a0
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_sgemove_aX
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+
+void ATL_sgescal_b1
+ (const int M, const int N, const float beta, float *C, const int ldc);
+void ATL_sgescal_b0
+ (const int M, const int N, const float beta, float *C, const int ldc);
+void ATL_sgescal_bX
+ (const int M, const int N, const float beta, float *C, const int ldc);
+
+void ATL_dgeadd(const int M, const int N, const double alpha,
+ const double *A, const int lda, const double beta,
+ double *C, const int ldc);
+void ATL_dgemove(const int M, const int N, const double alpha,
+ const double *A, const int lda, double *C, const int ldc);
+void ATL_dgemoveT(const int N, const int M, const double alpha,
+ const double *A, const int lda, double *C, const int ldc);
+void ATL_dsyreflect(const enum ATLAS_UPLO Uplo, const int N,
+ double *C, const int ldc);
+void ATL_dgecopy(const int M, const int N, const double *A, const int lda,
+ double *C, const int ldc);
+
+void ATL_dgescal(const int M, const int N, const double beta,
+ double *C, const int ldc);
+void ATL_dtrscal
+ (const enum ATLAS_UPLO Uplo, const int M, const int N, const double alpha,
+ double *A, const int lda);
+void ATL_dhescal
+ (const enum ATLAS_UPLO Uplo, const int M, const int N, const double alpha,
+ double *A, const int lda);
+
+void ATL_dgezero(const int M, const int N, double *C, const int ldc);
+
+void ATL_dzero(const int N, double *X, const int incX);
+void ATL_dset(const int N, const double alpha, double *X, const int incX);
+void ATL_dscal(const int N, const double alpha, double *X, const int incX);
+void ATL_dcopy(const int N, const double *X, const int incX,
+ double *Y, const int incY);
+void ATL_dcpsc(const int N, const double alpha, const double *X,
+ const int incX, double *Y, const int incY);
+void ATL_daxpy(const int N, const double alpha, const double *X,
+ const int incX, double *Y, const int incY);
+void ATL_daxpy_x1_y1(const int N, const double alpha, const double *X,
+ const int incX, double *Y, const int incY);
+void ATL_daxpby(const int N, const double alpha, const double *X,
+ const int incX, const double beta, double *Y, const int incY);
+
+void ATL_dgeadd_a1_b1
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double beta, double *C, const int ldc);
+void ATL_daxpby_a1_b1
+ (const int N, const double alpha, const double *X, const int incX,
+ const double beta, double *Y, const int incY);
+void ATL_dgeadd_a0_b1
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double beta, double *C, const int ldc);
+void ATL_daxpby_a0_b1
+ (const int N, const double alpha, const double *X, const int incX,
+ const double beta, double *Y, const int incY);
+void ATL_dgeadd_aX_b1
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double beta, double *C, const int ldc);
+void ATL_daxpby_aX_b1
+ (const int N, const double alpha, const double *X, const int incX,
+ const double beta, double *Y, const int incY);
+void ATL_dgeadd_a1_b0
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double beta, double *C, const int ldc);
+void ATL_daxpby_a1_b0
+ (const int N, const double alpha, const double *X, const int incX,
+ const double beta, double *Y, const int incY);
+void ATL_dgeadd_a0_b0
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double beta, double *C, const int ldc);
+void ATL_daxpby_a0_b0
+ (const int N, const double alpha, const double *X, const int incX,
+ const double beta, double *Y, const int incY);
+void ATL_dgeadd_aX_b0
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double beta, double *C, const int ldc);
+void ATL_daxpby_aX_b0
+ (const int N, const double alpha, const double *X, const int incX,
+ const double beta, double *Y, const int incY);
+void ATL_dgeadd_a1_bX
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double beta, double *C, const int ldc);
+void ATL_daxpby_a1_bX
+ (const int N, const double alpha, const double *X, const int incX,
+ const double beta, double *Y, const int incY);
+void ATL_dgeadd_a0_bX
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double beta, double *C, const int ldc);
+void ATL_daxpby_a0_bX
+ (const int N, const double alpha, const double *X, const int incX,
+ const double beta, double *Y, const int incY);
+void ATL_dgeadd_aX_bX
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double beta, double *C, const int ldc);
+void ATL_daxpby_aX_bX
+ (const int N, const double alpha, const double *X, const int incX,
+ const double beta, double *Y, const int incY);
+
+void ATL_dgemove_a1
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dgemove_a0
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dgemove_aX
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+
+void ATL_dgescal_b1
+ (const int M, const int N, const double beta, double *C, const int ldc);
+void ATL_dgescal_b0
+ (const int M, const int N, const double beta, double *C, const int ldc);
+void ATL_dgescal_bX
+ (const int M, const int N, const double beta, double *C, const int ldc);
+
+void ATL_cgeadd(const int M, const int N, const float *alpha,
+ const float *A, const int lda, const float *beta,
+ float *C, const int ldc);
+void ATL_cgemove(const int M, const int N, const float *alpha,
+ const float *A, const int lda, float *C, const int ldc);
+void ATL_cgemoveT(const int N, const int M, const float *alpha,
+ const float *A, const int lda, float *C, const int ldc);
+void ATL_csyreflect(const enum ATLAS_UPLO Uplo, const int N,
+ float *C, const int ldc);
+void ATL_cgecopy(const int M, const int N, const float *A, const int lda,
+ float *C, const int ldc);
+
+void ATL_cgescal(const int M, const int N, const float *beta,
+ float *C, const int ldc);
+void ATL_ctrscal
+ (const enum ATLAS_UPLO Uplo, const int M, const int N, const float *alpha,
+ float *A, const int lda);
+void ATL_chescal
+ (const enum ATLAS_UPLO Uplo, const int M, const int N, const float alpha,
+ float *A, const int lda);
+
+void ATL_cgezero(const int M, const int N, float *C, const int ldc);
+
+void ATL_czero(const int N, float *X, const int incX);
+void ATL_cset(const int N, const float *alpha, float *X, const int incX);
+void ATL_cscal(const int N, const float *alpha, float *X, const int incX);
+void ATL_ccopy(const int N, const float *X, const int incX,
+ float *Y, const int incY);
+void ATL_ccpsc(const int N, const float *alpha, const float *X,
+ const int incX, float *Y, const int incY);
+void ATL_caxpy(const int N, const float *alpha, const float *X,
+ const int incX, float *Y, const int incY);
+void ATL_caxpy_x1_y1(const int N, const float *alpha, const float *X,
+ const int incX, float *Y, const int incY);
+void ATL_caxpby(const int N, const float *alpha, const float *X,
+ const int incX, const float *beta, float *Y, const int incY);
+
+void ATL_cgeadd_a1_b1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_caxpby_a1_b1
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_cgeadd_a0_b1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_caxpby_a0_b1
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_cgeadd_aX_b1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_caxpby_aX_b1
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_cgeadd_a1_b0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_caxpby_a1_b0
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_cgeadd_a0_b0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_caxpby_a0_b0
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_cgeadd_aX_b0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_caxpby_aX_b0
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_cgeadd_a1_bX
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_caxpby_a1_bX
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_cgeadd_a0_bX
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_caxpby_a0_bX
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_cgeadd_aX_bX
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_caxpby_aX_bX
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+
+void ATL_cgemove_a1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_cgemove_a0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_cgemove_aX
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+
+void ATL_cgescal_b1
+ (const int M, const int N, const float *beta, float *C, const int ldc);
+void ATL_cgescal_b0
+ (const int M, const int N, const float *beta, float *C, const int ldc);
+void ATL_cgescal_bX
+ (const int M, const int N, const float *beta, float *C, const int ldc);
+
+void ATL_zgeadd(const int M, const int N, const double *alpha,
+ const double *A, const int lda, const double *beta,
+ double *C, const int ldc);
+void ATL_zgemove(const int M, const int N, const double *alpha,
+ const double *A, const int lda, double *C, const int ldc);
+void ATL_zgemoveT(const int N, const int M, const double *alpha,
+ const double *A, const int lda, double *C, const int ldc);
+void ATL_zsyreflect(const enum ATLAS_UPLO Uplo, const int N,
+ double *C, const int ldc);
+void ATL_zgecopy(const int M, const int N, const double *A, const int lda,
+ double *C, const int ldc);
+
+void ATL_zgescal(const int M, const int N, const double *beta,
+ double *C, const int ldc);
+void ATL_ztrscal
+ (const enum ATLAS_UPLO Uplo, const int M, const int N, const double *alpha,
+ double *A, const int lda);
+void ATL_zhescal
+ (const enum ATLAS_UPLO Uplo, const int M, const int N, const double alpha,
+ double *A, const int lda);
+
+void ATL_zgezero(const int M, const int N, double *C, const int ldc);
+
+void ATL_zzero(const int N, double *X, const int incX);
+void ATL_zset(const int N, const double *alpha, double *X, const int incX);
+void ATL_zscal(const int N, const double *alpha, double *X, const int incX);
+void ATL_zcopy(const int N, const double *X, const int incX,
+ double *Y, const int incY);
+void ATL_zcpsc(const int N, const double *alpha, const double *X,
+ const int incX, double *Y, const int incY);
+void ATL_zaxpy(const int N, const double *alpha, const double *X,
+ const int incX, double *Y, const int incY);
+void ATL_zaxpy_x1_y1(const int N, const double *alpha, const double *X,
+ const int incX, double *Y, const int incY);
+void ATL_zaxpby(const int N, const double *alpha, const double *X,
+ const int incX, const double *beta, double *Y, const int incY);
+
+void ATL_zgeadd_a1_b1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zaxpby_a1_b1
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zgeadd_a0_b1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zaxpby_a0_b1
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zgeadd_aX_b1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zaxpby_aX_b1
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zgeadd_a1_b0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zaxpby_a1_b0
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zgeadd_a0_b0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zaxpby_a0_b0
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zgeadd_aX_b0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zaxpby_aX_b0
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zgeadd_a1_bX
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zaxpby_a1_bX
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zgeadd_a0_bX
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zaxpby_a0_bX
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zgeadd_aX_bX
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zaxpby_aX_bX
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+
+void ATL_zgemove_a1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_zgemove_a0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_zgemove_aX
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+
+void ATL_zgescal_b1
+ (const int M, const int N, const double *beta, double *C, const int ldc);
+void ATL_zgescal_b0
+ (const int M, const int N, const double *beta, double *C, const int ldc);
+void ATL_zgescal_bX
+ (const int M, const int N, const double *beta, double *C, const int ldc);
+
+/*
+ * Specialized complex auxiliary routines
+ */
+
+void ATL_ccplxinvert
+ (const int N, float *X, const int incX, float *Y, const int incY);
+
+void ATL_chereflect(const enum ATLAS_UPLO Uplo, const int N,
+ float *C, const int ldc);
+void ATL_cscalConj
+ (const int N, const float *alpha, float *X, const int incX);
+void ATL_ccopyConj
+ (const int N, const float *X, const int incX, float *Y, const int incY);
+void ATL_cmoveConj
+ (const int N, const float *alpha, const float *X, const int incX,
+ float *Y, const int incY);
+void ATL_caxpyConj
+ (const int N, const float *alpha, const float *X, const int incX,
+ float *Y, const int incY);
+void ATL_caxpyConj_x1_y1(const int N, const float *alpha, const float *X,
+ const int incX, float *Y, const int incY);
+void ATL_caxpbyConj
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_cgemoveC(const int N, const int M, const float *alpha,
+ const float *A, const int lda, float *C, const int ldc);
+
+void ATL_cgeaddConj_aXi0_b1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_a1_b1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_a0_b1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_aXi0_b1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_aX_b1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_aXi0_b0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_a1_b0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_a0_b0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_aXi0_b0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_aX_b0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_aXi0_bXi0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_a1_bXi0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_a0_bXi0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_aXi0_bXi0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_aX_bXi0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_aXi0_bX
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_a1_bX
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_a0_bX
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_aXi0_bX
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_cgeaddConj_aX_bX
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_caxpby_aXi0_b1
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_caxpby_aXi0_b1
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_cgeadd_aXi0_b1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_caxpby_aXi0_b0
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_caxpby_aXi0_b0
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_cgeadd_aXi0_b0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_caxpby_aXi0_bXi0
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_caxpby_aXi0_bXi0
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_cgeadd_aXi0_bXi0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_caxpby_aXi0_bX
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_caxpby_aXi0_bX
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_cgeadd_aXi0_bX
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_caxpby_a1_bXi0
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_cgeadd_a1_bXi0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_caxpby_a0_bXi0
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_cgeadd_a0_bXi0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+void ATL_caxpby_aX_bXi0
+ (const int N, const float *alpha, const float *X, const int incX,
+ const float *beta, float *Y, const int incY);
+void ATL_cgeadd_aX_bXi0
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *beta, float *C, const int ldc);
+
+void ATL_cgemove_aXi0
+ (const int M, const int N, const float *alpha0, const float *A,
+ const int lda, float *C, const int ldc);
+
+void ATL_cgescal_bXi0
+ (const int M, const int N, const float *beta, float *C, const int ldc);
+
+void ATL_zcplxinvert
+ (const int N, double *X, const int incX, double *Y, const int incY);
+
+void ATL_zhereflect(const enum ATLAS_UPLO Uplo, const int N,
+ double *C, const int ldc);
+void ATL_zscalConj
+ (const int N, const double *alpha, double *X, const int incX);
+void ATL_zcopyConj
+ (const int N, const double *X, const int incX, double *Y, const int incY);
+void ATL_zmoveConj
+ (const int N, const double *alpha, const double *X, const int incX,
+ double *Y, const int incY);
+void ATL_zaxpyConj
+ (const int N, const double *alpha, const double *X, const int incX,
+ double *Y, const int incY);
+void ATL_zaxpyConj_x1_y1(const int N, const double *alpha, const double *X,
+ const int incX, double *Y, const int incY);
+void ATL_zaxpbyConj
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zgemoveC(const int N, const int M, const double *alpha,
+ const double *A, const int lda, double *C, const int ldc);
+
+void ATL_zgeaddConj_aXi0_b1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_a1_b1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_a0_b1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_aXi0_b1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_aX_b1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_aXi0_b0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_a1_b0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_a0_b0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_aXi0_b0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_aX_b0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_aXi0_bXi0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_a1_bXi0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_a0_bXi0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_aXi0_bXi0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_aX_bXi0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_aXi0_bX
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_a1_bX
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_a0_bX
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_aXi0_bX
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zgeaddConj_aX_bX
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zaxpby_aXi0_b1
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zaxpby_aXi0_b1
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zgeadd_aXi0_b1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zaxpby_aXi0_b0
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zaxpby_aXi0_b0
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zgeadd_aXi0_b0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zaxpby_aXi0_bXi0
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zaxpby_aXi0_bXi0
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zgeadd_aXi0_bXi0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zaxpby_aXi0_bX
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zaxpby_aXi0_bX
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zgeadd_aXi0_bX
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zaxpby_a1_bXi0
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zgeadd_a1_bXi0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zaxpby_a0_bXi0
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zgeadd_a0_bXi0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+void ATL_zaxpby_aX_bXi0
+ (const int N, const double *alpha, const double *X, const int incX,
+ const double *beta, double *Y, const int incY);
+void ATL_zgeadd_aX_bXi0
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *beta, double *C, const int ldc);
+
+void ATL_zgemove_aXi0
+ (const int M, const int N, const double *alpha0, const double *A,
+ const int lda, double *C, const int ldc);
+
+void ATL_zgescal_bXi0
+ (const int M, const int N, const double *beta, double *C, const int ldc);
+
+
+#if defined(ATL_USEPTHREADS) && !defined(ATL_flushcache)
+ #include "atlas_pthreads.h"
+ #define ATL_flushcache ATL_ptflushcache
+ #define ATL_PTCACHEMUL * ATL_NTHREADS
+#else
+ #define ATL_PTCACHEMUL
+#endif
+double ATL_flushcache(int size);
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_cblascalias.h b/kaldi_io/src/tools/ATLAS/include/atlas_cblascalias.h
new file mode 100644
index 0000000..267b176
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_cblascalias.h
@@ -0,0 +1,40 @@
+#ifndef ATLAS_CBLASCALIAS_H
+ #define ATLAS_CBLASCALIAS_H
+
+#define cblas_dotc_sub cblas_cdotc_sub
+#define cblas_dotu_sub cblas_cdotu_sub
+#define cblas_axpy cblas_caxpy
+#define cblas_copy cblas_ccopy
+#define cblas_scal cblas_cscal
+#define cblas_swap cblas_cswap
+#define cblas_hpr2 cblas_chpr2
+#define cblas_her2 cblas_cher2
+#define cblas_hpr cblas_chpr
+#define cblas_her cblas_cher
+#define cblas_gerc cblas_cgerc
+#define cblas_geru cblas_cgeru
+#define cblas_tpsv cblas_ctpsv
+#define cblas_tbsv cblas_ctbsv
+#define cblas_trsv cblas_ctrsv
+#define cblas_tpmv cblas_ctpmv
+#define cblas_tbmv cblas_ctbmv
+#define cblas_trmv cblas_ctrmv
+#define cblas_hpmv cblas_chpmv
+#define cblas_hbmv cblas_chbmv
+#define cblas_hemv cblas_chemv
+#define cblas_gbmv cblas_cgbmv
+#define cblas_gemv cblas_cgemv
+#define cblas_trsm cblas_ctrsm
+#define cblas_trmm cblas_ctrmm
+#define cblas_her2k cblas_cher2k
+#define cblas_syr2k cblas_csyr2k
+#define cblas_herk cblas_cherk
+#define cblas_syrk cblas_csyrk
+#define cblas_hemm cblas_chemm
+#define cblas_symm cblas_csymm
+#define cblas_gemm cblas_cgemm
+#define cblas_iamax cblas_icamax
+#define cblas_nrm2 cblas_scnrm2
+#define cblas_asum cblas_scasum
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_cblasdalias.h b/kaldi_io/src/tools/ATLAS/include/atlas_cblasdalias.h
new file mode 100644
index 0000000..cfc6d10
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_cblasdalias.h
@@ -0,0 +1,39 @@
+#ifndef ATLAS_CBLASDALIAS_H
+ #define ATLAS_CBLASDALIAS_H
+
+#define cblas_asum cblas_dasum
+#define cblas_nrm2 cblas_dnrm2
+#define cblas_dot cblas_ddot
+#define cblas_axpy cblas_daxpy
+#define cblas_copy cblas_dcopy
+#define cblas_scal cblas_dscal
+#define cblas_swap cblas_dswap
+#define cblas_rotm cblas_drotm
+#define cblas_rot cblas_drot
+#define cblas_rotmg cblas_drotmg
+#define cblas_rotg cblas_drotg
+#define cblas_spr2 cblas_dspr2
+#define cblas_syr2 cblas_dsyr2
+#define cblas_spr cblas_dspr
+#define cblas_syr cblas_dsyr
+#define cblas_ger cblas_dger
+#define cblas_tpsv cblas_dtpsv
+#define cblas_tbsv cblas_dtbsv
+#define cblas_trsv cblas_dtrsv
+#define cblas_tpmv cblas_dtpmv
+#define cblas_tbmv cblas_dtbmv
+#define cblas_trmv cblas_dtrmv
+#define cblas_spmv cblas_dspmv
+#define cblas_sbmv cblas_dsbmv
+#define cblas_symv cblas_dsymv
+#define cblas_gbmv cblas_dgbmv
+#define cblas_gemv cblas_dgemv
+#define cblas_trsm cblas_dtrsm
+#define cblas_trmm cblas_dtrmm
+#define cblas_syr2k cblas_dsyr2k
+#define cblas_syrk cblas_dsyrk
+#define cblas_symm cblas_dsymm
+#define cblas_gemm cblas_dgemm
+#define cblas_iamax cblas_idamax
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_cblassalias.h b/kaldi_io/src/tools/ATLAS/include/atlas_cblassalias.h
new file mode 100644
index 0000000..090f9de
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_cblassalias.h
@@ -0,0 +1,39 @@
+#ifndef ATLAS_CBLASSALIAS_H
+ #define ATLAS_CBLASSALIAS_H
+
+#define cblas_asum cblas_sasum
+#define cblas_nrm2 cblas_snrm2
+#define cblas_dot cblas_sdot
+#define cblas_axpy cblas_saxpy
+#define cblas_copy cblas_scopy
+#define cblas_scal cblas_sscal
+#define cblas_swap cblas_sswap
+#define cblas_rotm cblas_srotm
+#define cblas_rot cblas_srot
+#define cblas_rotmg cblas_srotmg
+#define cblas_rotg cblas_srotg
+#define cblas_spr2 cblas_sspr2
+#define cblas_syr2 cblas_ssyr2
+#define cblas_spr cblas_sspr
+#define cblas_syr cblas_ssyr
+#define cblas_ger cblas_sger
+#define cblas_tpsv cblas_stpsv
+#define cblas_tbsv cblas_stbsv
+#define cblas_trsv cblas_strsv
+#define cblas_tpmv cblas_stpmv
+#define cblas_tbmv cblas_stbmv
+#define cblas_trmv cblas_strmv
+#define cblas_spmv cblas_sspmv
+#define cblas_sbmv cblas_ssbmv
+#define cblas_symv cblas_ssymv
+#define cblas_gbmv cblas_sgbmv
+#define cblas_gemv cblas_sgemv
+#define cblas_trsm cblas_strsm
+#define cblas_trmm cblas_strmm
+#define cblas_syr2k cblas_ssyr2k
+#define cblas_syrk cblas_ssyrk
+#define cblas_symm cblas_ssymm
+#define cblas_gemm cblas_sgemm
+#define cblas_iamax cblas_isamax
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_cblastypealias.h b/kaldi_io/src/tools/ATLAS/include/atlas_cblastypealias.h
new file mode 100644
index 0000000..0c3e82f
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_cblastypealias.h
@@ -0,0 +1,9 @@
+#ifdef SREAL
+ #include "atlas_cblassalias.h"
+#elif defined(DREAL)
+ #include "atlas_cblasdalias.h"
+#elif defined(SCPLX)
+ #include "atlas_cblascalias.h"
+#elif defined(DCPLX)
+ #include "atlas_cblaszalias.h"
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_cblaszalias.h b/kaldi_io/src/tools/ATLAS/include/atlas_cblaszalias.h
new file mode 100644
index 0000000..ac01436
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_cblaszalias.h
@@ -0,0 +1,40 @@
+#ifndef ATLAS_CBLASZALIAS_H
+ #define ATLAS_CBLASZALIAS_H
+
+#define cblas_dotc_sub cblas_zdotc_sub
+#define cblas_dotu_sub cblas_zdotu_sub
+#define cblas_axpy cblas_zaxpy
+#define cblas_copy cblas_zcopy
+#define cblas_scal cblas_zscal
+#define cblas_swap cblas_zswap
+#define cblas_hpr2 cblas_zhpr2
+#define cblas_her2 cblas_zher2
+#define cblas_hpr cblas_zhpr
+#define cblas_her cblas_zher
+#define cblas_gerc cblas_zgerc
+#define cblas_geru cblas_zgeru
+#define cblas_tpsv cblas_ztpsv
+#define cblas_tbsv cblas_ztbsv
+#define cblas_trsv cblas_ztrsv
+#define cblas_tpmv cblas_ztpmv
+#define cblas_tbmv cblas_ztbmv
+#define cblas_trmv cblas_ztrmv
+#define cblas_hpmv cblas_zhpmv
+#define cblas_hbmv cblas_zhbmv
+#define cblas_hemv cblas_zhemv
+#define cblas_gbmv cblas_zgbmv
+#define cblas_gemv cblas_zgemv
+#define cblas_trsm cblas_ztrsm
+#define cblas_trmm cblas_ztrmm
+#define cblas_her2k cblas_zher2k
+#define cblas_syr2k cblas_zsyr2k
+#define cblas_herk cblas_zherk
+#define cblas_syrk cblas_zsyrk
+#define cblas_hemm cblas_zhemm
+#define cblas_symm cblas_zsymm
+#define cblas_gemm cblas_zgemm
+#define cblas_iamax cblas_izamax
+#define cblas_nrm2 cblas_dznrm2
+#define cblas_asum cblas_dzasum
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_enum.h b/kaldi_io/src/tools/ATLAS/include/atlas_enum.h
new file mode 100644
index 0000000..3d638be
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_enum.h
@@ -0,0 +1,55 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1997 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef ATLAS_ENUM_H
+ #define ATLAS_ENUM_H
+
+ #define CBLAS_ENUM_ONLY
+ #include "cblas.h"
+ #undef CBLAS_ENUM_ONLY
+
+ #define ATLAS_ORDER CBLAS_ORDER
+ #define AtlasRowMajor CblasRowMajor
+ #define AtlasColMajor CblasColMajor
+ #define ATLAS_TRANS CBLAS_TRANSPOSE
+ #define AtlasNoTrans CblasNoTrans
+ #define AtlasTrans CblasTrans
+ #define AtlasConjTrans CblasConjTrans
+ #define ATLAS_UPLO CBLAS_UPLO
+ #define AtlasUpper CblasUpper
+ #define AtlasLower CblasLower
+ #define ATLAS_DIAG CBLAS_DIAG
+ #define AtlasNonUnit CblasNonUnit
+ #define AtlasUnit CblasUnit
+ #define ATLAS_SIDE CBLAS_SIDE
+ #define AtlasLeft CblasLeft
+ #define AtlasRight CblasRight
+
+#endif
+
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_f77.h b/kaldi_io/src/tools/ATLAS/include/atlas_f77.h
new file mode 100644
index 0000000..1586fba
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_f77.h
@@ -0,0 +1,83 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1997 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef ATLAS_F77_H
+#define ATLAS_F77_H
+
+ #ifndef ATL_F77_SUBROUTINE
+ #define ATL_F77_SUBROUTINE void
+ #endif
+ #ifndef F77_INTEGER
+ #define F77_INTEGER int
+ #else
+ #define ATL_FunkyInts
+ #endif
+ #if defined(CRAY)
+ #define UseTransChar 1
+ #include <fortran.h>
+ #define F77_CHAR _fcd
+ #define ATL_F2C_TransChar(c) (*(_fcdtocp(c) ))
+ #define ATL_C2F_TransChar(c) (_cptofcd(&(c), 1))
+ #elif defined(StringStructVal)
+ typedef struct {char *cp; F77_INTEGER len;} F77_CHAR;
+ #define ATL_F2C_TransChar(c) (*(c.cp))
+ #define UseTransChar 2
+ #elif defined(StringStructPtr)
+ typedef struct {char *cp; F77_INTEGER len;} F77_CHAR;
+ #define ATL_F2C_TransChar(c) (*(c->cp))
+ #define UseTransChar 3
+ #else
+ #define ATL_DeclareSlens
+ #define F77_CHAR char *
+ #define ATL_F2C_TransChar(c) (*(c))
+ #define ATL_C2F_TransChar(c) (&(c))
+ #define ATL_STRLEN_1 ,F77_INTEGER ATL_Slen1
+ #define ATL_STRLEN_2 ,F77_INTEGER ATL_Slen1, F77_INTEGER ATL_Slen2
+ #define ATL_STRLEN_3 ,F77_INTEGER ATL_Slen1, F77_INTEGER ATL_Slen2, \
+ F77_INTEGER ATL_Slen3
+ #define ATL_STRLEN_4 ,F77_INTEGER ATL_Slen1, F77_INTEGER ATL_Slen2, \
+ F77_INTEGER ATL_Slen3, F77_INTEGER ATL_Slen4
+ #define ATL_STRLEN_1_para ,ATL_Slen1
+ #define ATL_STRLEN_2_para ,ATL_Slen1, ATL_Slen2
+ #define ATL_STRLEN_3_para ,ATL_Slen1, ATL_Slen2, ATL_Slen3
+ #define ATL_STRLEN_4_para ,ATL_Slen1, ATL_Slen2, ATL_Slen3, ATL_Slen4
+ #endif
+
+ #ifndef ATL_STRLEN_1
+ #define ATL_STRLEN_1
+ #define ATL_STRLEN_2
+ #define ATL_STRLEN_3
+ #define ATL_STRLEN_4
+ #define ATL_STRLEN_1_para
+ #define ATL_STRLEN_2_para
+ #define ATL_STRLEN_3_para
+ #define ATL_STRLEN_4_para
+ #endif
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_f77blas.h b/kaldi_io/src/tools/ATLAS/include/atlas_f77blas.h
new file mode 100644
index 0000000..a7c109d
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_f77blas.h
@@ -0,0 +1,849 @@
+#ifndef ATLAS_F77_LVLS
+#define ATLAS_F77_LVLS
+
+#include "atlas_f77.h"
+
+#if defined( StringSunStyle )
+#define F77_CHAR_DECL F77_CHAR /* input character*1 */
+#define F77_1_CHAR , F77_INTEGER
+#define F77_2_CHAR F77_1_CHAR F77_1_CHAR
+#define F77_3_CHAR F77_2_CHAR F77_1_CHAR
+#define F77_4_CHAR F77_3_CHAR F77_1_CHAR
+#elif defined( StringCrayStyle )
+#define F77_CHAR_DECL F77_CHAR /* input character*1 */
+#elif defined( StringStructVal )
+#define F77_CHAR_DECL F77_CHAR /* input character*1 */
+#elif defined( StringStructPtr )
+#define F77_CHAR_DECL F77_CHAR * /* input character*1 */
+#endif
+
+#ifndef F77_1_CHAR
+#define F77_1_CHAR
+#define F77_2_CHAR
+#define F77_3_CHAR
+#define F77_4_CHAR
+#endif
+
+#ifndef F77_CHAR_DECL
+ #define F77_CHAR_DECL F77_CHAR * /* input character*1 */
+#endif
+
+#define F77_INT_DECL const F77_INTEGER * /* input integer */
+
+#ifdef TREAL
+#define F77_SIN_DECL const TYPE * /* input scalar */
+#define F77_SINOUT_DECL TYPE * /* input/output scalar */
+#define F77_RIN_DECL const TYPE * /* input real scalar */
+#define F77_RINOUT_DECL TYPE * /* input/output real scalar */
+#else
+#define F77_SIN_DECL const TYPE * /* input scalar */
+#define F77_SINOUT_DECL TYPE * /* input/output scalar */
+#define F77_RIN_DECL const TYPE * /* input real scalar */
+#define F77_RINOUT_DECL TYPE * /* input/output real scalar */
+#endif
+
+#define F77_VIN_DECL const TYPE * /* input vector */
+#define F77_VINOUT_DECL TYPE * /* input/output matrix */
+
+#define F77_MIN_DECL const TYPE * /* input matrix */
+#define F77_MINOUT_DECL TYPE * /* input/output matrix */
+
+#if defined( CRAY )
+#define F77_VOID_FUN extern fortran void /* subroutine */
+#define F77_INT_FUN extern fortran int /* integer function */
+#define F77_TYPE_FUN extern fortran TYPE /* real function */
+#define F77_DBLE_FUN extern fortran double /* dble function */
+#else
+#define F77_VOID_FUN extern void /* subroutine */
+#define F77_INT_FUN extern int /* integer function */
+#define F77_TYPE_FUN extern TYPE /* real function */
+#define F77_DBLE_FUN extern double /* dble function */
+#endif
+
+#if defined( NoChange )
+/*
+ * These defines set up the naming scheme required to have a FORTRAN
+ * routine called by a C routine with the following FORTRAN to C inter-
+ * face:
+ *
+ * FORTRAN DECLARATION C CALL
+ * SUBROUTINE DGEMM(...) dgemm(...)
+ */
+#if defined( SREAL )
+
+#define F77rotg srotg
+#define F77rotmg srotmg
+#define F77nrm2 swrapnrm2
+#define F77asum swrapasum
+#define F77amax isamax
+#define F77scal sscal
+#define F77axpy saxpy
+#define F77axpby fatlas_saxpby
+#define F77set fatlas_sset
+#define F77copy scopy
+#define F77swap sswap
+#define F77rot srot
+#define F77rotm srotm
+#define F77dot swrapdot
+#define F77dsdot dswrapdot
+#define F77sdsdot sdswrapdot
+
+#define F77gemv sgemv
+#define F77gbmv sgbmv
+#define F77sbmv ssbmv
+#define F77spmv sspmv
+#define F77symv ssymv
+#define F77tbmv stbmv
+#define F77tpmv stpmv
+#define F77trmv strmv
+#define F77tbsv stbsv
+#define F77tpsv stpsv
+#define F77trsv strsv
+#define F77ger sger
+#define F77spr sspr
+#define F77syr ssyr
+#define F77spr2 sspr2
+#define F77syr2 ssyr2
+
+#define F77gemm sgemm
+#define F77symm ssymm
+#define F77syrk ssyrk
+#define F77syr2k ssyr2k
+#define F77trmm strmm
+#define F77trsm strsm
+
+#elif defined( DREAL )
+
+#define F77rotg drotg
+#define F77rotmg drotmg
+#define F77nrm2 dwrapnrm2
+#define F77asum dwrapasum
+#define F77amax idamax
+#define F77scal dscal
+#define F77axpy daxpy
+#define F77axpby fatlas_daxpby
+#define F77set fatlas_dset
+#define F77copy dcopy
+#define F77swap dswap
+#define F77rot drot
+#define F77rotm drotm
+#define F77dot dwrapdot
+
+#define F77gemv dgemv
+#define F77gbmv dgbmv
+#define F77sbmv dsbmv
+#define F77spmv dspmv
+#define F77symv dsymv
+#define F77tbmv dtbmv
+#define F77tpmv dtpmv
+#define F77trmv dtrmv
+#define F77tbsv dtbsv
+#define F77tpsv dtpsv
+#define F77trsv dtrsv
+#define F77ger dger
+#define F77spr dspr
+#define F77syr dsyr
+#define F77spr2 dspr2
+#define F77syr2 dsyr2
+
+#define F77gemm dgemm
+#define F77symm dsymm
+#define F77syrk dsyrk
+#define F77syr2k dsyr2k
+#define F77trmm dtrmm
+#define F77trsm dtrsm
+
+#elif defined( SCPLX )
+
+#define F77rotg crotg
+#define F77nrm2 scwrapnrm2
+#define F77asum scwrapasum
+#define F77amax icamax
+#define F77scal cscal
+#define F77rscal csscal
+#define F77axpy caxpy
+#define F77axpby fatlas_caxpby
+#define F77set fatlas_cset
+#define F77copy ccopy
+#define F77swap cswap
+#define F77rot csrot
+#define F77dotc cwrapdotc
+#define F77dotu cwrapdotu
+
+#define F77gbmv cgbmv
+#define F77gemv cgemv
+#define F77hbmv chbmv
+#define F77hpmv chpmv
+#define F77hemv chemv
+#define F77tbmv ctbmv
+#define F77tpmv ctpmv
+#define F77trmv ctrmv
+#define F77tbsv ctbsv
+#define F77tpsv ctpsv
+#define F77trsv ctrsv
+#define F77gerc cgerc
+#define F77geru cgeru
+#define F77hpr chpr
+#define F77her cher
+#define F77hpr2 chpr2
+#define F77her2 cher2
+
+#define F77gemm cgemm
+#define F77hemm chemm
+#define F77herk cherk
+#define F77her2k cher2k
+#define F77symm csymm
+#define F77syrk csyrk
+#define F77syr2k csyr2k
+#define F77trmm ctrmm
+#define F77trsm ctrsm
+
+#elif defined( DCPLX )
+
+#define F77rotg zrotg
+#define F77nrm2 dzwrapnrm2
+#define F77asum dzwrapasum
+#define F77amax izamax
+#define F77scal zscal
+#define F77rscal zdscal
+#define F77axpy zaxpy
+#define F77axpby fatlas_zaxpby
+#define F77set fatlas_zset
+#define F77copy zcopy
+#define F77swap zswap
+#define F77rot zdrot
+#define F77dotc zwrapdotc
+#define F77dotu zwrapdotu
+
+#define F77gbmv zgbmv
+#define F77gemv zgemv
+#define F77hbmv zhbmv
+#define F77hpmv zhpmv
+#define F77hemv zhemv
+#define F77tbmv ztbmv
+#define F77tpmv ztpmv
+#define F77trmv ztrmv
+#define F77tbsv ztbsv
+#define F77tpsv ztpsv
+#define F77trsv ztrsv
+#define F77gerc zgerc
+#define F77geru zgeru
+#define F77hpr zhpr
+#define F77her zher
+#define F77hpr2 zhpr2
+#define F77her2 zher2
+
+#define F77gemm zgemm
+#define F77hemm zhemm
+#define F77herk zherk
+#define F77her2k zher2k
+#define F77symm zsymm
+#define F77syrk zsyrk
+#define F77syr2k zsyr2k
+#define F77trmm ztrmm
+#define F77trsm ztrsm
+
+#endif
+
+#elif defined( UpCase )
+/*
+ * These defines set up the naming scheme required to have a FORTRAN
+ * routine called by a C routine with the following FORTRAN to C inter-
+ * face:
+ *
+ * FORTRAN DECLARATION C CALL
+ * SUBROUTINE DGEMM(...) DGEMM(...)
+ */
+#if defined( SREAL )
+
+#define F77rotg SROTG
+#define F77rotmg SROTMG
+#define F77nrm2 SWRAPNRM2
+#define F77asum SWRAPASUM
+#define F77amax ISAMAX
+#define F77scal SSCAL
+#define F77axpy SAXPY
+#define F77axpby FATLAS_SAXPBY
+#define F77set FATLAS_SSET
+#define F77copy SCOPY
+#define F77swap SSWAP
+#define F77rot SROT
+#define F77rotm SROTM
+#define F77dot SWRAPDOT
+#define F77dsdot DSWRAPDOT
+#define F77sdsdot SDSWRAPDOT
+
+#define F77gemv SGEMV
+#define F77gbmv SGBMV
+#define F77sbmv SSBMV
+#define F77spmv SSPMV
+#define F77symv SSYMV
+#define F77tbmv STBMV
+#define F77tpmv STPMV
+#define F77trmv STRMV
+#define F77tbsv STBSV
+#define F77tpsv STPSV
+#define F77trsv STRSV
+#define F77ger SGER
+#define F77spr SSPR
+#define F77syr SSYR
+#define F77spr2 SSPR2
+#define F77syr2 SSYR2
+
+#define F77gemm SGEMM
+#define F77symm SSYMM
+#define F77syrk SSYRK
+#define F77syr2k SSYR2K
+#define F77trmm STRMM
+#define F77trsm STRSM
+
+#elif defined( DREAL )
+
+#define F77rotg DROTG
+#define F77rotmg DROTMG
+#define F77nrm2 DWRAPNRM2
+#define F77asum DWRAPASUM
+#define F77amax IDAMAX
+#define F77scal DSCAL
+#define F77axpy DAXPY
+#define F77axpby FATLAS_DAXPBY
+#define F77set FATLAS_DSET
+#define F77copy DCOPY
+#define F77swap DSWAP
+#define F77rot DROT
+#define F77rotm DROTM
+#define F77dot DWRAPDOT
+
+#define F77gemv DGEMV
+#define F77gbmv DGBMV
+#define F77sbmv DSBMV
+#define F77spmv DSPMV
+#define F77symv DSYMV
+#define F77tbmv DTBMV
+#define F77tpmv DTPMV
+#define F77trmv DTRMV
+#define F77tbsv DTBSV
+#define F77tpsv DTPSV
+#define F77trsv DTRSV
+#define F77ger DGER
+#define F77spr DSPR
+#define F77syr DSYR
+#define F77spr2 DSPR2
+#define F77syr2 DSYR2
+
+#define F77gemm DGEMM
+#define F77symm DSYMM
+#define F77syrk DSYRK
+#define F77syr2k DSYR2K
+#define F77trmm DTRMM
+#define F77trsm DTRSM
+
+#elif defined( SCPLX )
+
+#define F77rotg CROTG
+#define F77nrm2 SCWRAPNRM2
+#define F77asum SCWRAPASUM
+#define F77amax ICAMAX
+#define F77scal CSCAL
+#define F77rscal CSSCAL
+#define F77axpy CAXPY
+#define F77axpby FATLAS_CAXPBY
+#define F77set FATLAS_CSET
+#define F77copy CCOPY
+#define F77swap CSWAP
+#define F77rot CSROT
+#define F77dotc CWRAPDOTC
+#define F77dotu CWRAPDOTU
+
+#define F77gbmv CGBMV
+#define F77gemv CGEMV
+#define F77hbmv CHBMV
+#define F77hpmv CHPMV
+#define F77hemv CHEMV
+#define F77tbmv CTBMV
+#define F77tpmv CTPMV
+#define F77trmv CTRMV
+#define F77tbsv CTBSV
+#define F77tpsv CTPSV
+#define F77trsv CTRSV
+#define F77gerc CGERC
+#define F77geru CGERU
+#define F77hpr CHPR
+#define F77her CHER
+#define F77hpr2 CHPR2
+#define F77her2 CHER2
+
+#define F77gemm CGEMM
+#define F77hemm CHEMM
+#define F77herk CHERK
+#define F77her2k CHER2K
+#define F77symm CSYMM
+#define F77syrk CSYRK
+#define F77syr2k CSYR2K
+#define F77trmm CTRMM
+#define F77trsm CTRSM
+
+#elif defined( DCPLX )
+
+#define F77rotg ZROTG
+#define F77nrm2 DZWRAPNRM2
+#define F77asum DZWRAPASUM
+#define F77amax IZAMAX
+#define F77scal ZSCAL
+#define F77rscal ZDSCAL
+#define F77axpy ZAXPY
+#define F77axpby FATLAS_ZAXPBY
+#define F77set FATLAS_ZSET
+#define F77copy ZCOPY
+#define F77swap ZSWAP
+#define F77rot ZDROT
+#define F77dotc ZWRAPDOTC
+#define F77dotu ZWRAPDOTU
+
+#define F77gbmv ZGBMV
+#define F77gemv ZGEMV
+#define F77hbmv ZHBMV
+#define F77hpmv ZHPMV
+#define F77hemv ZHEMV
+#define F77tbmv ZTBMV
+#define F77tpmv ZTPMV
+#define F77trmv ZTRMV
+#define F77tbsv ZTBSV
+#define F77tpsv ZTPSV
+#define F77trsv ZTRSV
+#define F77gerc ZGERC
+#define F77geru ZGERU
+#define F77hpr ZHPR
+#define F77her ZHER
+#define F77hpr2 ZHPR2
+#define F77her2 ZHER2
+
+#define F77gemm ZGEMM
+#define F77hemm ZHEMM
+#define F77herk ZHERK
+#define F77her2k ZHER2K
+#define F77symm ZSYMM
+#define F77syrk ZSYRK
+#define F77syr2k ZSYR2K
+#define F77trmm ZTRMM
+#define F77trsm ZTRSM
+
+#endif
+
+#elif defined( Add_ ) || defined( Add__ )
+/*
+ * These defines set up the naming scheme required to have a FORTRAN
+ * routine called by a C routine with the following FORTRAN to C inter-
+ * face:
+ *
+ * FORTRAN DECLARATION C CALL
+ * SUBROUTINE DGEMM(...) dgemm_(...)
+ */
+#if defined( SREAL )
+
+#define F77rotg srotg_
+#define F77rotmg srotmg_
+#define F77nrm2 swrapnrm2_
+#define F77asum swrapasum_
+#define F77amax isamax_
+#define F77scal sscal_
+#define F77axpy saxpy_
+#ifdef Add_
+ #define F77axpby fatlas_saxpby_
+ #define F77set fatlas_sset_
+#else
+ #define F77axpby fatlas_saxpby__
+ #define F77set fatlas_sset__
+#endif
+#define F77copy scopy_
+#define F77swap sswap_
+#define F77rot srot_
+#define F77rotm srotm_
+#define F77dot swrapdot_
+#define F77dsdot dswrapdot_
+#define F77sdsdot sdswrapdot_
+
+#define F77gemv sgemv_
+#define F77gbmv sgbmv_
+#define F77sbmv ssbmv_
+#define F77spmv sspmv_
+#define F77symv ssymv_
+#define F77tbmv stbmv_
+#define F77tpmv stpmv_
+#define F77trmv strmv_
+#define F77tbsv stbsv_
+#define F77tpsv stpsv_
+#define F77trsv strsv_
+#define F77ger sger_
+#define F77spr sspr_
+#define F77syr ssyr_
+#define F77spr2 sspr2_
+#define F77syr2 ssyr2_
+
+#define F77gemm sgemm_
+#define F77symm ssymm_
+#define F77syrk ssyrk_
+#define F77syr2k ssyr2k_
+#define F77trmm strmm_
+#define F77trsm strsm_
+
+#elif defined( DREAL )
+
+#define F77rotg drotg_
+#define F77rotmg drotmg_
+#define F77nrm2 dwrapnrm2_
+#define F77asum dwrapasum_
+#define F77amax idamax_
+#define F77scal dscal_
+#define F77axpy daxpy_
+#ifdef Add_
+ #define F77axpby fatlas_daxpby_
+ #define F77set fatlas_dset_
+#else
+ #define F77axpby fatlas_daxpby__
+ #define F77set fatlas_dset__
+#endif
+#define F77copy dcopy_
+#define F77swap dswap_
+#define F77rot drot_
+#define F77rotm drotm_
+#define F77dot dwrapdot_
+
+#define F77gemv dgemv_
+#define F77gbmv dgbmv_
+#define F77sbmv dsbmv_
+#define F77spmv dspmv_
+#define F77symv dsymv_
+#define F77tbmv dtbmv_
+#define F77tpmv dtpmv_
+#define F77trmv dtrmv_
+#define F77tbsv dtbsv_
+#define F77tpsv dtpsv_
+#define F77trsv dtrsv_
+#define F77ger dger_
+#define F77spr dspr_
+#define F77syr dsyr_
+#define F77spr2 dspr2_
+#define F77syr2 dsyr2_
+
+#define F77gemm dgemm_
+#define F77symm dsymm_
+#define F77syrk dsyrk_
+#define F77syr2k dsyr2k_
+#define F77trmm dtrmm_
+#define F77trsm dtrsm_
+
+#elif defined( SCPLX )
+
+#define F77rotg crotg_
+#define F77nrm2 scwrapnrm2_
+#define F77asum scwrapasum_
+#define F77amax icamax_
+#define F77scal cscal_
+#define F77rscal csscal_
+#define F77axpy caxpy_
+#ifdef Add_
+ #define F77axpby fatlas_caxpby_
+ #define F77set fatlas_cset_
+#else
+ #define F77axpby fatlas_caxpby__
+ #define F77set fatlas_cset__
+#endif
+#define F77copy ccopy_
+#define F77swap cswap_
+#define F77rot csrot_
+#define F77dotc cwrapdotc_
+#define F77dotu cwrapdotu_
+
+#define F77gbmv cgbmv_
+#define F77gemv cgemv_
+#define F77hbmv chbmv_
+#define F77hpmv chpmv_
+#define F77hemv chemv_
+#define F77tbmv ctbmv_
+#define F77tpmv ctpmv_
+#define F77trmv ctrmv_
+#define F77tbsv ctbsv_
+#define F77tpsv ctpsv_
+#define F77trsv ctrsv_
+#define F77gerc cgerc_
+#define F77geru cgeru_
+#define F77hpr chpr_
+#define F77her cher_
+#define F77hpr2 chpr2_
+#define F77her2 cher2_
+
+#define F77gemm cgemm_
+#define F77hemm chemm_
+#define F77herk cherk_
+#define F77her2k cher2k_
+#define F77symm csymm_
+#define F77syrk csyrk_
+#define F77syr2k csyr2k_
+#define F77trmm ctrmm_
+#define F77trsm ctrsm_
+
+#elif defined( DCPLX )
+
+#define F77rotg zrotg_
+#define F77nrm2 dzwrapnrm2_
+#define F77asum dzwrapasum_
+#define F77amax izamax_
+#define F77scal zscal_
+#define F77rscal zdscal_
+#define F77axpy zaxpy_
+#ifdef Add_
+ #define F77axpby fatlas_zaxpby_
+ #define F77set fatlas_zset_
+#else
+ #define F77axpby fatlas_zaxpby__
+ #define F77set fatlas_zset__
+#endif
+#define F77copy zcopy_
+#define F77swap zswap_
+#define F77rot zdrot_
+#define F77dotc zwrapdotc_
+#define F77dotu zwrapdotu_
+
+#define F77gbmv zgbmv_
+#define F77gemv zgemv_
+#define F77hbmv zhbmv_
+#define F77hpmv zhpmv_
+#define F77hemv zhemv_
+#define F77tbmv ztbmv_
+#define F77tpmv ztpmv_
+#define F77trmv ztrmv_
+#define F77tbsv ztbsv_
+#define F77tpsv ztpsv_
+#define F77trsv ztrsv_
+#define F77gerc zgerc_
+#define F77geru zgeru_
+#define F77hpr zhpr_
+#define F77her zher_
+#define F77hpr2 zhpr2_
+#define F77her2 zher2_
+
+#define F77gemm zgemm_
+#define F77hemm zhemm_
+#define F77herk zherk_
+#define F77her2k zher2k_
+#define F77symm zsymm_
+#define F77syrk zsyrk_
+#define F77syr2k zsyr2k_
+#define F77trmm ztrmm_
+#define F77trsm ztrsm_
+
+#endif
+
+#endif
+
+#ifdef TREAL
+F77_VOID_FUN F77rotg
+( F77_SINOUT_DECL, F77_SINOUT_DECL, F77_SINOUT_DECL, F77_SINOUT_DECL );
+F77_VOID_FUN F77rotmg
+( F77_SINOUT_DECL, F77_SINOUT_DECL, F77_SINOUT_DECL, F77_SIN_DECL,
+ F77_VINOUT_DECL );
+#else
+F77_VOID_FUN F77rotg
+( F77_SINOUT_DECL, F77_SIN_DECL, F77_SINOUT_DECL, F77_SINOUT_DECL );
+#endif
+F77_VOID_FUN F77nrm2
+( F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_RINOUT_DECL );
+F77_VOID_FUN F77asum
+( F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_RINOUT_DECL );
+F77_INT_FUN F77amax
+( F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL );
+F77_VOID_FUN F77scal
+( F77_INT_DECL, F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL );
+#ifdef TCPLX
+F77_VOID_FUN F77rscal
+( F77_INT_DECL, F77_RIN_DECL, F77_VINOUT_DECL, F77_INT_DECL );
+#endif
+void F77set
+( F77_INT_DECL, F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL );
+void F77axpby
+( F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL, F77_INT_DECL,
+ F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL );
+F77_VOID_FUN F77axpy
+( F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL, F77_INT_DECL,
+ F77_VINOUT_DECL, F77_INT_DECL );
+F77_VOID_FUN F77copy
+( F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_VINOUT_DECL,
+ F77_INT_DECL );
+F77_VOID_FUN F77swap
+( F77_INT_DECL, F77_VINOUT_DECL, F77_INT_DECL, F77_VINOUT_DECL,
+ F77_INT_DECL );
+F77_VOID_FUN F77rot
+( F77_INT_DECL, F77_VINOUT_DECL, F77_INT_DECL, F77_VINOUT_DECL,
+ F77_INT_DECL, F77_SIN_DECL, F77_SIN_DECL );
+#ifdef TREAL
+F77_VOID_FUN F77rotm
+( F77_INT_DECL, F77_VINOUT_DECL, F77_INT_DECL, F77_VINOUT_DECL,
+ F77_INT_DECL, F77_VIN_DECL );
+#endif
+#ifdef TREAL
+F77_VOID_FUN F77dot
+( F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_VIN_DECL,
+ F77_INT_DECL, F77_SINOUT_DECL );
+#ifdef SREAL
+F77_VOID_FUN F77dsdot
+( F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_VIN_DECL,
+ F77_INT_DECL, double * );
+F77_VOID_FUN F77sdsdot
+( F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL, F77_INT_DECL,
+ F77_VIN_DECL, F77_INT_DECL, F77_SINOUT_DECL );
+#endif
+#else
+F77_VOID_FUN F77dotc
+( F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_VIN_DECL,
+ F77_INT_DECL, F77_SINOUT_DECL );
+F77_VOID_FUN F77dotu
+( F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_VIN_DECL,
+ F77_INT_DECL, F77_SINOUT_DECL );
+#endif
+
+F77_VOID_FUN F77gbmv
+( F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL, F77_INT_DECL,
+ F77_INT_DECL, F77_SIN_DECL, F77_MIN_DECL, F77_INT_DECL,
+ F77_VIN_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VINOUT_DECL,
+ F77_INT_DECL F77_1_CHAR );
+F77_VOID_FUN F77gemv
+( F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL,
+ F77_SIN_DECL, F77_MIN_DECL, F77_INT_DECL,
+ F77_VIN_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VINOUT_DECL,
+ F77_INT_DECL F77_1_CHAR );
+#ifdef TREAL
+F77_VOID_FUN F77ger
+( F77_INT_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL,
+ F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_MINOUT_DECL,
+ F77_INT_DECL );
+F77_VOID_FUN F77sbmv
+( F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL, F77_SIN_DECL,
+ F77_MIN_DECL, F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL,
+ F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL F77_1_CHAR );
+F77_VOID_FUN F77spmv
+( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL,
+ F77_MIN_DECL, F77_VIN_DECL, F77_INT_DECL,
+ F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL F77_1_CHAR );
+F77_VOID_FUN F77symv
+( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL,
+ F77_MIN_DECL, F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL,
+ F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL F77_1_CHAR );
+F77_VOID_FUN F77spr
+( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL,
+ F77_INT_DECL, F77_MINOUT_DECL F77_1_CHAR );
+F77_VOID_FUN F77syr
+( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL,
+ F77_INT_DECL, F77_MINOUT_DECL, F77_INT_DECL F77_1_CHAR );
+F77_VOID_FUN F77spr2
+( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL,
+ F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_MINOUT_DECL
+ F77_1_CHAR );
+F77_VOID_FUN F77syr2
+( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL,
+ F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_MINOUT_DECL,
+ F77_INT_DECL F77_1_CHAR );
+#else
+F77_VOID_FUN F77gerc
+( F77_INT_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL,
+ F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_MINOUT_DECL,
+ F77_INT_DECL );
+F77_VOID_FUN F77geru
+( F77_INT_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL,
+ F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_MINOUT_DECL,
+ F77_INT_DECL );
+F77_VOID_FUN F77hbmv
+( F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL, F77_SIN_DECL,
+ F77_MIN_DECL, F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL,
+ F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL F77_1_CHAR );
+F77_VOID_FUN F77hpmv
+( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL,
+ F77_MIN_DECL, F77_VIN_DECL, F77_INT_DECL,
+ F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL F77_1_CHAR );
+F77_VOID_FUN F77hemv
+( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL,
+ F77_MIN_DECL, F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL,
+ F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL F77_1_CHAR );
+F77_VOID_FUN F77hpr
+( F77_CHAR_DECL, F77_INT_DECL, F77_RIN_DECL, F77_VIN_DECL,
+ F77_INT_DECL, F77_MINOUT_DECL F77_1_CHAR );
+F77_VOID_FUN F77her
+( F77_CHAR_DECL, F77_INT_DECL, F77_RIN_DECL, F77_VIN_DECL,
+ F77_INT_DECL, F77_MINOUT_DECL, F77_INT_DECL F77_1_CHAR );
+F77_VOID_FUN F77hpr2
+( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL,
+ F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_MINOUT_DECL
+ F77_1_CHAR );
+F77_VOID_FUN F77her2
+( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL,
+ F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_MINOUT_DECL,
+ F77_INT_DECL F77_1_CHAR );
+#endif
+F77_VOID_FUN F77tbmv
+( F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL,
+ F77_INT_DECL, F77_MIN_DECL, F77_INT_DECL, F77_VINOUT_DECL,
+ F77_INT_DECL F77_3_CHAR );
+F77_VOID_FUN F77tpmv
+( F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL,
+ F77_MIN_DECL, F77_VINOUT_DECL,
+ F77_INT_DECL F77_3_CHAR );
+F77_VOID_FUN F77trmv
+( F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL,
+ F77_MIN_DECL, F77_INT_DECL, F77_VINOUT_DECL,
+ F77_INT_DECL F77_3_CHAR );
+F77_VOID_FUN F77tbsv
+( F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL,
+ F77_INT_DECL, F77_MIN_DECL, F77_INT_DECL, F77_VINOUT_DECL,
+ F77_INT_DECL F77_3_CHAR );
+F77_VOID_FUN F77tpsv
+( F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL,
+ F77_MIN_DECL, F77_VINOUT_DECL,
+ F77_INT_DECL F77_3_CHAR );
+F77_VOID_FUN F77trsv
+( F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL,
+ F77_MIN_DECL, F77_INT_DECL, F77_VINOUT_DECL,
+ F77_INT_DECL F77_3_CHAR );
+
+F77_VOID_FUN F77gemm
+( F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL,
+ F77_INT_DECL, F77_SIN_DECL, F77_MIN_DECL, F77_INT_DECL,
+ F77_MIN_DECL, F77_INT_DECL, F77_SIN_DECL, F77_MINOUT_DECL,
+ F77_INT_DECL F77_2_CHAR );
+F77_VOID_FUN F77hemm
+( F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL,
+ F77_SIN_DECL, F77_MIN_DECL, F77_INT_DECL, F77_MIN_DECL,
+ F77_INT_DECL, F77_SIN_DECL, F77_MINOUT_DECL, F77_INT_DECL
+ F77_2_CHAR );
+F77_VOID_FUN F77her2k
+( F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL,
+ F77_SIN_DECL, F77_MIN_DECL, F77_INT_DECL, F77_MIN_DECL,
+ F77_INT_DECL, F77_RIN_DECL, F77_MINOUT_DECL, F77_INT_DECL
+ F77_2_CHAR );
+F77_VOID_FUN F77herk
+( F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL,
+ F77_RIN_DECL, F77_MIN_DECL, F77_INT_DECL, F77_RIN_DECL,
+ F77_MINOUT_DECL, F77_INT_DECL F77_2_CHAR );
+F77_VOID_FUN F77symm
+( F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL,
+ F77_SIN_DECL, F77_MIN_DECL, F77_INT_DECL, F77_MIN_DECL,
+ F77_INT_DECL, F77_SIN_DECL, F77_MINOUT_DECL, F77_INT_DECL
+ F77_2_CHAR );
+F77_VOID_FUN F77syr2k
+( F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL,
+ F77_SIN_DECL, F77_MIN_DECL, F77_INT_DECL, F77_MIN_DECL,
+ F77_INT_DECL, F77_SIN_DECL, F77_MINOUT_DECL, F77_INT_DECL
+ F77_2_CHAR );
+F77_VOID_FUN F77syrk
+( F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL,
+ F77_SIN_DECL, F77_MIN_DECL, F77_INT_DECL, F77_SIN_DECL,
+ F77_MINOUT_DECL, F77_INT_DECL F77_2_CHAR );
+F77_VOID_FUN F77trmm
+( F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL,
+ F77_INT_DECL, F77_INT_DECL, F77_SIN_DECL, F77_MIN_DECL,
+ F77_INT_DECL, F77_MINOUT_DECL, F77_INT_DECL F77_4_CHAR );
+F77_VOID_FUN F77trsm
+( F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL,
+ F77_INT_DECL, F77_INT_DECL, F77_SIN_DECL, F77_MIN_DECL,
+ F77_INT_DECL, F77_MINOUT_DECL, F77_INT_DECL F77_4_CHAR );
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_f77wrap.h b/kaldi_io/src/tools/ATLAS/include/atlas_f77wrap.h
new file mode 100644
index 0000000..db6099c
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_f77wrap.h
@@ -0,0 +1,1088 @@
+/* ---------------------------------------------------------------------
+ *
+ * -- Automatically Tuned Linear Algebra Software (ATLAS)
+ * (C) Copyright 2000 All Rights Reserved
+ *
+ * -- ATLAS routine -- Version 3.2 -- December 25, 2000
+ *
+ * Author : Antoine P. Petitet
+ * Originally developed at the University of Tennessee,
+ * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * -- Copyright notice and Licensing terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in
+ * the documentation and/or other materials provided with the distri-
+ * bution.
+ * 3. The name of the University, the ATLAS group, or the names of its
+ * contributors may not be used to endorse or promote products deri-
+ * ved from this software without specific written permission.
+ *
+ * -- Disclaimer:
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
+ * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN-
+ * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---------------------------------------------------------------------
+ */
+#ifndef ATLAS_F77WRAP_H
+#define ATLAS_F77WRAP_H
+/*
+ * =====================================================================
+ * Include Files
+ * =====================================================================
+ */
+#include "atlas_misc.h"
+#include "atlas_f77.h"
+/*
+ * =====================================================================
+ * Multi-threaded/reference implementation function names re-definition
+ * =====================================================================
+ *
+ * Uncomment the following definition macros to call the multi-threaded
+ * implementation or define those macros at compile time.
+ *
+ * #define USE_L1_PTHREADS
+ * #define USE_L2_PTHREADS
+ * #define USE_L3_PTHREADS
+ *
+ * Uncomment the following definition macros to call the reference im-
+ * plementation or define those macros at compile time.
+ *
+ * #define USE_L1_REFERENCE
+ * #define USE_L2_REFERENCE
+ * #define USE_L3_REFERENCE
+ *
+ * =====================================================================
+ */
+
+#ifdef ATL_USEPTHREADS
+#define USE_L3_PTHREADS
+#endif
+
+/*
+ * =====================================================================
+ * ATLAS Levels 1, 2 and 3 Prototypes
+ * =====================================================================
+ */
+#if defined( USE_L1_PTHREADS )
+#include "atlas_ptalias1.h"
+#include "atlas_ptlevel1.h"
+#elif defined( USE_L1_REFERENCE )
+#include "atlas_refalias1.h"
+#include "atlas_reflevel1.h"
+#else
+#include "atlas_level1.h"
+#endif
+
+#if defined( USE_L2_PTHREADS )
+#include "atlas_ptalias2.h"
+#include "atlas_ptlevel2.h"
+#elif defined( USE_L2_REFERENCE )
+#include "atlas_refalias2.h"
+#include "atlas_reflevel2.h"
+#else
+#include "atlas_level2.h"
+#endif
+
+#if defined( USE_L3_PTHREADS )
+#include "atlas_ptalias3.h"
+#include "atlas_ptlevel3.h"
+#elif defined( USE_L3_REFERENCE )
+#include "atlas_refalias3.h"
+#include "atlas_reflevel3.h"
+#else
+#include "atlas_level3.h"
+#endif
+/*
+ * =====================================================================
+ * #define macro constants
+ * =====================================================================
+ */
+#define PATLF77WRAP Mjoin( ATL_F77wrap_, PRE )
+
+#ifdef TREAL
+#define ATLPUF77WRAP Mjoin( ATL_F77wrap_, PRE )
+#define ATLUPF77WRAP Mjoin( ATL_F77wrap_, PRE )
+#else
+#define ATLPUF77WRAP Mjoin( Mjoin( ATL_F77wrap_, PRE ), UPR )
+#define ATLUPF77WRAP Mjoin( Mjoin( ATL_F77wrap_, UPR ), PRE )
+#endif
+
+#define F77_INOTRAN 111
+#define F77_ITRAN 112
+#define F77_ICOTRAN 113
+
+#define F77_IUPPER 121
+#define F77_ILOWER 122
+
+#define F77_INONUNIT 131
+#define F77_IUNIT 132
+
+#define F77_ILEFT 141
+#define F77_IRIGHT 142
+/*
+ * =====================================================================
+ * #define macro functions
+ * =====================================================================
+ */
+#ifdef TREAL
+#define V1N( n_, x_, incx_ ) \
+ ( (*n_) > 0 ? (x_)+(1-(*n_))*(*incx_) : (x_) )
+#define VN1( n_, x_, incx_ ) \
+ ( (*n_) > 0 ? (x_)+((*n_)-1)*(*incx_) : (x_) )
+#define W1N( n_, x_, incx_ ) \
+ ( (*incx_) < 0 ? V1N( n_, x_, incx_ ) : (x_) )
+#else
+#define V1N( n_, x_, incx_ ) \
+ ( (*n_) > 0 ? (x_)+( ( (1-(*n_))*(*incx_) ) << 1 ) : (x_) )
+#define VN1( n_, x_, incx_ ) \
+ ( (*n_) > 0 ? (x_)+( ( ((*n_)-1)*(*incx_) ) << 1 ) : (x_) )
+#define W1N( n_, x_, incx_ ) \
+ ( (*incx_) < 0 ? V1N( n_, x_, incx_ ) : (x_) )
+#endif
+/*
+ * =====================================================================
+ * FORTRAN <-> C interface
+ * =====================================================================
+ *
+ * These macros identifies how these wrappers will be called as follows:
+ *
+ * Add_: the FORTRAN compiler expects the name of C functions to be
+ * in all lower case and to have an underscore postfixed it (Suns, Intel
+ * compilers expect this).
+ *
+ * NoChange: the FORTRAN compiler expects the name of C functions to be
+ * in all lower case (IBM RS6K compilers do this).
+ *
+ * UpCase: the FORTRAN compiler expects the name of C functions to be
+ * in all upcase. (Cray compilers expect this).
+ *
+ * Add__: the FORTRAN compiler in use is f2c, a FORTRAN to C conver-
+ * ter.
+ */
+#if defined( Add_ )
+/*
+ * These defines set up the naming scheme required to have a FORTRAN
+ * routine calling a C routine.
+ *
+ * FORTRAN CALL C declaration
+ * CALL ATL_F77WRAP_SGEMM(...) void atl_f77wrap_sgemm_(...)
+ *
+ * This is the default.
+ */
+#if defined( SREAL )
+
+#define ATL_F77wrap_srotg atl_f77wrap_srotg_
+#define ATL_F77wrap_srotmg atl_f77wrap_srotmg_
+#define ATL_F77wrap_snrm2 atl_f77wrap_snrm2_
+#define ATL_F77wrap_sasum atl_f77wrap_sasum_
+#define ATL_F77wrap_sscal atl_f77wrap_sscal_
+#define ATL_F77wrap_isamax atl_f77wrap_isamax_
+#define ATL_F77wrap_saxpy atl_f77wrap_saxpy_
+#define ATL_F77wrap_scopy atl_f77wrap_scopy_
+#define ATL_F77wrap_sswap atl_f77wrap_sswap_
+#define ATL_F77wrap_srot atl_f77wrap_srot_
+#define ATL_F77wrap_srotm atl_f77wrap_srotm_
+#define ATL_F77wrap_sdot atl_f77wrap_sdot_
+#define ATL_F77wrap_dsdot atl_f77wrap_dsdot_
+#define ATL_F77wrap_sdsdot atl_f77wrap_sdsdot_
+
+#define ATL_F77wrap_sgbmv atl_f77wrap_sgbmv_
+#define ATL_F77wrap_sgemv atl_f77wrap_sgemv_
+#define ATL_F77wrap_sger atl_f77wrap_sger_
+#define ATL_F77wrap_ssbmv atl_f77wrap_ssbmv_
+#define ATL_F77wrap_sspmv atl_f77wrap_sspmv_
+#define ATL_F77wrap_ssymv atl_f77wrap_ssymv_
+#define ATL_F77wrap_sspr atl_f77wrap_sspr_
+#define ATL_F77wrap_ssyr atl_f77wrap_ssyr_
+#define ATL_F77wrap_sspr2 atl_f77wrap_sspr2_
+#define ATL_F77wrap_ssyr2 atl_f77wrap_ssyr2_
+#define ATL_F77wrap_stbmv atl_f77wrap_stbmv_
+#define ATL_F77wrap_stpmv atl_f77wrap_stpmv_
+#define ATL_F77wrap_strmv atl_f77wrap_strmv_
+#define ATL_F77wrap_stbsv atl_f77wrap_stbsv_
+#define ATL_F77wrap_stpsv atl_f77wrap_stpsv_
+#define ATL_F77wrap_strsv atl_f77wrap_strsv_
+
+#define ATL_F77wrap_sgemm atl_f77wrap_sgemm_
+#define ATL_F77wrap_ssymm atl_f77wrap_ssymm_
+#define ATL_F77wrap_ssyrk atl_f77wrap_ssyrk_
+#define ATL_F77wrap_ssyr2k atl_f77wrap_ssyr2k_
+#define ATL_F77wrap_strmm atl_f77wrap_strmm_
+#define ATL_F77wrap_strsm atl_f77wrap_strsm_
+
+#elif defined( DREAL )
+
+#define ATL_F77wrap_drotg atl_f77wrap_drotg_
+#define ATL_F77wrap_drotmg atl_f77wrap_drotmg_
+#define ATL_F77wrap_dnrm2 atl_f77wrap_dnrm2_
+#define ATL_F77wrap_dasum atl_f77wrap_dasum_
+#define ATL_F77wrap_dscal atl_f77wrap_dscal_
+#define ATL_F77wrap_idamax atl_f77wrap_idamax_
+#define ATL_F77wrap_daxpy atl_f77wrap_daxpy_
+#define ATL_F77wrap_dcopy atl_f77wrap_dcopy_
+#define ATL_F77wrap_dswap atl_f77wrap_dswap_
+#define ATL_F77wrap_drot atl_f77wrap_drot_
+#define ATL_F77wrap_drotm atl_f77wrap_drotm_
+#define ATL_F77wrap_ddot atl_f77wrap_ddot_
+
+#define ATL_F77wrap_dgbmv atl_f77wrap_dgbmv_
+#define ATL_F77wrap_dgemv atl_f77wrap_dgemv_
+#define ATL_F77wrap_dger atl_f77wrap_dger_
+#define ATL_F77wrap_dsbmv atl_f77wrap_dsbmv_
+#define ATL_F77wrap_dspmv atl_f77wrap_dspmv_
+#define ATL_F77wrap_dsymv atl_f77wrap_dsymv_
+#define ATL_F77wrap_dspr atl_f77wrap_dspr_
+#define ATL_F77wrap_dsyr atl_f77wrap_dsyr_
+#define ATL_F77wrap_dspr2 atl_f77wrap_dspr2_
+#define ATL_F77wrap_dsyr2 atl_f77wrap_dsyr2_
+#define ATL_F77wrap_dtbmv atl_f77wrap_dtbmv_
+#define ATL_F77wrap_dtpmv atl_f77wrap_dtpmv_
+#define ATL_F77wrap_dtrmv atl_f77wrap_dtrmv_
+#define ATL_F77wrap_dtbsv atl_f77wrap_dtbsv_
+#define ATL_F77wrap_dtpsv atl_f77wrap_dtpsv_
+#define ATL_F77wrap_dtrsv atl_f77wrap_dtrsv_
+
+#define ATL_F77wrap_dgemm atl_f77wrap_dgemm_
+#define ATL_F77wrap_dsymm atl_f77wrap_dsymm_
+#define ATL_F77wrap_dsyrk atl_f77wrap_dsyrk_
+#define ATL_F77wrap_dsyr2k atl_f77wrap_dsyr2k_
+#define ATL_F77wrap_dtrmm atl_f77wrap_dtrmm_
+#define ATL_F77wrap_dtrsm atl_f77wrap_dtrsm_
+
+#elif defined( SCPLX )
+
+#define ATL_F77wrap_crotg atl_f77wrap_crotg_
+#define ATL_F77wrap_scnrm2 atl_f77wrap_scnrm2_
+#define ATL_F77wrap_scasum atl_f77wrap_scasum_
+#define ATL_F77wrap_cscal atl_f77wrap_cscal_
+#define ATL_F77wrap_csscal atl_f77wrap_csscal_
+#define ATL_F77wrap_icamax atl_f77wrap_icamax_
+#define ATL_F77wrap_caxpy atl_f77wrap_caxpy_
+#define ATL_F77wrap_ccopy atl_f77wrap_ccopy_
+#define ATL_F77wrap_cswap atl_f77wrap_cswap_
+#define ATL_F77wrap_csrot atl_f77wrap_csrot_
+#define ATL_F77wrap_cdotc atl_f77wrap_cdotc_
+#define ATL_F77wrap_cdotu atl_f77wrap_cdotu_
+
+#define ATL_F77wrap_cgbmv atl_f77wrap_cgbmv_
+#define ATL_F77wrap_cgemv atl_f77wrap_cgemv_
+#define ATL_F77wrap_cgerc atl_f77wrap_cgerc_
+#define ATL_F77wrap_cgeru atl_f77wrap_cgeru_
+#define ATL_F77wrap_chbmv atl_f77wrap_chbmv_
+#define ATL_F77wrap_chpmv atl_f77wrap_chpmv_
+#define ATL_F77wrap_chemv atl_f77wrap_chemv_
+#define ATL_F77wrap_chpr atl_f77wrap_chpr_
+#define ATL_F77wrap_cher atl_f77wrap_cher_
+#define ATL_F77wrap_chpr2 atl_f77wrap_chpr2_
+#define ATL_F77wrap_cher2 atl_f77wrap_cher2_
+#define ATL_F77wrap_ctbmv atl_f77wrap_ctbmv_
+#define ATL_F77wrap_ctpmv atl_f77wrap_ctpmv_
+#define ATL_F77wrap_ctrmv atl_f77wrap_ctrmv_
+#define ATL_F77wrap_ctbsv atl_f77wrap_ctbsv_
+#define ATL_F77wrap_ctpsv atl_f77wrap_ctpsv_
+#define ATL_F77wrap_ctrsv atl_f77wrap_ctrsv_
+
+#define ATL_F77wrap_cgemm atl_f77wrap_cgemm_
+#define ATL_F77wrap_chemm atl_f77wrap_chemm_
+#define ATL_F77wrap_cherk atl_f77wrap_cherk_
+#define ATL_F77wrap_cher2k atl_f77wrap_cher2k_
+#define ATL_F77wrap_csymm atl_f77wrap_csymm_
+#define ATL_F77wrap_csyrk atl_f77wrap_csyrk_
+#define ATL_F77wrap_csyr2k atl_f77wrap_csyr2k_
+#define ATL_F77wrap_ctrmm atl_f77wrap_ctrmm_
+#define ATL_F77wrap_ctrsm atl_f77wrap_ctrsm_
+
+#elif defined( DCPLX )
+
+#define ATL_F77wrap_zrotg atl_f77wrap_zrotg_
+#define ATL_F77wrap_dznrm2 atl_f77wrap_dznrm2_
+#define ATL_F77wrap_dzasum atl_f77wrap_dzasum_
+#define ATL_F77wrap_zscal atl_f77wrap_zscal_
+#define ATL_F77wrap_zdscal atl_f77wrap_zdscal_
+#define ATL_F77wrap_izamax atl_f77wrap_izamax_
+#define ATL_F77wrap_zaxpy atl_f77wrap_zaxpy_
+#define ATL_F77wrap_zcopy atl_f77wrap_zcopy_
+#define ATL_F77wrap_zswap atl_f77wrap_zswap_
+#define ATL_F77wrap_zdrot atl_f77wrap_zdrot_
+#define ATL_F77wrap_zdotc atl_f77wrap_zdotc_
+#define ATL_F77wrap_zdotu atl_f77wrap_zdotu_
+
+#define ATL_F77wrap_zgbmv atl_f77wrap_zgbmv_
+#define ATL_F77wrap_zgemv atl_f77wrap_zgemv_
+#define ATL_F77wrap_zgerc atl_f77wrap_zgerc_
+#define ATL_F77wrap_zgeru atl_f77wrap_zgeru_
+#define ATL_F77wrap_zhbmv atl_f77wrap_zhbmv_
+#define ATL_F77wrap_zhpmv atl_f77wrap_zhpmv_
+#define ATL_F77wrap_zhemv atl_f77wrap_zhemv_
+#define ATL_F77wrap_zhpr atl_f77wrap_zhpr_
+#define ATL_F77wrap_zher atl_f77wrap_zher_
+#define ATL_F77wrap_zhpr2 atl_f77wrap_zhpr2_
+#define ATL_F77wrap_zher2 atl_f77wrap_zher2_
+#define ATL_F77wrap_ztbmv atl_f77wrap_ztbmv_
+#define ATL_F77wrap_ztpmv atl_f77wrap_ztpmv_
+#define ATL_F77wrap_ztrmv atl_f77wrap_ztrmv_
+#define ATL_F77wrap_ztbsv atl_f77wrap_ztbsv_
+#define ATL_F77wrap_ztpsv atl_f77wrap_ztpsv_
+#define ATL_F77wrap_ztrsv atl_f77wrap_ztrsv_
+
+#define ATL_F77wrap_zgemm atl_f77wrap_zgemm_
+#define ATL_F77wrap_zhemm atl_f77wrap_zhemm_
+#define ATL_F77wrap_zherk atl_f77wrap_zherk_
+#define ATL_F77wrap_zher2k atl_f77wrap_zher2k_
+#define ATL_F77wrap_zsymm atl_f77wrap_zsymm_
+#define ATL_F77wrap_zsyrk atl_f77wrap_zsyrk_
+#define ATL_F77wrap_zsyr2k atl_f77wrap_zsyr2k_
+#define ATL_F77wrap_ztrmm atl_f77wrap_ztrmm_
+#define ATL_F77wrap_ztrsm atl_f77wrap_ztrsm_
+
+#endif
+
+#elif defined( UpCase )
+/*
+ * These defines set up the naming scheme required to have a FORTRAN
+ * routine calling a C routine with the following interface:
+ *
+ * FORTRAN CALL C declaration
+ * CALL ATL_F77WRAP_SGEMM(...) void ATL_F77WRAP_SGEMM(...)
+ *
+ */
+#if defined( SREAL )
+
+#define ATL_F77wrap_srotg ATL_F77WRAP_SROTG
+#define ATL_F77wrap_srotmg ATL_F77WRAP_SROTMG
+#define ATL_F77wrap_snrm2 ATL_F77WRAP_SNRM2
+#define ATL_F77wrap_sasum ATL_F77WRAP_SASUM
+#define ATL_F77wrap_sscal ATL_F77WRAP_SSCAL
+#define ATL_F77wrap_isamax ATL_F77WRAP_ISAMAX
+#define ATL_F77wrap_saxpy ATL_F77WRAP_SAXPY
+#define ATL_F77wrap_scopy ATL_F77WRAP_SCOPY
+#define ATL_F77wrap_sswap ATL_F77WRAP_SSWAP
+#define ATL_F77wrap_srot ATL_F77WRAP_SROT
+#define ATL_F77wrap_srotm ATL_F77WRAP_SROTM
+#define ATL_F77wrap_sdot ATL_F77WRAP_SDOT
+#define ATL_F77wrap_dsdot ATL_F77WRAP_DSDOT
+#define ATL_F77wrap_sdsdot ATL_F77WRAP_SDSDOT
+
+#define ATL_F77wrap_sgbmv ATL_F77WRAP_SGBMV
+#define ATL_F77wrap_sgemv ATL_F77WRAP_SGEMV
+#define ATL_F77wrap_sger ATL_F77WRAP_SGER
+#define ATL_F77wrap_ssbmv ATL_F77WRAP_SSBMV
+#define ATL_F77wrap_sspmv ATL_F77WRAP_SSPMV
+#define ATL_F77wrap_ssymv ATL_F77WRAP_SSYMV
+#define ATL_F77wrap_sspr ATL_F77WRAP_SSPR
+#define ATL_F77wrap_ssyr ATL_F77WRAP_SSYR
+#define ATL_F77wrap_sspr2 ATL_F77WRAP_SSPR2
+#define ATL_F77wrap_ssyr2 ATL_F77WRAP_SSYR2
+#define ATL_F77wrap_stbmv ATL_F77WRAP_STBMV
+#define ATL_F77wrap_stpmv ATL_F77WRAP_STPMV
+#define ATL_F77wrap_strmv ATL_F77WRAP_STRMV
+#define ATL_F77wrap_stbsv ATL_F77WRAP_STBSV
+#define ATL_F77wrap_stpsv ATL_F77WRAP_STPSV
+#define ATL_F77wrap_strsv ATL_F77WRAP_STRSV
+
+#define ATL_F77wrap_sgemm ATL_F77WRAP_SGEMM
+#define ATL_F77wrap_ssymm ATL_F77WRAP_SSYMM
+#define ATL_F77wrap_ssyrk ATL_F77WRAP_SSYRK
+#define ATL_F77wrap_ssyr2k ATL_F77WRAP_SSYR2K
+#define ATL_F77wrap_strmm ATL_F77WRAP_STRMM
+#define ATL_F77wrap_strsm ATL_F77WRAP_STRSM
+
+#elif defined( DREAL )
+
+#define ATL_F77wrap_drotg ATL_F77WRAP_DROTG
+#define ATL_F77wrap_drotmg ATL_F77WRAP_DROTMG
+#define ATL_F77wrap_dnrm2 ATL_F77WRAP_DNRM2
+#define ATL_F77wrap_dasum ATL_F77WRAP_DASUM
+#define ATL_F77wrap_dscal ATL_F77WRAP_DSCAL
+#define ATL_F77wrap_idamax ATL_F77WRAP_IDAMAX
+#define ATL_F77wrap_daxpy ATL_F77WRAP_DAXPY
+#define ATL_F77wrap_dcopy ATL_F77WRAP_DCOPY
+#define ATL_F77wrap_dswap ATL_F77WRAP_DSWAP
+#define ATL_F77wrap_drot ATL_F77WRAP_DROT
+#define ATL_F77wrap_drotm ATL_F77WRAP_DROTM
+#define ATL_F77wrap_ddot ATL_F77WRAP_DDOT
+
+#define ATL_F77wrap_dgbmv ATL_F77WRAP_DGBMV
+#define ATL_F77wrap_dgemv ATL_F77WRAP_DGEMV
+#define ATL_F77wrap_dger ATL_F77WRAP_DGER
+#define ATL_F77wrap_dsbmv ATL_F77WRAP_DSBMV
+#define ATL_F77wrap_dspmv ATL_F77WRAP_DSPMV
+#define ATL_F77wrap_dsymv ATL_F77WRAP_DSYMV
+#define ATL_F77wrap_dspr ATL_F77WRAP_DSPR
+#define ATL_F77wrap_dsyr ATL_F77WRAP_DSYR
+#define ATL_F77wrap_dspr2 ATL_F77WRAP_DSPR2
+#define ATL_F77wrap_dsyr2 ATL_F77WRAP_DSYR2
+#define ATL_F77wrap_dtbmv ATL_F77WRAP_DTBMV
+#define ATL_F77wrap_dtpmv ATL_F77WRAP_DTPMV
+#define ATL_F77wrap_dtrmv ATL_F77WRAP_DTRMV
+#define ATL_F77wrap_dtbsv ATL_F77WRAP_DTBSV
+#define ATL_F77wrap_dtpsv ATL_F77WRAP_DTPSV
+#define ATL_F77wrap_dtrsv ATL_F77WRAP_DTRSV
+
+#define ATL_F77wrap_dgemm ATL_F77WRAP_DGEMM
+#define ATL_F77wrap_dsymm ATL_F77WRAP_DSYMM
+#define ATL_F77wrap_dsyrk ATL_F77WRAP_DSYRK
+#define ATL_F77wrap_dsyr2k ATL_F77WRAP_DSYR2K
+#define ATL_F77wrap_dtrmm ATL_F77WRAP_DTRMM
+#define ATL_F77wrap_dtrsm ATL_F77WRAP_DTRSM
+
+#elif defined( SCPLX )
+
+#define ATL_F77wrap_crotg ATL_F77WRAP_CROTG
+#define ATL_F77wrap_scnrm2 ATL_F77WRAP_SCNRM2
+#define ATL_F77wrap_scasum ATL_F77WRAP_SCASUM
+#define ATL_F77wrap_cscal ATL_F77WRAP_CSCAL
+#define ATL_F77wrap_csscal ATL_F77WRAP_CSSCAL
+#define ATL_F77wrap_icamax ATL_F77WRAP_ICAMAX
+#define ATL_F77wrap_caxpy ATL_F77WRAP_CAXPY
+#define ATL_F77wrap_ccopy ATL_F77WRAP_CCOPY
+#define ATL_F77wrap_cswap ATL_F77WRAP_CSWAP
+#define ATL_F77wrap_csrot ATL_F77WRAP_CSROT
+#define ATL_F77wrap_cdotc ATL_F77WRAP_CDOTC
+#define ATL_F77wrap_cdotu ATL_F77WRAP_CDOTU
+
+#define ATL_F77wrap_cgbmv ATL_F77WRAP_CGBMV
+#define ATL_F77wrap_cgemv ATL_F77WRAP_CGEMV
+#define ATL_F77wrap_cgerc ATL_F77WRAP_CGERC
+#define ATL_F77wrap_cgeru ATL_F77WRAP_CGERU
+#define ATL_F77wrap_chbmv ATL_F77WRAP_CHBMV
+#define ATL_F77wrap_chpmv ATL_F77WRAP_CHPMV
+#define ATL_F77wrap_chemv ATL_F77WRAP_CHEMV
+#define ATL_F77wrap_chpr ATL_F77WRAP_CHPR
+#define ATL_F77wrap_cher ATL_F77WRAP_CHER
+#define ATL_F77wrap_chpr2 ATL_F77WRAP_CHPR2
+#define ATL_F77wrap_cher2 ATL_F77WRAP_CHER2
+#define ATL_F77wrap_ctbmv ATL_F77WRAP_CTBMV
+#define ATL_F77wrap_ctpmv ATL_F77WRAP_CTPMV
+#define ATL_F77wrap_ctrmv ATL_F77WRAP_CTRMV
+#define ATL_F77wrap_ctbsv ATL_F77WRAP_CTBSV
+#define ATL_F77wrap_ctpsv ATL_F77WRAP_CTPSV
+#define ATL_F77wrap_ctrsv ATL_F77WRAP_CTRSV
+
+#define ATL_F77wrap_cgemm ATL_F77WRAP_CGEMM
+#define ATL_F77wrap_chemm ATL_F77WRAP_CHEMM
+#define ATL_F77wrap_cherk ATL_F77WRAP_CHERK
+#define ATL_F77wrap_cher2k ATL_F77WRAP_CHER2K
+#define ATL_F77wrap_csymm ATL_F77WRAP_CSYMM
+#define ATL_F77wrap_csyrk ATL_F77WRAP_CSYRK
+#define ATL_F77wrap_csyr2k ATL_F77WRAP_CSYR2K
+#define ATL_F77wrap_ctrmm ATL_F77WRAP_CTRMM
+#define ATL_F77wrap_ctrsm ATL_F77WRAP_CTRSM
+
+#elif defined( DCPLX )
+
+#define ATL_F77wrap_zrotg ATL_F77WRAP_ZROTG
+#define ATL_F77wrap_dznrm2 ATL_F77WRAP_DZNRM2
+#define ATL_F77wrap_dzasum ATL_F77WRAP_DZASUM
+#define ATL_F77wrap_zscal ATL_F77WRAP_ZSCAL
+#define ATL_F77wrap_zdscal ATL_F77WRAP_ZDSCAL
+#define ATL_F77wrap_izamax ATL_F77WRAP_IZAMAX
+#define ATL_F77wrap_zaxpy ATL_F77WRAP_ZAXPY
+#define ATL_F77wrap_zcopy ATL_F77WRAP_ZCOPY
+#define ATL_F77wrap_zswap ATL_F77WRAP_ZSWAP
+#define ATL_F77wrap_zdrot ATL_F77WRAP_ZDROT
+#define ATL_F77wrap_zdotc ATL_F77WRAP_ZDOTC
+#define ATL_F77wrap_zdotu ATL_F77WRAP_ZDOTU
+
+#define ATL_F77wrap_zgbmv ATL_F77WRAP_ZGBMV
+#define ATL_F77wrap_zgemv ATL_F77WRAP_ZGEMV
+#define ATL_F77wrap_zgerc ATL_F77WRAP_ZGERC
+#define ATL_F77wrap_zgeru ATL_F77WRAP_ZGERU
+#define ATL_F77wrap_zhbmv ATL_F77WRAP_ZHBMV
+#define ATL_F77wrap_zhpmv ATL_F77WRAP_ZHPMV
+#define ATL_F77wrap_zhemv ATL_F77WRAP_ZHEMV
+#define ATL_F77wrap_zhpr ATL_F77WRAP_ZHPR
+#define ATL_F77wrap_zher ATL_F77WRAP_ZHER
+#define ATL_F77wrap_zhpr2 ATL_F77WRAP_ZHPR2
+#define ATL_F77wrap_zher2 ATL_F77WRAP_ZHER2
+#define ATL_F77wrap_ztbmv ATL_F77WRAP_ZTBMV
+#define ATL_F77wrap_ztpmv ATL_F77WRAP_ZTPMV
+#define ATL_F77wrap_ztrmv ATL_F77WRAP_ZTRMV
+#define ATL_F77wrap_ztbsv ATL_F77WRAP_ZTBSV
+#define ATL_F77wrap_ztpsv ATL_F77WRAP_ZTPSV
+#define ATL_F77wrap_ztrsv ATL_F77WRAP_ZTRSV
+
+#define ATL_F77wrap_zgemm ATL_F77WRAP_ZGEMM
+#define ATL_F77wrap_zhemm ATL_F77WRAP_ZHEMM
+#define ATL_F77wrap_zherk ATL_F77WRAP_ZHERK
+#define ATL_F77wrap_zher2k ATL_F77WRAP_ZHER2K
+#define ATL_F77wrap_zsymm ATL_F77WRAP_ZSYMM
+#define ATL_F77wrap_zsyrk ATL_F77WRAP_ZSYRK
+#define ATL_F77wrap_zsyr2k ATL_F77WRAP_ZSYR2K
+#define ATL_F77wrap_ztrmm ATL_F77WRAP_ZTRMM
+#define ATL_F77wrap_ztrsm ATL_F77WRAP_ZTRSM
+
+#endif
+
+#elif defined( NoChange )
+/*
+ * These defines set up the naming scheme required to have a FORTRAN
+ * routine calling a C routine with the following interface:
+ *
+ * FORTRAN CALL C declaration
+ * CALL ATL_F77WRAP_SGEMM(...) void atl_f77wrap_sgemm(...)
+ */
+#if defined( SREAL )
+
+#define ATL_F77wrap_srotg atl_f77wrap_srotg
+#define ATL_F77wrap_srotmg atl_f77wrap_srotmg
+#define ATL_F77wrap_snrm2 atl_f77wrap_snrm2
+#define ATL_F77wrap_sasum atl_f77wrap_sasum
+#define ATL_F77wrap_sscal atl_f77wrap_sscal
+#define ATL_F77wrap_isamax atl_f77wrap_isamax
+#define ATL_F77wrap_saxpy atl_f77wrap_saxpy
+#define ATL_F77wrap_scopy atl_f77wrap_scopy
+#define ATL_F77wrap_sswap atl_f77wrap_sswap
+#define ATL_F77wrap_srot atl_f77wrap_srot
+#define ATL_F77wrap_srotm atl_f77wrap_srotm
+#define ATL_F77wrap_sdot atl_f77wrap_sdot
+#define ATL_F77wrap_dsdot atl_f77wrap_dsdot
+#define ATL_F77wrap_sdsdot atl_f77wrap_sdsdot
+
+#define ATL_F77wrap_sgbmv atl_f77wrap_sgbmv
+#define ATL_F77wrap_sgemv atl_f77wrap_sgemv
+#define ATL_F77wrap_sger atl_f77wrap_sger
+#define ATL_F77wrap_ssbmv atl_f77wrap_ssbmv
+#define ATL_F77wrap_sspmv atl_f77wrap_sspmv
+#define ATL_F77wrap_ssymv atl_f77wrap_ssymv
+#define ATL_F77wrap_sspr atl_f77wrap_sspr
+#define ATL_F77wrap_ssyr atl_f77wrap_ssyr
+#define ATL_F77wrap_sspr2 atl_f77wrap_sspr2
+#define ATL_F77wrap_ssyr2 atl_f77wrap_ssyr2
+#define ATL_F77wrap_stbmv atl_f77wrap_stbmv
+#define ATL_F77wrap_stpmv atl_f77wrap_stpmv
+#define ATL_F77wrap_strmv atl_f77wrap_strmv
+#define ATL_F77wrap_stbsv atl_f77wrap_stbsv
+#define ATL_F77wrap_stpsv atl_f77wrap_stpsv
+#define ATL_F77wrap_strsv atl_f77wrap_strsv
+
+#define ATL_F77wrap_sgemm atl_f77wrap_sgemm
+#define ATL_F77wrap_ssymm atl_f77wrap_ssymm
+#define ATL_F77wrap_ssyrk atl_f77wrap_ssyrk
+#define ATL_F77wrap_ssyr2k atl_f77wrap_ssyr2k
+#define ATL_F77wrap_strmm atl_f77wrap_strmm
+#define ATL_F77wrap_strsm atl_f77wrap_strsm
+
+#elif defined( DREAL )
+
+#define ATL_F77wrap_drotg atl_f77wrap_drotg
+#define ATL_F77wrap_drotmg atl_f77wrap_drotmg
+#define ATL_F77wrap_dnrm2 atl_f77wrap_dnrm2
+#define ATL_F77wrap_dasum atl_f77wrap_dasum
+#define ATL_F77wrap_dscal atl_f77wrap_dscal
+#define ATL_F77wrap_idamax atl_f77wrap_idamax
+#define ATL_F77wrap_daxpy atl_f77wrap_daxpy
+#define ATL_F77wrap_dcopy atl_f77wrap_dcopy
+#define ATL_F77wrap_dswap atl_f77wrap_dswap
+#define ATL_F77wrap_drot atl_f77wrap_drot
+#define ATL_F77wrap_drotm atl_f77wrap_drotm
+#define ATL_F77wrap_ddot atl_f77wrap_ddot
+
+#define ATL_F77wrap_dgbmv atl_f77wrap_dgbmv
+#define ATL_F77wrap_dgemv atl_f77wrap_dgemv
+#define ATL_F77wrap_dger atl_f77wrap_dger
+#define ATL_F77wrap_dsbmv atl_f77wrap_dsbmv
+#define ATL_F77wrap_dspmv atl_f77wrap_dspmv
+#define ATL_F77wrap_dsymv atl_f77wrap_dsymv
+#define ATL_F77wrap_dspr atl_f77wrap_dspr
+#define ATL_F77wrap_dsyr atl_f77wrap_dsyr
+#define ATL_F77wrap_dspr2 atl_f77wrap_dspr2
+#define ATL_F77wrap_dsyr2 atl_f77wrap_dsyr2
+#define ATL_F77wrap_dtbmv atl_f77wrap_dtbmv
+#define ATL_F77wrap_dtpmv atl_f77wrap_dtpmv
+#define ATL_F77wrap_dtrmv atl_f77wrap_dtrmv
+#define ATL_F77wrap_dtbsv atl_f77wrap_dtbsv
+#define ATL_F77wrap_dtpsv atl_f77wrap_dtpsv
+#define ATL_F77wrap_dtrsv atl_f77wrap_dtrsv
+
+#define ATL_F77wrap_dgemm atl_f77wrap_dgemm
+#define ATL_F77wrap_dsymm atl_f77wrap_dsymm
+#define ATL_F77wrap_dsyrk atl_f77wrap_dsyrk
+#define ATL_F77wrap_dsyr2k atl_f77wrap_dsyr2k
+#define ATL_F77wrap_dtrmm atl_f77wrap_dtrmm
+#define ATL_F77wrap_dtrsm atl_f77wrap_dtrsm
+
+#elif defined( SCPLX )
+
+#define ATL_F77wrap_crotg atl_f77wrap_crotg
+#define ATL_F77wrap_scnrm2 atl_f77wrap_scnrm2
+#define ATL_F77wrap_scasum atl_f77wrap_scasum
+#define ATL_F77wrap_cscal atl_f77wrap_cscal
+#define ATL_F77wrap_csscal atl_f77wrap_csscal
+#define ATL_F77wrap_icamax atl_f77wrap_icamax
+#define ATL_F77wrap_caxpy atl_f77wrap_caxpy
+#define ATL_F77wrap_ccopy atl_f77wrap_ccopy
+#define ATL_F77wrap_cswap atl_f77wrap_cswap
+#define ATL_F77wrap_csrot atl_f77wrap_csrot
+#define ATL_F77wrap_cdotc atl_f77wrap_cdotc
+#define ATL_F77wrap_cdotu atl_f77wrap_cdotu
+
+#define ATL_F77wrap_cgbmv atl_f77wrap_cgbmv
+#define ATL_F77wrap_cgemv atl_f77wrap_cgemv
+#define ATL_F77wrap_cgerc atl_f77wrap_cgerc
+#define ATL_F77wrap_cgeru atl_f77wrap_cgeru
+#define ATL_F77wrap_chbmv atl_f77wrap_chbmv
+#define ATL_F77wrap_chpmv atl_f77wrap_chpmv
+#define ATL_F77wrap_chemv atl_f77wrap_chemv
+#define ATL_F77wrap_chpr atl_f77wrap_chpr
+#define ATL_F77wrap_cher atl_f77wrap_cher
+#define ATL_F77wrap_chpr2 atl_f77wrap_chpr2
+#define ATL_F77wrap_cher2 atl_f77wrap_cher2
+#define ATL_F77wrap_ctbmv atl_f77wrap_ctbmv
+#define ATL_F77wrap_ctpmv atl_f77wrap_ctpmv
+#define ATL_F77wrap_ctrmv atl_f77wrap_ctrmv
+#define ATL_F77wrap_ctbsv atl_f77wrap_ctbsv
+#define ATL_F77wrap_ctpsv atl_f77wrap_ctpsv
+#define ATL_F77wrap_ctrsv atl_f77wrap_ctrsv
+
+#define ATL_F77wrap_cgemm atl_f77wrap_cgemm
+#define ATL_F77wrap_chemm atl_f77wrap_chemm
+#define ATL_F77wrap_cherk atl_f77wrap_cherk
+#define ATL_F77wrap_cher2k atl_f77wrap_cher2k
+#define ATL_F77wrap_csymm atl_f77wrap_csymm
+#define ATL_F77wrap_csyrk atl_f77wrap_csyrk
+#define ATL_F77wrap_csyr2k atl_f77wrap_csyr2k
+#define ATL_F77wrap_ctrmm atl_f77wrap_ctrmm
+#define ATL_F77wrap_ctrsm atl_f77wrap_ctrsm
+
+#elif defined( DCPLX )
+
+#define ATL_F77wrap_zrotg atl_f77wrap_zrotg
+#define ATL_F77wrap_dznrm2 atl_f77wrap_dznrm2
+#define ATL_F77wrap_dzasum atl_f77wrap_dzasum
+#define ATL_F77wrap_zscal atl_f77wrap_zscal
+#define ATL_F77wrap_zdscal atl_f77wrap_zdscal
+#define ATL_F77wrap_izamax atl_f77wrap_izamax
+#define ATL_F77wrap_zaxpy atl_f77wrap_zaxpy
+#define ATL_F77wrap_zcopy atl_f77wrap_zcopy
+#define ATL_F77wrap_zswap atl_f77wrap_zswap
+#define ATL_F77wrap_zdrot atl_f77wrap_zdrot
+#define ATL_F77wrap_zdotc atl_f77wrap_zdotc
+#define ATL_F77wrap_zdotu atl_f77wrap_zdotu
+
+#define ATL_F77wrap_zgbmv atl_f77wrap_zgbmv
+#define ATL_F77wrap_zgemv atl_f77wrap_zgemv
+#define ATL_F77wrap_zgerc atl_f77wrap_zgerc
+#define ATL_F77wrap_zgeru atl_f77wrap_zgeru
+#define ATL_F77wrap_zhbmv atl_f77wrap_zhbmv
+#define ATL_F77wrap_zhpmv atl_f77wrap_zhpmv
+#define ATL_F77wrap_zhemv atl_f77wrap_zhemv
+#define ATL_F77wrap_zhpr atl_f77wrap_zhpr
+#define ATL_F77wrap_zher atl_f77wrap_zher
+#define ATL_F77wrap_zhpr2 atl_f77wrap_zhpr2
+#define ATL_F77wrap_zher2 atl_f77wrap_zher2
+#define ATL_F77wrap_ztbmv atl_f77wrap_ztbmv
+#define ATL_F77wrap_ztpmv atl_f77wrap_ztpmv
+#define ATL_F77wrap_ztrmv atl_f77wrap_ztrmv
+#define ATL_F77wrap_ztbsv atl_f77wrap_ztbsv
+#define ATL_F77wrap_ztpsv atl_f77wrap_ztpsv
+#define ATL_F77wrap_ztrsv atl_f77wrap_ztrsv
+
+#define ATL_F77wrap_zgemm atl_f77wrap_zgemm
+#define ATL_F77wrap_zhemm atl_f77wrap_zhemm
+#define ATL_F77wrap_zherk atl_f77wrap_zherk
+#define ATL_F77wrap_zher2k atl_f77wrap_zher2k
+#define ATL_F77wrap_zsymm atl_f77wrap_zsymm
+#define ATL_F77wrap_zsyrk atl_f77wrap_zsyrk
+#define ATL_F77wrap_zsyr2k atl_f77wrap_zsyr2k
+#define ATL_F77wrap_ztrmm atl_f77wrap_ztrmm
+#define ATL_F77wrap_ztrsm atl_f77wrap_ztrsm
+
+#endif
+
+#elif defined( Add__ )
+/*
+ * These defines set up the naming scheme required to have a FORTRAN
+ * routine calling a C routine with the following interface:
+ *
+ * FORTRAN CALL C declaration
+ * CALL ATL_F77WRAP_SGEMM(...) void atl_f77wrap_sgemm__(...)
+ */
+#if defined( SREAL )
+
+#define ATL_F77wrap_srotg atl_f77wrap_srotg__
+#define ATL_F77wrap_srotmg atl_f77wrap_srotmg__
+#define ATL_F77wrap_snrm2 atl_f77wrap_snrm2__
+#define ATL_F77wrap_sasum atl_f77wrap_sasum__
+#define ATL_F77wrap_sscal atl_f77wrap_sscal__
+#define ATL_F77wrap_isamax atl_f77wrap_isamax__
+#define ATL_F77wrap_saxpy atl_f77wrap_saxpy__
+#define ATL_F77wrap_scopy atl_f77wrap_scopy__
+#define ATL_F77wrap_sswap atl_f77wrap_sswap__
+#define ATL_F77wrap_srot atl_f77wrap_srot__
+#define ATL_F77wrap_srotm atl_f77wrap_srotm__
+#define ATL_F77wrap_sdot atl_f77wrap_sdot__
+#define ATL_F77wrap_dsdot atl_f77wrap_dsdot__
+#define ATL_F77wrap_sdsdot atl_f77wrap_sdsdot__
+
+#define ATL_F77wrap_sgbmv atl_f77wrap_sgbmv__
+#define ATL_F77wrap_sgemv atl_f77wrap_sgemv__
+#define ATL_F77wrap_sger atl_f77wrap_sger__
+#define ATL_F77wrap_ssbmv atl_f77wrap_ssbmv__
+#define ATL_F77wrap_sspmv atl_f77wrap_sspmv__
+#define ATL_F77wrap_ssymv atl_f77wrap_ssymv__
+#define ATL_F77wrap_sspr atl_f77wrap_sspr__
+#define ATL_F77wrap_ssyr atl_f77wrap_ssyr__
+#define ATL_F77wrap_sspr2 atl_f77wrap_sspr2__
+#define ATL_F77wrap_ssyr2 atl_f77wrap_ssyr2__
+#define ATL_F77wrap_stbmv atl_f77wrap_stbmv__
+#define ATL_F77wrap_stpmv atl_f77wrap_stpmv__
+#define ATL_F77wrap_strmv atl_f77wrap_strmv__
+#define ATL_F77wrap_stbsv atl_f77wrap_stbsv__
+#define ATL_F77wrap_stpsv atl_f77wrap_stpsv__
+#define ATL_F77wrap_strsv atl_f77wrap_strsv__
+
+#define ATL_F77wrap_sgemm atl_f77wrap_sgemm__
+#define ATL_F77wrap_ssymm atl_f77wrap_ssymm__
+#define ATL_F77wrap_ssyrk atl_f77wrap_ssyrk__
+#define ATL_F77wrap_ssyr2k atl_f77wrap_ssyr2k__
+#define ATL_F77wrap_strmm atl_f77wrap_strmm__
+#define ATL_F77wrap_strsm atl_f77wrap_strsm__
+
+#elif defined( DREAL )
+
+#define ATL_F77wrap_drotg atl_f77wrap_drotg__
+#define ATL_F77wrap_drotmg atl_f77wrap_drotmg__
+#define ATL_F77wrap_dnrm2 atl_f77wrap_dnrm2__
+#define ATL_F77wrap_dasum atl_f77wrap_dasum__
+#define ATL_F77wrap_dscal atl_f77wrap_dscal__
+#define ATL_F77wrap_idamax atl_f77wrap_idamax__
+#define ATL_F77wrap_daxpy atl_f77wrap_daxpy__
+#define ATL_F77wrap_dcopy atl_f77wrap_dcopy__
+#define ATL_F77wrap_dswap atl_f77wrap_dswap__
+#define ATL_F77wrap_drot atl_f77wrap_drot__
+#define ATL_F77wrap_drotm atl_f77wrap_drotm__
+#define ATL_F77wrap_ddot atl_f77wrap_ddot__
+
+#define ATL_F77wrap_dgbmv atl_f77wrap_dgbmv__
+#define ATL_F77wrap_dgemv atl_f77wrap_dgemv__
+#define ATL_F77wrap_dger atl_f77wrap_dger__
+#define ATL_F77wrap_dsbmv atl_f77wrap_dsbmv__
+#define ATL_F77wrap_dspmv atl_f77wrap_dspmv__
+#define ATL_F77wrap_dsymv atl_f77wrap_dsymv__
+#define ATL_F77wrap_dspr atl_f77wrap_dspr__
+#define ATL_F77wrap_dsyr atl_f77wrap_dsyr__
+#define ATL_F77wrap_dspr2 atl_f77wrap_dspr2__
+#define ATL_F77wrap_dsyr2 atl_f77wrap_dsyr2__
+#define ATL_F77wrap_dtbmv atl_f77wrap_dtbmv__
+#define ATL_F77wrap_dtpmv atl_f77wrap_dtpmv__
+#define ATL_F77wrap_dtrmv atl_f77wrap_dtrmv__
+#define ATL_F77wrap_dtbsv atl_f77wrap_dtbsv__
+#define ATL_F77wrap_dtpsv atl_f77wrap_dtpsv__
+#define ATL_F77wrap_dtrsv atl_f77wrap_dtrsv__
+
+#define ATL_F77wrap_dgemm atl_f77wrap_dgemm__
+#define ATL_F77wrap_dsymm atl_f77wrap_dsymm__
+#define ATL_F77wrap_dsyrk atl_f77wrap_dsyrk__
+#define ATL_F77wrap_dsyr2k atl_f77wrap_dsyr2k__
+#define ATL_F77wrap_dtrmm atl_f77wrap_dtrmm__
+#define ATL_F77wrap_dtrsm atl_f77wrap_dtrsm__
+
+#elif defined( SCPLX )
+
+#define ATL_F77wrap_crotg atl_f77wrap_crotg__
+#define ATL_F77wrap_scnrm2 atl_f77wrap_scnrm2__
+#define ATL_F77wrap_scasum atl_f77wrap_scasum__
+#define ATL_F77wrap_cscal atl_f77wrap_cscal__
+#define ATL_F77wrap_csscal atl_f77wrap_csscal__
+#define ATL_F77wrap_icamax atl_f77wrap_icamax__
+#define ATL_F77wrap_caxpy atl_f77wrap_caxpy__
+#define ATL_F77wrap_ccopy atl_f77wrap_ccopy__
+#define ATL_F77wrap_cswap atl_f77wrap_cswap__
+#define ATL_F77wrap_csrot atl_f77wrap_csrot__
+#define ATL_F77wrap_cdotc atl_f77wrap_cdotc__
+#define ATL_F77wrap_cdotu atl_f77wrap_cdotu__
+
+#define ATL_F77wrap_cgbmv atl_f77wrap_cgbmv__
+#define ATL_F77wrap_cgemv atl_f77wrap_cgemv__
+#define ATL_F77wrap_cgerc atl_f77wrap_cgerc__
+#define ATL_F77wrap_cgeru atl_f77wrap_cgeru__
+#define ATL_F77wrap_chbmv atl_f77wrap_chbmv__
+#define ATL_F77wrap_chpmv atl_f77wrap_chpmv__
+#define ATL_F77wrap_chemv atl_f77wrap_chemv__
+#define ATL_F77wrap_chpr atl_f77wrap_chpr__
+#define ATL_F77wrap_cher atl_f77wrap_cher__
+#define ATL_F77wrap_chpr2 atl_f77wrap_chpr2__
+#define ATL_F77wrap_cher2 atl_f77wrap_cher2__
+#define ATL_F77wrap_ctbmv atl_f77wrap_ctbmv__
+#define ATL_F77wrap_ctpmv atl_f77wrap_ctpmv__
+#define ATL_F77wrap_ctrmv atl_f77wrap_ctrmv__
+#define ATL_F77wrap_ctbsv atl_f77wrap_ctbsv__
+#define ATL_F77wrap_ctpsv atl_f77wrap_ctpsv__
+#define ATL_F77wrap_ctrsv atl_f77wrap_ctrsv__
+
+#define ATL_F77wrap_cgemm atl_f77wrap_cgemm__
+#define ATL_F77wrap_chemm atl_f77wrap_chemm__
+#define ATL_F77wrap_cherk atl_f77wrap_cherk__
+#define ATL_F77wrap_cher2k atl_f77wrap_cher2k__
+#define ATL_F77wrap_csymm atl_f77wrap_csymm__
+#define ATL_F77wrap_csyrk atl_f77wrap_csyrk__
+#define ATL_F77wrap_csyr2k atl_f77wrap_csyr2k__
+#define ATL_F77wrap_ctrmm atl_f77wrap_ctrmm__
+#define ATL_F77wrap_ctrsm atl_f77wrap_ctrsm__
+
+#elif defined( DCPLX )
+
+#define ATL_F77wrap_zrotg atl_f77wrap_zrotg__
+#define ATL_F77wrap_dznrm2 atl_f77wrap_dznrm2__
+#define ATL_F77wrap_dzasum atl_f77wrap_dzasum__
+#define ATL_F77wrap_zscal atl_f77wrap_zscal__
+#define ATL_F77wrap_zdscal atl_f77wrap_zdscal__
+#define ATL_F77wrap_izamax atl_f77wrap_izamax__
+#define ATL_F77wrap_zaxpy atl_f77wrap_zaxpy__
+#define ATL_F77wrap_zcopy atl_f77wrap_zcopy__
+#define ATL_F77wrap_zswap atl_f77wrap_zswap__
+#define ATL_F77wrap_zdrot atl_f77wrap_zdrot__
+#define ATL_F77wrap_zdotc atl_f77wrap_zdotc__
+#define ATL_F77wrap_zdotu atl_f77wrap_zdotu__
+
+#define ATL_F77wrap_zgbmv atl_f77wrap_zgbmv__
+#define ATL_F77wrap_zgemv atl_f77wrap_zgemv__
+#define ATL_F77wrap_zgerc atl_f77wrap_zgerc__
+#define ATL_F77wrap_zgeru atl_f77wrap_zgeru__
+#define ATL_F77wrap_zhbmv atl_f77wrap_zhbmv__
+#define ATL_F77wrap_zhpmv atl_f77wrap_zhpmv__
+#define ATL_F77wrap_zhemv atl_f77wrap_zhemv__
+#define ATL_F77wrap_zhpr atl_f77wrap_zhpr__
+#define ATL_F77wrap_zher atl_f77wrap_zher__
+#define ATL_F77wrap_zhpr2 atl_f77wrap_zhpr2__
+#define ATL_F77wrap_zher2 atl_f77wrap_zher2__
+#define ATL_F77wrap_ztbmv atl_f77wrap_ztbmv__
+#define ATL_F77wrap_ztpmv atl_f77wrap_ztpmv__
+#define ATL_F77wrap_ztrmv atl_f77wrap_ztrmv__
+#define ATL_F77wrap_ztbsv atl_f77wrap_ztbsv__
+#define ATL_F77wrap_ztpsv atl_f77wrap_ztpsv__
+#define ATL_F77wrap_ztrsv atl_f77wrap_ztrsv__
+
+#define ATL_F77wrap_zgemm atl_f77wrap_zgemm__
+#define ATL_F77wrap_zhemm atl_f77wrap_zhemm__
+#define ATL_F77wrap_zherk atl_f77wrap_zherk__
+#define ATL_F77wrap_zher2k atl_f77wrap_zher2k__
+#define ATL_F77wrap_zsymm atl_f77wrap_zsymm__
+#define ATL_F77wrap_zsyrk atl_f77wrap_zsyrk__
+#define ATL_F77wrap_zsyr2k atl_f77wrap_zsyr2k__
+#define ATL_F77wrap_ztrmm atl_f77wrap_ztrmm__
+#define ATL_F77wrap_ztrsm atl_f77wrap_ztrsm__
+
+#endif
+
+#endif
+/*
+ * =====================================================================
+ * Prototypes for F77 interface wrappers ATLAS BLAS routines
+ * =====================================================================
+ */
+void Mjoin( PATLF77WRAP, rotg )
+( TYPE *, TYPE *, TYPE *, TYPE * );
+#ifdef TREAL
+void Mjoin( PATLF77WRAP, rotmg )
+( TYPE *, TYPE *, TYPE *, TYPE *,
+ TYPE * );
+#endif
+void Mjoin( ATLUPF77WRAP, nrm2 )
+( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE * );
+void Mjoin( ATLUPF77WRAP, asum )
+( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE * );
+void Mjoin( PATLF77WRAP, scal )
+( F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER * );
+#ifdef TCPLX
+void Mjoin( ATLPUF77WRAP, scal )
+( F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER * );
+#endif
+void Mjoin( Mjoin( ATL_F77wrap_i, PRE ), amax )
+( F77_INTEGER *, TYPE *, F77_INTEGER *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, axpy )
+( F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER *,
+ TYPE *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, copy )
+( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER * );
+void Mjoin( PATLF77WRAP, swap )
+( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER * );
+void Mjoin( ATLPUF77WRAP, rot )
+( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER *, TYPE *, TYPE * );
+#ifdef TREAL
+void Mjoin( PATLF77WRAP, rotm )
+( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER *, TYPE * );
+#endif
+#ifdef TREAL
+void Mjoin( PATLF77WRAP, dot )
+( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER *, TYPE * );
+#else
+void Mjoin( PATLF77WRAP, dotc )
+( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER *, TYPE * );
+void Mjoin( PATLF77WRAP, dotu )
+( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER *, TYPE * );
+#endif
+void ATL_F77wrap_dsdot
+( F77_INTEGER *, float *, F77_INTEGER *, float *,
+ F77_INTEGER *, double * );
+void ATL_F77wrap_sdsdot
+( F77_INTEGER *, float *, float *, F77_INTEGER *,
+ float *, F77_INTEGER *, float * );
+
+void Mjoin( PATLF77WRAP, gbmv )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *,
+ F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER *,
+ TYPE *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER * );
+void Mjoin( PATLF77WRAP, gemv )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, TYPE *,
+ TYPE *, F77_INTEGER *, TYPE *, F77_INTEGER *,
+ TYPE *, TYPE *, F77_INTEGER * );
+#ifdef TREAL
+void Mjoin( PATLF77WRAP, ger )
+( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER * );
+void Mjoin( PATLF77WRAP, sbmv )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, TYPE *,
+ TYPE *, F77_INTEGER *, TYPE *, F77_INTEGER *,
+ TYPE *, TYPE *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, spmv )
+( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ TYPE *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER * );
+void Mjoin( PATLF77WRAP, symv )
+( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *,
+ TYPE *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, spr )
+( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER *, TYPE * );
+void Mjoin( PATLF77WRAP, syr )
+( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER *, TYPE *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, spr2 )
+( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE * );
+void Mjoin( PATLF77WRAP, syr2 )
+( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER * );
+#else
+void Mjoin( PATLF77WRAP, gerc )
+( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER * );
+void Mjoin( PATLF77WRAP, geru )
+( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER * );
+void Mjoin( PATLF77WRAP, hbmv )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, TYPE *,
+ TYPE *, F77_INTEGER *, TYPE *, F77_INTEGER *,
+ TYPE *, TYPE *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, hpmv )
+( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ TYPE *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER * );
+void Mjoin( PATLF77WRAP, hemv )
+( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *,
+ TYPE *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, hpr )
+( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER *, TYPE * );
+void Mjoin( PATLF77WRAP, her )
+( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER *, TYPE *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, hpr2 )
+( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE * );
+void Mjoin( PATLF77WRAP, her2 )
+( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER * );
+#endif
+void Mjoin( PATLF77WRAP, tbmv )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *,
+ F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER * );
+void Mjoin( PATLF77WRAP, tpmv )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *,
+ TYPE *, TYPE *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, trmv )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *,
+ TYPE *, F77_INTEGER *, TYPE *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, tbsv )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *,
+ F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER * );
+void Mjoin( PATLF77WRAP, tpsv )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *,
+ TYPE *, TYPE *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, trsv )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *,
+ TYPE *, F77_INTEGER *, TYPE *, F77_INTEGER * );
+
+void Mjoin( PATLF77WRAP, gemm )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *,
+ F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER *,
+ TYPE *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER * );
+#ifdef TCPLX
+void Mjoin( PATLF77WRAP, hemm )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *,
+ TYPE *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, herk )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *,
+ TYPE *, TYPE *, F77_INTEGER *, TYPE *,
+ TYPE *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, her2k )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *,
+ TYPE *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER * );
+#endif
+void Mjoin( PATLF77WRAP, symm )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *,
+ TYPE *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, syrk )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *,
+ TYPE *, TYPE *, F77_INTEGER *, TYPE *,
+ TYPE *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, syr2k )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *,
+ TYPE *, TYPE *, F77_INTEGER *, TYPE *,
+ F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, trmm )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *,
+ F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER *, TYPE *, F77_INTEGER * );
+void Mjoin( PATLF77WRAP, trsm )
+( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *,
+ F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *,
+ F77_INTEGER *, TYPE *, F77_INTEGER * );
+
+#endif
+/*
+ * End of atlas_f77wrap.h
+ */
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_fopen.h b/kaldi_io/src/tools/ATLAS/include/atlas_fopen.h
new file mode 100644
index 0000000..aaed713
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_fopen.h
@@ -0,0 +1,40 @@
+#ifndef ATLAS_FOPEN_H
+#define ATLAS_FOPEN_H
+
+static int FileExists(const char *path)
+{
+ FILE *fp;
+ int iret=0;
+ fp = fopen(path, "r");
+ if (fp)
+ {
+ fclose(fp);
+ iret = 1;
+ }
+ return(iret);
+}
+
+#ifdef ATL_FOPENDELAY
+static FILE *ATL_fopen(const char *path, const char *mode)
+/*
+ * Overload fopen so it waits for NFS propogation upon first read failure
+ */
+{
+ FILE *fp;
+ char ln[256];
+
+ fp = fopen(path, mode);
+ if (fp == NULL)
+ {
+ if (*mode == 'r') /* give NFS time to produce file */
+ {
+ sprintf(ln, "make waitfile waitfile=%s\n", path);
+ if (system(ln) == 0) fp = fopen(path, mode);
+ }
+ }
+ return(fp);
+}
+#define fopen ATL_fopen
+#endif
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_kern3.h b/kaldi_io/src/tools/ATLAS/include/atlas_kern3.h
new file mode 100644
index 0000000..97e8bcc
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_kern3.h
@@ -0,0 +1,110 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef ATLAS_KERN3_H
+#define ATLAS_KERN3_H
+
+#include "atlas_misc.h"
+#include Mstr(Mjoin(Mjoin(atlas_,PRE),NCmm.h))
+#include "atlas_lvl3.h"
+#include "atlas_kernel3.h"
+#include "atlas_reflevel3.h"
+/*
+ * Gemm entry points
+ */
+#define CgemmNN Mjoin(PATL,gemmNN)
+#define CgemmNT Mjoin(PATL,gemmNT)
+#define CgemmTN Mjoin(PATL,gemmTN)
+#define CgemmNC Mjoin(PATL,gemmNC)
+#define CgemmCN Mjoin(PATL,gemmCN)
+
+#define CAgemmNN Mjoin(PATL,aliased_gemmNN)
+#define CAgemmTN Mjoin(PATL,aliased_gemmTN)
+
+#ifdef Left_
+ #define Side_ AtlasLeft
+ #define SideNM L
+#elif defined(Right_)
+ #define Side_ AtlasRight
+ #define SideNM R
+#endif
+
+#ifdef Upper_
+ #define Uplo_ AtlasUpper
+ #define UploNM U
+#elif defined(Lower_)
+ #define Uplo_ AtlasLower
+ #define UploNM L
+#endif
+
+#ifdef UnitDiag_
+ #define Unit_ AtlasUnit
+ #define UnitNM U
+#elif defined(NonUnitDiag_)
+ #define Unit_ AtlasNonUnit
+ #define UnitNM N
+#endif
+
+#ifdef Transpose_
+ #define Trans_ AtlasTrans
+ #define TransNM T
+#elif defined(Notranspose_)
+ #define Trans_ AtlasNoTrans
+ #define TransNM N
+#elif defined(ConjTrans_)
+ #define Trans_ AtlasConjTrans
+ #define TransNM C
+#endif
+
+#ifndef TRSM_Xover
+ #define TRSM_Xover NB
+#endif
+#ifndef TRMM_Xover
+ #define TRMM_Xover NB
+#endif
+#ifndef HER2K_Xover
+ #define HER2K_Xover NB
+#endif
+#ifndef SYR2K_Xover
+ #define SYR2K_Xover NB
+#endif
+#ifndef HERK_Xover
+ #define HERK_Xover NB
+#endif
+#ifndef SYRK_Xover
+ #define SYRK_Xover NB
+#endif
+#ifndef HEMM_Xover
+ #define HEMM_Xover NB
+#endif
+#ifndef SYMM_Xover
+ #define SYMM_Xover NB
+#endif
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_kernel2.h b/kaldi_io/src/tools/ATLAS/include/atlas_kernel2.h
new file mode 100644
index 0000000..4663def
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_kernel2.h
@@ -0,0 +1,5408 @@
+/* ---------------------------------------------------------------------
+ *
+ * -- Automatically Tuned Linear Algebra Software (ATLAS)
+ * (C) Copyright 2000 All Rights Reserved
+ *
+ * -- ATLAS routine -- Version 3.2 -- December 25, 2000
+ *
+ * Author : Antoine P. Petitet
+ * Contributor(s) : R. Clint Whaley
+ * Originally developed at the University of Tennessee,
+ * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * -- Copyright notice and Licensing terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in
+ * the documentation and/or other materials provided with the distri-
+ * bution.
+ * 3. The name of the University, the ATLAS group, or the names of its
+ * contributors may not be used to endorse or promote products deri-
+ * ved from this software without specific written permission.
+ *
+ * -- Disclaimer:
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
+ * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN-
+ * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---------------------------------------------------------------------
+ */
+#ifndef ATLAS_KERNEL2_H
+#define ATLAS_KERNEL2_H
+/*
+ * =====================================================================
+ * Macro function definitions
+ * =====================================================================
+ */
+#define ATL_GetPartSBMV ATL_GetPartSYMV
+#define ATL_GetPartSPMV ATL_GetPartSYMV
+#define ATL_GetPartP1 ATL_GetPartR1
+
+#define MLpprev( n_, a_, lda_ ) \
+ { a_ -= ( (((n_) * (lda_)) + (((n_)*((n_)+1)) >> 1)) SHIFT ); lda_ += (n_); }
+#define MUpprev( n_, a_, lda_ ) \
+ { a_ -= ( (((n_) * (lda_)) - (((n_)*((n_)-1)) >> 1)) SHIFT ); lda_ -= (n_); }
+#define MLpnext( n_, a_, lda_ ) \
+ { a_ += ( (((n_) * (lda_)) - (((n_)*((n_)-1)) >> 1)) SHIFT ); lda_ -= (n_); }
+#define MUpnext( n_, a_, lda_ ) \
+ { a_ += ( (((n_) * (lda_)) + (((n_)*((n_)+1)) >> 1)) SHIFT ); lda_ += (n_); }
+
+#define MLrprev( n_, a_, lda_ ) \
+ { a_ -= ( ((n_) * ((lda_)+1)) SHIFT ); }
+#define MUrprev( n_, a_, lda_ ) \
+ { a_ -= ( ((n_) * ((lda_)+1)) SHIFT ); }
+#define MLrnext( n_, a_, lda_ ) \
+ { a_ += ( ((n_) * ((lda_)+1)) SHIFT ); }
+#define MUrnext( n_, a_, lda_ ) \
+ { a_ += ( ((n_) * ((lda_)+1)) SHIFT ); }
+/*
+ * =====================================================================
+ * Recursive Level 2 BLAS function prototypes
+ * =====================================================================
+ */
+void ATL_strsvLTU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strsvLNU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strsvLTN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strsvLNN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strsvUTU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strsvUNU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strsvUTN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strsvUNN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strsvLT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strsvLN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strsvUT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strsvUN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpsvLTU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpsvLNU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpsvLTN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpsvLNN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpsvUTU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpsvUNU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpsvUTN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpsvUNN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpsvLT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpsvLN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpsvUT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpsvUN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbsvLTU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbsvLNU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbsvLTN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbsvLNN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbsvUTU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbsvUNU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbsvUTN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbsvUNN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbsvLT
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbsvLN
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbsvUT
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbsvUN
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strmvLTU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strmvLNU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strmvLTN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strmvLNN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strmvUTU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strmvUNU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strmvUTN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strmvUNN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strmvLT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strmvLN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strmvUT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_strmvUN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpmvLTU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpmvLNU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpmvLTN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpmvLNN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpmvUTU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpmvUNU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpmvUTN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpmvUNN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpmvLT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpmvLN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpmvUT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stpmvUN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbmvLTU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbmvLNU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbmvLTN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbmvLNN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbmvUTU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbmvUNU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbmvUTN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbmvUNN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbmvLT
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbmvLN
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbmvUT
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_stbmvUN
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ssyr2U
+(
+ const int,
+ const float *,
+ const float *,
+ float *, const int
+);
+
+void ATL_ssyr2L
+(
+ const int,
+ const float *,
+ const float *,
+ float *, const int
+);
+
+void ATL_sspr2U
+(
+ const int,
+ const float *,
+ const float *,
+ float *, const int
+);
+
+void ATL_sspr2L
+(
+ const int,
+ const float *,
+ const float *,
+ float *, const int
+);
+
+void ATL_ssyrU
+(
+ const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_ssyrL
+(
+ const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_ssprU
+(
+ const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_ssprL
+(
+ const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_ssymvU
+(
+ const int,
+ const float *, const int,
+ const float *,
+ const float,
+ float *
+);
+
+void ATL_ssymvL
+(
+ const int,
+ const float *, const int,
+ const float *,
+ const float,
+ float *
+);
+
+void ATL_sspmvU
+(
+ const int,
+ const float *, const int,
+ const float *,
+ const float,
+ float *
+);
+
+void ATL_sspmvL
+(
+ const int,
+ const float *, const int,
+ const float *,
+ const float,
+ float *
+);
+
+void ATL_ssbmvU
+(
+ const int, const int,
+ const float *, const int,
+ const float *,
+ const float,
+ float *
+);
+
+void ATL_ssbmvL
+(
+ const int, const int,
+ const float *, const int,
+ const float *,
+ const float,
+ float *
+);
+
+void ATL_sgpmv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgprU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sgprL
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sgpr
+(
+ const enum ATLAS_UPLO,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sgpr1U_a1_x1_yX
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sgpr1L_a1_x1_yX
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sgpmvUT_a1_x1_bX_y1
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgpmvUN_a1_x1_bX_y1
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgpmvUT_a1_x1_b1_y1
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgpmvUN_a1_x1_b1_y1
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgpmvUT_a1_x1_b0_y1
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgpmvUN_a1_x1_b0_y1
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgpmvLT_a1_x1_bX_y1
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgpmvLN_a1_x1_bX_y1
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgpmvLT_a1_x1_b1_y1
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgpmvLN_a1_x1_b1_y1
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgpmvLT_a1_x1_b0_y1
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgpmvLN_a1_x1_b0_y1
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgbmvT_a1_x1_bX_y1
+(
+ const int, const int,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgbmvN_a1_x1_bX_y1
+(
+ const int, const int,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgbmvT_a1_x1_b1_y1
+(
+ const int, const int,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgbmvN_a1_x1_b1_y1
+(
+ const int, const int,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgbmvT_a1_x1_b0_y1
+(
+ const int, const int,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sgbmvN_a1_x1_b0_y1
+(
+ const int, const int,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_dtrsvLTU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrsvLNU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrsvLTN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrsvLNN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrsvUTU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrsvUNU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrsvUTN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrsvUNN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrsvLT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrsvLN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrsvUT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrsvUN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpsvLTU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpsvLNU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpsvLTN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpsvLNN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpsvUTU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpsvUNU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpsvUTN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpsvUNN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpsvLT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpsvLN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpsvUT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpsvUN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbsvLTU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbsvLNU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbsvLTN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbsvLNN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbsvUTU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbsvUNU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbsvUTN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbsvUNN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbsvLT
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbsvLN
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbsvUT
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbsvUN
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrmvLTU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrmvLNU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrmvLTN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrmvLNN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrmvUTU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrmvUNU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrmvUTN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrmvUNN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrmvLT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrmvLN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrmvUT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtrmvUN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpmvLTU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpmvLNU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpmvLTN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpmvLNN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpmvUTU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpmvUNU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpmvUTN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpmvUNN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpmvLT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpmvLN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpmvUT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtpmvUN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbmvLTU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbmvLNU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbmvLTN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbmvLNN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbmvUTU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbmvUNU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbmvUTN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbmvUNN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbmvLT
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbmvLN
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbmvUT
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dtbmvUN
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_dsyr2U
+(
+ const int,
+ const double *,
+ const double *,
+ double *, const int
+);
+
+void ATL_dsyr2L
+(
+ const int,
+ const double *,
+ const double *,
+ double *, const int
+);
+
+void ATL_dspr2U
+(
+ const int,
+ const double *,
+ const double *,
+ double *, const int
+);
+
+void ATL_dspr2L
+(
+ const int,
+ const double *,
+ const double *,
+ double *, const int
+);
+
+void ATL_dsyrU
+(
+ const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dsyrL
+(
+ const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dsprU
+(
+ const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dsprL
+(
+ const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dsymvU
+(
+ const int,
+ const double *, const int,
+ const double *,
+ const double,
+ double *
+);
+
+void ATL_dsymvL
+(
+ const int,
+ const double *, const int,
+ const double *,
+ const double,
+ double *
+);
+
+void ATL_dspmvU
+(
+ const int,
+ const double *, const int,
+ const double *,
+ const double,
+ double *
+);
+
+void ATL_dspmvL
+(
+ const int,
+ const double *, const int,
+ const double *,
+ const double,
+ double *
+);
+
+void ATL_dsbmvU
+(
+ const int, const int,
+ const double *, const int,
+ const double *,
+ const double,
+ double *
+);
+
+void ATL_dsbmvL
+(
+ const int, const int,
+ const double *, const int,
+ const double *,
+ const double,
+ double *
+);
+
+void ATL_dgpmv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgprU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dgprL
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dgpr
+(
+ const enum ATLAS_UPLO,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dgpr1U_a1_x1_yX
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dgpr1L_a1_x1_yX
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dgpmvUT_a1_x1_bX_y1
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgpmvUN_a1_x1_bX_y1
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgpmvUT_a1_x1_b1_y1
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgpmvUN_a1_x1_b1_y1
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgpmvUT_a1_x1_b0_y1
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgpmvUN_a1_x1_b0_y1
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgpmvLT_a1_x1_bX_y1
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgpmvLN_a1_x1_bX_y1
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgpmvLT_a1_x1_b1_y1
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgpmvLN_a1_x1_b1_y1
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgpmvLT_a1_x1_b0_y1
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgpmvLN_a1_x1_b0_y1
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgbmvT_a1_x1_bX_y1
+(
+ const int, const int,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgbmvN_a1_x1_bX_y1
+(
+ const int, const int,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgbmvT_a1_x1_b1_y1
+(
+ const int, const int,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgbmvN_a1_x1_b1_y1
+(
+ const int, const int,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgbmvT_a1_x1_b0_y1
+(
+ const int, const int,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dgbmvN_a1_x1_b0_y1
+(
+ const int, const int,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_ctrsvLHU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvLCU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvLTU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvLNU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvLHN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvLCN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvLTN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvLNN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvUHU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvUCU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvUTU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvUNU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvUHN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvUCN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvUTN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvUNN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvLH
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvLC
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvLT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvLN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvUH
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvUC
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvUT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrsvUN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvLHU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvLCU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvLTU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvLNU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvLHN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvLCN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvLTN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvLNN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvUHU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvUCU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvUTU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvUNU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvUHN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvUCN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvUTN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvUNN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvLH
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvLC
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvLT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvLN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvUH
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvUC
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvUT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpsvUN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvLHU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvLCU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvLTU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvLNU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvLHN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvLCN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvLTN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvLNN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvUHU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvUCU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvUTU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvUNU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvUHN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvUCN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvUTN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvUNN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvLH
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvLC
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvLT
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvLN
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvUH
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvUC
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvUT
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbsvUN
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvLHU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvLCU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvLTU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvLNU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvLHN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvLCN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvLTN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvLNN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvUHU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvUCU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvUTU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvUNU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvUHN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvUCN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvUTN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvUNN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvLH
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvLC
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvLT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvLN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvUH
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvUC
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvUT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctrmvUN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvLHU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvLCU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvLTU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvLNU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvLHN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvLCN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvLTN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvLNN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvUHU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvUCU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvUTU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvUNU
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvUHN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvUCN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvUTN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvUNN
+(
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvLH
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvLC
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvLT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvLN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvUH
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvUC
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvUT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctpmvUN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvLHU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvLCU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvLTU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvLNU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvLHN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvLCN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvLTN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvLNN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvUHU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvUCU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvUTU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvUNU
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvUHN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvUCN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvUTN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvUNN
+(
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvLH
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvLC
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvLT
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvLN
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvUH
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvUC
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvUT
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_ctbmvUN
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_cher2U
+(
+ const int,
+ const float *,
+ const float *,
+ float *, const int
+);
+
+void ATL_cher2L
+(
+ const int,
+ const float *,
+ const float *,
+ float *, const int
+);
+
+void ATL_chpr2U
+(
+ const int,
+ const float *,
+ const float *,
+ float *, const int
+);
+
+void ATL_chpr2L
+(
+ const int,
+ const float *,
+ const float *,
+ float *, const int
+);
+
+void ATL_cherU
+(
+ const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_cherL
+(
+ const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_chprU
+(
+ const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_chprL
+(
+ const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_chemvU
+(
+ const int,
+ const float *, const int,
+ const float *,
+ const float *,
+ float *
+);
+
+void ATL_chemvL
+(
+ const int,
+ const float *, const int,
+ const float *,
+ const float *,
+ float *
+);
+
+void ATL_chpmvU
+(
+ const int,
+ const float *, const int,
+ const float *,
+ const float *,
+ float *
+);
+
+void ATL_chpmvL
+(
+ const int,
+ const float *, const int,
+ const float *,
+ const float *,
+ float *
+);
+
+void ATL_chbmvU
+(
+ const int, const int,
+ const float *, const int,
+ const float *,
+ const float *,
+ float *
+);
+
+void ATL_chbmvL
+(
+ const int, const int,
+ const float *, const int,
+ const float *,
+ const float *,
+ float *
+);
+
+void ATL_cgpmv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpruU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_cgpruL
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_cgpru
+(
+ const enum ATLAS_UPLO,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_cgprcU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_cgprcL
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_cgprc
+(
+ const enum ATLAS_UPLO,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_cgpr1uU_a1_x1_yX
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_cgpr1uL_a1_x1_yX
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_cgpr1cU_a1_x1_yX
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_cgpr1cL_a1_x1_yX
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_cgpmvUNc_a1_x1_bX_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvUC_a1_x1_bX_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvUT_a1_x1_bX_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvUN_a1_x1_bX_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvUNc_a1_x1_b1_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvUC_a1_x1_b1_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvUT_a1_x1_b1_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvUN_a1_x1_b1_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvUNc_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvUC_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvUT_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvUN_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvUNc_a1_x1_b0_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvUC_a1_x1_b0_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvUT_a1_x1_b0_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvUN_a1_x1_b0_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvLNc_a1_x1_bX_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvLC_a1_x1_bX_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvLT_a1_x1_bX_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvLN_a1_x1_bX_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvLNc_a1_x1_b1_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvLC_a1_x1_b1_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvLT_a1_x1_b1_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvLN_a1_x1_b1_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvLNc_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvLC_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvLT_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvLN_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvLNc_a1_x1_b0_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvLC_a1_x1_b0_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvLT_a1_x1_b0_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgpmvLN_a1_x1_b0_y1
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgbmvNc_a1_x1_bX_y1
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgbmvC_a1_x1_bX_y1
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgbmvT_a1_x1_bX_y1
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgbmvN_a1_x1_bX_y1
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgbmvNc_a1_x1_b1_y1
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgbmvC_a1_x1_b1_y1
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgbmvT_a1_x1_b1_y1
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgbmvN_a1_x1_b1_y1
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgbmvNc_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgbmvC_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgbmvT_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgbmvN_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgbmvNc_a1_x1_b0_y1
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgbmvC_a1_x1_b0_y1
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgbmvT_a1_x1_b0_y1
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_cgbmvN_a1_x1_b0_y1
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_ztrsvLHU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvLCU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvLTU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvLNU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvLHN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvLCN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvLTN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvLNN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvUHU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvUCU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvUTU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvUNU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvUHN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvUCN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvUTN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvUNN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvLH
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvLC
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvLT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvLN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvUH
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvUC
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvUT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrsvUN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvLHU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvLCU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvLTU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvLNU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvLHN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvLCN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvLTN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvLNN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvUHU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvUCU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvUTU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvUNU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvUHN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvUCN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvUTN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvUNN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvLH
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvLC
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvLT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvLN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvUH
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvUC
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvUT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpsvUN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvLHU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvLCU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvLTU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvLNU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvLHN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvLCN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvLTN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvLNN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvUHU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvUCU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvUTU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvUNU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvUHN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvUCN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvUTN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvUNN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvLH
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvLC
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvLT
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvLN
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvUH
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvUC
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvUT
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbsvUN
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvLHU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvLCU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvLTU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvLNU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvLHN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvLCN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvLTN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvLNN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvUHU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvUCU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvUTU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvUNU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvUHN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvUCN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvUTN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvUNN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvLH
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvLC
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvLT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvLN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvUH
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvUC
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvUT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztrmvUN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvLHU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvLCU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvLTU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvLNU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvLHN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvLCN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvLTN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvLNN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvUHU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvUCU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvUTU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvUNU
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvUHN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvUCN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvUTN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvUNN
+(
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvLH
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvLC
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvLT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvLN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvUH
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvUC
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvUT
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztpmvUN
+(
+ const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvLHU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvLCU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvLTU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvLNU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvLHN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvLCN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvLTN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvLNN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvUHU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvUCU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvUTU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvUNU
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvUHN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvUCN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvUTN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvUNN
+(
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvLH
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvLC
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvLT
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvLN
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvUH
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvUC
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvUT
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_ztbmvUN
+(
+ const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_zher2U
+(
+ const int,
+ const double *,
+ const double *,
+ double *, const int
+);
+
+void ATL_zher2L
+(
+ const int,
+ const double *,
+ const double *,
+ double *, const int
+);
+
+void ATL_zhpr2U
+(
+ const int,
+ const double *,
+ const double *,
+ double *, const int
+);
+
+void ATL_zhpr2L
+(
+ const int,
+ const double *,
+ const double *,
+ double *, const int
+);
+
+void ATL_zherU
+(
+ const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zherL
+(
+ const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zhprU
+(
+ const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zhprL
+(
+ const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zhemvU
+(
+ const int,
+ const double *, const int,
+ const double *,
+ const double *,
+ double *
+);
+
+void ATL_zhemvL
+(
+ const int,
+ const double *, const int,
+ const double *,
+ const double *,
+ double *
+);
+
+void ATL_zhpmvU
+(
+ const int,
+ const double *, const int,
+ const double *,
+ const double *,
+ double *
+);
+
+void ATL_zhpmvL
+(
+ const int,
+ const double *, const int,
+ const double *,
+ const double *,
+ double *
+);
+
+void ATL_zhbmvU
+(
+ const int, const int,
+ const double *, const int,
+ const double *,
+ const double *,
+ double *
+);
+
+void ATL_zhbmvL
+(
+ const int, const int,
+ const double *, const int,
+ const double *,
+ const double *,
+ double *
+);
+
+void ATL_zgpmv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpruU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zgpruL
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zgpru
+(
+ const enum ATLAS_UPLO,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zgprcU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zgprcL
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zgprc
+(
+ const enum ATLAS_UPLO,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zgpr1uU_a1_x1_yX
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zgpr1uL_a1_x1_yX
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zgpr1cU_a1_x1_yX
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zgpr1cL_a1_x1_yX
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zgpmvUNc_a1_x1_bX_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvUC_a1_x1_bX_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvUT_a1_x1_bX_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvUN_a1_x1_bX_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvUNc_a1_x1_b1_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvUC_a1_x1_b1_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvUT_a1_x1_b1_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvUN_a1_x1_b1_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvUNc_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvUC_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvUT_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvUN_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvUNc_a1_x1_b0_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvUC_a1_x1_b0_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvUT_a1_x1_b0_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvUN_a1_x1_b0_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvLNc_a1_x1_bX_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvLC_a1_x1_bX_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvLT_a1_x1_bX_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvLN_a1_x1_bX_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvLNc_a1_x1_b1_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvLC_a1_x1_b1_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvLT_a1_x1_b1_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvLN_a1_x1_b1_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvLNc_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvLC_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvLT_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvLN_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvLNc_a1_x1_b0_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvLC_a1_x1_b0_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvLT_a1_x1_b0_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgpmvLN_a1_x1_b0_y1
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgbmvNc_a1_x1_bX_y1
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgbmvC_a1_x1_bX_y1
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgbmvT_a1_x1_bX_y1
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgbmvN_a1_x1_bX_y1
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgbmvNc_a1_x1_b1_y1
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgbmvC_a1_x1_b1_y1
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgbmvT_a1_x1_b1_y1
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgbmvN_a1_x1_b1_y1
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgbmvNc_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgbmvC_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgbmvT_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgbmvN_a1_x1_bXi0_y1
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgbmvNc_a1_x1_b0_y1
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgbmvC_a1_x1_b0_y1
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgbmvT_a1_x1_b0_y1
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zgbmvN_a1_x1_b0_y1
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+
+#endif
+/*
+ * End of atlas_kernel2.h
+ */
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_kernel3.h b/kaldi_io/src/tools/ATLAS/include/atlas_kernel3.h
new file mode 100644
index 0000000..a929c2d
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_kernel3.h
@@ -0,0 +1,1393 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef ATLAS_KERNEL3_H
+#define ATLAS_KERNEL3_H
+
+/*
+ * Real level 3 kernels
+ */
+void ATL_ssymmRU
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_ssymmLU
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_ssymmRL
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_ssymmLL
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_strsmLLTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strmmLLTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strsmLLTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strmmLLTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strsmLLNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strmmLLNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strsmLLNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strmmLLNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strsmLUTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strmmLUTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strsmLUTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strmmLUTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strsmLUNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strmmLUNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strsmLUNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strmmLUNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strsmRLTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strmmRLTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strsmRLTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strmmRLTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strsmRLNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strmmRLNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strsmRLNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strmmRLNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strsmRUTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strmmRUTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strsmRUTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strmmRUTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strsmRUNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strmmRUNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strsmRUNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_strmmRUNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ssyrkLT
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_ssyrkUT
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_ssyrkLN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_ssyrkUN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+int ATL_ssyr2kLT
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_ssyr2kUT
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_ssyr2kLN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_ssyr2kUN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+void ATL_dsymmRU
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_dsymmLU
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_dsymmRL
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_dsymmLL
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_dtrsmLLTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrmmLLTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrsmLLTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrmmLLTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrsmLLNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrmmLLNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrsmLLNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrmmLLNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrsmLUTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrmmLUTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrsmLUTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrmmLUTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrsmLUNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrmmLUNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrsmLUNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrmmLUNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrsmRLTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrmmRLTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrsmRLTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrmmRLTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrsmRLNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrmmRLNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrsmRLNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrmmRLNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrsmRUTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrmmRUTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrsmRUTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrmmRUTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrsmRUNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrmmRUNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrsmRUNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dtrmmRUNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_dsyrkLT
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_dsyrkUT
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_dsyrkLN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_dsyrkUN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+int ATL_dsyr2kLT
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_dsyr2kUT
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_dsyr2kLN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_dsyr2kUN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+
+/*
+ * Complex level 3 kernels
+ */
+void ATL_chemmRU
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_chemmLU
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_chemmRL
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_chemmLL
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_csymmRU
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_csymmLU
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_csymmRL
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_csymmLL
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_ctrsmLLTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmLLTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmLLTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmLLTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmLLNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmLLNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmLLNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmLLNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmLLCN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmLLCN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmLLCU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmLLCU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmLUTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmLUTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmLUTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmLUTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmLUNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmLUNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmLUNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmLUNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmLUCN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmLUCN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmLUCU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmLUCU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmRLTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmRLTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmRLTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmRLTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmRLNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmRLNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmRLNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmRLNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmRLCN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmRLCN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmRLCU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmRLCU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmRUTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmRUTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmRUTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmRUTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmRUNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmRUNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmRUNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmRUNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmRUCN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmRUCN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrsmRUCU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ctrmmRUCU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_cherkLC
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_cherkUC
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_cherkLN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_cherkUN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_csyrkLT
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_csyrkUT
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_csyrkLN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_csyrkUN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+int ATL_cher2kLC
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_cher2kUC
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_cher2kLN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_cher2kUN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_csyr2kLT
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_csyr2kUT
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_csyr2kLN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_csyr2kUN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+void ATL_zhemmRU
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_zhemmLU
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_zhemmRL
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_zhemmLL
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_zsymmRU
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_zsymmLU
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_zsymmRL
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_zsymmLL
+ (const int M, const int N, const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta, void *C, const int ldc);
+void ATL_ztrsmLLTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmLLTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmLLTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmLLTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmLLNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmLLNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmLLNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmLLNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmLLCN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmLLCN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmLLCU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmLLCU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmLUTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmLUTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmLUTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmLUTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmLUNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmLUNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmLUNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmLUNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmLUCN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmLUCN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmLUCU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmLUCU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmRLTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmRLTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmRLTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmRLTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmRLNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmRLNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmRLNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmRLNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmRLCN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmRLCN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmRLCU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmRLCU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmRUTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmRUTN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmRUTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmRUTU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmRUNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmRUNN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmRUNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmRUNU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmRUCN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmRUCN
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrsmRUCU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_ztrmmRUCU
+ (const int M, const int N, const void *valpha, const void *A, const int lda,
+ void *C, const int ldc);
+void ATL_zherkLC
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_zherkUC
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_zherkLN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_zherkUN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_zsyrkLT
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_zsyrkUT
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_zsyrkLN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+void ATL_zsyrkUN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *vbeta, void *C, const int ldc);
+int ATL_zher2kLC
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_zher2kUC
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_zher2kLN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_zher2kUN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_zsyr2kLT
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_zsyr2kUT
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_zsyr2kLN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+int ATL_zsyr2kUN
+ (const int N, const int K, const void *valpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *vbeta, void *C, const int ldc);
+
+/*
+ * Real level 3 kernel auxiliaries
+ */
+void ATL_ssycopyU_a0
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_ssycopyL_a0
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyU2L_N_a0
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyU2L_U_a0
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyU2U_N_a0
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyU2U_U_a0
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyL2L_N_a0
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyL2L_U_a0
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyL2U_N_a0
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyL2U_U_a0
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_ssycopyU_a1
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_ssycopyL_a1
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyU2L_N_a1
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyU2L_U_a1
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyU2U_N_a1
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyU2U_U_a1
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyL2L_N_a1
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyL2L_U_a1
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyL2U_N_a1
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyL2U_U_a1
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_ssycopyU_aX
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_ssycopyL_aX
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyU2L_N_aX
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyU2L_U_aX
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyU2U_N_aX
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyU2U_U_aX
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyL2L_N_aX
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyL2L_U_aX
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyL2U_N_aX
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strcopyL2U_U_aX
+ (const int N, const float alpha, const float *A, const int lda, float *C);
+void ATL_strinvertUU(const int N, float *A, const int lda);
+void ATL_strinvertLU(const int N, float *A, const int lda);
+void ATL_strinvertUN(const int N, float *A, const int lda);
+void ATL_strinvertLN(const int N, float *A, const int lda);
+void ATL_ssyr2k_putU_bX
+ (const int N, const float *v, const float beta, float *A, const int lda);
+void ATL_ssyr2k_putL_bX
+ (const int N, const float *v, const float beta, float *A, const int lda);
+void ATL_strputU_bX
+ (const int N, const float *v, const float beta, float *A, const int lda);
+void ATL_strputL_bX
+ (const int N, const float *v, const float beta, float *A, const int lda);
+void ATL_ssyr2k_putU_b1
+ (const int N, const float *v, const float beta, float *A, const int lda);
+void ATL_ssyr2k_putL_b1
+ (const int N, const float *v, const float beta, float *A, const int lda);
+void ATL_strputU_b1
+ (const int N, const float *v, const float beta, float *A, const int lda);
+void ATL_strputL_b1
+ (const int N, const float *v, const float beta, float *A, const int lda);
+void ATL_ssyr2k_putU_b0
+ (const int N, const float *v, const float beta, float *A, const int lda);
+void ATL_ssyr2k_putL_b0
+ (const int N, const float *v, const float beta, float *A, const int lda);
+void ATL_strputU_b0
+ (const int N, const float *v, const float beta, float *A, const int lda);
+void ATL_strputL_b0
+ (const int N, const float *v, const float beta, float *A, const int lda);
+void ATL_strsmKLLTN
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_strsmKLLTU
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_strsmKLLNN
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_strsmKLLNU
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_strsmKLUTN
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_strsmKLUTU
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_strsmKLUNN
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_strsmKLUNU
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_strsmKRLTN
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_strsmKRLTU
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_strsmKRLNN
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_strsmKRLNU
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_strsmKRUTN
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_strsmKRUTU
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_strsmKRUNN
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_strsmKRUNU
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_dsycopyU_a0
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dsycopyL_a0
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyU2L_N_a0
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyU2L_U_a0
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyU2U_N_a0
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyU2U_U_a0
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyL2L_N_a0
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyL2L_U_a0
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyL2U_N_a0
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyL2U_U_a0
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dsycopyU_a1
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dsycopyL_a1
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyU2L_N_a1
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyU2L_U_a1
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyU2U_N_a1
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyU2U_U_a1
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyL2L_N_a1
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyL2L_U_a1
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyL2U_N_a1
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyL2U_U_a1
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dsycopyU_aX
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dsycopyL_aX
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyU2L_N_aX
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyU2L_U_aX
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyU2U_N_aX
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyU2U_U_aX
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyL2L_N_aX
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyL2L_U_aX
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyL2U_N_aX
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrcopyL2U_U_aX
+ (const int N, const double alpha, const double *A, const int lda, double *C);
+void ATL_dtrinvertUU(const int N, double *A, const int lda);
+void ATL_dtrinvertLU(const int N, double *A, const int lda);
+void ATL_dtrinvertUN(const int N, double *A, const int lda);
+void ATL_dtrinvertLN(const int N, double *A, const int lda);
+void ATL_dsyr2k_putU_bX
+ (const int N, const double *v, const double beta, double *A, const int lda);
+void ATL_dsyr2k_putL_bX
+ (const int N, const double *v, const double beta, double *A, const int lda);
+void ATL_dtrputU_bX
+ (const int N, const double *v, const double beta, double *A, const int lda);
+void ATL_dtrputL_bX
+ (const int N, const double *v, const double beta, double *A, const int lda);
+void ATL_dsyr2k_putU_b1
+ (const int N, const double *v, const double beta, double *A, const int lda);
+void ATL_dsyr2k_putL_b1
+ (const int N, const double *v, const double beta, double *A, const int lda);
+void ATL_dtrputU_b1
+ (const int N, const double *v, const double beta, double *A, const int lda);
+void ATL_dtrputL_b1
+ (const int N, const double *v, const double beta, double *A, const int lda);
+void ATL_dsyr2k_putU_b0
+ (const int N, const double *v, const double beta, double *A, const int lda);
+void ATL_dsyr2k_putL_b0
+ (const int N, const double *v, const double beta, double *A, const int lda);
+void ATL_dtrputU_b0
+ (const int N, const double *v, const double beta, double *A, const int lda);
+void ATL_dtrputL_b0
+ (const int N, const double *v, const double beta, double *A, const int lda);
+void ATL_dtrsmKLLTN
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dtrsmKLLTU
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dtrsmKLLNN
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dtrsmKLLNU
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dtrsmKLUTN
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dtrsmKLUTU
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dtrsmKLUNN
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dtrsmKLUNU
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dtrsmKRLTN
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dtrsmKRLTU
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dtrsmKRLNN
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dtrsmKRLNU
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dtrsmKRUTN
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dtrsmKRUTU
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dtrsmKRUNN
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_dtrsmKRUNU
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, double *C, const int ldc);
+
+/*
+ * Complex level 3 kernel auxiliaries
+ */
+void ATL_cCtrsmKL
+ (enum ATLAS_UPLO Uplo, enum ATLAS_TRANS Trans, enum ATLAS_DIAG Diag,
+ const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *B, const int ldb);
+void ATL_checopy
+ (const int N, const float *A, const int lda, float *C);
+void ATL_csycopy
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrcopyU2L_N
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrcopyU2Lc_N
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrcopyU2L_U
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrcopyU2Lc_U
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrcopyU2U_N
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrcopyU2Uc_N
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrcopyU2U_U
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrcopyU2Uc_U
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrcopyL2L_N
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrcopyL2Lc_N
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrcopyL2L_U
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrcopyL2Lc_U
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrcopyL2U_N
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrcopyL2Uc_N
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrcopyL2U_U
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrcopyL2Uc_U
+ (const int N, const float *A, const int lda, float *C);
+void ATL_ctrmv_scalLNU_an1
+ (const int N, const float *alpha, const float *A, const int lda, float *X);
+void ATL_ctrmv_scalLNN_aX
+ (const int N, const float *alpha, const float *A, const int lda, float *X);
+void ATL_ctrmv_scalUNU_an1
+ (const int N, const float *alpha, const float *A, const int lda, float *X);
+void ATL_ctrmv_scalUNN_aX
+ (const int N, const float *alpha, const float *A, const int lda, float *X);
+void ATL_ctrinvertUU(const int N, float *A, const int lda);
+void ATL_ctrinvertLU(const int N, float *A, const int lda);
+void ATL_ctrinvertUN(const int N, float *A, const int lda);
+void ATL_ctrinvertLN(const int N, float *A, const int lda);
+void ATL_ctrputU_b0
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_ctrputL_b0
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_csyr2k_putU_b0
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_csyr2k_putL_b0
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_ctrputU_b1
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_ctrputL_b1
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_csyr2k_putU_b1
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_csyr2k_putL_b1
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_ctrputU_bX
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_ctrputL_bX
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_csyr2k_putU_bX
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_csyr2k_putL_bX
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_ctrputU_bXi0
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_ctrputL_bXi0
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_csyr2k_putU_bXi0
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_csyr2k_putL_bXi0
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_ctrputU_bn1
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_ctrputL_bn1
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_csyr2k_putU_bn1
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_csyr2k_putL_bn1
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_cher2k_putU_b0
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_cher2k_putL_b0
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_cheputU_b0
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_cheputL_b0
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_cher2k_putU_b1
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_cher2k_putL_b1
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_cheputU_b1
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_cheputL_b1
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_cher2k_putU_bXi0
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_cher2k_putL_bXi0
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_cheputU_bXi0
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_cheputL_bXi0
+ (const int N, const float *v, const float *beta, float *A, const int lda);
+void ATL_ctrsm0LLTN
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0LLTU
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0LLNN
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0LLNU
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0LLCN
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0LLCU
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0LUTN
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0LUTU
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0LUNN
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0LUNU
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0LUCN
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0LUCU
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0RLTN
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0RLTU
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0RLNN
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0RLNU
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0RLCN
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0RLCU
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0RUTN
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0RUTU
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0RUNN
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0RUNU
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0RUCN
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_ctrsm0RUCU
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, float *C, const int ldc);
+void ATL_zCtrsmKL
+ (enum ATLAS_UPLO Uplo, enum ATLAS_TRANS Trans, enum ATLAS_DIAG Diag,
+ const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *B, const int ldb);
+void ATL_zhecopy
+ (const int N, const double *A, const int lda, double *C);
+void ATL_zsycopy
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrcopyU2L_N
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrcopyU2Lc_N
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrcopyU2L_U
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrcopyU2Lc_U
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrcopyU2U_N
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrcopyU2Uc_N
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrcopyU2U_U
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrcopyU2Uc_U
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrcopyL2L_N
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrcopyL2Lc_N
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrcopyL2L_U
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrcopyL2Lc_U
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrcopyL2U_N
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrcopyL2Uc_N
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrcopyL2U_U
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrcopyL2Uc_U
+ (const int N, const double *A, const int lda, double *C);
+void ATL_ztrmv_scalLNU_an1
+ (const int N, const double *alpha, const double *A, const int lda, double *X);
+void ATL_ztrmv_scalLNN_aX
+ (const int N, const double *alpha, const double *A, const int lda, double *X);
+void ATL_ztrmv_scalUNU_an1
+ (const int N, const double *alpha, const double *A, const int lda, double *X);
+void ATL_ztrmv_scalUNN_aX
+ (const int N, const double *alpha, const double *A, const int lda, double *X);
+void ATL_ztrinvertUU(const int N, double *A, const int lda);
+void ATL_ztrinvertLU(const int N, double *A, const int lda);
+void ATL_ztrinvertUN(const int N, double *A, const int lda);
+void ATL_ztrinvertLN(const int N, double *A, const int lda);
+void ATL_ztrputU_b0
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_ztrputL_b0
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zsyr2k_putU_b0
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zsyr2k_putL_b0
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_ztrputU_b1
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_ztrputL_b1
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zsyr2k_putU_b1
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zsyr2k_putL_b1
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_ztrputU_bX
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_ztrputL_bX
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zsyr2k_putU_bX
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zsyr2k_putL_bX
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_ztrputU_bXi0
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_ztrputL_bXi0
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zsyr2k_putU_bXi0
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zsyr2k_putL_bXi0
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_ztrputU_bn1
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_ztrputL_bn1
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zsyr2k_putU_bn1
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zsyr2k_putL_bn1
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zher2k_putU_b0
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zher2k_putL_b0
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zheputU_b0
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zheputL_b0
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zher2k_putU_b1
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zher2k_putL_b1
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zheputU_b1
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zheputL_b1
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zher2k_putU_bXi0
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zher2k_putL_bXi0
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zheputU_bXi0
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_zheputL_bXi0
+ (const int N, const double *v, const double *beta, double *A, const int lda);
+void ATL_ztrsm0LLTN
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0LLTU
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0LLNN
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0LLNU
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0LLCN
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0LLCU
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0LUTN
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0LUTU
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0LUNN
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0LUNU
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0LUCN
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0LUCU
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0RLTN
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0RLTU
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0RLNN
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0RLNU
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0RLCN
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0RLCU
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0RUTN
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0RUTU
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0RUNN
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0RUNU
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0RUCN
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+void ATL_ztrsm0RUCU
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, double *C, const int ldc);
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_lapack.h b/kaldi_io/src/tools/ATLAS/include/atlas_lapack.h
new file mode 100644
index 0000000..4b370b8
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_lapack.h
@@ -0,0 +1,239 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef ATLAS_LAPACK_H
+ #define ATLAS_LAPACK_H
+
+#include "atlas_misc.h"
+#include "cblas.h"
+
+#ifdef PATL
+
+#include "atlas_cblastypealias.h"
+/*
+ * predefined type macro names
+ */
+#define ATL_getriR Mjoin(PATL,getriR)
+#define ATL_getriC Mjoin(PATL,getriC)
+#define ATL_getri Mjoin(PATL,getri)
+#define ATL_lauumRL Mjoin(PATL,lauumRL)
+#define ATL_lauumRU Mjoin(PATL,lauumRU)
+#define ATL_lauumCL Mjoin(PATL,lauumCL)
+#define ATL_lauumCU Mjoin(PATL,lauumCU)
+#define ATL_lauum Mjoin(PATL,lauum)
+#define ATL_trtriRL Mjoin(PATL,trtriRL)
+#define ATL_trtriRU Mjoin(PATL,trtriRU)
+#define ATL_trtriCL Mjoin(PATL,trtriCL)
+#define ATL_trtriCU Mjoin(PATL,trtriCU)
+#define ATL_trtri Mjoin(PATL,trtri)
+#define ATL_potrfU Mjoin(PATL,potrfU)
+#define ATL_potrfL Mjoin(PATL,potrfL)
+#define ATL_potrs Mjoin(PATL,potrs)
+#define ATL_potrf Mjoin(PATL,potrf)
+#define ATL_getrfR Mjoin(PATL,getrfR)
+#define ATL_getrfC Mjoin(PATL,getrfC)
+#define ATL_getrs Mjoin(PATL,getrs)
+#define ATL_getrf Mjoin(PATL,getrf)
+#define ATL_laswp Mjoin(PATL,laswp)
+
+#endif
+
+int ATL_sgetri(const enum CBLAS_ORDER Order, const int N, TYPE *A, const int lda,
+ const int *ipiv, TYPE *wrk, int *lwrk);
+int ATL_sgetriR(const int N, TYPE *A, const int lda, const int *ipiv,
+ TYPE *wrk, const int lwrk);
+int ATL_sgetriC(const int N, TYPE *A, const int lda, const int *ipiv,
+ TYPE *wrk, const int lwrk);
+void ATL_slauum(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, float *A, const int lda);
+int ATL_spotrf(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, float *A, const int lda);
+void ATL_spotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const int NRHS, const float *A, const int lda,
+ float *B, const int ldb);
+int ATL_sgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+ float *A, const int lda, int *ipiv);
+void ATL_sgetrs(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans,
+ const int N, const int NRHS, const float *A, const int lda,
+ const int *ipiv, float *B, const int ldb);
+void ATL_slaswp(const int N, float *A, const int lda0, const int K1,
+ const int K2, const int *ipiv, const int inci);
+int ATL_sgetrfC(const int M, const int N, float *A, const int lda,
+ int *ipiv);
+int ATL_sgetrfR(const int M, const int N, float *A, const int lda,
+ int *ipiv);
+void ATL_slauumRU(const int N, float *A, const int lda);
+void ATL_slauumRL(const int N, float *A, const int lda);
+void ATL_slauumCU(const int N, float *A, const int lda);
+void ATL_slauumCL(const int N, float *A, const int lda);
+int ATL_spotrfU(const int N, float *A, const int lda);
+int ATL_spotrfL(const int N, float *A, const int lda);
+int ATL_strtri(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_DIAG Diag, const int N,
+ float *A, const int lda);
+int ATL_strtriRU(const enum CBLAS_DIAG Diag, const int N, float *A,
+ const int lda);
+int ATL_strtriRL(const enum CBLAS_DIAG Diag, const int N, float *A,
+ const int lda);
+int ATL_strtriCU(const enum CBLAS_DIAG Diag, const int N, float *A,
+ const int lda);
+int ATL_strtriCL(const enum CBLAS_DIAG Diag, const int N, float *A,
+ const int lda);
+
+int ATL_dgetri(const enum CBLAS_ORDER Order, const int N, TYPE *A, const int lda,
+ const int *ipiv, TYPE *wrk, int *lwrk);
+int ATL_dgetriR(const int N, TYPE *A, const int lda, const int *ipiv,
+ TYPE *wrk, const int lwrk);
+int ATL_dgetriC(const int N, TYPE *A, const int lda, const int *ipiv,
+ TYPE *wrk, const int lwrk);
+void ATL_dlauum(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, double *A, const int lda);
+int ATL_dpotrf(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, double *A, const int lda);
+void ATL_dpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const int NRHS, const double *A, const int lda,
+ double *B, const int ldb);
+int ATL_dgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+ double *A, const int lda, int *ipiv);
+void ATL_dgetrs(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans,
+ const int N, const int NRHS, const double *A, const int lda,
+ const int *ipiv, double *B, const int ldb);
+void ATL_dlaswp(const int N, double *A, const int lda0, const int K1,
+ const int K2, const int *ipiv, const int inci);
+int ATL_dgetrfC(const int M, const int N, double *A, const int lda,
+ int *ipiv);
+int ATL_dgetrfR(const int M, const int N, double *A, const int lda,
+ int *ipiv);
+void ATL_dlauumRU(const int N, double *A, const int lda);
+void ATL_dlauumRL(const int N, double *A, const int lda);
+void ATL_dlauumCU(const int N, double *A, const int lda);
+void ATL_dlauumCL(const int N, double *A, const int lda);
+int ATL_dpotrfU(const int N, double *A, const int lda);
+int ATL_dpotrfL(const int N, double *A, const int lda);
+int ATL_dtrtri(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_DIAG Diag, const int N,
+ double *A, const int lda);
+int ATL_dtrtriRU(const enum CBLAS_DIAG Diag, const int N, double *A,
+ const int lda);
+int ATL_dtrtriRL(const enum CBLAS_DIAG Diag, const int N, double *A,
+ const int lda);
+int ATL_dtrtriCU(const enum CBLAS_DIAG Diag, const int N, double *A,
+ const int lda);
+int ATL_dtrtriCL(const enum CBLAS_DIAG Diag, const int N, double *A,
+ const int lda);
+
+int ATL_cgetri(const enum CBLAS_ORDER Order, const int N, TYPE *A, const int lda,
+ const int *ipiv, TYPE *wrk, int *lwrk);
+int ATL_cgetriR(const int N, TYPE *A, const int lda, const int *ipiv,
+ TYPE *wrk, const int lwrk);
+int ATL_cgetriC(const int N, TYPE *A, const int lda, const int *ipiv,
+ TYPE *wrk, const int lwrk);
+void ATL_clauum(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, float *A, const int lda);
+int ATL_cpotrf(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, float *A, const int lda);
+void ATL_cpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const int NRHS, const float *A, const int lda,
+ float *B, const int ldb);
+int ATL_cgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+ float *A, const int lda, int *ipiv);
+void ATL_cgetrs(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans,
+ const int N, const int NRHS, const float *A, const int lda,
+ const int *ipiv, float *B, const int ldb);
+void ATL_claswp(const int N, float *A, const int lda0, const int K1,
+ const int K2, const int *ipiv, const int inci);
+int ATL_cgetrfC(const int M, const int N, float *A, const int lda,
+ int *ipiv);
+int ATL_cgetrfR(const int M, const int N, float *A, const int lda,
+ int *ipiv);
+void ATL_clauumRU(const int N, float *A, const int lda);
+void ATL_clauumRL(const int N, float *A, const int lda);
+void ATL_clauumCU(const int N, float *A, const int lda);
+void ATL_clauumCL(const int N, float *A, const int lda);
+int ATL_cpotrfRU(const int N, float *A, const int lda);
+int ATL_cpotrfRL(const int N, float *A, const int lda);
+int ATL_cpotrfU(const int N, float *A, const int lda);
+int ATL_cpotrfL(const int N, float *A, const int lda);
+int ATL_ctrtri(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_DIAG Diag, const int N,
+ float *A, const int lda);
+int ATL_ctrtriRU(const enum CBLAS_DIAG Diag, const int N, float *A,
+ const int lda);
+int ATL_ctrtriRL(const enum CBLAS_DIAG Diag, const int N, float *A,
+ const int lda);
+int ATL_ctrtriCU(const enum CBLAS_DIAG Diag, const int N, float *A,
+ const int lda);
+int ATL_ctrtriCL(const enum CBLAS_DIAG Diag, const int N, float *A,
+ const int lda);
+
+int ATL_zgetri(const enum CBLAS_ORDER Order, const int N, TYPE *A, const int lda,
+ const int *ipiv, TYPE *wrk, int *lwrk);
+int ATL_zgetriR(const int N, TYPE *A, const int lda, const int *ipiv,
+ TYPE *wrk, const int lwrk);
+int ATL_zgetriC(const int N, TYPE *A, const int lda, const int *ipiv,
+ TYPE *wrk, const int lwrk);
+void ATL_zlauum(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, double *A, const int lda);
+int ATL_zpotrf(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, double *A, const int lda);
+void ATL_zpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const int NRHS, const double *A, const int lda,
+ double *B, const int ldb);
+int ATL_zgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+ double *A, const int lda, int *ipiv);
+void ATL_zgetrs(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans,
+ const int N, const int NRHS, const double *A, const int lda,
+ const int *ipiv, double *B, const int ldb);
+void ATL_zlaswp(const int N, double *A, const int lda0, const int K1,
+ const int K2, const int *ipiv, const int inci);
+int ATL_zgetrfC(const int M, const int N, double *A, const int lda,
+ int *ipiv);
+int ATL_zgetrfR(const int M, const int N, double *A, const int lda,
+ int *ipiv);
+void ATL_zlauumRU(const int N, double *A, const int lda);
+void ATL_zlauumRL(const int N, double *A, const int lda);
+void ATL_zlauumCU(const int N, double *A, const int lda);
+void ATL_zlauumCL(const int N, double *A, const int lda);
+int ATL_zpotrfRU(const int N, double *A, const int lda);
+int ATL_zpotrfRL(const int N, double *A, const int lda);
+int ATL_zpotrfU(const int N, double *A, const int lda);
+int ATL_zpotrfL(const int N, double *A, const int lda);
+int ATL_ztrtri(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_DIAG Diag, const int N,
+ double *A, const int lda);
+int ATL_ztrtriRU(const enum CBLAS_DIAG Diag, const int N, double *A,
+ const int lda);
+int ATL_ztrtriRL(const enum CBLAS_DIAG Diag, const int N, double *A,
+ const int lda);
+int ATL_ztrtriCU(const enum CBLAS_DIAG Diag, const int N, double *A,
+ const int lda);
+int ATL_ztrtriCL(const enum CBLAS_DIAG Diag, const int N, double *A,
+ const int lda);
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_level1.h b/kaldi_io/src/tools/ATLAS/include/atlas_level1.h
new file mode 100644
index 0000000..d4d61d8
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_level1.h
@@ -0,0 +1,127 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Prototypes ATLAS Level 1 functions not defined in atlas_aux.h
+ */
+#ifndef ATLAS_LEVEL1_H
+#define ATLAS_LEVEL1_H
+
+/*
+ * Many level one blas routines actually taken care of by atlas auxiliary
+ */
+#include "atlas_aux.h"
+
+float ATL_sdsdot(const int N, const float alpha, const float *X,
+ const int incX, const float *Y, const int incY);
+double ATL_dsdot(const int N, const float *X, const int incX,
+ const float *Y, const int incY);
+/*
+ * Routines with all four types
+ */
+void ATL_sswap(const int N, float *X, const int incX,
+ float *Y, const int incY);
+int ATL_isamax(const int N, const float *X, const int incX);
+
+void ATL_dswap(const int N, double *X, const int incX,
+ double *Y, const int incY);
+int ATL_idamax(const int N, const double *X, const int incX);
+
+void ATL_cswap(const int N, float *X, const int incX,
+ float *Y, const int incY);
+int ATL_icamax(const int N, const float *X, const int incX);
+
+void ATL_zswap(const int N, double *X, const int incX,
+ double *Y, const int incY);
+int ATL_izamax(const int N, const double *X, const int incX);
+
+/*
+ * Routines with real types
+ */
+void ATL_srotg(float *a, float *b, float *c, float *s);
+void ATL_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
+void ATL_srot(const int N, float *X, const int incX,
+ float *Y, const int incY, const float c, const float s);
+void ATL_srotm(const int N, float *X, const int incX,
+ float *Y, const int incY, const float *P);
+float ATL_sdot(const int N, const float *X, const int incX,
+ const float *Y, const int incY);
+void ATL_sssq(const int N, const float *X, const int incX,
+ float *scal0, float *ssq0);
+float ATL_snrm2(const int N, const float *X, const int incX);
+float ATL_sasum(const int N, const float *X, const int incX);
+
+void ATL_drotg(double *a, double *b, double *c, double *s);
+void ATL_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
+void ATL_drot(const int N, double *X, const int incX,
+ double *Y, const int incY, const double c, const double s);
+void ATL_drotm(const int N, double *X, const int incX,
+ double *Y, const int incY, const double *P);
+double ATL_ddot(const int N, const double *X, const int incX,
+ const double *Y, const int incY);
+void ATL_dssq(const int N, const double *X, const int incX,
+ double *scal0, double *ssq0);
+double ATL_dnrm2(const int N, const double *X, const int incX);
+double ATL_dasum(const int N, const double *X, const int incX);
+
+/*
+ * Routines with complex types
+ */
+void ATL_csrot(const int N, float *X, const int incX,
+ float *Y, const int incY, const float c, const float s);
+void ATL_crotg(float *a, const float *b, float *c, float *s);
+void ATL_cdotu_sub(const int N, const float *X, const int incX,
+ const float *Y, const int incY, float *dot);
+void ATL_cdotc_sub(const int N, const float *X, const int incX,
+ const float *Y, const int incY, float *dot);
+void ATL_cssq(const int N, const float *X, const int incX,
+ float *scal0, float *ssq0);
+float ATL_scnrm2(const int N, const float *X, const int incX);
+float ATL_scasum(const int N, const float *X, const int incX);
+
+void ATL_zdrot(const int N, double *X, const int incX,
+ double *Y, const int incY, const double c, const double s);
+void ATL_zrotg(double *a, const double *b, double *c, double *s);
+void ATL_zdotu_sub(const int N, const double *X, const int incX,
+ const double *Y, const int incY, double *dot);
+void ATL_zdotc_sub(const int N, const double *X, const int incX,
+ const double *Y, const int incY, double *dot);
+void ATL_zssq(const int N, const double *X, const int incX,
+ double *scal0, double *ssq0);
+double ATL_dznrm2(const int N, const double *X, const int incX);
+double ATL_dzasum(const int N, const double *X, const int incX);
+
+
+#define ATL_casum ATL_scasum
+#define ATL_zasum ATL_dzasum
+#define ATL_cnrm2 ATL_scnrm2
+#define ATL_znrm2 ATL_dznrm2
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_level2.h b/kaldi_io/src/tools/ATLAS/include/atlas_level2.h
new file mode 100644
index 0000000..d05f6d5
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_level2.h
@@ -0,0 +1,267 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * ===========================================================================
+ * Prototypes for level 2 BLAS
+ * ===========================================================================
+ */
+#ifndef ATLAS_LEVEL2_H
+#define ATLAS_LEVEL2_H
+
+/*
+ * Routines with standard 4 prefixes (S, D, C, Z)
+ */
+void ATL_sgemv(const enum ATLAS_TRANS TransA, const int M, const int N,
+ const float alpha, const float *A, const int lda,
+ const float *X, const int incX, const float beta,
+ float *Y, const int incY);
+void ATL_sgbmv(const enum ATLAS_TRANS TransA, const int M, const int N,
+ const int KL, const int KU, const float alpha,
+ const float *A, const int lda, const float *X,
+ const int incX, const float beta, float *Y, const int incY);
+void ATL_strmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N,
+ const float *A, const int lda, float *X, const int incX);
+void ATL_stbmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N, const int K,
+ const float *A, const int lda, float *X, const int incX);
+void ATL_stpmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N, const float *Ap,
+ float *X, const int incX);
+void ATL_strsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N,
+ const float *A, const int lda, float *X, const int incX);
+void ATL_stbsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N, const int K,
+ const float *A, const int lda, float *X, const int incX);
+void ATL_stpsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N,
+ const float *Ap, float *X, const int incX);
+
+void ATL_dgemv(const enum ATLAS_TRANS TransA, const int M, const int N,
+ const double alpha, const double *A, const int lda,
+ const double *X, const int incX, const double beta,
+ double *Y, const int incY);
+void ATL_dgbmv(const enum ATLAS_TRANS TransA, const int M, const int N,
+ const int KL, const int KU, const double alpha,
+ const double *A, const int lda, const double *X,
+ const int incX, const double beta, double *Y, const int incY);
+void ATL_dtrmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N,
+ const double *A, const int lda, double *X, const int incX);
+void ATL_dtbmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N, const int K,
+ const double *A, const int lda, double *X, const int incX);
+void ATL_dtpmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N, const double *Ap,
+ double *X, const int incX);
+void ATL_dtrsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N,
+ const double *A, const int lda, double *X, const int incX);
+void ATL_dtbsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N, const int K,
+ const double *A, const int lda, double *X, const int incX);
+void ATL_dtpsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N,
+ const double *Ap, double *X, const int incX);
+
+void ATL_cgemv(const enum ATLAS_TRANS TransA, const int M, const int N,
+ const float *alpha, const float *A, const int lda,
+ const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgbmv(const enum ATLAS_TRANS TransA, const int M, const int N,
+ const int KL, const int KU, const float *alpha,
+ const float *A, const int lda, const float *X,
+ const int incX, const float *beta, float *Y, const int incY);
+void ATL_ctrmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N,
+ const float *A, const int lda, float *X, const int incX);
+void ATL_ctbmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N, const int K,
+ const float *A, const int lda, float *X, const int incX);
+void ATL_ctpmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N, const float *Ap,
+ float *X, const int incX);
+void ATL_ctrsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N,
+ const float *A, const int lda, float *X, const int incX);
+void ATL_ctbsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N, const int K,
+ const float *A, const int lda, float *X, const int incX);
+void ATL_ctpsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N,
+ const float *Ap, float *X, const int incX);
+
+void ATL_zgemv(const enum ATLAS_TRANS TransA, const int M, const int N,
+ const double *alpha, const double *A, const int lda,
+ const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgbmv(const enum ATLAS_TRANS TransA, const int M, const int N,
+ const int KL, const int KU, const double *alpha,
+ const double *A, const int lda, const double *X,
+ const int incX, const double *beta, double *Y, const int incY);
+void ATL_ztrmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N,
+ const double *A, const int lda, double *X, const int incX);
+void ATL_ztbmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N, const int K,
+ const double *A, const int lda, double *X, const int incX);
+void ATL_ztpmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N, const double *Ap,
+ double *X, const int incX);
+void ATL_ztrsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N,
+ const double *A, const int lda, double *X, const int incX);
+void ATL_ztbsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N, const int K,
+ const double *A, const int lda, double *X, const int incX);
+void ATL_ztpsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA,
+ const enum ATLAS_DIAG Diag, const int N,
+ const double *Ap, double *X, const int incX);
+
+
+/*
+ * Routines with S and D prefixes only
+ */
+void ATL_ssymv(const enum ATLAS_UPLO Uplo, const int N,
+ const float alpha, const float *A, const int lda,
+ const float *X, const int incX, const float beta,
+ float *Y, const int incY);
+void ATL_ssbmv(const enum ATLAS_UPLO Uplo, const int N, const int K,
+ const float alpha, const float *A, const int lda,
+ const float *X, const int incX, const float beta,
+ float *Y, const int incY);
+void ATL_sspmv(const enum ATLAS_UPLO Uplo, const int N, const float alpha,
+ const float *Ap, const float *X, const int incX,
+ const float beta, float *Y, const int incY);
+void ATL_sger(const int M, const int N, const float alpha,
+ const float *X, const int incX, const float *Y, const int incY,
+ float *A, const int lda);
+void ATL_ssyr(const enum ATLAS_UPLO Uplo, const int N, const float alpha,
+ const float *X, const int incX, float *A, const int lda);
+void ATL_sspr(const enum ATLAS_UPLO Uplo, const int N, const float alpha,
+ const float *X, const int incX, float *Ap);
+void ATL_ssyr2(const enum ATLAS_UPLO Uplo, const int N, const float alpha,
+ const float *X, const int incX, const float *Y, const int incY,
+ float *A, const int lda);
+void ATL_sspr2(const enum ATLAS_UPLO Uplo, const int N, const float alpha,
+ const float *X, const int incX, const float *Y, const int incY,
+ float *A);
+
+void ATL_dsymv(const enum ATLAS_UPLO Uplo, const int N,
+ const double alpha, const double *A, const int lda,
+ const double *X, const int incX, const double beta,
+ double *Y, const int incY);
+void ATL_dsbmv(const enum ATLAS_UPLO Uplo, const int N, const int K,
+ const double alpha, const double *A, const int lda,
+ const double *X, const int incX, const double beta,
+ double *Y, const int incY);
+void ATL_dspmv(const enum ATLAS_UPLO Uplo, const int N, const double alpha,
+ const double *Ap, const double *X, const int incX,
+ const double beta, double *Y, const int incY);
+void ATL_dger(const int M, const int N, const double alpha,
+ const double *X, const int incX, const double *Y, const int incY,
+ double *A, const int lda);
+void ATL_dsyr(const enum ATLAS_UPLO Uplo, const int N, const double alpha,
+ const double *X, const int incX, double *A, const int lda);
+void ATL_dspr(const enum ATLAS_UPLO Uplo, const int N, const double alpha,
+ const double *X, const int incX, double *Ap);
+void ATL_dsyr2(const enum ATLAS_UPLO Uplo, const int N, const double alpha,
+ const double *X, const int incX, const double *Y, const int incY,
+ double *A, const int lda);
+void ATL_dspr2(const enum ATLAS_UPLO Uplo, const int N, const double alpha,
+ const double *X, const int incX, const double *Y, const int incY,
+ double *A);
+
+
+/*
+ * Routines with C and Z prefixes only
+ */
+void ATL_chemv(const enum ATLAS_UPLO Uplo, const int N,
+ const float *alpha, const float *A, const int lda,
+ const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_chbmv(const enum ATLAS_UPLO Uplo, const int N, const int K,
+ const float *alpha, const float *A, const int lda,
+ const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_chpmv(const enum ATLAS_UPLO Uplo, const int N,
+ const float *alpha, const float *Ap,
+ const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgeru(const int M, const int N, const float *alpha,
+ const float *X, const int incX, const float *Y, const int incY,
+ float *A, const int lda);
+void ATL_cgerc(const int M, const int N, const float *alpha,
+ const float *X, const int incX, const float *Y, const int incY,
+ float *A, const int lda);
+void ATL_cher(const enum ATLAS_UPLO Uplo, const int N, const float alpha,
+ const float *X, const int incX, float *A, const int lda);
+void ATL_chpr(const enum ATLAS_UPLO Uplo, const int N, const float alpha,
+ const float *X, const int incX, float *A);
+void ATL_cher2(const enum ATLAS_UPLO Uplo, const int N,
+ const float *alpha, const float *X, const int incX,
+ const float *Y, const int incY, float *A, const int lda);
+void ATL_chpr2(const enum ATLAS_UPLO Uplo, const int N,
+ const float *alpha, const float *X, const int incX,
+ const float *Y, const int incY, float *Ap);
+
+void ATL_zhemv(const enum ATLAS_UPLO Uplo, const int N,
+ const double *alpha, const double *A, const int lda,
+ const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zhbmv(const enum ATLAS_UPLO Uplo, const int N, const int K,
+ const double *alpha, const double *A, const int lda,
+ const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zhpmv(const enum ATLAS_UPLO Uplo, const int N,
+ const double *alpha, const double *Ap,
+ const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgeru(const int M, const int N, const double *alpha,
+ const double *X, const int incX, const double *Y, const int incY,
+ double *A, const int lda);
+void ATL_zgerc(const int M, const int N, const double *alpha,
+ const double *X, const int incX, const double *Y, const int incY,
+ double *A, const int lda);
+void ATL_zher(const enum ATLAS_UPLO Uplo, const int N, const double alpha,
+ const double *X, const int incX, double *A, const int lda);
+void ATL_zhpr(const enum ATLAS_UPLO Uplo, const int N, const double alpha,
+ const double *X, const int incX, double *A);
+void ATL_zher2(const enum ATLAS_UPLO Uplo, const int N,
+ const double *alpha, const double *X, const int incX,
+ const double *Y, const int incY, double *A, const int lda);
+void ATL_zhpr2(const enum ATLAS_UPLO Uplo, const int N,
+ const double *alpha, const double *X, const int incX,
+ const double *Y, const int incY, double *Ap);
+
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_level3.h b/kaldi_io/src/tools/ATLAS/include/atlas_level3.h
new file mode 100644
index 0000000..023c63c
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_level3.h
@@ -0,0 +1,181 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1997 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+/*
+ * ===========================================================================
+ * Prototypes for level 3 BLAS
+ * ===========================================================================
+ */
+#ifndef ATLAS_LEVEL3_H
+#define ATLAS_LEVEL3_H
+
+
+/*
+ * Routines with standard 4 prefixes (S, D, C, Z)
+ */
+int ATL_sGetNB(void);
+int ATL_sGetNCNB(void);
+void ATL_sgemm(const enum ATLAS_TRANS TransA, const enum ATLAS_TRANS TransB,
+ const int M, const int N, const int K, const float alpha,
+ const float *A, const int lda, const float *B, const int ldb,
+ const float beta, float *C, const int ldc);
+void ATL_ssymm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo,
+ const int M, const int N, const float alpha,
+ const float *A, const int lda, const float *B, const int ldb,
+ const float beta, float *C, const int ldc);
+void ATL_ssyrk(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans,
+ const int N, const int K, const float alpha,
+ const float *A, const int lda, const float beta,
+ float *C, const int ldc);
+void ATL_ssyr2k(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans,
+ const int N, const int K, const float alpha,
+ const float *A, const int lda, const float *B, const int ldb,
+ const float beta, float *C, const int ldc);
+void ATL_strmm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo,
+ const enum ATLAS_TRANS TransA, const enum ATLAS_DIAG Diag,
+ const int M, const int N, const float alpha,
+ const float *A, const int lda, float *B, const int ldb);
+void ATL_strsm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo,
+ const enum ATLAS_TRANS TransA, const enum ATLAS_DIAG Diag,
+ const int M, const int N, const float alpha,
+ const float *A, const int lda, float *B, const int ldb);
+
+int ATL_dGetNB(void);
+int ATL_dGetNCNB(void);
+void ATL_dgemm(const enum ATLAS_TRANS TransA, const enum ATLAS_TRANS TransB,
+ const int M, const int N, const int K, const double alpha,
+ const double *A, const int lda, const double *B, const int ldb,
+ const double beta, double *C, const int ldc);
+void ATL_dsymm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo,
+ const int M, const int N, const double alpha,
+ const double *A, const int lda, const double *B, const int ldb,
+ const double beta, double *C, const int ldc);
+void ATL_dsyrk(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans,
+ const int N, const int K, const double alpha,
+ const double *A, const int lda, const double beta,
+ double *C, const int ldc);
+void ATL_dsyr2k(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans,
+ const int N, const int K, const double alpha,
+ const double *A, const int lda, const double *B, const int ldb,
+ const double beta, double *C, const int ldc);
+void ATL_dtrmm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo,
+ const enum ATLAS_TRANS TransA, const enum ATLAS_DIAG Diag,
+ const int M, const int N, const double alpha,
+ const double *A, const int lda, double *B, const int ldb);
+void ATL_dtrsm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo,
+ const enum ATLAS_TRANS TransA, const enum ATLAS_DIAG Diag,
+ const int M, const int N, const double alpha,
+ const double *A, const int lda, double *B, const int ldb);
+
+int ATL_cGetNB(void);
+int ATL_cGetNCNB(void);
+void ATL_cgemm(const enum ATLAS_TRANS TransA, const enum ATLAS_TRANS TransB,
+ const int M, const int N, const int K, const float *alpha,
+ const float *A, const int lda, const float *B, const int ldb,
+ const float *beta, float *C, const int ldc);
+void ATL_csymm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo,
+ const int M, const int N, const float *alpha,
+ const float *A, const int lda, const float *B, const int ldb,
+ const float *beta, float *C, const int ldc);
+void ATL_csyrk(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans,
+ const int N, const int K, const float *alpha,
+ const float *A, const int lda, const float *beta,
+ float *C, const int ldc);
+void ATL_csyr2k(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans,
+ const int N, const int K, const float *alpha,
+ const float *A, const int lda, const float *B, const int ldb,
+ const float *beta, float *C, const int ldc);
+void ATL_ctrmm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo,
+ const enum ATLAS_TRANS TransA, const enum ATLAS_DIAG Diag,
+ const int M, const int N, const float *alpha,
+ const float *A, const int lda, float *B, const int ldb);
+void ATL_ctrsm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo,
+ const enum ATLAS_TRANS TransA, const enum ATLAS_DIAG Diag,
+ const int M, const int N, const float *alpha,
+ const float *A, const int lda, float *B, const int ldb);
+
+int ATL_zGetNB(void);
+int ATL_zGetNCNB(void);
+void ATL_zgemm(const enum ATLAS_TRANS TransA, const enum ATLAS_TRANS TransB,
+ const int M, const int N, const int K, const double *alpha,
+ const double *A, const int lda, const double *B, const int ldb,
+ const double *beta, double *C, const int ldc);
+void ATL_zsymm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo,
+ const int M, const int N, const double *alpha,
+ const double *A, const int lda, const double *B, const int ldb,
+ const double *beta, double *C, const int ldc);
+void ATL_zsyrk(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans,
+ const int N, const int K, const double *alpha,
+ const double *A, const int lda, const double *beta,
+ double *C, const int ldc);
+void ATL_zsyr2k(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans,
+ const int N, const int K, const double *alpha,
+ const double *A, const int lda, const double *B, const int ldb,
+ const double *beta, double *C, const int ldc);
+void ATL_ztrmm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo,
+ const enum ATLAS_TRANS TransA, const enum ATLAS_DIAG Diag,
+ const int M, const int N, const double *alpha,
+ const double *A, const int lda, double *B, const int ldb);
+void ATL_ztrsm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo,
+ const enum ATLAS_TRANS TransA, const enum ATLAS_DIAG Diag,
+ const int M, const int N, const double *alpha,
+ const double *A, const int lda, double *B, const int ldb);
+
+
+/*
+ * Routines with prefixes C and Z only
+ */
+void ATL_chemm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo,
+ const int M, const int N, const float *alpha,
+ const float *A, const int lda, const float *B, const int ldb,
+ const float *beta, float *C, const int ldc);
+void ATL_cherk(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans,
+ const int N, const int K, const float alpha,
+ const float *A, const int lda, const float beta,
+ float *C, const int ldc);
+void ATL_cher2k(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans,
+ const int N, const int K, const float *alpha,
+ const float *A, const int lda, const float *B, const int ldb,
+ const float beta, float *C, const int ldc);
+
+void ATL_zhemm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo,
+ const int M, const int N, const double *alpha,
+ const double *A, const int lda, const double *B, const int ldb,
+ const double *beta, double *C, const int ldc);
+void ATL_zherk(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans,
+ const int N, const int K, const double alpha,
+ const double *A, const int lda, const double beta,
+ double *C, const int ldc);
+void ATL_zher2k(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans,
+ const int N, const int K, const double *alpha,
+ const double *A, const int lda, const double *B, const int ldb,
+ const double beta, double *C, const int ldc);
+
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_lvl2.h b/kaldi_io/src/tools/ATLAS/include/atlas_lvl2.h
new file mode 100644
index 0000000..b09a021
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_lvl2.h
@@ -0,0 +1,294 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "atlas_level2.h"
+#include "atlas_kernel2.h"
+#ifndef ATLAS_LVL2_H
+#define ATLAS_LVL2_H
+
+/*
+ * Real kernels
+ */
+void ATL_sger1_a1_x1_yX
+ (const int M, const int N, const float alpha, const float *X,
+ const int incX, const float *Y, const int incY, float *A, const int lda);
+void ATL_sgemvS_a1_x1_bX_y1
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float *X, const int incX, const float beta,
+ float *Y, const int incY);
+void ATL_sgemvT_a1_x1_bX_y1
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float *X, const int incX, const float beta,
+ float *Y, const int incY);
+void ATL_sgemvN_a1_x1_bX_y1
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float *X, const int incX, const float beta,
+ float *Y, const int incY);
+void ATL_sgemvS_a1_x1_b1_y1
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float *X, const int incX, const float beta,
+ float *Y, const int incY);
+void ATL_sgemvT_a1_x1_b1_y1
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float *X, const int incX, const float beta,
+ float *Y, const int incY);
+void ATL_sgemvN_a1_x1_b1_y1
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float *X, const int incX, const float beta,
+ float *Y, const int incY);
+void ATL_sgemvS_a1_x1_b0_y1
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float *X, const int incX, const float beta,
+ float *Y, const int incY);
+void ATL_sgemvT_a1_x1_b0_y1
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float *X, const int incX, const float beta,
+ float *Y, const int incY);
+void ATL_sgemvN_a1_x1_b0_y1
+ (const int M, const int N, const float alpha, const float *A,
+ const int lda, const float *X, const int incX, const float beta,
+ float *Y, const int incY);
+void ATL_dger1_a1_x1_yX
+ (const int M, const int N, const double alpha, const double *X,
+ const int incX, const double *Y, const int incY, double *A, const int lda);
+void ATL_dgemvS_a1_x1_bX_y1
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double *X, const int incX, const double beta,
+ double *Y, const int incY);
+void ATL_dgemvT_a1_x1_bX_y1
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double *X, const int incX, const double beta,
+ double *Y, const int incY);
+void ATL_dgemvN_a1_x1_bX_y1
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double *X, const int incX, const double beta,
+ double *Y, const int incY);
+void ATL_dgemvS_a1_x1_b1_y1
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double *X, const int incX, const double beta,
+ double *Y, const int incY);
+void ATL_dgemvT_a1_x1_b1_y1
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double *X, const int incX, const double beta,
+ double *Y, const int incY);
+void ATL_dgemvN_a1_x1_b1_y1
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double *X, const int incX, const double beta,
+ double *Y, const int incY);
+void ATL_dgemvS_a1_x1_b0_y1
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double *X, const int incX, const double beta,
+ double *Y, const int incY);
+void ATL_dgemvT_a1_x1_b0_y1
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double *X, const int incX, const double beta,
+ double *Y, const int incY);
+void ATL_dgemvN_a1_x1_b0_y1
+ (const int M, const int N, const double alpha, const double *A,
+ const int lda, const double *X, const int incX, const double beta,
+ double *Y, const int incY);
+
+/*
+ * Complex kernels
+ */
+void ATL_cger1u_a1_x1_yX
+ (const int M, const int N, const float *alpha, const float *X,
+ const int incX, const float *Y, const int incY, float *A, const int lda);
+void ATL_cger1c_a1_x1_yX
+ (const int M, const int N, const float *alpha, const float *X,
+ const int incX, const float *Y, const int incY, float *A, const int lda);
+void ATL_cgemvS_a1_x1_bXi0_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvC_a1_x1_bXi0_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvNc_a1_x1_bXi0_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvT_a1_x1_bXi0_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvN_a1_x1_bXi0_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvS_a1_x1_bX_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvC_a1_x1_bX_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvNc_a1_x1_bX_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvT_a1_x1_bX_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvN_a1_x1_bX_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvS_a1_x1_b1_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvC_a1_x1_b1_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvNc_a1_x1_b1_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvT_a1_x1_b1_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvN_a1_x1_b1_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvS_a1_x1_b0_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvC_a1_x1_b0_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvNc_a1_x1_b0_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvT_a1_x1_b0_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_cgemvN_a1_x1_b0_y1
+ (const int M, const int N, const float *alpha, const float *A,
+ const int lda, const float *X, const int incX, const float *beta,
+ float *Y, const int incY);
+void ATL_zger1u_a1_x1_yX
+ (const int M, const int N, const double *alpha, const double *X,
+ const int incX, const double *Y, const int incY, double *A, const int lda);
+void ATL_zger1c_a1_x1_yX
+ (const int M, const int N, const double *alpha, const double *X,
+ const int incX, const double *Y, const int incY, double *A, const int lda);
+void ATL_zgemvS_a1_x1_bXi0_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvC_a1_x1_bXi0_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvNc_a1_x1_bXi0_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvT_a1_x1_bXi0_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvN_a1_x1_bXi0_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvS_a1_x1_bX_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvC_a1_x1_bX_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvNc_a1_x1_bX_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvT_a1_x1_bX_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvN_a1_x1_bX_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvS_a1_x1_b1_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvC_a1_x1_b1_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvNc_a1_x1_b1_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvT_a1_x1_b1_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvN_a1_x1_b1_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvS_a1_x1_b0_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvC_a1_x1_b0_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvNc_a1_x1_b0_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvT_a1_x1_b0_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+void ATL_zgemvN_a1_x1_b0_y1
+ (const int M, const int N, const double *alpha, const double *A,
+ const int lda, const double *X, const int incX, const double *beta,
+ double *Y, const int incY);
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_lvl3.h b/kaldi_io/src/tools/ATLAS/include/atlas_lvl3.h
new file mode 100644
index 0000000..eab93c0
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_lvl3.h
@@ -0,0 +1,512 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1997 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef ATLAS_LVL3_H
+#define ATLAS_LVL3_H
+
+#include "atlas_misc.h"
+#include "atlas_f77.h"
+#include "atlas_level3.h"
+#if defined(SREAL)
+ #include "smm.h"
+ #include "sXover.h"
+#elif defined(DREAL)
+ #include "dmm.h"
+ #include "dXover.h"
+#elif defined(QREAL)
+ #include "qmm.h"
+ #include "qXover.h"
+#elif defined(SCPLX)
+ #ifdef ATL_NCMM
+ #include "atlas_cNCmm.h"
+ #else
+ #include "cmm.h"
+ #endif
+ #include "cXover.h"
+#elif defined(DCPLX)
+ #ifdef ATL_NCMM
+ #include "atlas_zNCmm.h"
+ #else
+ #include "zmm.h"
+ #endif
+ #include "zmm.h"
+ #include "zXover.h"
+#endif
+#ifndef ATL_3NB
+ #define ATL_3NB 3*NB
+
+ #define NN_MNK_M NBNB*NB
+ #define NN_MNK_N NBNB*NB
+ #define NN_MNK_K NBNB*NB
+ #define NN_MNK_MN NBNB*NB
+ #define NN_MNK_GE NBNB*NB
+
+ #define NT_MNK_M NBNB*NB
+ #define NT_MNK_N NBNB*NB
+ #define NT_MNK_K NBNB*NB
+ #define NT_MNK_MN NBNB*NB
+ #define NT_MNK_GE NBNB*NB
+
+ #define TN_MNK_M NBNB*NB
+ #define TN_MNK_N NBNB*NB
+ #define TN_MNK_K NBNB*NB
+ #define TN_MNK_MN NBNB*NB
+ #define TN_MNK_GE NBNB*NB
+
+ #define TT_MNK_M NBNB*NB
+ #define TT_MNK_N NBNB*NB
+ #define TT_MNK_K NBNB*NB
+ #define TT_MNK_MN NBNB*NB
+ #define TT_MNK_GE NBNB*NB
+#endif
+
+#ifndef CN_MNK_M
+ #define CN_MNK_M TN_MNK_M
+ #define CN_MNK_N TN_MNK_N
+ #define CN_MNK_K TN_MNK_K
+ #define CN_MNK_MN TN_MNK_MN
+ #define CN_MNK_GE TN_MNK_GE
+#endif
+#ifndef NC_MNK_M
+ #define NC_MNK_M NT_MNK_M
+ #define NC_MNK_N NT_MNK_N
+ #define NC_MNK_K NT_MNK_K
+ #define NC_MNK_MN NT_MNK_MN
+ #define NC_MNK_GE NT_MNK_GE
+#endif
+#ifndef CT_MNK_M
+ #define CT_MNK_M TT_MNK_M
+ #define CT_MNK_N TT_MNK_N
+ #define CT_MNK_K TT_MNK_K
+ #define CT_MNK_MN TT_MNK_MN
+ #define CT_MNK_GE TT_MNK_GE
+#endif
+#ifndef TC_MNK_M
+ #define TC_MNK_M TT_MNK_M
+ #define TC_MNK_N TT_MNK_N
+ #define TC_MNK_K TT_MNK_K
+ #define TC_MNK_MN TT_MNK_MN
+ #define TC_MNK_GE TT_MNK_GE
+#endif
+#ifndef CC_MNK_M
+ #define CC_MNK_M TT_MNK_M
+ #define CC_MNK_N TT_MNK_N
+ #define CC_MNK_K TT_MNK_K
+ #define CC_MNK_MN TT_MNK_MN
+ #define CC_MNK_GE TT_MNK_GE
+#endif
+
+#define CPAT Mjoin(C_ATL_, PRE);
+
+#ifndef ATL_MaxMalloc
+ #define ATL_MaxMalloc 67108864
+#endif
+
+typedef void (*MAT2BLK)(int, int, const TYPE*, int, TYPE*, const SCALAR);
+typedef void (*MAT2BLK2)(const int, const int, const SCALAR, const TYPE*,
+ const int, TYPE*, const int);
+typedef void (*MATSCAL)(const int, const int, const SCALAR, TYPE*, const int);
+typedef void (*PUTBLK)(int, int, TYPE*, TYPE*, int, const SCALAR);
+typedef void (*NBCLEANUP)(const TYPE*, const TYPE*, TYPE*, const int);
+typedef int (*MMINTR)(const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const SCALAR,
+ const TYPE *, const int, const TYPE *, const int,
+ const SCALAR, TYPE *, const int);
+typedef void (*NBMM0)(const int, const int, const int, const TYPE,
+ const TYPE*, const int, const TYPE*, const int,
+ const TYPE, TYPE*, const int);
+
+void ATL_xerbla(int p, char *rout, char *form, ...);
+int Mjoin(PATL,GetNB)(void);
+int Mjoin(PATL,GetNCNB)(void);
+
+void Mjoin(PATL, gescal_bX)(const int, const int, const SCALAR, TYPE*,
+ const int);
+void Mjoin(PATL, gescal_bn1)(const int, const int, const SCALAR, TYPE*,
+ const int);
+void Mjoin(PATL, gescal_b0)(const int, const int, const SCALAR, TYPE*,
+ const int);
+
+void Mjoin(PATL,pKBmm_bX)(const int M, const int N, const int K,
+ const TYPE alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const TYPE beta,
+ TYPE *C, const int ldc);
+void Mjoin(PATL,pNBmm_bX)(const int M, const int N, const int K,
+ const TYPE alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const TYPE beta,
+ TYPE *C, const int ldc);
+void Mjoin(PATL,pMBmm_bX)(const int M, const int N, const int K,
+ const TYPE alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const TYPE beta,
+ TYPE *C, const int ldc);
+void Mjoin(PATL,pKBmm_b1)(const int M, const int N, const int K,
+ const TYPE alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const TYPE beta,
+ TYPE *C, const int ldc);
+void Mjoin(PATL,pNBmm_b1)(const int M, const int N, const int K,
+ const TYPE alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const TYPE beta,
+ TYPE *C, const int ldc);
+void Mjoin(PATL,pMBmm_b1)(const int M, const int N, const int K,
+ const TYPE alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const TYPE beta,
+ TYPE *C, const int ldc);
+void Mjoin(PATL,pKBmm_b0)(const int M, const int N, const int K,
+ const TYPE alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const TYPE beta,
+ TYPE *C, const int ldc);
+void Mjoin(PATL,pNBmm_b0)(const int M, const int N, const int K,
+ const TYPE alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const TYPE beta,
+ TYPE *C, const int ldc);
+void Mjoin(PATL,pMBmm_b0)(const int M, const int N, const int K,
+ const TYPE alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const TYPE beta,
+ TYPE *C, const int ldc);
+void Mjoin(PATL,pKBmm)(const int M, const int N, const int K,
+ const TYPE alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const TYPE beta,
+ TYPE *C, const int ldc);
+
+void Mjoin(PATL,MBJBmm)(const int N, const int K, const TYPE *A, const TYPE *B,
+ const TYPE beta, TYPE *C, const int ldc);
+void Mjoin(PATL,IBJBmm)(int IB, int JB, int K, const TYPE *A, const TYPE *B,
+ const TYPE beta, TYPE *C, const int ldc);
+void Mjoin(PATL,IBNBmm)(const int M, const int K, const TYPE *A, const TYPE *B,
+ const TYPE beta, TYPE *C, const int ldc);
+#ifdef TCPLX
+
+void Mjoin(PATL,CNBmm_b0)(const int M, const int N, const int K,
+ const TYPE alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const TYPE beta,
+ TYPE *C, const int ldc);
+void Mjoin(PATL,CNBmm_b1)(const int M, const int N, const int K,
+ const TYPE alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const TYPE beta,
+ TYPE *C, const int ldc);
+void Mjoin(PATL,CNBmm_bX)(const int M, const int N, const int K,
+ const TYPE alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const TYPE beta,
+ TYPE *C, const int ldc);
+void Mjoin(PATL, gescal_bXi0)(const int, const int, const SCALAR, TYPE*,
+ const int);
+
+void Mjoin(PATL,row2blkT_aXi0)
+ (const int, const int, const TYPE*, const int, TYPE*, const SCALAR);
+void Mjoin(PATL,row2blkT2_aXi0)
+ (const int, const int, const TYPE*, const int, TYPE*, const SCALAR);
+void Mjoin(PATL,col2blk_aXi0)
+ (const int, const int, const TYPE*, const int, TYPE*, const SCALAR);
+void Mjoin(PATL,col2blk2_aXi0)
+ (const int, const int, const TYPE*, const int, TYPE*, const SCALAR);
+
+void Mjoin(PATL,row2blkC_aX)
+ (const int, const int, const TYPE*, const int, TYPE*, const SCALAR);
+void Mjoin(PATL,row2blkC2_aX)
+ (const int, const int, const TYPE*, const int, TYPE*, const SCALAR);
+void Mjoin(PATL,col2blkConj_aX)
+ (const int, const int, const TYPE*, const int, TYPE*, const SCALAR);
+void Mjoin(PATL,col2blkConj2_aX)
+ (const int, const int, const TYPE*, const int, TYPE*, const SCALAR);
+void Mjoin(PATL,row2blkC_a1)
+ (const int, const int, const TYPE*, const int, TYPE*, const SCALAR);
+void Mjoin(PATL,row2blkC2_a1)
+ (const int, const int, const TYPE*, const int, TYPE*, const SCALAR);
+void Mjoin(PATL,col2blkConj_a1)
+ (const int, const int, const TYPE*, const int, TYPE*, const SCALAR);
+void Mjoin(PATL,col2blkConj2_a1)
+ (const int, const int, const TYPE*, const int, TYPE*, const SCALAR);
+void Mjoin(PATL,row2blkC_aXi0)
+ (const int, const int, const TYPE*, const int, TYPE*, const SCALAR);
+void Mjoin(PATL,row2blkC2_aXi0)
+ (const int, const int, const TYPE*, const int, TYPE*, const SCALAR);
+void Mjoin(PATL,col2blkConj_aXi0)
+ (const int, const int, const TYPE*, const int, TYPE*, const SCALAR);
+void Mjoin(PATL,col2blkConj2_aXi0)
+ (const int, const int, const TYPE*, const int, TYPE*, const SCALAR);
+
+void Mjoin(PATL,mmJIK2)
+ (int K, int nMb, int nNb, int nKb, int ib, int jb, int kb,
+ const SCALAR alpha, const TYPE *pA0, const TYPE *B, int ldb, TYPE *pB0,
+ int incB, MAT2BLK B2blk, const SCALAR beta, TYPE *C, int ldc,
+ MATSCAL gescal, NBMM0 NBmm0);
+
+void Mjoin(PATL,mmIJK2)
+ (int K, int nMb, int nNb, int nKb, int ib, int jb, int kb,
+ const SCALAR alpha, const TYPE *A, const int lda, TYPE *pA0, const int incA,
+ MAT2BLK A2blk, TYPE *pB0, const SCALAR beta, TYPE *C, int ldc,
+ MATSCAL gescal, NBMM0 NBmm0);
+
+#else /* real */
+
+void Mjoin(PATL,putblk_bX)(int M, int N, TYPE *V, TYPE *C, int ldc, const SCALAR beta);
+void Mjoin(PATL,putblk_bn1)(int M, int N, TYPE *V, TYPE *C, int ldc, const SCALAR beta);
+void Mjoin(PATL,putblk_b1)(int M, int N, TYPE *V, TYPE *C, int ldc, const SCALAR beta);
+void Mjoin(PATL,putblk_b0)(int M, int N, TYPE *V, TYPE *C, int ldc, const SCALAR beta);
+void ATL_gereal2cplx(const int M, const int N, TYPE *alpha, TYPE *R, int ldr,
+ TYPE *I, int ldi, TYPE *beta, TYPE *C, int ldc);
+
+void NBmm_b1(const int M, const int N, const int K, const TYPE alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const TYPE beta, TYPE *C, const int ldc);
+void NBmm_b0(const int M, const int N, const int K, const TYPE alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const TYPE beta, TYPE *C, const int ldc);
+void NBmm_bX(const int M, const int N, const int K, const TYPE alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const TYPE beta, TYPE *C, const int ldc);
+void Mjoin(PATL,mmJIK2)(int K, int nMb, int nNb, int nKb, int ib, int jb,
+ int kb, const SCALAR alpha, const TYPE *pA0,
+ const TYPE *B, int ldb, TYPE *pB0, int incB,
+ MAT2BLK B2blk, const SCALAR beta, TYPE *C, int ldc,
+ TYPE *pC, PUTBLK putblk, NBMM0 NBmm0);
+
+void Mjoin(PATL,mmIJK2)(int K, int nMb, int nNb, int nKb, int ib, int jb,
+ int kb, const SCALAR alpha, const TYPE *A, int lda,
+ TYPE *pA0, int incA, MAT2BLK A2blk, const TYPE *pB0,
+ const SCALAR beta, TYPE *C, int ldc, TYPE *pC,
+ PUTBLK putblk, NBMM0 NBmm0);
+
+
+void Mjoin(PATL,aliased_gemm)
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void Mjoin(PATL,f77gemm)
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void Mjoin(PATL,gemm)
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void Mjoin(PATL,small_mm)
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void Mjoin(PATL,big_mm)
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+#endif
+
+#ifdef USERGEMM
+int Mjoin(PATU,usergemm)(const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const SCALAR,
+ const TYPE*, const int, const TYPE*,
+ const int, const SCALAR, TYPE*, const int);
+#endif
+int Mjoin(PATL,NCmmJIK)(const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const SCALAR,
+ const TYPE*, const int, const TYPE*,
+ const int, const SCALAR, TYPE*, const int);
+int Mjoin(PATL,NCmmIJK)(const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const SCALAR,
+ const TYPE*, const int, const TYPE*,
+ const int, const SCALAR, TYPE*, const int);
+int Mjoin(PATL,NCmmJIK_c)(const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const SCALAR,
+ const TYPE*, const int, const TYPE*,
+ const int, const SCALAR, TYPE*, const int);
+int Mjoin(PATL,NCmmIJK_c)(const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const SCALAR,
+ const TYPE*, const int, const TYPE*,
+ const int, const SCALAR, TYPE*, const int);
+
+void Mjoin(PATL,row2blkT2_aX)(int, int, const TYPE*, int, TYPE*, const SCALAR);
+void Mjoin(PATL,row2blkT_aX)(int, int, const TYPE*, int, TYPE*, const SCALAR);
+void Mjoin(PATL,col2blk2_aX)(int, int, const TYPE*, int, TYPE*, const SCALAR);
+void Mjoin(PATL,col2blk_aX)(int, int, const TYPE*, int, TYPE*, const SCALAR);
+void Mjoin(PATL,row2blkT2_an1)(int, int, const TYPE*, int, TYPE*,
+ const SCALAR);
+void Mjoin(PATL,row2blkT_an1)(int, int, const TYPE*, int, TYPE*, const SCALAR);
+void Mjoin(PATL,col2blk2_an1)(int, int, const TYPE*, int, TYPE*, const SCALAR);
+void Mjoin(PATL,col2blk_an1)(int, int, const TYPE*, int, TYPE*, const SCALAR);
+void Mjoin(PATL,row2blkT2_a1)(int, int, const TYPE*, int, TYPE*, const SCALAR);
+void Mjoin(PATL,row2blkT_a1)(int, int, const TYPE*, int, TYPE*, const SCALAR);
+void Mjoin(PATL,col2blk2_a1)(int, int, const TYPE*, int, TYPE*, const SCALAR);
+void Mjoin(PATL,col2blk_a1)(int, int, const TYPE*, int, TYPE*, const SCALAR);
+
+int Mjoin(PATL,mmJITcp)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K,
+ const SCALAR alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const SCALAR beta,
+ TYPE *C, const int ldc);
+int Mjoin(PATL,mmJIK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K,
+ const SCALAR alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const SCALAR beta,
+ TYPE *C, const int ldc);
+int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K,
+ const SCALAR alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const SCALAR beta,
+ TYPE *C, const int ldc);
+int Mjoin(PATL,mmJKI)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K,
+ const SCALAR alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const SCALAR beta,
+ TYPE *C, const int ldc);
+
+void Mjoin(PATL,mmK)
+ (int M, int m, int N, int n, int nblk, int kr, int KR, const SCALAR alphaA,
+ const SCALAR alphaB, const SCALAR beta, const TYPE *A, const int lda,
+ const int incA, TYPE *pA, const int incAW, const TYPE *B, const int ldb,
+ const int incB, TYPE *pB, const int incBW, TYPE *C, const int ldc,
+ MAT2BLK2 A2blk, MAT2BLK2 B2blk, NBMM0 NBmm0, NBMM0 NBmm1);
+
+int Mjoin(PATL,mmBPP)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K,
+ const SCALAR alpha, const TYPE *A, const int lda,
+ const TYPE *B, const int ldb, const SCALAR beta,
+ TYPE *C, const int ldc);
+
+
+void Mjoin(PATL,gemmTT)
+ (const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void Mjoin(PATL,aliased_gemmTT)
+ (const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void Mjoin(PATL,gemmTN)
+ (const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void Mjoin(PATL,aliased_gemmTN)
+ (const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void Mjoin(PATL,gemmNT)
+ (const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void Mjoin(PATL,aliased_gemmNT)
+ (const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void Mjoin(PATL,gemmNN)
+ (const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void Mjoin(PATL,aliased_gemmNN)
+ (const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+
+
+void NCmmNNIJK_c
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void NCmmNTIJK_c
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void NCmmTNIJK_c
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void NCmmTTIJK_c
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void NCmmNNIJK
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void NCmmNTIJK
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void NCmmTNIJK
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void NCmmTTIJK
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void NCmmNNJIK_c
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void NCmmNTJIK_c
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void NCmmTNJIK_c
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void NCmmTTJIK_c
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void NCmmNNJIK
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void NCmmNTJIK
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void NCmmTNJIK
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+void NCmmTTJIK
+ (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, TYPE *C, const int ldc);
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_misc.h b/kaldi_io/src/tools/ATLAS/include/atlas_misc.h
new file mode 100644
index 0000000..88f754d
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_misc.h
@@ -0,0 +1,416 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1997 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "atlas_enum.h"
+
+#ifndef ATLAS_MISC_H
+#define ATLAS_MISC_H
+#include "atlas_type.h"
+#ifdef ATL_PROFILE
+ extern int ATL_ProfGemmCameFrom;
+#endif
+/*
+ * Some useful macro functions
+ */
+#if (defined(PentiumCPS) || defined(ATL_USEPTHREADS)) && !defined(WALL)
+ #define WALL
+#endif
+#ifndef time00
+ #if defined(WALL)
+ #define time00 ATL_walltime
+ #else
+ #define time00 ATL_cputime
+ #endif
+#endif
+#define Mabs(x) ( (x) >= 0 ? (x) : -(x) )
+#define Mmax(x, y) ( (x) > (y) ? (x) : (y) )
+#define Mmin(x, y) ( (x) > (y) ? (y) : (x) )
+#define Mlowcase(C) ( ((C) > 64 && (C) < 91) ? (C) | 32 : (C) )
+#define Mupcase(C) ( ((C) > 96 && (C) < 123) ? (C) & 0xDF : (C) )
+/*
+ * packed indexing functions (upper & lower)
+ */
+
+#define Mjoin(pre, nam) my_join(pre, nam)
+#define my_join(pre, nam) pre ## nam
+#define Mstr2(m) # m
+#define Mstr(m) Mstr2(m)
+
+#define ATL_assert(n_) \
+{ \
+ if (!(n_)) \
+ { \
+ ATL_xerbla(0, __FILE__, "assertion %s failed, line %d of file %s\n", \
+ Mstr(n_), __LINE__, __FILE__); \
+ } \
+}
+
+/*
+ * Define some C99 features that we use when we know the compiler supports them
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__/100 >= 1999)
+ #define INLINE inline
+ #define RESTRICT restrict
+#else
+ #define INLINE
+ #define RESTRICT
+#endif
+
+#if defined(SREAL)
+ #define EPS 5.0e-7
+ #define TYPE float
+ #define PRE s
+ #define UPR s
+ #define PREU S
+ #define PATL ATL_s
+ #define PATU ATLU_s
+ #define UATL ATLU_s
+ #define CBLA cblas_s
+ #define PATLU ATL_s
+ #define ATL_rone 1.0f
+ #define ATL_rnone -1.0f
+ #define ATL_rzero 0.0f
+ #define ATL_typify(m_) Mjoin(m_,f)
+ #include "atlas_ssysinfo.h"
+#elif defined(DREAL)
+ #define EPS 1.0e-15
+ #define TYPE double
+ #define PRE d
+ #define UPR d
+ #define PREU D
+ #define PATL ATL_d
+ #define PATU ATLU_d
+ #define UATL ATLU_d
+ #define CBLA cblas_d
+ #define PATLU ATL_d
+ #define ATL_rone 1.0
+ #define ATL_rnone -1.0
+ #define ATL_rzero 0.0
+ #define ATL_typify(m_) m_
+ #include "atlas_dsysinfo.h"
+#elif defined (QREAL)
+ #define EPS 1.9259299443872358530559779425849273E-34L
+ #define TYPE long double
+ #define PRE q
+ #define UPR q
+ #define PREU Q
+ #define PATL ATL_q
+ #define PATU ATLU_q
+ #define CBLA cblas_q
+#elif defined(SCPLX)
+ #define EPS 5.0e-7
+ #define TYPE float
+ #define PRE c
+ #define UPR s
+ #define PREU C
+ #define PATL ATL_c
+ #define PATLU ATL_s
+ #define PATU ATLU_c
+ #define UATL ATLU_s
+ #define ATL_rone 1.0f
+ #define ATL_rnone -1.0f
+ #define ATL_rzero 0.0f
+ #define ATL_typify(m_) Mjoin(m_,f)
+ #define CBLA cblas_c
+ #include "atlas_csysinfo.h"
+#elif defined(DCPLX)
+ #define TYPE double
+ #define PRE z
+ #define UPR d
+ #define PREU Z
+ #define PATL ATL_z
+ #define PATLU ATL_d
+ #define PATU ATLU_z
+ #define UATL ATLU_d
+ #define EPS 1.0e-15
+ #define ATL_rone 1.0
+ #define ATL_rnone -1.0
+ #define ATL_rzero 0.0
+ #define ATL_typify(m_) m_
+ #define CBLA cblas_z
+ #include "atlas_zsysinfo.h"
+#endif
+
+#if defined (SREAL) || defined (DREAL) || defined (SCPLX) || defined (DCPLX)
+ #define ATL_sizeof Mjoin(PATL,size)
+ #define ATL_MulBySize Mjoin(PATL,MulBySize)
+ #define ATL_DivBySize Mjoin(PATL,DivBySize)
+#endif
+
+#if ( defined(SREAL) || defined(DREAL) || defined(QREAL) )
+ #define TREAL
+ #define SHIFT
+ #define SCALAR TYPE
+ #define SADD &
+ #define SVAL
+ #define SVVAL *
+ #define SCALAR_IS_ONE(M_scalar) ((M_scalar) == ATL_rone)
+ #define SCALAR_IS_NONE(M_scalar) ((M_scalar) == ATL_rnone)
+ #define SCALAR_IS_ZERO(M_scalar) ((M_scalar) == ATL_rzero)
+#elif defined(SCPLX) || defined(DCPLX)
+ #define TCPLX
+/*
+ * c = b*c + v;
+ */
+ #define CMULT2(v, a, b, tmp) \
+ { \
+ tmp = *(a) * *(b) - *(a+1) * *(b+1); \
+ *(b+1) = *(a) * *(b+1) + *(a+1) * *(b) + *(v+1); \
+ *(b) = tmp + *v; \
+ }
+ #define SHIFT << 1
+ #define SCALAR TYPE *
+ #define SADD
+ #define SVAL *
+ #define SVVAL
+ #define SCALAR_IS_ONE(M_scalar) \
+ ( (*(M_scalar) == ATL_rone) && ((M_scalar)[1] == ATL_rzero) )
+ #define SCALAR_IS_NONE(M_scalar) \
+ ( (*(M_scalar) == ATL_rnone) && ((M_scalar)[1] == ATL_rzero) )
+ #define SCALAR_IS_ZERO(M_scalar) \
+ ( (*(M_scalar) == ATL_rzero) && ((M_scalar)[1] == ATL_rzero) )
+#endif
+
+#if defined(ALPHA1)
+ #define ATL_MulByALPHA(x_) (x_)
+ #define NM _a1
+#elif defined (ALPHA0)
+ #define ATL_MulByALPHA(x_) ATL_rzero
+ #define NM _a0
+#elif defined (ALPHAN1)
+ #define ATL_MulByALPHA(x_) (-(x_))
+ #define NM _an1
+#elif defined (ALPHAXI0)
+ #define ATL_MulByALPHA(x_) (ralpha*(x_))
+ #define NM _aXi0
+#elif defined (ALPHA1C)
+ #define NM _a1c
+#elif defined (ALPHAN1C)
+ #define NM _an1c
+#elif defined (ALPHAXI0C)
+ #define NM _aXi0c
+#elif defined (ALPHAXC)
+ #define NM _aXc
+#elif defined (ALPHAX)
+ #define ATL_MulByALPHA(x_) (alpha*(x_))
+ #define NM _aX
+#endif
+
+#if defined(BETA1)
+ #define ATL_MulByBETA(x_) (x_)
+ #define MSTAT A[i] += v[i]
+ #define BNM _b1
+#elif defined(BETA1C)
+ #define BNM _b1c
+#elif defined(BETAN1)
+ #define ATL_MulByBETA(x_) (-(x_))
+ #define MSTAT A[i] = v[i] - A[i]
+ #define BNM _bn1
+#elif defined(BETAN1C)
+ #define BNM _bn1c
+#elif defined(BETA0)
+ #define ATL_MulByBETA(x_) ATL_rzero
+ #define MSTAT A[i] = v[i]
+ #define BNM _b0
+#elif defined (BETAXI0)
+ #define BNM _bXi0
+ #define ATL_MulByBETA(x_) (rbeta*(x_))
+#elif defined (BETAXI0C)
+ #define BNM _bXi0c
+#elif defined (BETAX)
+ #define ATL_MulByBETA(x_) (beta*(x_))
+ #define MSTAT A[i] = beta*A[i] + v[i]
+ #define BNM _bX
+#elif defined (BETAXC)
+ #define BNM _bXc
+#endif
+
+/* any alignment below this forces data copy in gemm */
+#ifndef ATL_MinMMAlign
+ #define ATL_MinMMAlign 16
+#endif
+#if (ATL_MinMMAlign == 1 || ATL_MinMMAlign == 0)
+ #define ATL_DataIsMinAligned(ptr) 1
+#elif (ATL_MinMMAlign == 2)
+ #define ATL_DataIsMinAligned(ptr) \
+ ( (((size_t) (ptr))>>1)<<1 == (size_t) (ptr) )
+#elif (ATL_MinMMAlign == 4)
+ #define ATL_DataIsMinAligned(ptr) \
+ ( (((size_t) (ptr))>>2)<<2 == (size_t) (ptr) )
+#elif (ATL_MinMMAlign == 8)
+ #define ATL_DataIsMinAligned(ptr) \
+ ( (((size_t) (ptr))>>3)<<3 == (size_t) (ptr) )
+#elif (ATL_MinMMAlign == 16)
+ #define ATL_DataIsMinAligned(ptr) \
+ ( (((size_t) (ptr))>>4)<<4 == (size_t) (ptr) )
+#elif (ATL_MinMMAlign == 32)
+ #define ATL_DataIsMinAligned(ptr) \
+ ( (((size_t) (ptr))>>5)<<5 == (size_t) (ptr) )
+#elif (ATL_MinMMAlign == 64)
+ #define ATL_DataIsMinAligned(ptr) \
+ ( (((size_t) (ptr))>>6)<<6 == (size_t) (ptr) )
+#elif (ATL_MinMMAlign == 128)
+ #define ATL_DataIsMinAligned(ptr) \
+ ( (((size_t) (ptr))>>7)<<7 == (size_t) (ptr) )
+#else
+ #define ATL_DataIsMinAligned(ptr) \
+ ( (((size_t) (ptr))/ATL_MinMMAlign)*ATL_MinMMAlign == (size_t) (ptr) )
+#endif
+
+#define ATL_Cachelen 32
+#if (ATL_Cachelen == 4)
+ #define ATL_MulByCachelen(N_) ( (N_) << 2 )
+ #define ATL_DivByCachelen(N_) ( (N_) >> 2 )
+#elif (ATL_Cachelen == 8)
+ #define ATL_MulByCachelen(N_) ( (N_) << 3 )
+ #define ATL_DivByCachelen(N_) ( (N_) >> 3 )
+#elif (ATL_Cachelen == 16)
+ #define ATL_MulByCachelen(N_) ( (N_) << 4 )
+ #define ATL_DivByCachelen(N_) ( (N_) >> 4 )
+#elif (ATL_Cachelen == 32)
+ #define ATL_MulByCachelen(N_) ( (N_) << 5 )
+ #define ATL_DivByCachelen(N_) ( (N_) >> 5 )
+#elif (ATL_Cachelen == 64)
+ #define ATL_MulByCachelen(N_) ( (N_) << 6 )
+ #define ATL_DivByCachelen(N_) ( (N_) >> 6 )
+#elif (ATL_Cachelen == 128)
+ #define ATL_MulByCachelen(N_) ( (N_) << 7 )
+ #define ATL_DivByCachelen(N_) ( (N_) >> 7 )
+#elif (ATL_Cachelen == 256)
+ #define ATL_MulByCachelen(N_) ( (N_) << 8 )
+ #define ATL_DivByCachelen(N_) ( (N_) >> 8 )
+#else
+ #define ATL_MulByCachelen(N_) ( (N_) * ATL_Cachelen )
+ #define ATL_DivByCachelen(N_) ( (N_) / ATL_Cachelen )
+#endif
+
+#if (ATL_Cachelen < ATL_MinMMAlign)
+ Force a compilation error if our required alignment is at least the
+ minimum!!@^
+#endif
+
+#define ATL_AlignPtr(vp) \
+ (void*) (ATL_Cachelen + ATL_MulByCachelen(ATL_DivByCachelen((size_t) (vp))))
+
+#define ATL_FindPtrAdjust(vp, iadj_) \
+{ \
+ (iadj_) = ((size_t)(vp))-ATL_MulByCachelen(ATL_DivByCachelen((size_t)(vp)));\
+ if (iadj_) \
+ { \
+ if ( (iadj_) == ATL_MulBySize(ATL_DivBySize(iadj_)) ) \
+ (iadj_) = ATL_DivBySize(iadj_); \
+ else (iadj_) = 0; \
+ }\
+}
+#define ATL_FindMatAdjust(vp_, lda_, iadj_) \
+{ \
+ if (ATL_MulByCachelen(ATL_DivByCachelen(ATL_MulBySize(lda_))) \
+ == ATL_MulBySize(lda_)) \
+ { \
+ ATL_FindPtrAdjust(vp_, iadj_); \
+ } \
+ else (iadj_) = 0; \
+}
+
+#define ATL_sqrtLL(x, res) \
+ asm ("fsqrt" : "=t" (res) : "0" (x));
+
+/*
+ * Find N necessary for alignment. Written as function for optimization,
+ * declared static to encourage inlining
+ */
+static int ATL_AlignOffset
+(const int N, /* max return value */
+ const void *vp, /* pointer to be aligned */
+ const int inc, /* size of each elt, in bytes */
+ const int align) /* required alignment, in bytes */
+{
+ const int p = align/inc;
+ const size_t k=(size_t)vp, j=k/inc;
+ int iret;
+ if (k == (j)*inc && p*inc == align)
+ {
+ iret = ((j+p-1) / p)*p - j;
+ if (iret <= N) return(iret);
+ }
+ return(N);
+}
+
+/*
+ * Gcc links in crap that MSVC++ and DVF can't handle if you use stdout
+ * or stderr, so use this beautiful kludge to avoid this problem -- RCW
+ */
+#ifdef GCCWIN
+
+#include <stdarg.h>
+static int WINFPRINTF(FILE *fpout, char *form, ...)
+{
+ int ierr=0;
+ va_list argptr;
+
+ va_start(argptr, form);
+ if (fpout == NULL) ierr = vprintf(form, argptr);
+ else ierr = vfprintf(fpout, form, argptr);
+ va_end(argptr);
+
+ return(ierr);
+}
+
+#ifdef stdout
+ #undef stdout
+#endif
+#ifdef stderr
+ #undef stderr
+#endif
+#ifdef assert
+ #undef assert
+#endif
+
+#define stdout NULL
+#define stderr NULL
+#define fprintf WINFPRINTF
+#define assert WINASSERT
+#define WINASSERT(n_) \
+{ \
+ if (!(n_)) \
+ { \
+ printf("assertion %s failed, line %d of file %s\n", \
+ Mstr(n_), __LINE__, __FILE__); \
+ exit(1); \
+ } \
+}
+
+#endif
+
+#include "atlas_aux.h"
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_mv.h b/kaldi_io/src/tools/ATLAS/include/atlas_mv.h
new file mode 100644
index 0000000..f26da5f
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_mv.h
@@ -0,0 +1,45 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef ATLAS_MV_H
+ #define ATLAS_MV_H
+
+#include "atlas_misc.h"
+#if defined(SREAL)
+ #include "atlas_smv.h"
+#elif defined(DREAL)
+ #include "atlas_dmv.h"
+#elif defined(SCPLX)
+ #include "atlas_cmv.h"
+#elif defined(DCPLX)
+ #include "atlas_zmv.h"
+#endif
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_pkblas.h b/kaldi_io/src/tools/ATLAS/include/atlas_pkblas.h
new file mode 100644
index 0000000..b9c7d82
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_pkblas.h
@@ -0,0 +1,569 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 2003 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef ATLAS_PKBLAS_H
+#define ATLAS_PKBLAS_H
+
+#include "atlas_misc.h"
+#ifndef ATL_NOL3
+#include "atlas_lvl3.h"
+#endif
+
+#define CBLAS_ENUM_ONLY
+#include "cblas.h"
+#undef CBLAS_ENUM_ONLY
+
+enum PACK_UPLO {PackUpper=121, PackLower=122, PackGen=123};
+
+#define PACK_ORDER CBLAS_ORDER
+ #define PackRowMajor CblasRowMajor
+ #define PackColMajor CblasColMajor
+#define PACK_TRANS CBLAS_TRANSPOSE
+ #define PackNoTrans CblasNoTrans
+ #define PackTrans CblasTrans
+ #define PackConjTrans CblasConjTrans
+ #define PackConj AtlasConj
+#define PACK_DIAG CBLAS_DIAG
+ #define PackNonUnit CblasNonUnit
+ #define PackUnit CblasUnit
+#define PACK_SIDE CBLAS_SIDE
+ #define PackLeft CblasLeft
+ #define PackRight CblasRight
+
+#ifndef ATL_pkMaxMalloc
+ #define ATL_pkMaxMalloc ATL_MaxMalloc
+#endif
+
+#ifdef TCPLX
+ #define MindexPL(I_,J_,lda_) ( (((J_)*((lda_)+(lda_)-(J_)-1))) + (I_)+(I_) )
+ #define MindexPU(I_,J_,lda_) ( ((((lda_)+(lda_)+(J_)-1)*(J_))) + (I_)+(I_) )
+#else
+ #define MindexPL(I_,J_,lda_) ( (((J_)*((lda_)+(lda_)-(J_)-1))>>1) + (I_) )
+ #define MindexPU(I_,J_,lda_) ( ((((lda_)+(lda_)+(J_)-1)*(J_))>>1) + (I_) )
+#endif
+#define MindexP(uplo_,I_,J_,lda_) \
+ ( (uplo_) == PackUpper ? MindexPU(I_,J_,lda_) : \
+ ( (uplo_) == PackLower ? MindexPL(I_,J_,lda_) : \
+ (((J_)*(lda_)+(I_))SHIFT) ) )
+#define Mpld(uplo_,J_,lda_) (uplo_) == PackUpper ? (lda_)+(J_) : \
+ ( (uplo_) == PackLower ? (lda_)-(J_) : (lda_) )
+
+
+void ATL_sgpmm(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum PACK_UPLO UB, const enum PACK_TRANS TB,
+ const enum PACK_UPLO UC,
+ const int M, const int N, const int K, const float alpha,
+ const float *A, const int IA, const int JA, const int lda,
+ const float *B, const int IB, const int JB, const int ldb,
+ const float beta, float *C, const int IC, const int JC,
+ const int ldc);
+void ATL_sprankK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
+ const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, int R,
+ const SCALAR alpha, const TYPE *A, int lda,
+ const TYPE *B, int ldb, const SCALAR beta,
+ const enum PACK_UPLO UC, TYPE *C, int ldc);
+int ATL_spmmJIKF(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
+ const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, const enum PACK_UPLO UC,
+ TYPE *C, const int ldc);
+int ATL_spmmJIK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
+ const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const float alpha,
+ const float *A, const int lda, const float *B, const int ldb,
+ const float beta, const enum PACK_UPLO UC,
+ float *C, const int ldc);
+void ATL_spcol2blkF(const int M, const int N, const float alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_sprow2blkTF(const int M, const int N, const float alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_spcol2blk_a1(const int M, const int N, const float alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_spcol2blk_aX(const int M, const int N, const float alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_sprow2blkT_a1(const int M, const int N, const float alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_sprow2blkT_aX(const int M, const int N, const float alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_spputblk(const int M, const int N, const TYPE *V, TYPE *C,
+ int ldc, int ldcinc, const SCALAR beta);
+void ATL_spputblk_diag
+ (const int M, const int N, const float *V, const enum ATLAS_UPLO UC,
+ float *C, int ldc, int ldcinc, const float alpha, const float beta);
+void ATL_spputblk_aX
+ (const int M, const int N, const float *V, float *C, int ldc, int ldcinc,
+ const float alpha, const float beta);
+void ATL_ssprk(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum ATLAS_UPLO UC, const int CP,
+ const int N, const int K, const float alpha,
+ const float *A, const int IA, const int JA, const int lda,
+ const float beta,
+ float *C, const int IC, const int JC, const int ldc);
+void ATL_shprk(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum ATLAS_UPLO UC, const int CP,
+ const int N, const int K, const float alpha,
+ const float *A, const int IA, const int JA, const int lda,
+ const float beta,
+ float *C, const int IC, const int JC, const int ldc);
+void ATL_shprk_rK(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum ATLAS_UPLO UC, const int CP,
+ const int N, const int K, int R, const float alpha,
+ const float *A, int lda, const float beta,
+ float *C, const int ldc);
+int ATL_sphk_kmm(const enum ATLAS_UPLO UC, const enum PACK_UPLO UA,
+ const enum ATLAS_TRANS TA, const int N, const int K,
+ const float alpha, const float *A, const int lda,
+ const float beta, const int CP, float *C, const int ldc);
+void ATL_ssprk_rK(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum ATLAS_UPLO UC, const int CP,
+ const int N, const int K, int R, const float alpha,
+ const float *A, int lda, const float beta,
+ float *C, const int ldc);
+int ATL_sprk_kmm(const enum ATLAS_UPLO UC, const enum PACK_UPLO UA,
+ const enum ATLAS_TRANS TA, const int N, const int K,
+ const float alpha, const float *A, const int lda,
+ const float beta, const int CP, float *C, const int ldc);
+
+void ATL_dgpmm(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum PACK_UPLO UB, const enum PACK_TRANS TB,
+ const enum PACK_UPLO UC,
+ const int M, const int N, const int K, const double alpha,
+ const double *A, const int IA, const int JA, const int lda,
+ const double *B, const int IB, const int JB, const int ldb,
+ const double beta, double *C, const int IC, const int JC,
+ const int ldc);
+void ATL_dprankK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
+ const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, int R,
+ const SCALAR alpha, const TYPE *A, int lda,
+ const TYPE *B, int ldb, const SCALAR beta,
+ const enum PACK_UPLO UC, TYPE *C, int ldc);
+int ATL_dpmmJIKF(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
+ const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, const enum PACK_UPLO UC,
+ TYPE *C, const int ldc);
+int ATL_dpmmJIK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
+ const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const double alpha,
+ const double *A, const int lda, const double *B, const int ldb,
+ const double beta, const enum PACK_UPLO UC,
+ double *C, const int ldc);
+void ATL_dpcol2blkF(const int M, const int N, const double alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_dprow2blkTF(const int M, const int N, const double alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_dpcol2blk_a1(const int M, const int N, const double alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_dpcol2blk_aX(const int M, const int N, const double alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_dprow2blkT_a1(const int M, const int N, const double alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_dprow2blkT_aX(const int M, const int N, const double alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_dpputblk(const int M, const int N, const TYPE *V, TYPE *C,
+ int ldc, int ldcinc, const SCALAR beta);
+void ATL_dpputblk_diag
+ (const int M, const int N, const double *V, const enum ATLAS_UPLO UC,
+ double *C, int ldc, int ldcinc, const double alpha, const double beta);
+void ATL_dpputblk_aX
+ (const int M, const int N, const double *V, double *C, int ldc, int ldcinc,
+ const double alpha, const double beta);
+void ATL_dsprk(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum ATLAS_UPLO UC, const int CP,
+ const int N, const int K, const double alpha,
+ const double *A, const int IA, const int JA, const int lda,
+ const double beta,
+ double *C, const int IC, const int JC, const int ldc);
+void ATL_dhprk(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum ATLAS_UPLO UC, const int CP,
+ const int N, const int K, const double alpha,
+ const double *A, const int IA, const int JA, const int lda,
+ const double beta,
+ double *C, const int IC, const int JC, const int ldc);
+void ATL_dhprk_rK(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum ATLAS_UPLO UC, const int CP,
+ const int N, const int K, int R, const double alpha,
+ const double *A, int lda, const double beta,
+ double *C, const int ldc);
+int ATL_dphk_kmm(const enum ATLAS_UPLO UC, const enum PACK_UPLO UA,
+ const enum ATLAS_TRANS TA, const int N, const int K,
+ const double alpha, const double *A, const int lda,
+ const double beta, const int CP, double *C, const int ldc);
+void ATL_dsprk_rK(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum ATLAS_UPLO UC, const int CP,
+ const int N, const int K, int R, const double alpha,
+ const double *A, int lda, const double beta,
+ double *C, const int ldc);
+int ATL_dprk_kmm(const enum ATLAS_UPLO UC, const enum PACK_UPLO UA,
+ const enum ATLAS_TRANS TA, const int N, const int K,
+ const double alpha, const double *A, const int lda,
+ const double beta, const int CP, double *C, const int ldc);
+
+void ATL_cgpmm(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum PACK_UPLO UB, const enum PACK_TRANS TB,
+ const enum PACK_UPLO UC,
+ const int M, const int N, const int K, const float* alpha,
+ const float *A, const int IA, const int JA, const int lda,
+ const float *B, const int IB, const int JB, const int ldb,
+ const float* beta, float *C, const int IC, const int JC,
+ const int ldc);
+void ATL_cprankK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
+ const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, int R,
+ const SCALAR alpha, const TYPE *A, int lda,
+ const TYPE *B, int ldb, const SCALAR beta,
+ const enum PACK_UPLO UC, TYPE *C, int ldc);
+int ATL_cpmmJIKF(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
+ const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, const enum PACK_UPLO UC,
+ TYPE *C, const int ldc);
+int ATL_cpmmJIK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
+ const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const float* alpha,
+ const float *A, const int lda, const float *B, const int ldb,
+ const float* beta, const enum PACK_UPLO UC,
+ float *C, const int ldc);
+void ATL_cpcol2blkF(const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cprow2blkTF(const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cpcol2blk_a1(const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cpcol2blk_aX(const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cprow2blkT_a1(const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cprow2blkT_aX(const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cpputblk(const int M, const int N, const TYPE *V, TYPE *C,
+ int ldc, int ldcinc, const SCALAR beta);
+void ATL_cpputblk_diag
+ (const int M, const int N, const float *V, const enum ATLAS_UPLO UC,
+ float *C, int ldc, int ldcinc, const float* alpha, const float* beta);
+void ATL_cpputblk_aX
+ (const int M, const int N, const float *V, float *C, int ldc, int ldcinc,
+ const float* alpha, const float* beta);
+void ATL_csprk(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum ATLAS_UPLO UC, const int CP,
+ const int N, const int K, const float* alpha,
+ const float *A, const int IA, const int JA, const int lda,
+ const float* beta,
+ float *C, const int IC, const int JC, const int ldc);
+void ATL_chprk(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum ATLAS_UPLO UC, const int CP,
+ const int N, const int K, const float alpha,
+ const float *A, const int IA, const int JA, const int lda,
+ const float beta,
+ float *C, const int IC, const int JC, const int ldc);
+void ATL_chprk_rK(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum ATLAS_UPLO UC, const int CP,
+ const int N, const int K, int R, const float* alpha,
+ const float *A, int lda, const float* beta,
+ float *C, const int ldc);
+int ATL_cphk_kmm(const enum ATLAS_UPLO UC, const enum PACK_UPLO UA,
+ const enum ATLAS_TRANS TA, const int N, const int K,
+ const float* alpha, const float *A, const int lda,
+ const float* beta, const int CP, float *C, const int ldc);
+void ATL_csprk_rK(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum ATLAS_UPLO UC, const int CP,
+ const int N, const int K, int R, const float* alpha,
+ const float *A, int lda, const float* beta,
+ float *C, const int ldc);
+int ATL_cprk_kmm(const enum ATLAS_UPLO UC, const enum PACK_UPLO UA,
+ const enum ATLAS_TRANS TA, const int N, const int K,
+ const float* alpha, const float *A, const int lda,
+ const float* beta, const int CP, float *C, const int ldc);
+
+void ATL_zgpmm(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum PACK_UPLO UB, const enum PACK_TRANS TB,
+ const enum PACK_UPLO UC,
+ const int M, const int N, const int K, const double* alpha,
+ const double *A, const int IA, const int JA, const int lda,
+ const double *B, const int IB, const int JB, const int ldb,
+ const double* beta, double *C, const int IC, const int JC,
+ const int ldc);
+void ATL_zprankK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
+ const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, int R,
+ const SCALAR alpha, const TYPE *A, int lda,
+ const TYPE *B, int ldb, const SCALAR beta,
+ const enum PACK_UPLO UC, TYPE *C, int ldc);
+int ATL_zpmmJIKF(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
+ const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const SCALAR alpha,
+ const TYPE *A, const int lda, const TYPE *B, const int ldb,
+ const SCALAR beta, const enum PACK_UPLO UC,
+ TYPE *C, const int ldc);
+int ATL_zpmmJIK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA,
+ const enum PACK_UPLO UB, const enum ATLAS_TRANS TB,
+ const int M, const int N, const int K, const double* alpha,
+ const double *A, const int lda, const double *B, const int ldb,
+ const double* beta, const enum PACK_UPLO UC,
+ double *C, const int ldc);
+void ATL_zpcol2blkF(const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zprow2blkTF(const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zpcol2blk_a1(const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zpcol2blk_aX(const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zprow2blkT_a1(const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zprow2blkT_aX(const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zpputblk(const int M, const int N, const TYPE *V, TYPE *C,
+ int ldc, int ldcinc, const SCALAR beta);
+void ATL_zpputblk_diag
+ (const int M, const int N, const double *V, const enum ATLAS_UPLO UC,
+ double *C, int ldc, int ldcinc, const double* alpha, const double* beta);
+void ATL_zpputblk_aX
+ (const int M, const int N, const double *V, double *C, int ldc, int ldcinc,
+ const double* alpha, const double* beta);
+void ATL_zsprk(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum ATLAS_UPLO UC, const int CP,
+ const int N, const int K, const double* alpha,
+ const double *A, const int IA, const int JA, const int lda,
+ const double* beta,
+ double *C, const int IC, const int JC, const int ldc);
+void ATL_zhprk(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum ATLAS_UPLO UC, const int CP,
+ const int N, const int K, const double alpha,
+ const double *A, const int IA, const int JA, const int lda,
+ const double beta,
+ double *C, const int IC, const int JC, const int ldc);
+void ATL_zhprk_rK(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum ATLAS_UPLO UC, const int CP,
+ const int N, const int K, int R, const double* alpha,
+ const double *A, int lda, const double* beta,
+ double *C, const int ldc);
+int ATL_zphk_kmm(const enum ATLAS_UPLO UC, const enum PACK_UPLO UA,
+ const enum ATLAS_TRANS TA, const int N, const int K,
+ const double* alpha, const double *A, const int lda,
+ const double* beta, const int CP, double *C, const int ldc);
+void ATL_zsprk_rK(const enum PACK_UPLO UA, const enum PACK_TRANS TA,
+ const enum ATLAS_UPLO UC, const int CP,
+ const int N, const int K, int R, const double* alpha,
+ const double *A, int lda, const double* beta,
+ double *C, const int ldc);
+int ATL_zprk_kmm(const enum ATLAS_UPLO UC, const enum PACK_UPLO UA,
+ const enum ATLAS_TRANS TA, const int N, const int K,
+ const double* alpha, const double *A, const int lda,
+ const double* beta, const int CP, double *C, const int ldc);
+
+void ATL_cpcol2blk_aX_blk
+ (const int blk, const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cprow2blkT_aX_blk
+ (const int blk, const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cpcol2blk_a1_blk
+ (const int blk, const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cprow2blkT_a1_blk
+ (const int blk, const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cpcol2blkConjF
+ (const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cpcol2blkConj_a1
+ (const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cpcol2blkConj_aX
+ (const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cpcol2blk_aXi0
+ (const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cpcol2blkConj_aXi0
+ (const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc,float*V);
+void ATL_cprow2blkHF
+ (const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cprow2blkH_a1
+ (const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cprow2blkH_aX
+ (const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cprow2blkH_aXi0
+ (const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cprow2blkT_aXi0
+ (const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cpcol2blkConjF_blk
+ (const int blk, const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cpcol2blkConj_a1_blk
+ (const int blk, const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cpcol2blkConj_aX_blk
+ (const int blk, const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cpcol2blk_aXi0_blk
+ (const int blk, const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cpcol2blkConj_aXi0_blk
+ (const int blk, const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc,float*V);
+void ATL_cprow2blkHF_blk
+ (const int blk, const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cprow2blkH_a1_blk
+ (const int blk, const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cprow2blkH_aX_blk
+ (const int blk, const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cprow2blkH_aXi0_blk
+ (const int blk, const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+void ATL_cprow2blkT_aXi0_blk
+ (const int blk, const int M, const int N, const float* alpha,
+ const float *A, int lda, const int ldainc, float *V);
+
+void ATL_cprow2blkT_KB_aXi0
+ (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda,
+ const int ldainc, TYPE *V);
+void ATL_cprow2blkT_KB_aX
+ (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda,
+ const int ldainc, TYPE *V);
+void ATL_cprow2blkT_KB_a1
+ (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda,
+ const int ldainc, TYPE *V);
+void ATL_cprow2blkH_KB_aXi0
+ (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda,
+ const int ldainc, TYPE *V);
+void ATL_cprow2blkH_KB_aX
+ (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda,
+ const int ldainc, TYPE *V);
+void ATL_cprow2blkH_KB_a1
+ (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda,
+ const int ldainc, TYPE *V);
+void ATL_zpcol2blk_aX_blk
+ (const int blk, const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zprow2blkT_aX_blk
+ (const int blk, const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zpcol2blk_a1_blk
+ (const int blk, const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zprow2blkT_a1_blk
+ (const int blk, const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zpcol2blkConjF
+ (const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zpcol2blkConj_a1
+ (const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zpcol2blkConj_aX
+ (const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zpcol2blk_aXi0
+ (const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zpcol2blkConj_aXi0
+ (const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc,double*V);
+void ATL_zprow2blkHF
+ (const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zprow2blkH_a1
+ (const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zprow2blkH_aX
+ (const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zprow2blkH_aXi0
+ (const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zprow2blkT_aXi0
+ (const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zpcol2blkConjF_blk
+ (const int blk, const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zpcol2blkConj_a1_blk
+ (const int blk, const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zpcol2blkConj_aX_blk
+ (const int blk, const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zpcol2blk_aXi0_blk
+ (const int blk, const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zpcol2blkConj_aXi0_blk
+ (const int blk, const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc,double*V);
+void ATL_zprow2blkHF_blk
+ (const int blk, const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zprow2blkH_a1_blk
+ (const int blk, const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zprow2blkH_aX_blk
+ (const int blk, const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zprow2blkH_aXi0_blk
+ (const int blk, const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+void ATL_zprow2blkT_aXi0_blk
+ (const int blk, const int M, const int N, const double* alpha,
+ const double *A, int lda, const int ldainc, double *V);
+
+void ATL_zprow2blkT_KB_aXi0
+ (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda,
+ const int ldainc, TYPE *V);
+void ATL_zprow2blkT_KB_aX
+ (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda,
+ const int ldainc, TYPE *V);
+void ATL_zprow2blkT_KB_a1
+ (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda,
+ const int ldainc, TYPE *V);
+void ATL_zprow2blkH_KB_aXi0
+ (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda,
+ const int ldainc, TYPE *V);
+void ATL_zprow2blkH_KB_aX
+ (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda,
+ const int ldainc, TYPE *V);
+void ATL_zprow2blkH_KB_a1
+ (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda,
+ const int ldainc, TYPE *V);
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_prefetch.h b/kaldi_io/src/tools/ATLAS/include/atlas_prefetch.h
new file mode 100644
index 0000000..83ee2df
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_prefetch.h
@@ -0,0 +1,197 @@
+#ifndef ATLAS_PREFETCH_H
+#define ATLAS_PREFETCH_H
+/*
+ * Altivec prefetch model not well utilized by SSE-like prefetch, so have
+ * special commands for it.
+ */
+#if defined(ATL_AltiVec)
+ #include "atlas_altivec.h"
+#endif
+/*
+ *
+ * ATL_pfl1R(mem) : fetch location mem to L1, with intent to read *only*
+ * ATL_pfl1W(mem) : fetch location mem to L1, with intent to read/write
+ * ATL_pfl1WO(mem) : fetch location mem to L1, with intent to write ONLY
+ */
+
+#if defined(ATL_3DNow)
+ #ifdef __GNUC__
+ #define ATL_pfl1R(mem) \
+ __asm__ __volatile__ ("prefetch %0" : : "m" (*((char *)(mem))))
+ #define ATL_pfl1W(mem) \
+ __asm__ __volatile__ ("prefetchw %0" : : "m" (*((char *)(mem))))
+ #define ATL_pfl1WO ATL_pfl1W
+ #define ATL_GOT_L1PREFETCH
+ #ifdef ATL_SSE1
+ #define ATL_pfl2R(mem) \
+ __asm__ __volatile__ ("prefetcht1 %0" : : "m" (*((char *)(mem))))
+ #define ATL_pfl2W(mem) \
+ __asm__ __volatile__ ("prefetcht1 %0" : : "m" (*((char *)(mem))))
+ #define ATL_pfl2WO ATL_pfl2W
+ #define ATL_GOT_L2PREFETCH
+ #endif
+ #endif
+#elif defined(ATL_SSE1) || defined (ATL_SSE2) /* SSE prefetch is available */
+ #ifdef __GNUC__
+ #define ATL_pfl1R(mem) \
+ __asm__ __volatile__ ("prefetchnta %0" : : "m" (*((char *)(mem))))
+ #define ATL_pfl1W(mem) \
+ __asm__ __volatile__ ("prefetchnta %0" : : "m" (*((char *)(mem))))
+ #define ATL_pfl1WO ATL_pfl1W
+ #define ATL_GOT_L1PREFETCH
+
+ #define ATL_pfl2R(mem) \
+ __asm__ __volatile__ ("prefetcht1 %0" : : "m" (*((char *)(mem))))
+ #define ATL_pfl2W(mem) \
+ __asm__ __volatile__ ("prefetcht1 %0" : : "m" (*((char *)(mem))))
+ #define ATL_pfl2WO ATL_pfl2W
+ #define ATL_GOT_L2PREFETCH
+ #endif
+#elif defined(__SUNPRO_C) && defined(__sparc) /* && __SUNPRO_CC > 0x600 */
+ #include <sun_prefetch.h>
+ #define ATL_pfl1R(mem) sparc_prefetch_read_many((void*)(mem))
+ #define ATL_pfl1W(mem) sparc_prefetch_write_many((void*)(mem))
+ #define ATL_GOT_L1PREFETCH
+ #define ATL_pfl2R(mem) sparc_prefetch_read_many((void*)(mem))
+ #define ATL_pfl2W(mem) sparc_prefetch_write_many((void*)(mem))
+ #define ATL_GOT_L2PREFETCH
+#elif defined(ATL_ARCH_21264)
+ #ifdef __GNUC__
+ #define ATL_pfl1R(mem) \
+ __asm__ __volatile__ ("ldt $f31, %0" : : "m" (*((char *)(mem))))
+ #define ATL_pfl1W(mem) \
+ __asm__ __volatile__ ("lds $f31, %0" : : "m" (*((char *)(mem))))
+ #define ATL_pfl1WO(mem) \
+ __asm__ __volatile__ ("wh64 %0" : : "m" (*((char *)(mem))))
+ #define ATL_GOT_L1PREFETCH
+ #elif defined(__DECC)
+ #include "c_asm.h"
+ #define ATL_pfl1R(mem) asm ("ldt %f31,(%a0) ;", mem)
+ #define ATL_pfl1W(mem) asm ("lds %f31,(%a0) ;", mem)
+ #define ATL_pfl1WO(mem) asm ("wh64 (%a0) ;", mem)
+ #define ATL_GOT_L1PREFETCH
+ #endif
+/*
+ * Note: SunUS5/10 seems to get no benefit from prefetch, so don't enable
+ */
+#elif defined(ATL_ARCH_USIV) || defined(ATL_ARCH_SunUSIII) || \
+ defined(ATL_ARCH_SunUSII) || defined(ATL_ARCH_SunUSI)
+ #ifdef __GNUC__
+ #define ATL_pfl1R(mem) \
+ __asm__ __volatile__ ("prefetch %0,0" : : "m" (*((char *)(mem))))
+ #define ATL_pfl1W(mem) \
+ __asm__ __volatile__ ("prefetch %0,2" : : "m" (*((char *)(mem))))
+ #define ATL_GOT_L1PREFETCH
+ #define ATL_pfl2R(mem) \
+ __asm__ __volatile__ ("prefetch %0,3" : : "m" (*((char *)(mem))))
+ #define ATL_pfl2W(mem) \
+ __asm__ __volatile__ ("prefetch %0,2" : : "m" (*((char *)(mem))))
+ #define ATL_GOT_L2PREFETCH
+ #endif
+/*
+ * Gives gigantic slowdown on POWER4, so don't enable there, just use gcc
+ * builtin
+ */
+#elif defined(ATL_ARCH_PPCG5) || defined(ATL_ARCH_PPCG5) || \
+ defined(ATL_ARCH_POWER5)
+ #if defined(__GNUC__) || defined(__IBM_GCC_ASM)
+ #define ATL_pfl1R(mem) \
+ __asm__ __volatile__ ("dcbt 0, %0, 0" : : "r" ((mem)))
+ #define ATL_pfl1W(mem) \
+ __asm__ __volatile__ ("dcbtst 0, %0" : : "r" ((mem)))
+ #define ATL_pfST(mem) \
+ __asm__ __volatile__ ("dcbt 0, %0, 1" : : "r" ((mem)))
+ #define ATL_pfl1STi(mem, str) \
+ __asm__ __volatile__ ("rlwinm %0, %0, 0, 0, 24\n\t" \
+ "ori %0, %0, 96+%2\n\t" \
+ "dcbt 0, %0, 8" \
+ : "=r" (mem) \
+ : "0" (mem), "i" (str))
+
+ #define ATL_GOT_L1PREFETCH
+ #define ATL_L1LS 128
+ #endif
+#elif defined(ATL_ARCH_IA64Itan) || defined(ATL_ARCH_IA64Itan2)
+/*
+ * Have to use nt2, 'cause fpu ignored L1.
+ * NOTE: just let icc to prefetch, keep inst here for reference
+ */
+ #if defined(__ECC) && 0
+ #include "ia64intrin.h"
+ #define ATL_pfl1R(mem) __lfetch(2, (mem))
+ #define ATL_pfl1W(mem) __lfetch_excl(2, (mem))
+ #define ATL_GOT_L1PREFETCH
+ #elif defined(__GNUC__) && !defined(__ECC)
+ #define ATL_pfl1R(mem) \
+ __asm__ (" lfetch.nt2 [%0]": : "r"((void *)(mem)))
+ #define ATL_pfl1W(mem) \
+ __asm__ (" lfetch.excl [%0]": : "r"((void *)(mem)))
+ #define ATL_GOT_L1PREFETCH
+ #endif
+#elif defined(ATL_ARCH_HPPA20) && defined(__GNUC__)
+ #define ATL_pfl1R(mem) \
+ __asm__ __volatile__ ("ldw %0, %%r0" : : "m" (*((char *)(mem))))
+ #define ATL_pfl1W(mem) \
+ __asm__ __volatile__ ("ldd %0, %%r0" : : "m" (*((char *)(mem))))
+ #define ATL_GOT_L1PREFETCH
+#elif defined(ATL_AltiVec) && !defined(ATL_pfl1R)
+ #ifndef ATL_NoFakePF
+ /* 33619968 is ATL_GetCtrl(0, 1, 2), or fetch 1 32-byte block */
+ #define ATL_pfl1R(mem) ATL_pfavR(mem, 33619968, 3)
+ #define ATL_pfl1W(mem) ATL_pfavW(mem, 33619968, 2)
+ #define ATL_GOT_L1PREFETCH
+ #endif
+#elif defined(ATL_ARCH_MIPSICE9) && defined(__GNUC__)
+ #define ATL_pfl1R(mem) \
+ __asm__ __volatile__ ("pref 6,%0" : : "m" (*((char *)(mem))))
+ #define ATL_pfl1W(mem) \
+ __asm__ __volatile__ ("pref 7,%0" : : "m" (*((char *)(mem))))
+ #define ATL_GOT_L1PREFETCH
+ #define ATL_L1LS 32
+ #define ATL_L2LS 64
+#elif defined(__GNUC__) /* last ditch, use gcc predefined func */
+ #define ATL_pfl1R(mem) __builtin_prefetch(mem, 0, 3)
+ #define ATL_pfl1W(mem) __builtin_prefetch(mem, 1, 3)
+ #define ATL_GOT_L1PREFETCH
+#endif
+#if defined(ATL_pfl1W) && !defined(ATL_pfl1WO)
+ #define ATL_pfl1WO ATL_pfl1W
+#endif
+
+#ifdef ATL_NOL1PREFETCH
+ #ifdef ATL_GOT_L1PREFETCH
+ #undef ATL_pfl1R
+ #undef ATL_pfl1W
+ #undef ATL_pfl1WO
+ #undef ATL_GOT_L1PREFETCH
+ #endif
+#endif
+#ifdef ATL_NOL2PREFETCH
+ #ifdef ATL_GOT_L2PREFETCH
+ #undef ATL_pfl2R
+ #undef ATL_pfl2W
+ #undef ATL_pfl2WO
+ #undef ATL_GOT_L2PREFETCH
+ #endif
+#endif
+#ifndef ATL_GOT_L1PREFETCH /* dummy calls cpp takes out of code */
+ #define ATL_pfl1R(mem)
+ #define ATL_pfl1W(mem)
+ #define ATL_pfl1WO(mem)
+#endif
+#ifndef ATL_GOT_L2PREFETCH /* dummy calls cpp takes out of code */
+ #define ATL_pfl2R(mem)
+ #define ATL_pfl2W(mem)
+#endif
+
+/*
+ * Define Cache line sizes for L1 and L2
+ */
+#ifndef ATL_L1LS
+ #define ATL_L1LS 64
+#endif
+#ifndef ATL_L2LS
+ #define ATL_L2LS ATL_L1LS
+#endif
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_ptalias1.h b/kaldi_io/src/tools/ATLAS/include/atlas_ptalias1.h
new file mode 100644
index 0000000..2a45eda
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_ptalias1.h
@@ -0,0 +1,60 @@
+#define ATLAS_PTALIAS1_H /* no threaded routs for Level 1 and 2 yet */
+#ifndef ATLAS_PTALIAS1_H
+#define ATLAS_PTALIAS1_H
+/*
+ * Real BLAS
+ */
+ #define ATL_dsdot ATL_dsptdot
+ #define ATL_sdsdot ATL_sdsptdot
+ #define ATL_sasum ATL_sptasum
+ #define ATL_snrm2 ATL_sptnrm2
+ #define ATL_sdot ATL_sptdot
+ #define ATL_saxpy ATL_sptaxpy
+ #define ATL_scopy ATL_sptcopy
+ #define ATL_sscal ATL_sptscal
+ #define ATL_sswap ATL_sptswap
+ #define ATL_srotm ATL_sptrotm
+ #define ATL_srot ATL_sptrot
+ #define ATL_srotmg ATL_sptrotmg
+ #define ATL_srotg ATL_sptrotg
+ #define ATL_isamax ATL_isptamax
+
+ #define ATL_dasum ATL_dptasum
+ #define ATL_dnrm2 ATL_dptnrm2
+ #define ATL_ddot ATL_dptdot
+ #define ATL_daxpy ATL_dptaxpy
+ #define ATL_dcopy ATL_dptcopy
+ #define ATL_dscal ATL_dptscal
+ #define ATL_dswap ATL_dptswap
+ #define ATL_drotm ATL_dptrotm
+ #define ATL_drot ATL_dptrot
+ #define ATL_drotmg ATL_dptrotmg
+ #define ATL_drotg ATL_dptrotg
+ #define ATL_idamax ATL_idptamax
+
+/*
+ * Complex BLAS
+ */
+ #define ATL_cdotc_sub ATL_cptdotc_sub
+ #define ATL_cdotu_sub ATL_cptdotu_sub
+ #define ATL_caxpy ATL_cptaxpy
+ #define ATL_ccopy ATL_cptcopy
+ #define ATL_cscal ATL_cptscal
+ #define ATL_cswap ATL_cptswap
+ #define ATL_icamax ATL_icptamax
+ #define ATL_csscal ATL_csptscal
+ #define ATL_scnrm2 ATL_scptnrm2
+ #define ATL_scasum ATL_scptasum
+
+ #define ATL_zdotc_sub ATL_zptdotc_sub
+ #define ATL_zdotu_sub ATL_zptdotu_sub
+ #define ATL_zaxpy ATL_zptaxpy
+ #define ATL_zcopy ATL_zptcopy
+ #define ATL_zscal ATL_zptscal
+ #define ATL_zswap ATL_zptswap
+ #define ATL_izamax ATL_izptamax
+ #define ATL_zdscal ATL_zdptscal
+ #define ATL_dznrm2 ATL_dzptnrm2
+ #define ATL_dzasum ATL_dzptasum
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_ptalias2.h b/kaldi_io/src/tools/ATLAS/include/atlas_ptalias2.h
new file mode 100644
index 0000000..66b1e0e
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_ptalias2.h
@@ -0,0 +1,80 @@
+#define ATLAS_PTALIAS2_H /* no threaded routs for Level 1 and 2 yet */
+#ifndef ATLAS_PTALIAS2_H
+#define ATLAS_PTALIAS2_H
+/*
+ * Real BLAS
+ */
+ #define ATL_sspr2 ATL_sptspr2
+ #define ATL_ssyr2 ATL_sptsyr2
+ #define ATL_sspr ATL_sptspr
+ #define ATL_ssyr ATL_sptsyr
+ #define ATL_sger ATL_sptger
+ #define ATL_stpsv ATL_spttpsv
+ #define ATL_stbsv ATL_spttbsv
+ #define ATL_strsv ATL_spttrsv
+ #define ATL_stpmv ATL_spttpmv
+ #define ATL_stbmv ATL_spttbmv
+ #define ATL_strmv ATL_spttrmv
+ #define ATL_sspmv ATL_sptspmv
+ #define ATL_ssbmv ATL_sptsbmv
+ #define ATL_ssymv ATL_sptsymv
+ #define ATL_sgbmv ATL_sptgbmv
+ #define ATL_sgemv ATL_sptgemv
+
+ #define ATL_dspr2 ATL_dptspr2
+ #define ATL_dsyr2 ATL_dptsyr2
+ #define ATL_dspr ATL_dptspr
+ #define ATL_dsyr ATL_dptsyr
+ #define ATL_dger ATL_dptger
+ #define ATL_dtpsv ATL_dpttpsv
+ #define ATL_dtbsv ATL_dpttbsv
+ #define ATL_dtrsv ATL_dpttrsv
+ #define ATL_dtpmv ATL_dpttpmv
+ #define ATL_dtbmv ATL_dpttbmv
+ #define ATL_dtrmv ATL_dpttrmv
+ #define ATL_dspmv ATL_dptspmv
+ #define ATL_dsbmv ATL_dptsbmv
+ #define ATL_dsymv ATL_dptsymv
+ #define ATL_dgbmv ATL_dptgbmv
+ #define ATL_dgemv ATL_dptgemv
+
+/*
+ * Complex BLAS
+ */
+ #define ATL_chpr2 ATL_cpthpr2
+ #define ATL_cher2 ATL_cpther2
+ #define ATL_chpr ATL_cpthpr
+ #define ATL_cher ATL_cpther
+ #define ATL_cgerc ATL_cptgerc
+ #define ATL_cgeru ATL_cptgeru
+ #define ATL_ctpsv ATL_cpttpsv
+ #define ATL_ctbsv ATL_cpttbsv
+ #define ATL_ctrsv ATL_cpttrsv
+ #define ATL_ctpmv ATL_cpttpmv
+ #define ATL_ctbmv ATL_cpttbmv
+ #define ATL_ctrmv ATL_cpttrmv
+ #define ATL_chpmv ATL_cpthpmv
+ #define ATL_chbmv ATL_cpthbmv
+ #define ATL_chemv ATL_cpthemv
+ #define ATL_cgbmv ATL_cptgbmv
+ #define ATL_cgemv ATL_cptgemv
+
+ #define ATL_zhpr2 ATL_zpthpr2
+ #define ATL_zher2 ATL_zpther2
+ #define ATL_zhpr ATL_zpthpr
+ #define ATL_zher ATL_zpther
+ #define ATL_zgerc ATL_zptgerc
+ #define ATL_zgeru ATL_zptgeru
+ #define ATL_ztpsv ATL_zpttpsv
+ #define ATL_ztbsv ATL_zpttbsv
+ #define ATL_ztrsv ATL_zpttrsv
+ #define ATL_ztpmv ATL_zpttpmv
+ #define ATL_ztbmv ATL_zpttbmv
+ #define ATL_ztrmv ATL_zpttrmv
+ #define ATL_zhpmv ATL_zpthpmv
+ #define ATL_zhbmv ATL_zpthbmv
+ #define ATL_zhemv ATL_zpthemv
+ #define ATL_zgbmv ATL_zptgbmv
+ #define ATL_zgemv ATL_zptgemv
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_ptalias3.h b/kaldi_io/src/tools/ATLAS/include/atlas_ptalias3.h
new file mode 100644
index 0000000..2a25d23
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_ptalias3.h
@@ -0,0 +1,43 @@
+#ifndef ATLAS_PTALIAS3_H
+#define ATLAS_PTALIAS3_H
+/*
+ * Real BLAS
+ */
+ #define ATL_strsm ATL_spttrsm
+ #define ATL_strmm ATL_spttrmm
+ #define ATL_ssyr2k ATL_sptsyr2k
+ #define ATL_ssyrk ATL_sptsyrk
+ #define ATL_ssymm ATL_sptsymm
+ #define ATL_sgemm ATL_sptgemm
+
+ #define ATL_dtrsm ATL_dpttrsm
+ #define ATL_dtrmm ATL_dpttrmm
+ #define ATL_dsyr2k ATL_dptsyr2k
+ #define ATL_dsyrk ATL_dptsyrk
+ #define ATL_dsymm ATL_dptsymm
+ #define ATL_dgemm ATL_dptgemm
+
+/*
+ * Complex BLAS
+ */
+ #define ATL_ctrsm ATL_cpttrsm
+ #define ATL_ctrmm ATL_cpttrmm
+ #define ATL_cher2k ATL_cpther2k
+ #define ATL_csyr2k ATL_cptsyr2k
+ #define ATL_cherk ATL_cptherk
+ #define ATL_csyrk ATL_cptsyrk
+ #define ATL_chemm ATL_cpthemm
+ #define ATL_csymm ATL_cptsymm
+ #define ATL_cgemm ATL_cptgemm
+
+ #define ATL_ztrsm ATL_zpttrsm
+ #define ATL_ztrmm ATL_zpttrmm
+ #define ATL_zher2k ATL_zpther2k
+ #define ATL_zsyr2k ATL_zptsyr2k
+ #define ATL_zherk ATL_zptherk
+ #define ATL_zsyrk ATL_zptsyrk
+ #define ATL_zhemm ATL_zpthemm
+ #define ATL_zsymm ATL_zptsymm
+ #define ATL_zgemm ATL_zptgemm
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_ptlevel3.h b/kaldi_io/src/tools/ATLAS/include/atlas_ptlevel3.h
new file mode 100644
index 0000000..d1bded3
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_ptlevel3.h
@@ -0,0 +1,284 @@
+
+/* ---------------------------------------------------------------------
+ *
+ * -- Automatically Tuned Linear Algebra Software (ATLAS)
+ * (C) Copyright 2000 All Rights Reserved
+ *
+ * -- ATLAS routine -- Version 3.2 -- December 25, 2000
+ *
+ * Author : Antoine P. Petitet
+ * Originally developed at the University of Tennessee,
+ * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * -- Copyright notice and Licensing terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in
+ * the documentation and/or other materials provided with the distri-
+ * bution.
+ * 3. The name of the University, the ATLAS group, or the names of its
+ * contributors may not be used to endorse or promote products deri-
+ * ved from this software without specific written permission.
+ *
+ * -- Disclaimer:
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
+ * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN-
+ * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---------------------------------------------------------------------
+ */
+#ifndef ATLAS_PTLEVEL3_H
+#define ATLAS_PTLEVEL3_H
+/*
+ * =====================================================================
+ * Include files
+ * =====================================================================
+ */
+#include "atlas_enum.h"
+#include "atlas_pthreads.h"
+/*
+ * =====================================================================
+ * Prototypes for single precision real Level 3 multi-threaded ATLAS
+ * BLAS routines.
+ * =====================================================================
+ */
+void ATL_sptgeadd
+( const int, const int, const float, const float *,
+ const int, const float, float *, const int );
+void ATL_sptgezero
+( const int, const int, float *, const int );
+void ATL_sptgescal
+( const int, const int, const float, float *,
+ const int );
+void ATL_spttrscal
+( const enum ATLAS_UPLO, const int, const int,
+ const float, float *, const int );
+
+void ATL_sptgemm
+( const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const float,
+ const float *, const int, const float *, const int,
+ const float, float *, const int );
+void ATL_sptsymm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int, const float, const float *,
+ const int, const float *, const int, const float,
+ float *, const int );
+void ATL_sptsyrk
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const float, const float *,
+ const int, const float, float *, const int );
+void ATL_sptsyr2k
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const float, const float *,
+ const int, const float *, const int, const float,
+ float *, const int );
+void ATL_spttrmm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const float, const float *,
+ const int, float *, const int );
+void ATL_spttrsm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const float, const float *,
+ const int, float *, const int );
+/*
+ * =====================================================================
+ * Prototypes for double precision real Level 3 multi-threaded ATLAS
+ * BLAS routines.
+ * =====================================================================
+ */
+void ATL_dptgeadd
+( const int, const int, const double, const double *,
+ const int, const double, double *, const int );
+void ATL_dptgezero
+( const int, const int, double *, const int );
+void ATL_dptgescal
+( const int, const int, const double, double *,
+ const int );
+void ATL_dpttrscal
+( const enum ATLAS_UPLO, const int, const int,
+ const double, double *, const int );
+
+void ATL_dptgemm
+( const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const double,
+ const double *, const int, const double *, const int,
+ const double, double *, const int );
+void ATL_dptsymm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int, const double, const double *,
+ const int, const double *, const int, const double,
+ double *, const int );
+void ATL_dptsyrk
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const double, const double *,
+ const int, const double, double *, const int );
+void ATL_dptsyr2k
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const double, const double *,
+ const int, const double *, const int, const double,
+ double *, const int );
+void ATL_dpttrmm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const double, const double *,
+ const int, double *, const int );
+void ATL_dpttrsm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const double, const double *,
+ const int, double *, const int );
+/*
+ * =====================================================================
+ * Prototypes for single precision complex Level 3 multi-threaded ATLAS
+ * BLAS routines.
+ * =====================================================================
+ */
+void ATL_cptgeadd
+( const int, const int, const float *, const float *,
+ const int, const float *, float *, const int );
+void ATL_cptgezero
+( const int, const int, float *, const int );
+void ATL_cptgescal
+( const int, const int, const float *, float *,
+ const int );
+void ATL_cpttrscal
+( const enum ATLAS_UPLO, const int, const int,
+ const float *, float *, const int );
+void ATL_cpthescal
+( const enum ATLAS_UPLO, const int, const int,
+ const float, float *, const int );
+
+void ATL_cptgemm
+( const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const float *,
+ const float *, const int, const float *, const int,
+ const float *, float *, const int );
+void ATL_cptsymm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int, const float *, const float *,
+ const int, const float *, const int, const float *,
+ float *, const int );
+void ATL_cptsyrk
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const float *, const float *,
+ const int, const float *, float *, const int );
+void ATL_cptsyr2k
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const float *, const float *,
+ const int, const float *, const int, const float *,
+ float *, const int );
+void ATL_cpttrmm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const float *, const float *,
+ const int, float *, const int );
+void ATL_cpttrsm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const float *, const float *,
+ const int, float *, const int );
+/*
+ * =====================================================================
+ * Prototypes for double precision complex Level 3 multi-threaded ATLAS
+ * BLAS routines.
+ * =====================================================================
+ */
+void ATL_zptgeadd
+( const int, const int, const double *, const double *,
+ const int, const double *, double *, const int );
+void ATL_zptgezero
+( const int, const int, double *, const int );
+void ATL_zptgescal
+( const int, const int, const double *, double *,
+ const int );
+void ATL_zpttrscal
+( const enum ATLAS_UPLO, const int, const int,
+ const double *, double *, const int );
+void ATL_zpthescal
+( const enum ATLAS_UPLO, const int, const int,
+ const double, double *, const int );
+
+void ATL_zptgemm
+( const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const double *,
+ const double *, const int, const double *, const int,
+ const double *, double *, const int );
+void ATL_zptsymm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int, const double *, const double *,
+ const int, const double *, const int, const double *,
+ double *, const int );
+void ATL_zptsyrk
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const double *, const double *,
+ const int, const double *, double *, const int );
+void ATL_zptsyr2k
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const double *, const double *,
+ const int, const double *, const int, const double *,
+ double *, const int );
+void ATL_zpttrmm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const double *, const double *,
+ const int, double *, const int );
+void ATL_zpttrsm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const double *, const double *,
+ const int, double *, const int );
+
+void ATL_cpthemm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int, const float *, const float *,
+ const int, const float *, const int, const float *,
+ float *, const int );
+void ATL_cptherk
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const float, const float *,
+ const int, const float, float *, const int );
+void ATL_cpther2k
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const float *, const float *,
+ const int, const float *, const int, const float,
+ float *, const int );
+
+void ATL_zpthemm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int, const double *, const double *,
+ const int, const double *, const int, const double *,
+ double *, const int );
+void ATL_zptherk
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const double, const double *,
+ const int, const double, double *, const int );
+void ATL_zpther2k
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const double *, const double *,
+ const int, const double *, const int, const double,
+ double *, const int );
+
+#endif
+/*
+ * End of atlas_ptlevel3.h
+ */
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_ptlvl3.h b/kaldi_io/src/tools/ATLAS/include/atlas_ptlvl3.h
new file mode 100644
index 0000000..916afd0
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_ptlvl3.h
@@ -0,0 +1,389 @@
+
+/* ---------------------------------------------------------------------
+ *
+ * -- Automatically Tuned Linear Algebra Software (ATLAS)
+ * (C) Copyright 2000 All Rights Reserved
+ *
+ * -- ATLAS routine -- Version 3.2 -- December 25, 2000
+ *
+ * Author : Antoine P. Petitet
+ * Originally developed at the University of Tennessee,
+ * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * -- Copyright notice and Licensing terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in
+ * the documentation and/or other materials provided with the distri-
+ * bution.
+ * 3. The name of the University, the ATLAS group, or the names of its
+ * contributors may not be used to endorse or promote products deri-
+ * ved from this software without specific written permission.
+ *
+ * -- Disclaimer:
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
+ * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN-
+ * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---------------------------------------------------------------------
+ */
+#ifndef ATLAS_PTLVL3_H
+#define ATLAS_PTLVL3_H
+/*
+ * =====================================================================
+ * Include files
+ * =====================================================================
+ */
+#include "atlas_ptmisc.h"
+#include "atlas_level3.h"
+#include "atlas_rblas3.h"
+/*
+ * =====================================================================
+ * macro constants
+ * =====================================================================
+ */
+#ifdef TREAL
+#define ATL_XOVER_L3_DEFAULT 8 /* number of NB x NB blocks */
+#else
+#define ATL_XOVER_L3_DEFAULT 4
+#endif
+/*
+ * =====================================================================
+ * macro functions
+ * =====================================================================
+ */
+#define Mpt3( a_, i_, siz_ ) ( ( (char*)(a_) + ( (i_) * (siz_) ) ) )
+#define Mvpt3( a_, i_, siz_ ) ( (void *)(Mpt3( (a_), (i_), (siz_) )))
+/*
+ * =====================================================================
+ * typedef definitions
+ * =====================================================================
+ */
+typedef PT_TREE_T (*PT_GEMM_FUN_T)
+(
+ const unsigned int, pthread_attr_t *,
+ const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int
+);
+
+typedef PT_TREE_T (*PT_TRMM_FUN_T)
+(
+ const unsigned int, pthread_attr_t *,
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const void *, const void *,
+ const int, void *, const int
+);
+
+typedef int (*PT_SYR2K_FUN_T)
+(
+ const unsigned int, pthread_attr_t *,
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_TRANS, const int, const int,
+ const void *, const void *, const int, const void *,
+ const int, const void *, void *, const int
+);
+
+
+typedef struct
+{
+ size_t size;
+ void * negone, * one, * zero;
+ PT_FUN_T geadd0, gemm0, symm0, hemm0, syrk0, syr2k0,
+ herk0, her2k0, trmm0, trsm0;
+ PT_GEMM_FUN_T ptgemm;
+ PT_TRMM_FUN_T pttrmm;
+ PT_SYR2K_FUN_T ptsyr2k0, pther2k0;
+} PT_LVL3_TYPE_T;
+
+typedef struct
+{
+ const void * a, * al, * b, * be;
+ void * c;
+ enum ATLAS_TRANS ta, tb;
+ int k, la, lb, lc, m, n;
+} PT_GEMM_ARGS_T;
+
+typedef struct
+{
+ const void * a, * al, * b, * be;
+ void * c;
+ enum ATLAS_SIDE si;
+ enum ATLAS_UPLO up;
+ int la, lb, lc, m, n;
+} PT_SYMM_ARGS_T;
+
+typedef struct
+{
+ const void * a, * al, * be;
+ void * c;
+ enum ATLAS_UPLO up;
+ enum ATLAS_TRANS tr;
+ int l, la, lc, m, n, k;
+} PT_SYRK_ARGS_T;
+
+typedef struct
+{
+ const void * a, * al, * ac, * b, * be;
+ void * c;
+ enum ATLAS_UPLO up;
+ enum ATLAS_TRANS tr;
+ int l, la, lb, lc, m, n, k;
+} PT_SYR2K_ARGS_T;
+
+typedef struct
+{
+ const void * a, * al;
+ void * b;
+ enum ATLAS_SIDE si;
+ enum ATLAS_UPLO up;
+ enum ATLAS_TRANS tr;
+ enum ATLAS_DIAG di;
+ int la, lb, m, n;
+} PT_TRMM_ARGS_T;
+
+typedef struct
+{
+ const void * a, * al;
+ void * b;
+ enum ATLAS_SIDE si;
+ enum ATLAS_UPLO up;
+ enum ATLAS_TRANS tr;
+ enum ATLAS_DIAG di;
+ int la, lb, m, n;
+} PT_TRSM_ARGS_T;
+
+/*
+ * =====================================================================
+ * Function prototypes
+ * =====================================================================
+ */
+PT_TREE_T ATL_Sgemm
+( const PT_LVL3_TYPE_T *, const unsigned int,
+ const unsigned int, pthread_attr_t *, const int,
+ const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+PT_TREE_T ATL_Ssymm
+( const PT_LVL3_TYPE_T *, const unsigned int,
+ const unsigned int, pthread_attr_t *, const int,
+ const enum ATLAS_TRANS, const enum ATLAS_SIDE,
+ const enum ATLAS_UPLO, const int, const int,
+ const void *, const void *, const int, const void *,
+ const int, const void *, void *, const int );
+PT_TREE_T ATL_Ssyrk
+( const PT_LVL3_TYPE_T *, const unsigned int,
+ const unsigned int, pthread_attr_t *, const int,
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_TRANS, const int, const int,
+ const int, const int, const void *, const void *,
+ const int, const void *, void *, const int );
+PT_TREE_T ATL_Ssyr2k
+( const PT_LVL3_TYPE_T *, const unsigned int,
+ const unsigned int, pthread_attr_t *, const int,
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_TRANS, const int, const int,
+ const int, const int, const void *, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+PT_TREE_T ATL_Strmm
+( const PT_LVL3_TYPE_T *, const unsigned int,
+ const unsigned int, pthread_attr_t *, const int,
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const void *, const void *,
+ const int, void *, const int );
+PT_TREE_T ATL_Strsm
+( const PT_LVL3_TYPE_T *, const unsigned int,
+ const unsigned int, pthread_attr_t *, const int,
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const void *, const void *,
+ const int, void *, const int );
+
+#if defined( TREAL ) || defined( TCPLX )
+
+int Mjoin( PATL, GetNB ) ( void );
+
+void Mjoin( PATL, ptl3settype ) ( PT_LVL3_TYPE_T * );
+
+void Mjoin( PATL, gemmNN )
+( const int, const int, const int, const SCALAR,
+ const TYPE *, const int, const TYPE *, const int,
+ const SCALAR, TYPE *, const int );
+void Mjoin( PATL, gemmNT )
+( const int, const int, const int, const SCALAR,
+ const TYPE *, const int, const TYPE *, const int,
+ const SCALAR, TYPE *, const int );
+void Mjoin( PATL, gemmTN )
+( const int, const int, const int, const SCALAR,
+ const TYPE *, const int, const TYPE *, const int,
+ const SCALAR, TYPE *, const int );
+
+#if defined( TCPLX )
+void Mjoin( PATL, gemmNC )
+( const int, const int, const int, const SCALAR,
+ const TYPE *, const int, const TYPE *, const int,
+ const SCALAR, TYPE *, const int );
+void Mjoin( PATL, gemmCN )
+( const int, const int, const int, const SCALAR,
+ const TYPE *, const int, const TYPE *, const int,
+ const SCALAR, TYPE *, const int );
+#endif
+
+PT_FUN_ARG_T Mjoin( PATL, ptgemm0 ) ( PT_FUN_ARG_T );
+PT_FUN_ARG_T Mjoin( PATL, ptsymm0 ) ( PT_FUN_ARG_T );
+PT_FUN_ARG_T Mjoin( PATL, ptsyr2k0 ) ( PT_FUN_ARG_T );
+PT_FUN_ARG_T Mjoin( PATL, ptsyrk0 ) ( PT_FUN_ARG_T );
+PT_FUN_ARG_T Mjoin( PATL, pttrmm0 ) ( PT_FUN_ARG_T );
+PT_FUN_ARG_T Mjoin( PATL, pttrsm0 ) ( PT_FUN_ARG_T );
+
+#if defined( TCPLX )
+PT_FUN_ARG_T Mjoin( PATL, pthemm0 ) ( PT_FUN_ARG_T );
+PT_FUN_ARG_T Mjoin( PATL, pther2k0 ) ( PT_FUN_ARG_T );
+PT_FUN_ARG_T Mjoin( PATL, ptherk0 ) ( PT_FUN_ARG_T );
+#endif
+/*
+ * =====================================================================
+ * Prototypes for the Level 3 multi-threaded ATLAS BLAS routines
+ * =====================================================================
+ */
+PT_TREE_T Mjoin( PATL, ptgemm_nt )
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+PT_TREE_T Mjoin( PATL, ptsymm_nt )
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int, const void *, const void *,
+ const int, const void *, const int, const void *,
+ void *, const int );
+PT_TREE_T Mjoin( PATL, ptsyr2k_nt )
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const void *, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+int Mjoin( PATL, ptsyr2k0_nt )
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_TRANS, const int, const int,
+ const void *, const void *, const int, const void *,
+ const int, const void *, void *, const int );
+PT_TREE_T Mjoin( PATL, ptsyrk_nt )
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const void *, const void *,
+ const int, const void *, void *, const int );
+PT_TREE_T Mjoin( PATL, pttrmm_nt )
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const void *, const void *,
+ const int, void *, const int );
+PT_TREE_T Mjoin( PATL, pttrsm_nt )
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const void *, const void *,
+ const int, void *, const int );
+
+void Mjoin( PATL, ptgemm )
+( const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const SCALAR,
+ const TYPE *, const int, const TYPE *, const int,
+ const SCALAR, TYPE *, const int );
+void Mjoin( PATL, ptsymm )
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int, const SCALAR, const TYPE *,
+ const int, const TYPE *, const int, const SCALAR,
+ TYPE *, const int );
+void Mjoin( PATL, ptsyr2k )
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const SCALAR, const TYPE *,
+ const int, const TYPE *, const int, const SCALAR,
+ TYPE *, const int );
+void Mjoin( PATL, ptsyrk )
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const SCALAR, const TYPE *,
+ const int, const SCALAR, TYPE *, const int );
+void Mjoin( PATL, pttrmm )
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const SCALAR, const TYPE *,
+ const int, TYPE *, const int );
+void Mjoin( PATL, pttrsm )
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const SCALAR, const TYPE *,
+ const int, TYPE *, const int );
+
+#if defined( TCPLX )
+PT_TREE_T Mjoin( PATL, pthemm_nt )
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int, const void *, const void *,
+ const int, const void *, const int, const void *,
+ void *, const int );
+PT_TREE_T Mjoin( PATL, pther2k_nt )
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const void *, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+int Mjoin( PATL, pther2k0_nt )
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_TRANS, const int, const int,
+ const void *, const void *, const int, const void *,
+ const int, const void *, void *, const int );
+PT_TREE_T Mjoin( PATL, ptherk_nt )
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const void *, const void *,
+ const int, const void *, void *, const int );
+
+void Mjoin( PATL, pthemm )
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int, const SCALAR, const TYPE *,
+ const int, const TYPE *, const int, const SCALAR,
+ TYPE *, const int );
+void Mjoin( PATL, pther2k )
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const SCALAR, const TYPE *,
+ const int, const TYPE *, const int, const TYPE,
+ TYPE *, const int );
+void Mjoin( PATL, ptherk )
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const TYPE, const TYPE *,
+ const int, const TYPE, TYPE *, const int );
+#endif
+
+#endif
+
+#endif
+/*
+ * End of atlas_ptlvl3.h
+ */
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_ptmisc.h b/kaldi_io/src/tools/ATLAS/include/atlas_ptmisc.h
new file mode 100644
index 0000000..4c3db23
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_ptmisc.h
@@ -0,0 +1,410 @@
+/* ---------------------------------------------------------------------
+ *
+ * -- Automatically Tuned Linear Algebra Software (ATLAS)
+ * (C) Copyright 2000 All Rights Reserved
+ *
+ * -- ATLAS routine -- Version 3.2 -- December 25, 2000
+ *
+ * Author : Antoine P. Petitet
+ * Originally developed at the University of Tennessee,
+ * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * -- Copyright notice and Licensing terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in
+ * the documentation and/or other materials provided with the distri-
+ * bution.
+ * 3. The name of the University, the ATLAS group, or the names of its
+ * contributors may not be used to endorse or promote products deri-
+ * ved from this software without specific written permission.
+ *
+ * -- Disclaimer:
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
+ * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN-
+ * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---------------------------------------------------------------------
+ */
+#ifndef ATLAS_PTMISC_H
+#define ATLAS_PTMISC_H
+/*
+ * =====================================================================
+ * Include Files
+ * =====================================================================
+ */
+#include <math.h>
+#include <pthread.h>
+
+#include "atlas_misc.h"
+#include "atlas_pthreads.h"
+/*
+ * =====================================================================
+ * #define macro constants
+ * =====================================================================
+ *
+ * ATL_XOVER_MI_DEFAULT is the smallest number of NB-by-NB blocks for
+ * which threading is enabled, where NB is the value returned by the
+ * ATLAS function Mjoin( PATL, GetNB ).
+ */
+#ifdef TREAL
+#define ATL_XOVER_MI_DEFAULT 8 /* number of NB x NB blocks */
+#else
+#define ATL_XOVER_MI_DEFAULT 4
+#endif
+
+#define NOSPLIT 0 /* For convenience */
+#define SPLIT_M 1
+#define SPLIT_N 2
+#define SPLIT_K 3
+
+/*
+ * =====================================================================
+ * macro functions
+ * =====================================================================
+ */
+#define Mptm( a_, i_, siz_ ) ( ( (char*)(a_) + ( (i_) * (siz_) ) ) )
+#define Mvptm( a_, i_, siz_ ) ( (void *)(Mptm( (a_), (i_), (siz_) )))
+/*
+ * =====================================================================
+ * typedef definitions
+ * =====================================================================
+ *
+ * Definition of the Binary (recursive) task tree: Each node of the tree
+ * mainly consist a node number, a reference counter to enforce depen-
+ * dencies, a argument structure and a function to be applied.
+ */
+typedef void * PT_DATA_T;
+typedef void * PT_FUN_VAL_T;
+typedef void * PT_FUN_ARG_T;
+typedef PT_FUN_VAL_T (*PT_FUN_T) ( PT_FUN_ARG_T );
+
+typedef struct PT_node_T
+{
+ pthread_t pid;
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+ struct PT_node_T * left;
+ struct PT_node_T * right;
+ PT_DATA_T data;
+ PT_FUN_VAL_T * val;
+ PT_FUN_T fun;
+ PT_FUN_ARG_T arg;
+ unsigned int node;
+ unsigned int count;
+} PT_NODE_T;
+
+typedef PT_NODE_T * PT_TREE_T;
+typedef void (*PT_APPLY_FUN_T)( PT_TREE_T );
+
+enum DIM_1DSPLIT_E
+{
+ Atlas1dSplit = 100,
+ Atlas1dNoSplit = 199
+};
+
+enum DIM_TZSPLIT_E
+{
+ AtlasTzSplitMrow = 200,
+ AtlasTzSplitKrow = 201,
+ AtlasTzSplitKcol = 202,
+ AtlasTzSplitNcol = 203,
+ AtlasTzNoSplit = 299
+};
+
+typedef enum DIM_1DSPLIT_E DIM_1DSPLIT_T;
+typedef enum DIM_TZSPLIT_E DIM_TZSPLIT_T;
+
+/*
+ * Type definitions for some auxiliaries that have been multi-threaded
+ * as well.
+ */
+typedef struct
+{
+ size_t size;
+ PT_FUN_T fun;
+} PT_MISC_TYPE_T;
+
+typedef struct
+{
+ const void * al, * be;
+ const void * a;
+ void * c;
+ int la, lc, m, n;
+} PT_GEADD_ARGS_T;
+
+typedef struct
+{
+ void * a;
+ int la, m, n;
+} PT_GEZERO_ARGS_T;
+
+typedef struct
+{
+ const void * al;
+ void * a;
+ int la, m, n;
+} PT_GESCAL_ARGS_T;
+
+typedef struct
+{
+ enum ATLAS_UPLO up;
+ const void * al;
+ void * a;
+ int k, la, m, n;
+} PT_TZSCAL_ARGS_T;
+
+/*
+ * =====================================================================
+ * Function prototypes
+ * =====================================================================
+ */
+int ATL_sGetNB ( void );
+int ATL_dGetNB ( void );
+int ATL_cGetNB ( void );
+int ATL_zGetNB ( void );
+
+DIM_1DSPLIT_T ATL_1dsplit
+(
+ const unsigned int,
+ const int,
+ const int,
+ unsigned int *,
+ unsigned int *,
+ int *,
+ int *,
+ double *
+);
+
+DIM_TZSPLIT_T ATL_tzsplit
+(
+ const enum ATLAS_UPLO,
+ const unsigned int,
+ const int,
+ const int,
+ const int,
+ const int,
+ unsigned int *,
+ unsigned int *,
+ int *,
+ int *
+);
+/*
+ * Task tree management
+ */
+PT_TREE_T ATL_init_node
+( unsigned int, PT_TREE_T, PT_TREE_T, PT_DATA_T,
+ PT_FUN_VAL_T *, PT_FUN_T, PT_FUN_ARG_T );
+
+void ATL_traverse_tree ( PT_TREE_T );
+void ATL_apply_tree ( PT_TREE_T, PT_APPLY_FUN_T );
+void ATL_free_tree ( PT_TREE_T );
+void ATL_free_node ( PT_TREE_T );
+void ATL_print_node_id ( PT_TREE_T );
+
+void ATL_thread_init ( pthread_attr_t * );
+void ATL_thread_exit ( pthread_attr_t * );
+void ATL_wait_tree ( PT_TREE_T );
+void ATL_signal_tree ( PT_TREE_T );
+void ATL_thread_tree ( PT_TREE_T, pthread_attr_t * );
+void ATL_join_tree ( PT_TREE_T );
+
+PT_TREE_T ATL_create_tree
+( unsigned int *, const int, const int );
+/*
+ * Typeless auxiliary functions
+ */
+PT_TREE_T ATL_Sgeadd
+( const PT_MISC_TYPE_T *, const unsigned int,
+ const unsigned int, pthread_attr_t *, const int,
+ const int, const int, const void *, const void *,
+ const int, const void *, void *, const int );
+PT_TREE_T ATL_Sgescal
+( const PT_MISC_TYPE_T *, const unsigned int,
+ const unsigned int, pthread_attr_t *, const int,
+ const int, const int, const void *, void *,
+ const int );
+PT_TREE_T ATL_Sgezero
+( const PT_MISC_TYPE_T *, const unsigned int,
+ const unsigned int, pthread_attr_t *, const int,
+ const int, const int, void *, const int );
+PT_TREE_T ATL_Stzscal
+( const PT_MISC_TYPE_T *, const unsigned int,
+ const unsigned int, pthread_attr_t *, const int,
+ const enum ATLAS_UPLO, const int, const int,
+ const int, const void *, void *, const int );
+/*
+ * Single precision real auxiliary functions
+ */
+PT_FUN_ARG_T ATL_sptgeadd0 ( PT_FUN_ARG_T );
+PT_FUN_ARG_T ATL_sptgescal0 ( PT_FUN_ARG_T );
+PT_FUN_ARG_T ATL_sptgezero0 ( PT_FUN_ARG_T );
+PT_FUN_ARG_T ATL_spttzscal0 ( PT_FUN_ARG_T );
+
+PT_TREE_T ATL_sptgeadd_nt
+( const unsigned int, pthread_attr_t *, const int,
+ const int, const void *, const void *, const int,
+ const void *, void *, const int );
+PT_TREE_T ATL_sptgescal_nt
+( const unsigned int, pthread_attr_t *, const int,
+ const int, const void *, void *, const int );
+PT_TREE_T ATL_sptgezero_nt
+( const unsigned int, pthread_attr_t *, const int,
+ const int, void *, const int );
+PT_TREE_T ATL_spttrscal_nt
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_UPLO, const int, const int,
+ const void *, void *, const int );
+
+void ATL_sptgeadd
+( const int, const int, const float, const float *,
+ const int, const float, float *, const int );
+void ATL_sptgescal
+( const int, const int, const float, float *,
+ const int );
+void ATL_sptgezero
+( const int, const int, float *, const int );
+void ATL_spttrscal
+( const enum ATLAS_UPLO, const int, const int,
+ const float, float *, const int );
+
+/*
+ * Double precision real auxiliary functions
+ */
+PT_FUN_ARG_T ATL_dptgeadd0 ( PT_FUN_ARG_T );
+PT_FUN_ARG_T ATL_dptgescal0 ( PT_FUN_ARG_T );
+PT_FUN_ARG_T ATL_dptgezero0 ( PT_FUN_ARG_T );
+PT_FUN_ARG_T ATL_dpttzscal0 ( PT_FUN_ARG_T );
+
+PT_TREE_T ATL_dptgeadd_nt
+( const unsigned int, pthread_attr_t *, const int,
+ const int, const void *, const void *, const int,
+ const void *, void *, const int );
+PT_TREE_T ATL_dptgescal_nt
+( const unsigned int, pthread_attr_t *, const int,
+ const int, const void *, void *, const int );
+PT_TREE_T ATL_dptgezero_nt
+( const unsigned int, pthread_attr_t *, const int,
+ const int, void *, const int );
+PT_TREE_T ATL_dpttrscal_nt
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_UPLO, const int, const int,
+ const void *, void *, const int );
+
+void ATL_dptgeadd
+( const int, const int, const double, const double *,
+ const int, const double, double *, const int );
+void ATL_dptgescal
+( const int, const int, const double, double *,
+ const int );
+void ATL_dptgezero
+( const int, const int, double *, const int );
+void ATL_dpttrscal
+( const enum ATLAS_UPLO, const int, const int,
+ const double, double *, const int );
+/*
+ * Single precision complex auxiliary functions
+ */
+PT_FUN_ARG_T ATL_cptgeadd0 ( PT_FUN_ARG_T );
+PT_FUN_ARG_T ATL_cptgescal0 ( PT_FUN_ARG_T );
+PT_FUN_ARG_T ATL_cptgezero0 ( PT_FUN_ARG_T );
+PT_FUN_ARG_T ATL_cpthescal0 ( PT_FUN_ARG_T );
+PT_FUN_ARG_T ATL_cpttzscal0 ( PT_FUN_ARG_T );
+
+PT_TREE_T ATL_cptgeadd_nt
+( const unsigned int, pthread_attr_t *, const int,
+ const int, const void *, const void *, const int,
+ const void *, void *, const int );
+PT_TREE_T ATL_cptgescal_nt
+( const unsigned int, pthread_attr_t *, const int,
+ const int, const void *, void *, const int );
+PT_TREE_T ATL_cptgezero_nt
+( const unsigned int, pthread_attr_t *, const int,
+ const int, void *, const int );
+PT_TREE_T ATL_cpttrscal_nt
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_UPLO, const int, const int,
+ const void *, void *, const int );
+PT_TREE_T ATL_cpthescal_nt
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_UPLO, const int, const int,
+ const void *, void *, const int );
+
+void ATL_cptgeadd
+( const int, const int, const float *, const float *,
+ const int, const float *, float *, const int );
+void ATL_cptgezero
+( const int, const int, float *, const int );
+void ATL_cptgescal
+( const int, const int, const float *, float *,
+ const int );
+void ATL_cpttrscal
+( const enum ATLAS_UPLO, const int, const int,
+ const float *, float *, const int );
+void ATL_cpthescal
+( const enum ATLAS_UPLO, const int, const int,
+ const float, float *, const int );
+/*
+ * Double precision complex auxiliary functions
+ */
+PT_FUN_ARG_T ATL_zptgeadd0 ( PT_FUN_ARG_T );
+PT_FUN_ARG_T ATL_zptgescal0 ( PT_FUN_ARG_T );
+PT_FUN_ARG_T ATL_zptgezero0 ( PT_FUN_ARG_T );
+PT_FUN_ARG_T ATL_zpthescal0 ( PT_FUN_ARG_T );
+PT_FUN_ARG_T ATL_zpttzscal0 ( PT_FUN_ARG_T );
+
+PT_TREE_T ATL_zptgeadd_nt
+( const unsigned int, pthread_attr_t *, const int,
+ const int, const void *, const void *, const int,
+ const void *, void *, const int );
+PT_TREE_T ATL_zptgescal_nt
+( const unsigned int, pthread_attr_t *, const int,
+ const int, const void *, void *, const int );
+PT_TREE_T ATL_zptgezero_nt
+( const unsigned int, pthread_attr_t *, const int,
+ const int, void *, const int );
+PT_TREE_T ATL_zpttrscal_nt
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_UPLO, const int, const int,
+ const void *, void *, const int );
+PT_TREE_T ATL_zpthescal_nt
+( const unsigned int, pthread_attr_t *,
+ const enum ATLAS_UPLO, const int, const int,
+ const void *, void *, const int );
+
+void ATL_zptgeadd
+( const int, const int, const double *, const double *,
+ const int, const double *, double *, const int );
+void ATL_zptgezero
+( const int, const int, double *, const int );
+void ATL_zptgescal
+( const int, const int, const double *, double *,
+ const int );
+void ATL_zpttrscal
+( const enum ATLAS_UPLO, const int, const int,
+ const double *, double *, const int );
+void ATL_zpthescal
+( const enum ATLAS_UPLO, const int, const int,
+ const double, double *, const int );
+
+#endif
+/*
+ * End of atlas_ptmisc.h
+ */
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_r1.h b/kaldi_io/src/tools/ATLAS/include/atlas_r1.h
new file mode 100644
index 0000000..dc49fe2
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_r1.h
@@ -0,0 +1,39 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef SREAL
+ #include "atlas_sr1.h"
+#elif defined(DREAL)
+ #include "atlas_dr1.h"
+#elif defined(SCPLX)
+ #include "atlas_cr1.h"
+#elif defined(DCPLX)
+ #include "atlas_zr1.h"
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_rblas3.h b/kaldi_io/src/tools/ATLAS/include/atlas_rblas3.h
new file mode 100644
index 0000000..9ad27e7
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_rblas3.h
@@ -0,0 +1,474 @@
+/* ---------------------------------------------------------------------
+ *
+ * -- Automatically Tuned Linear Algebra Software (ATLAS)
+ * (C) Copyright 2000 All Rights Reserved
+ *
+ * -- ATLAS routine -- Version 3.2 -- December 25, 2000
+ *
+ * Author : Antoine P. Petitet
+ * Contributor(s) : R. Clint Whaley
+ * Originally developed at the University of Tennessee,
+ * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * -- Copyright notice and Licensing terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in
+ * the documentation and/or other materials provided with the distri-
+ * bution.
+ * 3. The name of the University, the ATLAS group, or the names of its
+ * contributors may not be used to endorse or promote products deri-
+ * ved from this software without specific written permission.
+ *
+ * -- Disclaimer:
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
+ * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN-
+ * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---------------------------------------------------------------------
+ */
+#ifndef ATLAS_RBLAS3_H
+#define ATLAS_RBLAS3_H
+/*
+ * =====================================================================
+ * Include files
+ * =====================================================================
+ */
+#include "atlas_misc.h"
+/*
+ * =====================================================================
+ * #define macros definitions
+ * =====================================================================
+ */
+#define Mrc3( a_, i_, j_, lda_, siz_ ) \
+ ( (void*) ( (char*)(a_) + ( ( (i_)+(j_)*(lda_) )*(siz_) ) ) )
+/*
+ * =====================================================================
+ * #typedef definitions
+ * =====================================================================
+ */
+typedef void (*KR3_FUN_GEMM_T)
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+typedef void (*KR3_FUN_HEMM_T)
+( const int, const int, const void *, const void *,
+ const int, const void *, const int, const void *,
+ void *, const int );
+typedef int (*KR3_FUN_HER2K_T)
+( const int, const int, const void *, const void *,
+ const int, const void *, const int, const void *,
+ void *, const int );
+typedef void (*KR3_FUN_HERK_T)
+( const int, const int, const void *, const void *,
+ const int, const void *, void *, const int );
+typedef void (*KR3_FUN_SYMM_T)
+( const int, const int, const void *, const void *,
+ const int, const void *, const int, const void *,
+ void *, const int );
+typedef int (*KR3_FUN_SYR2K_T)
+( const int, const int, const void *, const void *,
+ const int, const void *, const int, const void *,
+ void *, const int );
+typedef void (*KR3_FUN_SYRK_T)
+( const int, const int, const void *, const void *,
+ const int, const void *, void *, const int );
+typedef void (*KR3_FUN_TRMM_T)
+( const int, const int, const void *, const void *,
+ const int, void *, const int );
+typedef void (*KR3_FUN_TRSM_T)
+( const int, const int, const void *, const void *,
+ const int, void *, const int );
+
+typedef struct
+{
+ size_t size;
+ void * one;
+ KR3_FUN_GEMM_T TgemmNN;
+ KR3_FUN_GEMM_T Tgemm;
+ KR3_FUN_SYMM_T Tsymm;
+} RC3_SYMM_T;
+
+typedef struct
+{
+ size_t size;
+ void * one;
+ KR3_FUN_GEMM_T TgemmNN;
+ KR3_FUN_GEMM_T Tgemm;
+ KR3_FUN_HEMM_T Themm;
+} RC3_HEMM_T;
+
+typedef struct
+{
+ size_t size;
+ KR3_FUN_GEMM_T Tgemm;
+ KR3_FUN_SYRK_T Tsyrk;
+} RC3_SYRK_T;
+
+typedef struct
+{
+ size_t size;
+ KR3_FUN_GEMM_T Tgemm;
+ KR3_FUN_HERK_T Therk;
+} RC3_HERK_T;
+
+typedef struct
+{
+ size_t size;
+ void * one;
+ KR3_FUN_GEMM_T Tgemm;
+ KR3_FUN_SYR2K_T Tsyr2k;
+} RC3_SYR2K_T;
+
+typedef struct
+{
+ size_t size;
+ void * one;
+ KR3_FUN_GEMM_T Tgemm;
+ KR3_FUN_HER2K_T Ther2k;
+} RC3_HER2K_T;
+
+typedef struct
+{
+ size_t size;
+ void * one;
+ KR3_FUN_GEMM_T Tgemm;
+ KR3_FUN_TRMM_T Ttrmm;
+} RC3_TRMM_T;
+
+typedef struct
+{
+ size_t size;
+ void * one, * negone;
+ KR3_FUN_GEMM_T Tgemm;
+ KR3_FUN_TRSM_T Ttrsm;
+} RC3_TRSM_T;
+
+typedef void (*RC3_FUN_HEMM_T)
+( RC3_HEMM_T *, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int, const int );
+typedef void (*RC3_FUN_HER2K_T)
+( RC3_HER2K_T *, const int, const int, const void *,
+ const void *, const void *, const int, const void *,
+ const int, const void *, void *, const int,
+ const int );
+typedef void (*RC3_FUN_HERK_T)
+( RC3_HERK_T *, const int, const int, const void *,
+ const void *, const int, const void *, void *,
+ const int, const int );
+typedef void (*RC3_FUN_SYMM_T)
+( RC3_SYMM_T *, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int, const int );
+typedef void (*RC3_FUN_SYR2K_T)
+( RC3_SYR2K_T *, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int, const int );
+typedef void (*RC3_FUN_SYRK_T)
+( RC3_SYRK_T *, const int, const int, const void *,
+ const void *, const int, const void *, void *,
+ const int, const int );
+typedef void (*RC3_FUN_TRMM_T)
+( RC3_TRMM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+typedef void (*RC3_FUN_TRSM_T)
+( RC3_TRSM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+/*
+ * =====================================================================
+ * Level 3 recursive BLAS internal function prototypes
+ * =====================================================================
+ */
+void ATL_sgemmTN_RB
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+void ATL_sgemmNT_RB
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+void ATL_sgemmNN_RB
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+void ATL_dgemmTN_RB
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+void ATL_dgemmNT_RB
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+void ATL_dgemmNN_RB
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+void ATL_cgemmCN_RB
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+void ATL_cgemmNC_RB
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+void ATL_cgemmTN_RB
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+void ATL_cgemmNT_RB
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+void ATL_cgemmNN_RB
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+void ATL_zgemmCN_RB
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+void ATL_zgemmNC_RB
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+void ATL_zgemmTN_RB
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+void ATL_zgemmNT_RB
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+void ATL_zgemmNN_RB
+( const int, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int );
+/*
+ * =====================================================================
+ * Recursive BLAS function prototypes
+ * =====================================================================
+ */
+void ATL_rsymmRU
+( RC3_SYMM_T *, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int, const int );
+void ATL_rhemmRU
+( RC3_HEMM_T *, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int, const int );
+void ATL_rsymmRL
+( RC3_SYMM_T *, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int, const int );
+void ATL_rhemmRL
+( RC3_HEMM_T *, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int, const int );
+void ATL_rsymmLU
+( RC3_SYMM_T *, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int, const int );
+void ATL_rhemmLU
+( RC3_HEMM_T *, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int, const int );
+void ATL_rsymmLL
+( RC3_SYMM_T *, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int, const int );
+void ATL_rhemmLL
+( RC3_HEMM_T *, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int, const int );
+
+void ATL_rsyrkUT
+( RC3_SYRK_T *, const int, const int, const void *,
+ const void *, const int, const void *, void *,
+ const int, const int );
+void ATL_rsyr2kUT
+( RC3_SYR2K_T *, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int, const int );
+void ATL_rsyrkUN
+( RC3_SYRK_T *, const int, const int, const void *,
+ const void *, const int, const void *, void *,
+ const int, const int );
+void ATL_rsyr2kUN
+( RC3_SYR2K_T *, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int, const int );
+void ATL_rsyrkLT
+( RC3_SYRK_T *, const int, const int, const void *,
+ const void *, const int, const void *, void *,
+ const int, const int );
+void ATL_rsyr2kLT
+( RC3_SYR2K_T *, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int, const int );
+void ATL_rsyrkLN
+( RC3_SYRK_T *, const int, const int, const void *,
+ const void *, const int, const void *, void *,
+ const int, const int );
+void ATL_rsyr2kLN
+( RC3_SYR2K_T *, const int, const int, const void *,
+ const void *, const int, const void *, const int,
+ const void *, void *, const int, const int );
+
+void ATL_rherkUC
+( RC3_HERK_T *, const int, const int, const void *,
+ const void *, const int, const void *, void *,
+ const int, const int );
+void ATL_rher2kUC
+( RC3_HER2K_T *, const int, const int, const void *,
+ const void *, const void *, const int, const void *,
+ const int, const void *, void *, const int,
+ const int );
+void ATL_rherkUN
+( RC3_HERK_T *, const int, const int, const void *,
+ const void *, const int, const void *, void *,
+ const int, const int );
+void ATL_rher2kUN
+( RC3_HER2K_T *, const int, const int, const void *,
+ const void *, const void *, const int, const void *,
+ const int, const void *, void *, const int,
+ const int );
+void ATL_rherkLC
+( RC3_HERK_T *, const int, const int, const void *,
+ const void *, const int, const void *, void *,
+ const int, const int );
+void ATL_rher2kLC
+( RC3_HER2K_T *, const int, const int, const void *,
+ const void *, const void *, const int, const void *,
+ const int, const void *, void *, const int,
+ const int );
+void ATL_rherkLN
+( RC3_HERK_T *, const int, const int, const void *,
+ const void *, const int, const void *, void *,
+ const int, const int );
+void ATL_rher2kLN
+( RC3_HER2K_T *, const int, const int, const void *,
+ const void *, const void *, const int, const void *,
+ const int, const void *, void *, const int,
+ const int );
+
+void ATL_rtrmmRUC
+( RC3_TRMM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrsmRUC
+( RC3_TRSM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrmmRLC
+( RC3_TRMM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrsmRLC
+( RC3_TRSM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrmmRUT
+( RC3_TRMM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrsmRUT
+( RC3_TRSM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrmmRLT
+( RC3_TRMM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrsmRLT
+( RC3_TRSM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrmmRUN
+( RC3_TRMM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrsmRUN
+( RC3_TRSM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrmmRLN
+( RC3_TRMM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrsmRLN
+( RC3_TRSM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrmmLUC
+( RC3_TRMM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrsmLUC
+( RC3_TRSM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrmmLLC
+( RC3_TRMM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrsmLLC
+( RC3_TRSM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrmmLUT
+( RC3_TRMM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrsmLUT
+( RC3_TRSM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrmmLLT
+( RC3_TRMM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrsmLLT
+( RC3_TRSM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrmmLUN
+( RC3_TRMM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrsmLUN
+( RC3_TRSM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrmmLLN
+( RC3_TRMM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+void ATL_rtrsmLLN
+( RC3_TRSM_T *, const int, const int, const void *,
+ const void *, const int, void *, const int,
+ const int );
+
+#endif
+/*
+ * End of atlas_rblas3.h
+ */
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_refalias1.h b/kaldi_io/src/tools/ATLAS/include/atlas_refalias1.h
new file mode 100644
index 0000000..7dcac8a
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_refalias1.h
@@ -0,0 +1,59 @@
+#ifndef ATLAS_REFALIAS1_H
+#define ATLAS_REFALIAS1_H
+/*
+ * Real BLAS
+ */
+ #define ATL_dsdot ATL_dsrefdot
+ #define ATL_sdsdot ATL_sdsrefdot
+ #define ATL_sasum ATL_srefasum
+ #define ATL_snrm2 ATL_srefnrm2
+ #define ATL_sdot ATL_srefdot
+ #define ATL_saxpy ATL_srefaxpy
+ #define ATL_scopy ATL_srefcopy
+ #define ATL_sscal ATL_srefscal
+ #define ATL_sswap ATL_srefswap
+ #define ATL_srotm ATL_srefrotm
+ #define ATL_srot ATL_srefrot
+ #define ATL_srotmg ATL_srefrotmg
+ #define ATL_srotg ATL_srefrotg
+ #define ATL_isamax ATL_isrefamax
+
+ #define ATL_dasum ATL_drefasum
+ #define ATL_dnrm2 ATL_drefnrm2
+ #define ATL_ddot ATL_drefdot
+ #define ATL_daxpy ATL_drefaxpy
+ #define ATL_dcopy ATL_drefcopy
+ #define ATL_dscal ATL_drefscal
+ #define ATL_dswap ATL_drefswap
+ #define ATL_drotm ATL_drefrotm
+ #define ATL_drot ATL_drefrot
+ #define ATL_drotmg ATL_drefrotmg
+ #define ATL_drotg ATL_drefrotg
+ #define ATL_idamax ATL_idrefamax
+
+/*
+ * Complex BLAS
+ */
+ #define ATL_cdotc_sub ATL_crefdotc_sub
+ #define ATL_cdotu_sub ATL_crefdotu_sub
+ #define ATL_caxpy ATL_crefaxpy
+ #define ATL_ccopy ATL_crefcopy
+ #define ATL_cscal ATL_crefscal
+ #define ATL_cswap ATL_crefswap
+ #define ATL_icamax ATL_icrefamax
+ #define ATL_csscal ATL_csrefscal
+ #define ATL_scnrm2 ATL_screfnrm2
+ #define ATL_scasum ATL_screfasum
+
+ #define ATL_zdotc_sub ATL_zrefdotc_sub
+ #define ATL_zdotu_sub ATL_zrefdotu_sub
+ #define ATL_zaxpy ATL_zrefaxpy
+ #define ATL_zcopy ATL_zrefcopy
+ #define ATL_zscal ATL_zrefscal
+ #define ATL_zswap ATL_zrefswap
+ #define ATL_izamax ATL_izrefamax
+ #define ATL_zdscal ATL_zdrefscal
+ #define ATL_dznrm2 ATL_dzrefnrm2
+ #define ATL_dzasum ATL_dzrefasum
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_refalias2.h b/kaldi_io/src/tools/ATLAS/include/atlas_refalias2.h
new file mode 100644
index 0000000..5871491
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_refalias2.h
@@ -0,0 +1,79 @@
+#ifndef ATLAS_REFALIAS2_H
+#define ATLAS_REFALIAS2_H
+/*
+ * Real BLAS
+ */
+ #define ATL_sspr2 ATL_srefspr2
+ #define ATL_ssyr2 ATL_srefsyr2
+ #define ATL_sspr ATL_srefspr
+ #define ATL_ssyr ATL_srefsyr
+ #define ATL_sger ATL_srefger
+ #define ATL_stpsv ATL_sreftpsv
+ #define ATL_stbsv ATL_sreftbsv
+ #define ATL_strsv ATL_sreftrsv
+ #define ATL_stpmv ATL_sreftpmv
+ #define ATL_stbmv ATL_sreftbmv
+ #define ATL_strmv ATL_sreftrmv
+ #define ATL_sspmv ATL_srefspmv
+ #define ATL_ssbmv ATL_srefsbmv
+ #define ATL_ssymv ATL_srefsymv
+ #define ATL_sgbmv ATL_srefgbmv
+ #define ATL_sgemv ATL_srefgemv
+
+ #define ATL_dspr2 ATL_drefspr2
+ #define ATL_dsyr2 ATL_drefsyr2
+ #define ATL_dspr ATL_drefspr
+ #define ATL_dsyr ATL_drefsyr
+ #define ATL_dger ATL_drefger
+ #define ATL_dtpsv ATL_dreftpsv
+ #define ATL_dtbsv ATL_dreftbsv
+ #define ATL_dtrsv ATL_dreftrsv
+ #define ATL_dtpmv ATL_dreftpmv
+ #define ATL_dtbmv ATL_dreftbmv
+ #define ATL_dtrmv ATL_dreftrmv
+ #define ATL_dspmv ATL_drefspmv
+ #define ATL_dsbmv ATL_drefsbmv
+ #define ATL_dsymv ATL_drefsymv
+ #define ATL_dgbmv ATL_drefgbmv
+ #define ATL_dgemv ATL_drefgemv
+
+/*
+ * Complex BLAS
+ */
+ #define ATL_chpr2 ATL_crefhpr2
+ #define ATL_cher2 ATL_crefher2
+ #define ATL_chpr ATL_crefhpr
+ #define ATL_cher ATL_crefher
+ #define ATL_cgerc ATL_crefgerc
+ #define ATL_cgeru ATL_crefgeru
+ #define ATL_ctpsv ATL_creftpsv
+ #define ATL_ctbsv ATL_creftbsv
+ #define ATL_ctrsv ATL_creftrsv
+ #define ATL_ctpmv ATL_creftpmv
+ #define ATL_ctbmv ATL_creftbmv
+ #define ATL_ctrmv ATL_creftrmv
+ #define ATL_chpmv ATL_crefhpmv
+ #define ATL_chbmv ATL_crefhbmv
+ #define ATL_chemv ATL_crefhemv
+ #define ATL_cgbmv ATL_crefgbmv
+ #define ATL_cgemv ATL_crefgemv
+
+ #define ATL_zhpr2 ATL_zrefhpr2
+ #define ATL_zher2 ATL_zrefher2
+ #define ATL_zhpr ATL_zrefhpr
+ #define ATL_zher ATL_zrefher
+ #define ATL_zgerc ATL_zrefgerc
+ #define ATL_zgeru ATL_zrefgeru
+ #define ATL_ztpsv ATL_zreftpsv
+ #define ATL_ztbsv ATL_zreftbsv
+ #define ATL_ztrsv ATL_zreftrsv
+ #define ATL_ztpmv ATL_zreftpmv
+ #define ATL_ztbmv ATL_zreftbmv
+ #define ATL_ztrmv ATL_zreftrmv
+ #define ATL_zhpmv ATL_zrefhpmv
+ #define ATL_zhbmv ATL_zrefhbmv
+ #define ATL_zhemv ATL_zrefhemv
+ #define ATL_zgbmv ATL_zrefgbmv
+ #define ATL_zgemv ATL_zrefgemv
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_refalias3.h b/kaldi_io/src/tools/ATLAS/include/atlas_refalias3.h
new file mode 100644
index 0000000..f10e65c
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_refalias3.h
@@ -0,0 +1,43 @@
+#ifndef ATLAS_REFALIAS3_H
+#define ATLAS_REFALIAS3_H
+/*
+ * Real BLAS
+ */
+ #define ATL_strsm ATL_sreftrsm
+ #define ATL_strmm ATL_sreftrmm
+ #define ATL_ssyr2k ATL_srefsyr2k
+ #define ATL_ssyrk ATL_srefsyrk
+ #define ATL_ssymm ATL_srefsymm
+ #define ATL_sgemm ATL_srefgemm
+
+ #define ATL_dtrsm ATL_dreftrsm
+ #define ATL_dtrmm ATL_dreftrmm
+ #define ATL_dsyr2k ATL_drefsyr2k
+ #define ATL_dsyrk ATL_drefsyrk
+ #define ATL_dsymm ATL_drefsymm
+ #define ATL_dgemm ATL_drefgemm
+
+/*
+ * Complex BLAS
+ */
+ #define ATL_ctrsm ATL_creftrsm
+ #define ATL_ctrmm ATL_creftrmm
+ #define ATL_cher2k ATL_crefher2k
+ #define ATL_csyr2k ATL_crefsyr2k
+ #define ATL_cherk ATL_crefherk
+ #define ATL_csyrk ATL_crefsyrk
+ #define ATL_chemm ATL_crefhemm
+ #define ATL_csymm ATL_crefsymm
+ #define ATL_cgemm ATL_crefgemm
+
+ #define ATL_ztrsm ATL_zreftrsm
+ #define ATL_ztrmm ATL_zreftrmm
+ #define ATL_zher2k ATL_zrefher2k
+ #define ATL_zsyr2k ATL_zrefsyr2k
+ #define ATL_zherk ATL_zrefherk
+ #define ATL_zsyrk ATL_zrefsyrk
+ #define ATL_zhemm ATL_zrefhemm
+ #define ATL_zsymm ATL_zrefsymm
+ #define ATL_zgemm ATL_zrefgemm
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_reflevel1.h b/kaldi_io/src/tools/ATLAS/include/atlas_reflevel1.h
new file mode 100644
index 0000000..2f79ac8
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_reflevel1.h
@@ -0,0 +1,421 @@
+/* ---------------------------------------------------------------------
+ *
+ * -- Automatically Tuned Linear Algebra Software (ATLAS)
+ * (C) Copyright 2000 All Rights Reserved
+ *
+ * -- ATLAS routine -- Version 3.2 -- December 25, 2000
+ *
+ * Author : Antoine P. Petitet
+ * Originally developed at the University of Tennessee,
+ * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * -- Copyright notice and Licensing terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in
+ * the documentation and/or other materials provided with the distri-
+ * bution.
+ * 3. The name of the University, the ATLAS group, or the names of its
+ * contributors may not be used to endorse or promote products deri-
+ * ved from this software without specific written permission.
+ *
+ * -- Disclaimer:
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
+ * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN-
+ * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---------------------------------------------------------------------
+ */
+#ifndef ATLAS_REFLEVEL1_H
+#define ATLAS_REFLEVEL1_H
+/*
+ * =====================================================================
+ * Prototypes for Level 1 Reference ATLAS BLAS routines
+ * =====================================================================
+ */
+void ATL_srefrotg
+(
+ float *,
+ float *,
+ float *,
+ float *
+);
+
+void ATL_srefrotmg
+(
+ float *,
+ float *,
+ float *,
+ const float,
+ float *
+);
+
+float ATL_srefnrm2
+(
+ const int,
+ const float *, const int
+);
+
+float ATL_srefasum
+(
+ const int,
+ const float *, const int
+);
+
+int ATL_isrefamax
+(
+ const int,
+ const float *, const int
+);
+
+void ATL_srefscal
+(
+ const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefswap
+(
+ const int,
+ float *, const int,
+ float *, const int
+);
+
+void ATL_srefcopy
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_srefaxpy
+(
+ const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_srefrot
+(
+ const int,
+ float *, const int,
+ float *, const int,
+ const float,
+ const float
+);
+
+void ATL_srefrotm
+(
+ const int,
+ float *, const int,
+ float *, const int,
+ const float *
+);
+
+float ATL_srefdot
+(
+ const int,
+ const float *, const int,
+ const float *, const int
+);
+
+float ATL_sdsrefdot
+(
+ const int,
+ const float,
+ const float *, const int,
+ const float *, const int
+);
+
+double ATL_dsrefdot
+(
+ const int,
+ const float *, const int,
+ const float *, const int
+);
+
+void ATL_drefrotg
+(
+ double *,
+ double *,
+ double *,
+ double *
+);
+
+void ATL_drefrotmg
+(
+ double *,
+ double *,
+ double *,
+ const double,
+ double *
+);
+
+double ATL_drefnrm2
+(
+ const int,
+ const double *, const int
+);
+
+double ATL_drefasum
+(
+ const int,
+ const double *, const int
+);
+
+int ATL_idrefamax
+(
+ const int,
+ const double *, const int
+);
+
+void ATL_drefscal
+(
+ const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefswap
+(
+ const int,
+ double *, const int,
+ double *, const int
+);
+
+void ATL_drefcopy
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_drefaxpy
+(
+ const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_drefrot
+(
+ const int,
+ double *, const int,
+ double *, const int,
+ const double,
+ const double
+);
+
+void ATL_drefrotm
+(
+ const int,
+ double *, const int,
+ double *, const int,
+ const double *
+);
+
+double ATL_drefdot
+(
+ const int,
+ const double *, const int,
+ const double *, const int
+);
+
+void ATL_crefrotg
+(
+ float *,
+ const float *,
+ float *,
+ float *
+);
+
+float ATL_screfnrm2
+(
+ const int,
+ const float *, const int
+);
+
+float ATL_screfasum
+(
+ const int,
+ const float *, const int
+);
+
+int ATL_icrefamax
+(
+ const int,
+ const float *, const int
+);
+
+void ATL_crefscal
+(
+ const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_csrefscal
+(
+ const int,
+ const float,
+ float *, const int
+);
+
+void ATL_crefswap
+(
+ const int,
+ float *, const int,
+ float *, const int
+);
+
+void ATL_crefcopy
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefaxpy
+(
+ const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_csrefrot
+(
+ const int,
+ float *, const int,
+ float *, const int,
+ const float,
+ const float
+);
+
+void ATL_crefdotc_sub
+(
+ const int,
+ const float *, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_crefdotu_sub
+(
+ const int,
+ const float *, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_zrefrotg
+(
+ double *,
+ const double *,
+ double *,
+ double *
+);
+
+double ATL_dzrefnrm2
+(
+ const int,
+ const double *, const int
+);
+
+double ATL_dzrefasum
+(
+ const int,
+ const double *, const int
+);
+
+int ATL_izrefamax
+(
+ const int,
+ const double *, const int
+);
+
+void ATL_zrefscal
+(
+ const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zdrefscal
+(
+ const int,
+ const double,
+ double *, const int
+);
+
+void ATL_zrefswap
+(
+ const int,
+ double *, const int,
+ double *, const int
+);
+
+void ATL_zrefcopy
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefaxpy
+(
+ const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zdrefrot
+(
+ const int,
+ double *, const int,
+ double *, const int,
+ const double,
+ const double
+);
+
+void ATL_zrefdotc_sub
+(
+ const int,
+ const double *, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_zrefdotu_sub
+(
+ const int,
+ const double *, const int,
+ const double *, const int,
+ double *
+);
+
+#endif
+/*
+ * End of atlas_reflevel1.h
+ */
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_reflevel2.h b/kaldi_io/src/tools/ATLAS/include/atlas_reflevel2.h
new file mode 100644
index 0000000..6158d17
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_reflevel2.h
@@ -0,0 +1,788 @@
+/* ---------------------------------------------------------------------
+ *
+ * -- Automatically Tuned Linear Algebra Software (ATLAS)
+ * (C) Copyright 2000 All Rights Reserved
+ *
+ * -- ATLAS routine -- Version 3.2 -- December 25, 2000
+ *
+ * Author : Antoine P. Petitet
+ * Originally developed at the University of Tennessee,
+ * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * -- Copyright notice and Licensing terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in
+ * the documentation and/or other materials provided with the distri-
+ * bution.
+ * 3. The name of the University, the ATLAS group, or the names of its
+ * contributors may not be used to endorse or promote products deri-
+ * ved from this software without specific written permission.
+ *
+ * -- Disclaimer:
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
+ * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN-
+ * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---------------------------------------------------------------------
+ */
+#ifndef ATLAS_REFLEVEL2_H
+#define ATLAS_REFLEVEL2_H
+
+#include "atlas_enum.h"
+/*
+ * =====================================================================
+ * Prototypes for Level 2 Reference ATLAS BLAS routines
+ * =====================================================================
+ */
+void ATL_srefgbmv
+(
+ const enum ATLAS_TRANS,
+ const int, const int,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefgpmv
+(
+ const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefgemv
+(
+ const enum ATLAS_TRANS,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefgpr
+(
+ const enum ATLAS_UPLO,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_srefger
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_srefsbmv
+(
+ const enum ATLAS_UPLO,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefspmv
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const float,
+ const float *,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefspr
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const float,
+ const float *, const int,
+ float *
+);
+
+void ATL_srefspr2
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_srefsymv
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsyr
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_srefsyr2
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbmv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbsv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpmv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_sreftpsv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_sreftrmv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_drefgbmv
+(
+ const enum ATLAS_TRANS,
+ const int, const int,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefgpmv
+(
+ const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefgemv
+(
+ const enum ATLAS_TRANS,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefgpr
+(
+ const enum ATLAS_UPLO,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_drefger
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_drefsbmv
+(
+ const enum ATLAS_UPLO,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefspmv
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const double,
+ const double *,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefspr
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const double,
+ const double *, const int,
+ double *
+);
+
+void ATL_drefspr2
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_drefsymv
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsyr
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_drefsyr2
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbmv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbsv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpmv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_dreftpsv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_dreftrmv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_crefgbmv
+(
+ const enum ATLAS_TRANS,
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgpmv
+(
+ const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgemv
+(
+ const enum ATLAS_TRANS,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgprc
+(
+ const enum ATLAS_UPLO,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefgpru
+(
+ const enum ATLAS_UPLO,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefgerc
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefgeru
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefhbmv
+(
+ const enum ATLAS_UPLO,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefhpmv
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const float *,
+ const float *,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefhpr
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const float,
+ const float *, const int,
+ float *
+);
+
+void ATL_crefhpr2
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *
+);
+
+void ATL_crefhemv
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefher
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefher2
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_creftpsv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_creftrmv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_zrefgbmv
+(
+ const enum ATLAS_TRANS,
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgpmv
+(
+ const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgemv
+(
+ const enum ATLAS_TRANS,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgprc
+(
+ const enum ATLAS_UPLO,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefgpru
+(
+ const enum ATLAS_UPLO,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefgerc
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefgeru
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefhbmv
+(
+ const enum ATLAS_UPLO,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefhpmv
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const double *,
+ const double *,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefhpr
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const double,
+ const double *, const int,
+ double *
+);
+
+void ATL_zrefhpr2
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *
+);
+
+void ATL_zrefhemv
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefher
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefher2
+(
+ const enum ATLAS_UPLO,
+ const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zreftpsv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zreftrmv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsv
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+#endif
+/*
+ * End of atlas_reflevel2.h
+ */
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_reflevel3.h b/kaldi_io/src/tools/ATLAS/include/atlas_reflevel3.h
new file mode 100644
index 0000000..eba976b
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_reflevel3.h
@@ -0,0 +1,374 @@
+/* ---------------------------------------------------------------------
+ *
+ * -- Automatically Tuned Linear Algebra Software (ATLAS)
+ * (C) Copyright 2000 All Rights Reserved
+ *
+ * -- ATLAS routine -- Version 3.2 -- December 25, 2000
+ *
+ * Author : Antoine P. Petitet
+ * Originally developed at the University of Tennessee,
+ * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * -- Copyright notice and Licensing terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in
+ * the documentation and/or other materials provided with the distri-
+ * bution.
+ * 3. The name of the University, the ATLAS group, or the names of its
+ * contributors may not be used to endorse or promote products deri-
+ * ved from this software without specific written permission.
+ *
+ * -- Disclaimer:
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
+ * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN-
+ * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---------------------------------------------------------------------
+ */
+#ifndef ATLAS_REFLEVEL3_H
+#define ATLAS_REFLEVEL3_H
+
+#include "atlas_enum.h"
+/*
+ * =====================================================================
+ * Prototypes for Level 3 Reference ATLAS BLAS routines
+ * =====================================================================
+ */
+void ATL_srefgemm
+(
+ const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsymm
+(
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsyrk
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsyr2k
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sreftrmm
+(
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsm
+(
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_drefgemm
+(
+ const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsymm
+(
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsyrk
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsyr2k
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dreftrmm
+(
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsm
+(
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_crefgemm
+(
+ const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefhemm
+(
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefherk
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_crefher2k
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_crefsymm
+(
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefsyrk
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefsyr2k
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_creftrmm
+(
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsm
+(
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_zrefgemm
+(
+ const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefhemm
+(
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefherk
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_zrefher2k
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_zrefsymm
+(
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefsyrk
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefsyr2k
+(
+ const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zreftrmm
+(
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsm
+(
+ const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+#endif
+/*
+ * End of atlas_reflevel3.h
+ */
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_reflvl2.h b/kaldi_io/src/tools/ATLAS/include/atlas_reflvl2.h
new file mode 100644
index 0000000..c557f04
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_reflvl2.h
@@ -0,0 +1,3184 @@
+/* ---------------------------------------------------------------------
+ *
+ * -- Automatically Tuned Linear Algebra Software (ATLAS)
+ * (C) Copyright 2000 All Rights Reserved
+ *
+ * -- ATLAS routine -- Version 3.2 -- December 25, 2000
+ *
+ * Author : Antoine P. Petitet
+ * Originally developed at the University of Tennessee,
+ * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * -- Copyright notice and Licensing terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in
+ * the documentation and/or other materials provided with the distri-
+ * bution.
+ * 3. The name of the University, the ATLAS group, or the names of its
+ * contributors may not be used to endorse or promote products deri-
+ * ved from this software without specific written permission.
+ *
+ * -- Disclaimer:
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
+ * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN-
+ * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---------------------------------------------------------------------
+ */
+#ifndef ATLAS_REFLVL2_H
+#define ATLAS_REFLVL2_H
+/*
+ * =====================================================================
+ * Prototypes for Level 2 Reference Internal ATLAS BLAS routines
+ * =====================================================================
+ */
+void ATL_srefgbmvN
+(
+ const int, const int,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefgbmvT
+(
+ const int, const int,
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefgpmvUN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefgpmvUT
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefgpmvLN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefgpmvLT
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefgemvN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefgemvT
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefgprL
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_srefgprU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_srefsbmvL
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsbmvU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefspmvL
+(
+ const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefspmvU
+(
+ const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsprL
+(
+ const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_srefsprU
+(
+ const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_srefspr2L
+(
+ const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_srefspr2U
+(
+ const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_srefsymvL
+(
+ const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsymvU
+(
+ const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsyrL
+(
+ const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_srefsyrU
+(
+ const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_srefsyr2L
+(
+ const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_srefsyr2U
+(
+ const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbmvLNN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbmvLNU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbmvLTN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbmvLTU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbmvUNN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbmvUNU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbmvUTN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbmvUTU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpmvLNN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpmvLNU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpmvLTN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpmvLTU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpmvUNN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpmvUNU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpmvUTN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpmvUTU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmvLNN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmvLNU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmvLTN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmvLTU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmvUNN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmvUNU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmvUTN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmvUTU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbsvLNN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbsvLNU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbsvLTN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbsvLTU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbsvUNN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbsvUNU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbsvUTN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftbsvUTU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpsvLNN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpsvLNU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpsvLTN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpsvLTU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpsvUNN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpsvUNU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpsvUTN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftpsvUTU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsvLNN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsvLNU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsvLTN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsvLTU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsvUNN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsvUNU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsvUTN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsvUTU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_drefgbmvN
+(
+ const int, const int,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefgbmvT
+(
+ const int, const int,
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefgpmvUN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefgpmvUT
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefgpmvLN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefgpmvLT
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefgemvN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefgemvT
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefgprL
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_drefgprU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_drefsbmvL
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsbmvU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefspmvL
+(
+ const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefspmvU
+(
+ const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsprL
+(
+ const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_drefsprU
+(
+ const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_drefspr2L
+(
+ const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_drefspr2U
+(
+ const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_drefsymvL
+(
+ const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsymvU
+(
+ const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsyrL
+(
+ const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_drefsyrU
+(
+ const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_drefsyr2L
+(
+ const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_drefsyr2U
+(
+ const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbmvLNN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbmvLNU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbmvLTN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbmvLTU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbmvUNN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbmvUNU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbmvUTN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbmvUTU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpmvLNN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpmvLNU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpmvLTN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpmvLTU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpmvUNN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpmvUNU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpmvUTN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpmvUTU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmvLNN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmvLNU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmvLTN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmvLTU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmvUNN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmvUNU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmvUTN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmvUTU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbsvLNN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbsvLNU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbsvLTN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbsvLTU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbsvUNN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbsvUNU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbsvUTN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftbsvUTU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpsvLNN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpsvLNU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpsvLTN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpsvLTU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpsvUNN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpsvUNU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpsvUTN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftpsvUTU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsvLNN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsvLNU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsvLTN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsvLTU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsvUNN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsvUNU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsvUTN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsvUTU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_crefgbmvN
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgbmvT
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgbmvC
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgbmvH
+(
+ const int, const int,
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgpmvUN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgpmvUT
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgpmvUC
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgpmvUH
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgpmvLN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgpmvLT
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgpmvLC
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgpmvLH
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgemvN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgemvT
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgemvC
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgemvH
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgprcL
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefgprcU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefgpruL
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefgpruU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefhbmvL
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefhbmvU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefhpmvL
+(
+ const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefhpmvU
+(
+ const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefhprL
+(
+ const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefhprU
+(
+ const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefhpr2L
+(
+ const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefhpr2U
+(
+ const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefhemvL
+(
+ const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefhemvU
+(
+ const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefherL
+(
+ const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefherU
+(
+ const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefher2L
+(
+ const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_crefher2U
+(
+ const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmvLNN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmvLNU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmvLTN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmvLTU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmvLCN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmvLCU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmvLHN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmvLHU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmvUNN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmvUNU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmvUTN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmvUTU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmvUCN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmvUCU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmvUHN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbmvUHU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmvLNN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmvLNU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmvLTN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmvLTU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmvLCN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmvLCU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmvLHN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmvLHU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmvUNN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmvUNU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmvUTN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmvUTU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmvUCN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmvUCU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmvUHN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpmvUHU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmvLNN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmvLNU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmvLTN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmvLTU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmvLCN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmvLCU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmvLHN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmvLHU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmvUNN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmvUNU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmvUTN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmvUTU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmvUCN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmvUCU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmvUHN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmvUHU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsvLNN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsvLNU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsvLTN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsvLTU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsvLCN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsvLCU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsvLHN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsvLHU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsvUNN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsvUNU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsvUTN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsvUTU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsvUCN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsvUCU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsvUHN
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftbsvUHU
+(
+ const int, const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpsvLNN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpsvLNU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpsvLTN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpsvLTU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpsvLCN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpsvLCU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpsvLHN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpsvLHU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpsvUNN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpsvUNU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpsvUTN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpsvUTU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpsvUCN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpsvUCU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpsvUHN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftpsvUHU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsvLNN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsvLNU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsvLTN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsvLTU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsvLCN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsvLCU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsvLHN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsvLHU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsvUNN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsvUNU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsvUTN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsvUTU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsvUCN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsvUCU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsvUHN
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsvUHU
+(
+ const int,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_zrefgbmvN
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgbmvT
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgbmvC
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgbmvH
+(
+ const int, const int,
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgpmvUN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgpmvUT
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgpmvUC
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgpmvUH
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgpmvLN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgpmvLT
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgpmvLC
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgpmvLH
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgemvN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgemvT
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgemvC
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgemvH
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgprcL
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefgprcU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefgpruL
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefgpruU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefhbmvL
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefhbmvU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefhpmvL
+(
+ const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefhpmvU
+(
+ const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefhprL
+(
+ const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefhprU
+(
+ const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefhpr2L
+(
+ const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefhpr2U
+(
+ const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefhemvL
+(
+ const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefhemvU
+(
+ const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefherL
+(
+ const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefherU
+(
+ const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefher2L
+(
+ const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zrefher2U
+(
+ const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmvLNN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmvLNU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmvLTN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmvLTU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmvLCN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmvLCU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmvLHN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmvLHU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmvUNN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmvUNU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmvUTN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmvUTU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmvUCN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmvUCU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmvUHN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbmvUHU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmvLNN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmvLNU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmvLTN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmvLTU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmvLCN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmvLCU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmvLHN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmvLHU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmvUNN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmvUNU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmvUTN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmvUTU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmvUCN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmvUCU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmvUHN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpmvUHU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmvLNN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmvLNU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmvLTN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmvLTU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmvLCN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmvLCU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmvLHN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmvLHU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmvUNN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmvUNU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmvUTN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmvUTU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmvUCN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmvUCU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmvUHN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmvUHU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsvLNN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsvLNU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsvLTN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsvLTU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsvLCN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsvLCU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsvLHN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsvLHU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsvUNN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsvUNU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsvUTN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsvUTU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsvUCN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsvUCU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsvUHN
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftbsvUHU
+(
+ const int, const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpsvLNN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpsvLNU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpsvLTN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpsvLTU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpsvLCN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpsvLCU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpsvLHN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpsvLHU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpsvUNN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpsvUNU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpsvUTN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpsvUTU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpsvUCN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpsvUCU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpsvUHN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftpsvUHU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsvLNN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsvLNU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsvLTN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsvLTU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsvLCN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsvLCU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsvLHN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsvLHU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsvUNN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsvUNU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsvUTN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsvUTU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsvUCN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsvUCU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsvUHN
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsvUHU
+(
+ const int,
+ const double *, const int,
+ double *, const int
+);
+
+#endif
+/*
+ * End of atlas_reflvl2.h
+ */
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_reflvl3.h b/kaldi_io/src/tools/ATLAS/include/atlas_reflvl3.h
new file mode 100644
index 0000000..0451ff9
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_reflvl3.h
@@ -0,0 +1,2292 @@
+/* ---------------------------------------------------------------------
+ *
+ * -- Automatically Tuned Linear Algebra Software (ATLAS)
+ * (C) Copyright 2000 All Rights Reserved
+ *
+ * -- ATLAS routine -- Version 3.2 -- December 25, 2000
+ *
+ * Author : Antoine P. Petitet
+ * Originally developed at the University of Tennessee,
+ * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * -- Copyright notice and Licensing terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in
+ * the documentation and/or other materials provided with the distri-
+ * bution.
+ * 3. The name of the University, the ATLAS group, or the names of its
+ * contributors may not be used to endorse or promote products deri-
+ * ved from this software without specific written permission.
+ *
+ * -- Disclaimer:
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
+ * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN-
+ * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---------------------------------------------------------------------
+ */
+#ifndef ATLAS_REFLVL3_H
+#define ATLAS_REFLVL3_H
+/*
+ * =====================================================================
+ * Prototypes for Level 3 Reference Internal ATLAS BLAS routines
+ * =====================================================================
+ */
+void ATL_srefgemmNN
+(
+ const int, const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefgemmNT
+(
+ const int, const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefgemmTN
+(
+ const int, const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefgemmTT
+(
+ const int, const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsymmLL
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsymmLU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsymmRL
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsymmRU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsyrkLN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsyrkLT
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsyrkUN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsyrkUT
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsyr2kLN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsyr2kLT
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsyr2kUN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_srefsyr2kUT
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_sreftrmmLLNN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmmLLNU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmmLLTN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmmLLTU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmmLUNN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmmLUNU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmmLUTN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmmLUTU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmmRLNN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmmRLNU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmmRLTN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmmRLTU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmmRUNN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmmRUNU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmmRUTN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrmmRUTU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsmLLNN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsmLLNU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsmLLTN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsmLLTU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsmLUNN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsmLUNU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsmLUTN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsmLUTU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsmRLNN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsmRLNU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsmRLTN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsmRLTU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsmRUNN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsmRUNU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsmRUTN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_sreftrsmRUTU
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_drefgemmNN
+(
+ const int, const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefgemmNT
+(
+ const int, const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefgemmTN
+(
+ const int, const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefgemmTT
+(
+ const int, const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsymmLL
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsymmLU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsymmRL
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsymmRU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsyrkLN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsyrkLT
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsyrkUN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsyrkUT
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsyr2kLN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsyr2kLT
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsyr2kUN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_drefsyr2kUT
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_dreftrmmLLNN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmmLLNU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmmLLTN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmmLLTU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmmLUNN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmmLUNU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmmLUTN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmmLUTU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmmRLNN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmmRLNU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmmRLTN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmmRLTU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmmRUNN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmmRUNU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmmRUTN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrmmRUTU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsmLLNN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsmLLNU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsmLLTN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsmLLTU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsmLUNN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsmLUNU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsmLUTN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsmLUTU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsmRLNN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsmRLNU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsmRLTN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsmRLTU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsmRUNN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsmRUNU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsmRUTN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_dreftrsmRUTU
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_crefgemmNN
+(
+ const int, const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgemmNT
+(
+ const int, const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgemmNC
+(
+ const int, const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgemmTN
+(
+ const int, const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgemmTT
+(
+ const int, const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgemmTC
+(
+ const int, const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgemmCN
+(
+ const int, const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgemmCT
+(
+ const int, const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefgemmCC
+(
+ const int, const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefhemmLL
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefhemmLU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefhemmRL
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefhemmRU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefherkLN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_crefherkLC
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_crefherkUN
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_crefherkUC
+(
+ const int, const int,
+ const float,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_crefher2kLN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_crefher2kLC
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_crefher2kUN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_crefher2kUC
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float,
+ float *, const int
+);
+
+void ATL_crefsymmLL
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefsymmLU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefsymmRL
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefsymmRU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefsyrkLN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefsyrkLT
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefsyrkUN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefsyrkUT
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefsyr2kLN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefsyr2kLT
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefsyr2kUN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_crefsyr2kUT
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ const float *, const int,
+ const float *,
+ float *, const int
+);
+
+void ATL_creftrmmLLNN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmLLNU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmLLTN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmLLTU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmLLCN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmLLCU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmLUNN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmLUNU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmLUTN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmLUTU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmLUCN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmLUCU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmRLNN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmRLNU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmRLTN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmRLTU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmRLCN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmRLCU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmRUNN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmRUNU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmRUTN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmRUTU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmRUCN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrmmRUCU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmLLNN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmLLNU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmLLTN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmLLTU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmLLCN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmLLCU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmLUNN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmLUNU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmLUTN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmLUTU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmLUCN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmLUCU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmRLNN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmRLNU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmRLTN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmRLTU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmRLCN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmRLCU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmRUNN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmRUNU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmRUTN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmRUTU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmRUCN
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_creftrsmRUCU
+(
+ const int, const int,
+ const float *,
+ const float *, const int,
+ float *, const int
+);
+
+void ATL_zrefgemmNN
+(
+ const int, const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgemmNT
+(
+ const int, const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgemmNC
+(
+ const int, const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgemmTN
+(
+ const int, const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgemmTT
+(
+ const int, const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgemmTC
+(
+ const int, const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgemmCN
+(
+ const int, const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgemmCT
+(
+ const int, const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefgemmCC
+(
+ const int, const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefhemmLL
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefhemmLU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefhemmRL
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefhemmRU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefherkLN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_zrefherkLC
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_zrefherkUN
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_zrefherkUC
+(
+ const int, const int,
+ const double,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_zrefher2kLN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_zrefher2kLC
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_zrefher2kUN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_zrefher2kUC
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double,
+ double *, const int
+);
+
+void ATL_zrefsymmLL
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefsymmLU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefsymmRL
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefsymmRU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefsyrkLN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefsyrkLT
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefsyrkUN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefsyrkUT
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefsyr2kLN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefsyr2kLT
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefsyr2kUN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zrefsyr2kUT
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ const double *, const int,
+ const double *,
+ double *, const int
+);
+
+void ATL_zreftrmmLLNN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmLLNU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmLLTN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmLLTU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmLLCN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmLLCU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmLUNN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmLUNU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmLUTN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmLUTU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmLUCN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmLUCU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmRLNN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmRLNU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmRLTN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmRLTU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmRLCN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmRLCU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmRUNN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmRUNU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmRUTN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmRUTU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmRUCN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrmmRUCU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmLLNN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmLLNU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmLLTN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmLLTU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmLLCN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmLLCU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmLUNN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmLUNU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmLUTN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmLUTU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmLUCN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmLUCU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmRLNN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmRLNU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmRLTN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmRLTU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmRLCN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmRLCU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmRUNN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmRUNU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmRUTN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmRUTU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmRUCN
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+void ATL_zreftrsmRUCU
+(
+ const int, const int,
+ const double *,
+ const double *, const int,
+ double *, const int
+);
+
+#endif
+/*
+ * End of atlas_reflvl3.h
+ */
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_refmisc.h b/kaldi_io/src/tools/ATLAS/include/atlas_refmisc.h
new file mode 100644
index 0000000..d8b600e
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_refmisc.h
@@ -0,0 +1,367 @@
+/* ---------------------------------------------------------------------
+ *
+ * -- Automatically Tuned Linear Algebra Software (ATLAS)
+ * (C) Copyright 2000 All Rights Reserved
+ *
+ * -- ATLAS routine -- Version 3.2 -- December 25, 2000
+ *
+ * Author : Antoine P. Petitet
+ * Originally developed at the University of Tennessee,
+ * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * -- Copyright notice and Licensing terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in
+ * the documentation and/or other materials provided with the distri-
+ * bution.
+ * 3. The name of the University, the ATLAS group, or the names of its
+ * contributors may not be used to endorse or promote products deri-
+ * ved from this software without specific written permission.
+ *
+ * -- Disclaimer:
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
+ * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN-
+ * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---------------------------------------------------------------------
+ */
+#ifndef ATL_REFMISC_H
+#define ATL_REFMISC_H
+/*
+ * =====================================================================
+ * Include files
+ * =====================================================================
+ */
+#include <math.h>
+#include "atlas_enum.h"
+/*
+ * =====================================================================
+ * #define macro constants
+ * =====================================================================
+ */
+#define ATL_sNONE (-1.0f)
+#define ATL_sNTWO (-2.0f)
+#define ATL_sONE ( 1.0f)
+#define ATL_sZERO ( 0.0f)
+
+#define ATL_dNONE (-1.0)
+#define ATL_dNTWO (-2.0)
+#define ATL_dONE ( 1.0)
+#define ATL_dZERO ( 0.0)
+/*
+ * =====================================================================
+ * # macro functions
+ * =====================================================================
+ */
+#define Msabs( a_ ) ( ( (a_) < ATL_sZERO ) ? -(a_) : (a_) )
+
+#define Mszero( a_r_, a_i_ ) \
+ ( ( (a_r_) == ATL_sZERO ) && ( (a_i_) == ATL_sZERO ) )
+
+#define Msone( a_r_, a_i_ ) \
+ ( ( (a_r_) == ATL_sONE ) && ( (a_i_) == ATL_sZERO ) )
+
+#define Msscl( a_r_, a_i_, c_r_, c_i_ ) \
+ { \
+ register float tmp_r_, tmp_i_; \
+ tmp_r_ = (a_r_) * c_r_ - (a_i_) * c_i_; \
+ tmp_i_ = (a_r_) * c_i_ + (a_i_) * c_r_; \
+ c_r_ = tmp_r_; \
+ c_i_ = tmp_i_; \
+ }
+/*
+ * Msdiv performs complex division in real arithmetic
+ * a_r_ + i * a_i_ = ( a_r_ + i * a_i_ ) / ( b_r_ + i * b_i_ );
+ * The algorithm is due to Robert L. Smith and can be found in D. Knuth,
+ * The art of Computer Programming, Vol.2, p.195
+ */
+#define Msdiv( b_r_, b_i_, a_r_, a_i_ ) \
+ { \
+ register float c_i_, c_r_, tmp1_, tmp2_; \
+ if( Msabs( b_i_ ) < Msabs( b_r_ ) ) \
+ { \
+ tmp1_ = (b_i_) / (b_r_); \
+ tmp2_ = (b_r_) + (b_i_) * tmp1_; \
+ c_r_ = ( (a_r_) + (a_i_) * tmp1_ ) / tmp2_; \
+ c_i_ = ( (a_i_) - (a_r_) * tmp1_ ) / tmp2_; \
+ } \
+ else \
+ { \
+ tmp1_ = (b_r_) / (b_i_); \
+ tmp2_ = (b_i_) + (b_r_) * tmp1_; \
+ c_r_ = ( (a_i_) + (a_r_) * tmp1_ ) / tmp2_; \
+ c_i_ = ( -(a_r_) + (a_i_) * tmp1_ ) / tmp2_; \
+ } \
+ a_r_ = c_r_; \
+ a_i_ = c_i_; \
+ }
+
+#define Mdabs( a_ ) ( ( (a_) < ATL_dZERO ) ? -(a_) : (a_) )
+
+#define Mdzero( a_r_, a_i_ ) \
+ ( ( (a_r_) == ATL_dZERO ) && ( (a_i_) == ATL_dZERO ) )
+
+#define Mdone( a_r_, a_i_ ) \
+ ( ( (a_r_) == ATL_dONE ) && ( (a_i_) == ATL_dZERO ) )
+
+#define Mdscl( a_r_, a_i_, c_r_, c_i_ ) \
+ { \
+ register double tmp_r_, tmp_i_; \
+ tmp_r_ = (a_r_) * c_r_ - (a_i_) * c_i_; \
+ tmp_i_ = (a_r_) * c_i_ + (a_i_) * c_r_; \
+ c_r_ = tmp_r_; \
+ c_i_ = tmp_i_; \
+ }
+/*
+ * Mddiv performs complex division in real arithmetic
+ * a_r_ + i * a_i_ = ( a_r_ + i * a_i_ ) / ( b_r_ + i * b_i_ );
+ * The algorithm is due to Robert L. Smith and can be found in D. Knuth,
+ * The art of Computer Programming, Vol.2, p.195
+ */
+#define Mddiv( b_r_, b_i_, a_r_, a_i_ ) \
+ { \
+ register double c_i_, c_r_, tmp1_, tmp2_; \
+ if( Mdabs( b_i_ ) < Mdabs( b_r_ ) ) \
+ { \
+ tmp1_ = (b_i_) / (b_r_); \
+ tmp2_ = (b_r_) + (b_i_) * tmp1_; \
+ c_r_ = ( (a_r_) + (a_i_) * tmp1_ ) / tmp2_; \
+ c_i_ = ( (a_i_) - (a_r_) * tmp1_ ) / tmp2_; \
+ } \
+ else \
+ { \
+ tmp1_ = (b_r_) / (b_i_); \
+ tmp2_ = (b_i_) + (b_r_) * tmp1_; \
+ c_r_ = ( (a_i_) + (a_r_) * tmp1_ ) / tmp2_; \
+ c_i_ = ( -(a_r_) + (a_i_) * tmp1_ ) / tmp2_; \
+ } \
+ a_r_ = c_r_; \
+ a_i_ = c_i_; \
+ }
+
+#define Mmin( a_, b_ ) ( ( (a_) < (b_) ) ? (a_) : (b_) )
+
+#define Mmax( a_, b_ ) ( ( (a_) > (b_) ) ? (a_) : (b_) )
+
+#define Mmul( a_r_, a_i_, b_r_, b_i_, c_r_, c_i_ ) \
+ { \
+ c_r_ = (a_r_) * (b_r_) - (a_i_) * (b_i_); \
+ c_i_ = (a_r_) * (b_i_) + (a_i_) * (b_r_); \
+ }
+
+#define Mmla( a_r_, a_i_, b_r_, b_i_, c_r_, c_i_ ) \
+ { \
+ c_r_ += (a_r_) * (b_r_) - (a_i_) * (b_i_); \
+ c_i_ += (a_r_) * (b_i_) + (a_i_) * (b_r_); \
+ }
+
+#define Mmls( a_r_, a_i_, b_r_, b_i_, c_r_, c_i_ ) \
+ { \
+ c_r_ -= (a_r_) * (b_r_) - (a_i_) * (b_i_); \
+ c_i_ -= (a_r_) * (b_i_) + (a_i_) * (b_r_); \
+ }
+
+#define Mset( a_r_, a_i_, b_r_, b_i_ ) \
+ { \
+ b_r_ = (a_r_); \
+ b_i_ = (a_i_); \
+ }
+
+#define Mselscal( al_, a_ ) \
+ { \
+ if( (al_) == ATL_sZERO ) { (a_) = ATL_sZERO; } \
+ else if( (al_) != ATL_sONE ) { (a_) *= (al_); } \
+ }
+
+#define Mdelscal( al_, a_ ) \
+ { \
+ if( (al_) == ATL_dZERO ) { (a_) = ATL_dZERO; } \
+ else if( (al_) != ATL_dONE ) { (a_) *= (al_); } \
+ }
+
+#define Mcelscal( al_r_, al_i_, a_r_, a_i_ ) \
+ { \
+ if( Mszero( (al_r_), (al_i_) ) ) \
+ { (a_r_) = (a_i_) = ATL_sZERO; } \
+ else if( ! Msone( (al_r_), (al_i_) ) ) \
+ { Msscl( (al_r_), (al_i_), (a_r_), (a_i_) ); } \
+ }
+
+#define Mzelscal( al_r_, al_i_, a_r_, a_i_ ) \
+ { \
+ if( Mdzero( (al_r_), (al_i_) ) ) \
+ { (a_r_) = (a_i_) = ATL_dZERO; } \
+ else if( ! Mdone( (al_r_), (al_i_) ) ) \
+ { Mdscl( (al_r_), (al_i_), (a_r_), (a_i_) ); } \
+ }
+
+#define Msvscal( n_, al_, x_, incx_ ) \
+ { \
+ int i_, ix_; \
+ if( (al_) == ATL_sZERO ) \
+ { \
+ for( i_ = 0, ix_ = 0; i_ < (n_); i_++, ix_ += (incx_) ) \
+ { (x_)[ix_] = ATL_sZERO; } \
+ } \
+ else if( (al_) != ATL_sONE ) \
+ { \
+ for( i_ = 0, ix_ = 0; i_ < (n_); i_++, ix_ += (incx_) ) \
+ { (x_)[ix_] *= (al_); } \
+ } \
+ }
+
+#define Mdvscal( n_, al_, x_, incx_ ) \
+ { \
+ int i_, ix_; \
+ if( (al_) == ATL_dZERO ) \
+ { \
+ for( i_ = 0, ix_ = 0; i_ < (n_); i_++, ix_ += (incx_) ) \
+ { (x_)[ix_] = ATL_dZERO; } \
+ } \
+ else if( (al_) != ATL_dONE ) \
+ { \
+ for( i_ = 0, ix_ = 0; i_ < (n_); i_++, ix_ += (incx_) ) \
+ { (x_)[ix_] *= (al_); } \
+ } \
+ }
+
+#define Mcvscal( n_, al_, x_, incx_ ) \
+ { \
+ int i_, ix_, incx2_ = ( 2 * (incx_) ); \
+ if( Mszero( (al_)[0], (al_)[1] ) ) \
+ { \
+ for( i_ = 0, ix_ = 0; i_ < (n_); i_++, ix_ += (incx2_) ) \
+ { (x_)[ix_] = (x_)[ix_+1] = ATL_sZERO; } \
+ } \
+ else if( ! Msone( (al_)[0], (al_)[1] ) ) \
+ { \
+ for( i_ = 0, ix_ = 0; i_ < (n_); i_++, ix_ += (incx2_) ) \
+ { Msscl( (al_)[0], (al_)[1], (x_)[ix_], (x_)[ix_+1] ); } \
+ } \
+ }
+
+#define Mzvscal( n_, al_, x_, incx_ ) \
+ { \
+ int i_, ix_, incx2_ = ( 2 * (incx_) ); \
+ if( Mdzero( (al_)[0], (al_)[1] ) ) \
+ { \
+ for( i_ = 0, ix_ = 0; i_ < (n_); i_++, ix_ += (incx2_) ) \
+ { (x_)[ix_] = (x_)[ix_+1] = ATL_dZERO; } \
+ } \
+ else if( ! Mdone( (al_)[0], (al_)[1] ) ) \
+ { \
+ for( i_ = 0, ix_ = 0; i_ < (n_); i_++, ix_ += (incx2_) ) \
+ { Mdscl( (al_)[0], (al_)[1], (x_)[ix_], (x_)[ix_+1] ); } \
+ } \
+ }
+
+#define Msgescal( m_, n_, al_, a_, lda_ ) \
+ { \
+ int i_, iaij_, j_, jaj_; \
+ if( (al_) == ATL_sZERO ) \
+ { \
+ for( j_ = 0, jaj_ = 0; j_ < (n_); j_++, jaj_ += (lda_) ) \
+ { \
+ for( i_ = 0, iaij_ = jaj_; i_ < (m_); i_++, iaij_ += 1 ) \
+ { (a_)[iaij_] = ATL_sZERO; } \
+ } \
+ } \
+ else if( (al_) != ATL_sONE ) \
+ { \
+ for( j_ = 0, jaj_ = 0; j_ < (n_); j_++, jaj_ += (lda_) ) \
+ { \
+ for( i_ = 0, iaij_ = jaj_; i_ < (m_); i_++, iaij_ += 1 ) \
+ { (a_)[iaij_] *= (al_); } \
+ } \
+ } \
+ }
+
+#define Mdgescal( m_, n_, al_, a_, lda_ ) \
+ { \
+ int i_, iaij_, j_, jaj_; \
+ if( (al_) == ATL_dZERO ) \
+ { \
+ for( j_ = 0, jaj_ = 0; j_ < (n_); j_++, jaj_ += (lda_) ) \
+ { \
+ for( i_ = 0, iaij_ = jaj_; i_ < (m_); i_++, iaij_ += 1 ) \
+ { (a_)[iaij_] = ATL_dZERO; } \
+ } \
+ } \
+ else if( (al_) != ATL_dONE ) \
+ { \
+ for( j_ = 0, jaj_ = 0; j_ < (n_); j_++, jaj_ += (lda_) ) \
+ { \
+ for( i_ = 0, iaij_ = jaj_; i_ < (m_); i_++, iaij_ += 1 ) \
+ { (a_)[iaij_] *= (al_); } \
+ } \
+ } \
+ }
+
+#define Mcgescal( m_, n_, al_, a_, lda_ ) \
+ { \
+ int i_, iaij_, j_, jaj_, lda2_ = ( (lda_) << 1 ); \
+ if( Mszero( (al_)[0], (al_)[1] ) ) \
+ { \
+ for( j_ = 0, jaj_ = 0; j_ < (n_); j_++, jaj_ += lda2_ ) \
+ { \
+ for( i_ = 0, iaij_ = jaj_; i_ < (m_); i_++, iaij_ += 2 ) \
+ { (a_)[iaij_] = (a_)[iaij_+1] = ATL_sZERO; } \
+ } \
+ } \
+ else if( ! Msone( (al_)[0], (al_)[1] ) ) \
+ { \
+ for( j_ = 0, jaj_ = 0; j_ < (n_); j_++, jaj_ += lda2_ ) \
+ { \
+ for( i_ = 0, iaij_ = jaj_; i_ < (m_); i_++, iaij_ += 2 ) \
+ { \
+ Msscl( (al_)[0], (al_)[1], (a_)[iaij_], (a_)[iaij_+1] ); \
+ } \
+ } \
+ } \
+ }
+
+#define Mzgescal( m_, n_, al_, a_, lda_ ) \
+ { \
+ int i_, iaij_, j_, jaj_, lda2_ = ( (lda_) << 1 ); \
+ if( Mdzero( (al_)[0], (al_)[1] ) ) \
+ { \
+ for( j_ = 0, jaj_ = 0; j_ < (n_); j_++, jaj_ += lda2_ ) \
+ { \
+ for( i_ = 0, iaij_ = jaj_; i_ < (m_); i_++, iaij_ += 2 ) \
+ { (a_)[iaij_] = (a_)[iaij_+1] = ATL_dZERO; } \
+ } \
+ } \
+ else if( ! Mdone( (al_)[0], (al_)[1] ) ) \
+ { \
+ for( j_ = 0, jaj_ = 0; j_ < (n_); j_++, jaj_ += lda2_ ) \
+ { \
+ for( i_ = 0, iaij_ = jaj_; i_ < (m_); i_++, iaij_ += 2 ) \
+ { \
+ Mdscl( (al_)[0], (al_)[1], (a_)[iaij_], (a_)[iaij_+1] ); \
+ } \
+ } \
+ } \
+ }
+
+#endif
+/*
+ * End of atlas_refmisc.h
+ */
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_tst.h b/kaldi_io/src/tools/ATLAS/include/atlas_tst.h
new file mode 100644
index 0000000..1ea5f5e
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/atlas_tst.h
@@ -0,0 +1,909 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1999 R. Clint Whaley
+ *
+ * Code contributers : R. Clint Whaley, Antoine P. Petitet
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef ATLAS_TST_H
+ #define ATLAS_TST_H
+
+#include "atlas_enum.h"
+
+double time00();
+#ifndef UseCRand
+ void ATL_srand(int iseed);
+ int ATL_rand(void);
+ #define dumb_seed(iseed_) ATL_srand(iseed_)
+ #define dumb_rand() ( 0.5 - ((double)ATL_rand())/(2147483648.0) )
+#else
+ #define dumb_seed(iseed_) srand(iseed_)
+ #ifndef RAND_MAX /* rather dangerous non-ansi workaround */
+ #define RAND_MAX ((unsigned long)(1<<30))
+ #endif
+ #define dumb_rand() ( 0.5 - ((double)rand())/((double)RAND_MAX) )
+#endif
+
+void ATL_ststsqtran(const int N, float *A, const int lda);
+void ATL_sgeprint
+ (char *mat, const int M, const int N, const float *A, const int lda);
+
+float ATL_sgediffnrm1
+ (const int M, const int N, const float *A, const int lda,
+ const float *B, const int ldb);
+float ATL_shediffnrm
+ (const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, const int N,
+ const float *A0, const int ld0, const float *A1, const int ld1);
+float ATL_sinfnrm(const int N, const float *X, const int incX);
+float ATL_sgenrm1
+ (const int M, const int N, const float *A, const int lda);
+float ATL_strnrm1
+ (const enum ATLAS_UPLO Upper, const enum ATLAS_DIAG Diag, const int N,
+ const float *A, const int lda);
+float ATL_sgbnrm1
+ (const int M, const int N, const int KL, const int KU,
+ const float *A, const int lda);
+float ATL_stpnrm1
+ (const enum ATLAS_UPLO UPLO, const enum ATLAS_DIAG DIAG, const int N,
+ const float *A);
+float ATL_stbnrm1
+ (const enum ATLAS_UPLO UPLO, const enum ATLAS_DIAG DIAG,
+ const int N, const int K, const float *A, const int LDA);
+float ATL_ssynrm
+ (const enum ATLAS_UPLO UPLO, const int N, const float *A, const int LDA);
+float ATL_shenrm
+ (const enum ATLAS_UPLO UPLO, const int N, const float *A, const int LDA);
+float ATL_sspnrm
+ (const enum ATLAS_UPLO UPLO, const int N, const float *A);
+float ATL_shpnrm
+ (const enum ATLAS_UPLO UPLO, const int N, const float *A);
+float ATL_ssbnrm
+ (const enum ATLAS_UPLO UPLO, const int N, const int K,
+ const float *A, const int LDA);
+float ATL_shbnrm
+ (const enum ATLAS_UPLO UPLO, const int N, const int K,
+ const float *A, const int LDA);
+
+void ATL_sgefillgap(const int M, const int N, float *A, const int lda0);
+int ATL_sgechkgap(const int M0, const int N, float *A, const int lda0);
+void ATL_strgen(const enum ATLAS_UPLO Uplo, const enum ATLAS_DIAG Diag,
+ const int N, float *A, const int lda, const int seed);
+void ATL_sgegen(const int M0, const int N, float *A, const int lda,
+ const int seed);
+float ATL_sepsilon(void);
+void ATL_svdiff(const int N, const float *X, const int incX,
+ const float *Y, const int incY, float *Z, const int incZ);
+void ATL_sgediff(const int M, const int N, const float *A, const int lda,
+ const float *B, const int ldb, float *C, const int ldc);
+void ATL_dtstsqtran(const int N, double *A, const int lda);
+void ATL_dgeprint
+ (char *mat, const int M, const int N, const double *A, const int lda);
+
+double ATL_dgediffnrm1
+ (const int M, const int N, const double *A, const int lda,
+ const double *B, const int ldb);
+double ATL_dhediffnrm
+ (const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, const int N,
+ const double *A0, const int ld0, const double *A1, const int ld1);
+double ATL_dinfnrm(const int N, const double *X, const int incX);
+double ATL_dgenrm1
+ (const int M, const int N, const double *A, const int lda);
+double ATL_dtrnrm1
+ (const enum ATLAS_UPLO Upper, const enum ATLAS_DIAG Diag, const int N,
+ const double *A, const int lda);
+double ATL_dgbnrm1
+ (const int M, const int N, const int KL, const int KU,
+ const double *A, const int lda);
+double ATL_dtpnrm1
+ (const enum ATLAS_UPLO UPLO, const enum ATLAS_DIAG DIAG, const int N,
+ const double *A);
+double ATL_dtbnrm1
+ (const enum ATLAS_UPLO UPLO, const enum ATLAS_DIAG DIAG,
+ const int N, const int K, const double *A, const int LDA);
+double ATL_dsynrm
+ (const enum ATLAS_UPLO UPLO, const int N, const double *A, const int LDA);
+double ATL_dhenrm
+ (const enum ATLAS_UPLO UPLO, const int N, const double *A, const int LDA);
+double ATL_dspnrm
+ (const enum ATLAS_UPLO UPLO, const int N, const double *A);
+double ATL_dhpnrm
+ (const enum ATLAS_UPLO UPLO, const int N, const double *A);
+double ATL_dsbnrm
+ (const enum ATLAS_UPLO UPLO, const int N, const int K,
+ const double *A, const int LDA);
+double ATL_dhbnrm
+ (const enum ATLAS_UPLO UPLO, const int N, const int K,
+ const double *A, const int LDA);
+
+void ATL_dgefillgap(const int M, const int N, double *A, const int lda0);
+int ATL_dgechkgap(const int M0, const int N, double *A, const int lda0);
+void ATL_dtrgen(const enum ATLAS_UPLO Uplo, const enum ATLAS_DIAG Diag,
+ const int N, double *A, const int lda, const int seed);
+void ATL_dgegen(const int M0, const int N, double *A, const int lda,
+ const int seed);
+double ATL_depsilon(void);
+void ATL_dvdiff(const int N, const double *X, const int incX,
+ const double *Y, const int incY, double *Z, const int incZ);
+void ATL_dgediff(const int M, const int N, const double *A, const int lda,
+ const double *B, const int ldb, double *C, const int ldc);
+void ATL_ctstsqtran(const int N, float *A, const int lda);
+void ATL_cgeprint
+ (char *mat, const int M, const int N, const float *A, const int lda);
+
+float ATL_cgediffnrm1
+ (const int M, const int N, const float *A, const int lda,
+ const float *B, const int ldb);
+float ATL_chediffnrm
+ (const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, const int N,
+ const float *A0, const int ld0, const float *A1, const int ld1);
+float ATL_cinfnrm(const int N, const float *X, const int incX);
+float ATL_cgenrm1
+ (const int M, const int N, const float *A, const int lda);
+float ATL_ctrnrm1
+ (const enum ATLAS_UPLO Upper, const enum ATLAS_DIAG Diag, const int N,
+ const float *A, const int lda);
+float ATL_cgbnrm1
+ (const int M, const int N, const int KL, const int KU,
+ const float *A, const int lda);
+float ATL_ctpnrm1
+ (const enum ATLAS_UPLO UPLO, const enum ATLAS_DIAG DIAG, const int N,
+ const float *A);
+float ATL_ctbnrm1
+ (const enum ATLAS_UPLO UPLO, const enum ATLAS_DIAG DIAG,
+ const int N, const int K, const float *A, const int LDA);
+float ATL_csynrm
+ (const enum ATLAS_UPLO UPLO, const int N, const float *A, const int LDA);
+float ATL_chenrm
+ (const enum ATLAS_UPLO UPLO, const int N, const float *A, const int LDA);
+float ATL_cspnrm
+ (const enum ATLAS_UPLO UPLO, const int N, const float *A);
+float ATL_chpnrm
+ (const enum ATLAS_UPLO UPLO, const int N, const float *A);
+float ATL_csbnrm
+ (const enum ATLAS_UPLO UPLO, const int N, const int K,
+ const float *A, const int LDA);
+float ATL_chbnrm
+ (const enum ATLAS_UPLO UPLO, const int N, const int K,
+ const float *A, const int LDA);
+
+void ATL_cgefillgap(const int M, const int N, float *A, const int lda0);
+int ATL_cgechkgap(const int M0, const int N, float *A, const int lda0);
+void ATL_ctrgen(const enum ATLAS_UPLO Uplo, const enum ATLAS_DIAG Diag,
+ const int N, float *A, const int lda, const int seed);
+void ATL_cgegen(const int M0, const int N, float *A, const int lda,
+ const int seed);
+float ATL_cepsilon(void);
+void ATL_cvdiff(const int N, const float *X, const int incX,
+ const float *Y, const int incY, float *Z, const int incZ);
+void ATL_cgediff(const int M, const int N, const float *A, const int lda,
+ const float *B, const int ldb, float *C, const int ldc);
+void ATL_ztstsqtran(const int N, double *A, const int lda);
+void ATL_zgeprint
+ (char *mat, const int M, const int N, const double *A, const int lda);
+
+double ATL_zgediffnrm1
+ (const int M, const int N, const double *A, const int lda,
+ const double *B, const int ldb);
+double ATL_zhediffnrm
+ (const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, const int N,
+ const double *A0, const int ld0, const double *A1, const int ld1);
+double ATL_zinfnrm(const int N, const double *X, const int incX);
+double ATL_zgenrm1
+ (const int M, const int N, const double *A, const int lda);
+double ATL_ztrnrm1
+ (const enum ATLAS_UPLO Upper, const enum ATLAS_DIAG Diag, const int N,
+ const double *A, const int lda);
+double ATL_zgbnrm1
+ (const int M, const int N, const int KL, const int KU,
+ const double *A, const int lda);
+double ATL_ztpnrm1
+ (const enum ATLAS_UPLO UPLO, const enum ATLAS_DIAG DIAG, const int N,
+ const double *A);
+double ATL_ztbnrm1
+ (const enum ATLAS_UPLO UPLO, const enum ATLAS_DIAG DIAG,
+ const int N, const int K, const double *A, const int LDA);
+double ATL_zsynrm
+ (const enum ATLAS_UPLO UPLO, const int N, const double *A, const int LDA);
+double ATL_zhenrm
+ (const enum ATLAS_UPLO UPLO, const int N, const double *A, const int LDA);
+double ATL_zspnrm
+ (const enum ATLAS_UPLO UPLO, const int N, const double *A);
+double ATL_zhpnrm
+ (const enum ATLAS_UPLO UPLO, const int N, const double *A);
+double ATL_zsbnrm
+ (const enum ATLAS_UPLO UPLO, const int N, const int K,
+ const double *A, const int LDA);
+double ATL_zhbnrm
+ (const enum ATLAS_UPLO UPLO, const int N, const int K,
+ const double *A, const int LDA);
+
+void ATL_zgefillgap(const int M, const int N, double *A, const int lda0);
+int ATL_zgechkgap(const int M0, const int N, double *A, const int lda0);
+void ATL_ztrgen(const enum ATLAS_UPLO Uplo, const enum ATLAS_DIAG Diag,
+ const int N, double *A, const int lda, const int seed);
+void ATL_zgegen(const int M0, const int N, double *A, const int lda,
+ const int seed);
+double ATL_zepsilon(void);
+void ATL_zvdiff(const int N, const double *X, const int incX,
+ const double *Y, const int incY, double *Z, const int incZ);
+void ATL_zgediff(const int M, const int N, const double *A, const int lda,
+ const double *B, const int ldb, double *C, const int ldc);
+
+/*
+ * Wrappers so that C can call F77 LAPACK
+ */
+int ATL_sf77getri
+ (const enum ATLAS_ORDER, const int, float*, const int, int*,
+ float*, int*);
+int ATL_sf77getrf
+ (const enum ATLAS_ORDER, const int, const int, float*, const int, int*);
+int ATL_sf77potrf(const enum ATLAS_UPLO, const int, float*, const int);
+int ATL_sf77lauum(const enum ATLAS_UPLO, const int, float*, const int);
+int ATL_sf77trtri(const enum ATLAS_UPLO, const enum ATLAS_DIAG, const int,
+ float*, const int);
+int ATL_sf77posv(const enum ATLAS_UPLO, const int, const int, float*, const int, float*, const int);
+int ATL_sf77gesv(const int, const int, float*, const int, int*, float*, const int);
+int ATL_sf77gels(const enum ATLAS_TRANS, const int, const int, const int, float*, const int, float*, const int);
+int ATL_df77getri
+ (const enum ATLAS_ORDER, const int, double*, const int, int*,
+ double*, int*);
+int ATL_df77getrf
+ (const enum ATLAS_ORDER, const int, const int, double*, const int, int*);
+int ATL_df77potrf(const enum ATLAS_UPLO, const int, double*, const int);
+int ATL_df77lauum(const enum ATLAS_UPLO, const int, double*, const int);
+int ATL_df77trtri(const enum ATLAS_UPLO, const enum ATLAS_DIAG, const int,
+ double*, const int);
+int ATL_df77posv(const enum ATLAS_UPLO, const int, const int, double*, const int, double*, const int);
+int ATL_df77gesv(const int, const int, double*, const int, int*, double*, const int);
+int ATL_df77gels(const enum ATLAS_TRANS, const int, const int, const int, double*, const int, double*, const int);
+int ATL_cf77getri
+ (const enum ATLAS_ORDER, const int, float*, const int, int*,
+ float*, int*);
+int ATL_cf77getrf
+ (const enum ATLAS_ORDER, const int, const int, float*, const int, int*);
+int ATL_cf77potrf(const enum ATLAS_UPLO, const int, float*, const int);
+int ATL_cf77lauum(const enum ATLAS_UPLO, const int, float*, const int);
+int ATL_cf77trtri(const enum ATLAS_UPLO, const enum ATLAS_DIAG, const int,
+ float*, const int);
+int ATL_cf77posv(const enum ATLAS_UPLO, const int, const int, float*, const int, float*, const int);
+int ATL_cf77gesv(const int, const int, float*, const int, int*, float*, const int);
+int ATL_cf77gels(const enum ATLAS_TRANS, const int, const int, const int, float*, const int, float*, const int);
+int ATL_zf77getri
+ (const enum ATLAS_ORDER, const int, double*, const int, int*,
+ double*, int*);
+int ATL_zf77getrf
+ (const enum ATLAS_ORDER, const int, const int, double*, const int, int*);
+int ATL_zf77potrf(const enum ATLAS_UPLO, const int, double*, const int);
+int ATL_zf77lauum(const enum ATLAS_UPLO, const int, double*, const int);
+int ATL_zf77trtri(const enum ATLAS_UPLO, const enum ATLAS_DIAG, const int,
+ double*, const int);
+int ATL_zf77posv(const enum ATLAS_UPLO, const int, const int, double*, const int, double*, const int);
+int ATL_zf77gesv(const int, const int, double*, const int, int*, double*, const int);
+int ATL_zf77gels(const enum ATLAS_TRANS, const int, const int, const int, double*, const int, double*, const int);
+/*
+ * =====================================================================
+ * Prototypes for C-callable F77 interface to the Level 1 BLAS routines
+ * =====================================================================
+ */
+void ATL_sf77rotg
+( float *, float *, float *, float * );
+void ATL_df77rotg
+( double *, double *, double *, double * );
+void ATL_cf77rotg
+( float *, const float *, float *, float * );
+void ATL_zf77rotg
+( double *, const double *, double *, double * );
+
+void ATL_sf77rotmg
+( float *, float *, float *, const float,
+ float * );
+void ATL_df77rotmg
+( double *, double *, double *, const double,
+ double * );
+
+float ATL_sf77nrm2
+( const int, const float *, const int );
+double ATL_df77nrm2
+( const int, const double *, const int );
+float ATL_scf77nrm2
+( const int, const float *, const int );
+double ATL_dzf77nrm2
+( const int, const double *, const int );
+
+float ATL_sf77asum
+( const int, const float *, const int );
+double ATL_df77asum
+( const int, const double *, const int );
+float ATL_scf77asum
+( const int, const float *, const int );
+double ATL_dzf77asum
+( const int, const double *, const int );
+
+int ATL_isf77amax
+( const int, const float *, const int );
+int ATL_idf77amax
+( const int, const double *, const int );
+int ATL_icf77amax
+( const int, const float *, const int );
+int ATL_izf77amax
+( const int, const double *, const int );
+
+void ATL_sf77scal
+( const int, const float, float *, const int );
+void ATL_df77scal
+( const int, const double, double *, const int );
+void ATL_cf77scal
+( const int, const float *, float *, const int );
+void ATL_zf77scal
+( const int, const double *, double *, const int );
+void ATL_csf77scal
+( const int, const float, float *, const int );
+void ATL_zdf77scal
+( const int, const double, double *, const int );
+
+void ATL_sf77set(const int, const float, float*, const int);
+void ATL_df77set(const int, const double, double*, const int);
+void ATL_cf77set(const int, const float*, float*, const int);
+void ATL_zf77set(const int, const double*, double*, const int);
+void ATL_sf77axpby
+ (const int, const float, const float*, const int, const float,
+ float*, const int);
+void ATL_df77axpby
+ (const int, const double, const double*, const int, const double,
+ double*, const int);
+void ATL_cf77axpby
+ (const int, const float*, const float*, const int, const float*,
+ float*, const int);
+void ATL_zf77axpby
+ (const int, const double*, const double*, const int, const double*,
+ double*, const int);
+
+void ATL_sf77axpy
+( const int, const float, const float *, const int,
+ float *, const int );
+void ATL_df77axpy
+( const int, const double, const double *, const int,
+ double *, const int );
+void ATL_cf77axpy
+( const int, const float *, const float *, const int,
+ float *, const int );
+void ATL_zf77axpy
+( const int, const double *, const double *, const int,
+ double *, const int );
+
+void ATL_sf77copy
+( const int, const float *, const int, float *,
+ const int );
+void ATL_df77copy
+( const int, const double *, const int, double *,
+ const int );
+void ATL_cf77copy
+( const int, const float *, const int, float *,
+ const int );
+void ATL_zf77copy
+( const int, const double *, const int, double *,
+ const int );
+
+void ATL_sf77swap
+( const int, float *, const int, float *,
+ const int );
+void ATL_df77swap
+( const int, double *, const int, double *,
+ const int );
+void ATL_cf77swap
+( const int, float *, const int, float *,
+ const int );
+void ATL_zf77swap
+( const int, double *, const int, double *,
+ const int );
+
+void ATL_sf77rot
+( const int, float *, const int, float *,
+ const int, const float, const float );
+void ATL_df77rot
+( const int, double *, const int, double *,
+ const int, const double, const double );
+void ATL_csf77rot
+( const int, float *, const int, float *,
+ const int, const float, const float );
+void ATL_zdf77rot
+( const int, double *, const int, double *,
+ const int, const double, const double );
+
+void ATL_sf77rotm
+( const int, float *, const int, float *,
+ const int, const float * );
+void ATL_df77rotm
+( const int, double *, const int, double *,
+ const int, const double * );
+
+float ATL_sf77dot
+( const int, const float *, const int, const float *,
+ const int );
+double ATL_df77dot
+( const int, const double *, const int, const double *,
+ const int );
+void ATL_cf77dotu_sub
+( const int, const float *, const int, const float *,
+ const int, float * );
+void ATL_cf77dotc_sub
+( const int, const float *, const int, const float *,
+ const int, float * );
+void ATL_zf77dotu_sub
+( const int, const double *, const int, const double *,
+ const int, double * );
+void ATL_zf77dotc_sub
+( const int, const double *, const int, const double *,
+ const int, double * );
+
+float ATL_sdsf77dot
+( const int, const float, const float *, const int,
+ const float *, const int );
+double ATL_dsf77dot
+( const int, const float *, const int, const float *,
+ const int );
+/*
+ * =====================================================================
+ * Prototypes for C-callable F77 interface to the Level 2 BLAS routines
+ * =====================================================================
+ */
+void ATL_sf77gemv
+( const enum ATLAS_TRANS, const int, const int,
+ const float, const float *, const int, const float *,
+ const int, const float, float *, const int );
+void ATL_df77gemv
+( const enum ATLAS_TRANS, const int, const int,
+ const double, const double *, const int, const double *,
+ const int, const double, double *, const int );
+void ATL_cf77gemv
+( const enum ATLAS_TRANS, const int, const int,
+ const float *, const float *, const int, const float *,
+ const int, const float *, float *, const int );
+void ATL_zf77gemv
+( const enum ATLAS_TRANS, const int, const int,
+ const double *, const double *, const int, const double *,
+ const int, const double *, double *, const int );
+
+void ATL_sf77gbmv
+( const enum ATLAS_TRANS, const int, const int,
+ const int, const int, const float, const float *,
+ const int, const float *, const int, const float,
+ float *, const int );
+void ATL_df77gbmv
+( const enum ATLAS_TRANS, const int, const int,
+ const int, const int, const double, const double *,
+ const int, const double *, const int, const double,
+ double *, const int );
+void ATL_cf77gbmv
+( const enum ATLAS_TRANS, const int, const int,
+ const int, const int, const float *, const float *,
+ const int, const float *, const int, const float *,
+ float *, const int );
+void ATL_zf77gbmv
+( const enum ATLAS_TRANS, const int, const int,
+ const int, const int, const double *, const double *,
+ const int, const double *, const int, const double *,
+ double *, const int );
+
+void ATL_sf77trmv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const float *,
+ const int, float *, const int );
+void ATL_df77trmv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const double *,
+ const int, double *, const int );
+void ATL_cf77trmv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const float *,
+ const int, float *, const int );
+void ATL_zf77trmv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const double *,
+ const int, double *, const int );
+
+void ATL_sf77tbmv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const int,
+ const float *, const int, float *, const int );
+void ATL_df77tbmv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const int,
+ const double *, const int, double *, const int );
+void ATL_cf77tbmv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const int,
+ const float *, const int, float *, const int );
+void ATL_zf77tbmv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const int,
+ const double *, const int, double *, const int );
+
+void ATL_sf77tpmv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const float *,
+ float *, const int );
+void ATL_df77tpmv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const double *,
+ double *, const int );
+void ATL_cf77tpmv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const float *,
+ float *, const int );
+void ATL_zf77tpmv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const double *,
+ double *, const int );
+
+void ATL_sf77trsv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const float *,
+ const int, float *, const int );
+void ATL_df77trsv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const double *,
+ const int, double *, const int );
+void ATL_cf77trsv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const float *,
+ const int, float *, const int );
+void ATL_zf77trsv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const double *,
+ const int, double *, const int );
+
+void ATL_sf77tbsv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const int,
+ const float *, const int, float *, const int );
+void ATL_df77tbsv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const int,
+ const double *, const int, double *, const int );
+void ATL_cf77tbsv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const int,
+ const float *, const int, float *, const int );
+void ATL_zf77tbsv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const int,
+ const double *, const int, double *, const int );
+
+void ATL_sf77tpsv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const float *,
+ float *, const int );
+void ATL_df77tpsv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const double *,
+ double *, const int );
+void ATL_cf77tpsv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const float *,
+ float *, const int );
+void ATL_zf77tpsv
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const enum ATLAS_DIAG, const int, const double *,
+ double *, const int );
+
+void ATL_sf77symv
+( const enum ATLAS_UPLO, const int, const float,
+ const float *, const int, const float *, const int,
+ const float, float *, const int );
+void ATL_df77symv
+( const enum ATLAS_UPLO, const int, const double,
+ const double *, const int, const double *, const int,
+ const double, double *, const int );
+
+void ATL_cf77hemv
+( const enum ATLAS_UPLO, const int, const float *,
+ const float *, const int, const float *, const int,
+ const float *, float *, const int );
+void ATL_zf77hemv
+( const enum ATLAS_UPLO, const int, const double *,
+ const double *, const int, const double *, const int,
+ const double *, double *, const int );
+
+void ATL_sf77sbmv
+( const enum ATLAS_UPLO, const int, const int,
+ const float, const float *, const int, const float *,
+ const int, const float, float *, const int );
+void ATL_df77sbmv
+( const enum ATLAS_UPLO, const int, const int,
+ const double, const double *, const int, const double *,
+ const int, const double, double *, const int );
+void ATL_cf77hbmv
+( const enum ATLAS_UPLO, const int, const int,
+ const float *, const float *, const int, const float *,
+ const int, const float *, float *, const int );
+void ATL_zf77hbmv
+( const enum ATLAS_UPLO, const int, const int,
+ const double *, const double *, const int, const double *,
+ const int, const double *, double *, const int );
+
+void ATL_sf77spmv
+( const enum ATLAS_UPLO, const int, const float,
+ const float *, const float *, const int, const float,
+ float *, const int );
+void ATL_df77spmv
+( const enum ATLAS_UPLO, const int, const double,
+ const double *, const double *, const int, const double,
+ double *, const int );
+void ATL_cf77hpmv
+( const enum ATLAS_UPLO, const int, const float *,
+ const float *, const float *, const int, const float *,
+ float *, const int );
+void ATL_zf77hpmv
+( const enum ATLAS_UPLO, const int, const double *,
+ const double *, const double *, const int, const double *,
+ double *, const int );
+
+void ATL_sf77ger
+( const int, const int, const float, const float *,
+ const int, const float *, const int, float *,
+ const int );
+void ATL_df77ger
+( const int, const int, const double, const double *,
+ const int, const double *, const int, double *,
+ const int );
+void ATL_cf77gerc
+( const int, const int, const float *, const float *,
+ const int, const float *, const int, float *,
+ const int );
+void ATL_cf77geru
+( const int, const int, const float *, const float *,
+ const int, const float *, const int, float *,
+ const int );
+void ATL_zf77gerc
+( const int, const int, const double *, const double *,
+ const int, const double *, const int, double *,
+ const int );
+void ATL_zf77geru
+( const int, const int, const double *, const double *,
+ const int, const double *, const int, double *,
+ const int );
+
+void ATL_sf77syr
+( const enum ATLAS_UPLO, const int, const float,
+ const float *, const int, float *, const int );
+void ATL_df77syr
+( const enum ATLAS_UPLO, const int, const double,
+ const double *, const int, double *, const int );
+void ATL_cf77her
+( const enum ATLAS_UPLO, const int, const float,
+ const float *, const int, float *, const int );
+void ATL_zf77her
+( const enum ATLAS_UPLO, const int, const double,
+ const double *, const int, double *, const int );
+
+void ATL_sf77spr
+( const enum ATLAS_UPLO, const int, const float,
+ const float *, const int, float * );
+void ATL_df77spr
+( const enum ATLAS_UPLO, const int, const double,
+ const double *, const int, double * );
+void ATL_cf77hpr
+( const enum ATLAS_UPLO, const int, const float,
+ const float *, const int, float * );
+void ATL_zf77hpr
+( const enum ATLAS_UPLO, const int, const double,
+ const double *, const int, double * );
+
+void ATL_sf77syr2
+( const enum ATLAS_UPLO, const int, const float,
+ const float *, const int, const float *, const int,
+ float *, const int );
+void ATL_df77syr2
+( const enum ATLAS_UPLO, const int, const double,
+ const double *, const int, const double *, const int,
+ double *, const int );
+void ATL_cf77her2
+( const enum ATLAS_UPLO, const int, const float *,
+ const float *, const int, const float *, const int,
+ float *, const int );
+void ATL_zf77her2
+( const enum ATLAS_UPLO, const int, const double *,
+ const double *, const int, const double *, const int,
+ double *, const int );
+
+void ATL_sf77spr2
+( const enum ATLAS_UPLO, const int, const float,
+ const float *, const int, const float *, const int,
+ float * );
+void ATL_df77spr2
+( const enum ATLAS_UPLO, const int, const double,
+ const double *, const int, const double *, const int,
+ double * );
+void ATL_cf77hpr2
+( const enum ATLAS_UPLO, const int, const float *,
+ const float *, const int, const float *, const int,
+ float * );
+void ATL_zf77hpr2
+( const enum ATLAS_UPLO, const int, const double *,
+ const double *, const int, const double *, const int,
+ double * );
+/*
+ * =====================================================================
+ * Prototypes for C-callable F77 interface to the Level 3 BLAS routines
+ * =====================================================================
+ */
+void ATL_sf77gemm
+( const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const float,
+ const float *, const int, const float *, const int,
+ const float, float *, const int );
+void ATL_df77gemm
+( const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const double,
+ const double *, const int, const double *, const int,
+ const double, double *, const int );
+void ATL_cf77gemm
+( const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const float *,
+ const float *, const int, const float *, const int,
+ const float *, float *, const int );
+void ATL_zf77gemm
+( const enum ATLAS_TRANS, const enum ATLAS_TRANS,
+ const int, const int, const int, const double *,
+ const double *, const int, const double *, const int,
+ const double *, double *, const int );
+
+void ATL_cf77hemm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int, const float *, const float *,
+ const int, const float *, const int, const float *,
+ float *, const int );
+void ATL_zf77hemm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int, const double *, const double *,
+ const int, const double *, const int, const double *,
+ double *, const int );
+
+void ATL_cf77herk
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const float, const float *,
+ const int, const float, float *, const int );
+void ATL_zf77herk
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const double, const double *,
+ const int, const double, double *, const int );
+
+void ATL_cf77her2k
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const float *, const float *,
+ const int, const float *, const int, const float,
+ float *, const int );
+void ATL_zf77her2k
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const double *, const double *,
+ const int, const double *, const int, const double,
+ double *, const int );
+
+void ATL_sf77symm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int, const float, const float *,
+ const int, const float *, const int, const float,
+ float *, const int );
+void ATL_df77symm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int, const double, const double *,
+ const int, const double *, const int, const double,
+ double *, const int );
+void ATL_cf77symm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int, const float *, const float *,
+ const int, const float *, const int, const float *,
+ float *, const int );
+void ATL_zf77symm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const int, const int, const double *, const double *,
+ const int, const double *, const int, const double *,
+ double *, const int );
+
+void ATL_sf77syrk
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const float, const float *,
+ const int, const float, float *, const int );
+void ATL_df77syrk
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const double, const double *,
+ const int, const double, double *, const int );
+void ATL_cf77syrk
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const float *, const float *,
+ const int, const float *, float *, const int );
+void ATL_zf77syrk
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const double *, const double *,
+ const int, const double *, double *, const int );
+
+void ATL_sf77syr2k
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const float, const float *,
+ const int, const float *, const int, const float,
+ float *, const int );
+void ATL_df77syr2k
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const double, const double *,
+ const int, const double *, const int, const double,
+ double *, const int );
+void ATL_cf77syr2k
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const float *, const float *,
+ const int, const float *, const int, const float *,
+ float *, const int );
+void ATL_zf77syr2k
+( const enum ATLAS_UPLO, const enum ATLAS_TRANS,
+ const int, const int, const double *, const double *,
+ const int, const double *, const int, const double *,
+ double *, const int );
+
+void ATL_sf77trmm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const float, const float *,
+ const int, float *, const int );
+void ATL_df77trmm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const double, const double *,
+ const int, double *, const int );
+void ATL_cf77trmm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const float *, const float *,
+ const int, float *, const int );
+void ATL_zf77trmm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const double *, const double *,
+ const int, double *, const int );
+
+void ATL_sf77trsm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const float, const float *,
+ const int, float *, const int );
+void ATL_df77trsm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const double, const double *,
+ const int, double *, const int );
+void ATL_cf77trsm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const float *, const float *,
+ const int, float *, const int );
+void ATL_zf77trsm
+( const enum ATLAS_SIDE, const enum ATLAS_UPLO,
+ const enum ATLAS_TRANS, const enum ATLAS_DIAG,
+ const int, const int, const double *, const double *,
+ const int, double *, const int );
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/cblas.h b/kaldi_io/src/tools/ATLAS/include/cblas.h
new file mode 100644
index 0000000..4087ffb
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/cblas.h
@@ -0,0 +1,596 @@
+#ifndef CBLAS_H
+
+#ifndef CBLAS_ENUM_DEFINED_H
+ #define CBLAS_ENUM_DEFINED_H
+ enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 };
+ enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113,
+ AtlasConj=114};
+ enum CBLAS_UPLO {CblasUpper=121, CblasLower=122};
+ enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132};
+ enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
+#endif
+
+#ifndef CBLAS_ENUM_ONLY
+#define CBLAS_H
+#define CBLAS_INDEX int
+
+int cblas_errprn(int ierr, int info, char *form, ...);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 1 BLAS functions (complex are recast as routines)
+ * ===========================================================================
+ */
+float cblas_sdsdot(const int N, const float alpha, const float *X,
+ const int incX, const float *Y, const int incY);
+double cblas_dsdot(const int N, const float *X, const int incX, const float *Y,
+ const int incY);
+float cblas_sdot(const int N, const float *X, const int incX,
+ const float *Y, const int incY);
+double cblas_ddot(const int N, const double *X, const int incX,
+ const double *Y, const int incY);
+/*
+ * Functions having prefixes Z and C only
+ */
+void cblas_cdotu_sub(const int N, const void *X, const int incX,
+ const void *Y, const int incY, void *dotu);
+void cblas_cdotc_sub(const int N, const void *X, const int incX,
+ const void *Y, const int incY, void *dotc);
+
+void cblas_zdotu_sub(const int N, const void *X, const int incX,
+ const void *Y, const int incY, void *dotu);
+void cblas_zdotc_sub(const int N, const void *X, const int incX,
+ const void *Y, const int incY, void *dotc);
+
+
+/*
+ * Functions having prefixes S D SC DZ
+ */
+float cblas_snrm2(const int N, const float *X, const int incX);
+float cblas_sasum(const int N, const float *X, const int incX);
+
+double cblas_dnrm2(const int N, const double *X, const int incX);
+double cblas_dasum(const int N, const double *X, const int incX);
+
+float cblas_scnrm2(const int N, const void *X, const int incX);
+float cblas_scasum(const int N, const void *X, const int incX);
+
+double cblas_dznrm2(const int N, const void *X, const int incX);
+double cblas_dzasum(const int N, const void *X, const int incX);
+
+
+/*
+ * Functions having standard 4 prefixes (S D C Z)
+ */
+CBLAS_INDEX cblas_isamax(const int N, const float *X, const int incX);
+CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX);
+CBLAS_INDEX cblas_icamax(const int N, const void *X, const int incX);
+CBLAS_INDEX cblas_izamax(const int N, const void *X, const int incX);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 1 BLAS routines
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (s, d, c, z)
+ */
+void cblas_sswap(const int N, float *X, const int incX,
+ float *Y, const int incY);
+void cblas_scopy(const int N, const float *X, const int incX,
+ float *Y, const int incY);
+void cblas_saxpy(const int N, const float alpha, const float *X,
+ const int incX, float *Y, const int incY);
+void catlas_saxpby(const int N, const float alpha, const float *X,
+ const int incX, const float beta, float *Y, const int incY);
+void catlas_sset
+ (const int N, const float alpha, float *X, const int incX);
+
+void cblas_dswap(const int N, double *X, const int incX,
+ double *Y, const int incY);
+void cblas_dcopy(const int N, const double *X, const int incX,
+ double *Y, const int incY);
+void cblas_daxpy(const int N, const double alpha, const double *X,
+ const int incX, double *Y, const int incY);
+void catlas_daxpby(const int N, const double alpha, const double *X,
+ const int incX, const double beta, double *Y, const int incY);
+void catlas_dset
+ (const int N, const double alpha, double *X, const int incX);
+
+void cblas_cswap(const int N, void *X, const int incX,
+ void *Y, const int incY);
+void cblas_ccopy(const int N, const void *X, const int incX,
+ void *Y, const int incY);
+void cblas_caxpy(const int N, const void *alpha, const void *X,
+ const int incX, void *Y, const int incY);
+void catlas_caxpby(const int N, const void *alpha, const void *X,
+ const int incX, const void *beta, void *Y, const int incY);
+void catlas_cset
+ (const int N, const void *alpha, void *X, const int incX);
+
+void cblas_zswap(const int N, void *X, const int incX,
+ void *Y, const int incY);
+void cblas_zcopy(const int N, const void *X, const int incX,
+ void *Y, const int incY);
+void cblas_zaxpy(const int N, const void *alpha, const void *X,
+ const int incX, void *Y, const int incY);
+void catlas_zaxpby(const int N, const void *alpha, const void *X,
+ const int incX, const void *beta, void *Y, const int incY);
+void catlas_zset
+ (const int N, const void *alpha, void *X, const int incX);
+
+
+/*
+ * Routines with S and D prefix only
+ */
+void cblas_srotg(float *a, float *b, float *c, float *s);
+void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
+void cblas_srot(const int N, float *X, const int incX,
+ float *Y, const int incY, const float c, const float s);
+void cblas_srotm(const int N, float *X, const int incX,
+ float *Y, const int incY, const float *P);
+
+void cblas_drotg(double *a, double *b, double *c, double *s);
+void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
+void cblas_drot(const int N, double *X, const int incX,
+ double *Y, const int incY, const double c, const double s);
+void cblas_drotm(const int N, double *X, const int incX,
+ double *Y, const int incY, const double *P);
+
+
+/*
+ * Routines with S D C Z CS and ZD prefixes
+ */
+void cblas_sscal(const int N, const float alpha, float *X, const int incX);
+void cblas_dscal(const int N, const double alpha, double *X, const int incX);
+void cblas_cscal(const int N, const void *alpha, void *X, const int incX);
+void cblas_zscal(const int N, const void *alpha, void *X, const int incX);
+void cblas_csscal(const int N, const float alpha, void *X, const int incX);
+void cblas_zdscal(const int N, const double alpha, void *X, const int incX);
+
+/*
+ * Extra reference routines provided by ATLAS, but not mandated by the standard
+ */
+void cblas_crotg(void *a, void *b, void *c, void *s);
+void cblas_zrotg(void *a, void *b, void *c, void *s);
+void cblas_csrot(const int N, void *X, const int incX, void *Y, const int incY,
+ const float c, const float s);
+void cblas_zdrot(const int N, void *X, const int incX, void *Y, const int incY,
+ const double c, const double s);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 2 BLAS
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (S, D, C, Z)
+ */
+void cblas_sgemv(const enum CBLAS_ORDER Order,
+ const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+ const float alpha, const float *A, const int lda,
+ const float *X, const int incX, const float beta,
+ float *Y, const int incY);
+void cblas_sgbmv(const enum CBLAS_ORDER Order,
+ const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+ const int KL, const int KU, const float alpha,
+ const float *A, const int lda, const float *X,
+ const int incX, const float beta, float *Y, const int incY);
+void cblas_strmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const float *A, const int lda,
+ float *X, const int incX);
+void cblas_stbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const int K, const float *A, const int lda,
+ float *X, const int incX);
+void cblas_stpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const float *Ap, float *X, const int incX);
+void cblas_strsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const float *A, const int lda, float *X,
+ const int incX);
+void cblas_stbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const int K, const float *A, const int lda,
+ float *X, const int incX);
+void cblas_stpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const float *Ap, float *X, const int incX);
+
+void cblas_dgemv(const enum CBLAS_ORDER Order,
+ const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+ const double alpha, const double *A, const int lda,
+ const double *X, const int incX, const double beta,
+ double *Y, const int incY);
+void cblas_dgbmv(const enum CBLAS_ORDER Order,
+ const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+ const int KL, const int KU, const double alpha,
+ const double *A, const int lda, const double *X,
+ const int incX, const double beta, double *Y, const int incY);
+void cblas_dtrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const double *A, const int lda,
+ double *X, const int incX);
+void cblas_dtbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const int K, const double *A, const int lda,
+ double *X, const int incX);
+void cblas_dtpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const double *Ap, double *X, const int incX);
+void cblas_dtrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const double *A, const int lda, double *X,
+ const int incX);
+void cblas_dtbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const int K, const double *A, const int lda,
+ double *X, const int incX);
+void cblas_dtpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const double *Ap, double *X, const int incX);
+
+void cblas_cgemv(const enum CBLAS_ORDER Order,
+ const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+ const void *alpha, const void *A, const int lda,
+ const void *X, const int incX, const void *beta,
+ void *Y, const int incY);
+void cblas_cgbmv(const enum CBLAS_ORDER Order,
+ const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+ const int KL, const int KU, const void *alpha,
+ const void *A, const int lda, const void *X,
+ const int incX, const void *beta, void *Y, const int incY);
+void cblas_ctrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const void *A, const int lda,
+ void *X, const int incX);
+void cblas_ctbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const int K, const void *A, const int lda,
+ void *X, const int incX);
+void cblas_ctpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const void *Ap, void *X, const int incX);
+void cblas_ctrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const void *A, const int lda, void *X,
+ const int incX);
+void cblas_ctbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const int K, const void *A, const int lda,
+ void *X, const int incX);
+void cblas_ctpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const void *Ap, void *X, const int incX);
+
+void cblas_zgemv(const enum CBLAS_ORDER Order,
+ const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+ const void *alpha, const void *A, const int lda,
+ const void *X, const int incX, const void *beta,
+ void *Y, const int incY);
+void cblas_zgbmv(const enum CBLAS_ORDER Order,
+ const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+ const int KL, const int KU, const void *alpha,
+ const void *A, const int lda, const void *X,
+ const int incX, const void *beta, void *Y, const int incY);
+void cblas_ztrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const void *A, const int lda,
+ void *X, const int incX);
+void cblas_ztbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const int K, const void *A, const int lda,
+ void *X, const int incX);
+void cblas_ztpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const void *Ap, void *X, const int incX);
+void cblas_ztrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const void *A, const int lda, void *X,
+ const int incX);
+void cblas_ztbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const int K, const void *A, const int lda,
+ void *X, const int incX);
+void cblas_ztpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const int N, const void *Ap, void *X, const int incX);
+
+
+/*
+ * Routines with S and D prefixes only
+ */
+void cblas_ssymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const float alpha, const float *A,
+ const int lda, const float *X, const int incX,
+ const float beta, float *Y, const int incY);
+void cblas_ssbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const int K, const float alpha, const float *A,
+ const int lda, const float *X, const int incX,
+ const float beta, float *Y, const int incY);
+void cblas_sspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const float alpha, const float *Ap,
+ const float *X, const int incX,
+ const float beta, float *Y, const int incY);
+void cblas_sger(const enum CBLAS_ORDER Order, const int M, const int N,
+ const float alpha, const float *X, const int incX,
+ const float *Y, const int incY, float *A, const int lda);
+void cblas_ssyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const float alpha, const float *X,
+ const int incX, float *A, const int lda);
+void cblas_sspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const float alpha, const float *X,
+ const int incX, float *Ap);
+void cblas_ssyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const float alpha, const float *X,
+ const int incX, const float *Y, const int incY, float *A,
+ const int lda);
+void cblas_sspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const float alpha, const float *X,
+ const int incX, const float *Y, const int incY, float *A);
+
+void cblas_dsymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const double alpha, const double *A,
+ const int lda, const double *X, const int incX,
+ const double beta, double *Y, const int incY);
+void cblas_dsbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const int K, const double alpha, const double *A,
+ const int lda, const double *X, const int incX,
+ const double beta, double *Y, const int incY);
+void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const double alpha, const double *Ap,
+ const double *X, const int incX,
+ const double beta, double *Y, const int incY);
+void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N,
+ const double alpha, const double *X, const int incX,
+ const double *Y, const int incY, double *A, const int lda);
+void cblas_dsyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const double alpha, const double *X,
+ const int incX, double *A, const int lda);
+void cblas_dspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const double alpha, const double *X,
+ const int incX, double *Ap);
+void cblas_dsyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const double alpha, const double *X,
+ const int incX, const double *Y, const int incY, double *A,
+ const int lda);
+void cblas_dspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const double alpha, const double *X,
+ const int incX, const double *Y, const int incY, double *A);
+
+
+/*
+ * Routines with C and Z prefixes only
+ */
+void cblas_chemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const void *alpha, const void *A,
+ const int lda, const void *X, const int incX,
+ const void *beta, void *Y, const int incY);
+void cblas_chbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const int K, const void *alpha, const void *A,
+ const int lda, const void *X, const int incX,
+ const void *beta, void *Y, const int incY);
+void cblas_chpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const void *alpha, const void *Ap,
+ const void *X, const int incX,
+ const void *beta, void *Y, const int incY);
+void cblas_cgeru(const enum CBLAS_ORDER Order, const int M, const int N,
+ const void *alpha, const void *X, const int incX,
+ const void *Y, const int incY, void *A, const int lda);
+void cblas_cgerc(const enum CBLAS_ORDER Order, const int M, const int N,
+ const void *alpha, const void *X, const int incX,
+ const void *Y, const int incY, void *A, const int lda);
+void cblas_cher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const float alpha, const void *X, const int incX,
+ void *A, const int lda);
+void cblas_chpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const float alpha, const void *X,
+ const int incX, void *A);
+void cblas_cher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+ const void *alpha, const void *X, const int incX,
+ const void *Y, const int incY, void *A, const int lda);
+void cblas_chpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+ const void *alpha, const void *X, const int incX,
+ const void *Y, const int incY, void *Ap);
+
+void cblas_zhemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const void *alpha, const void *A,
+ const int lda, const void *X, const int incX,
+ const void *beta, void *Y, const int incY);
+void cblas_zhbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const int K, const void *alpha, const void *A,
+ const int lda, const void *X, const int incX,
+ const void *beta, void *Y, const int incY);
+void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const void *alpha, const void *Ap,
+ const void *X, const int incX,
+ const void *beta, void *Y, const int incY);
+void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N,
+ const void *alpha, const void *X, const int incX,
+ const void *Y, const int incY, void *A, const int lda);
+void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N,
+ const void *alpha, const void *X, const int incX,
+ const void *Y, const int incY, void *A, const int lda);
+void cblas_zher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const double alpha, const void *X, const int incX,
+ void *A, const int lda);
+void cblas_zhpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const double alpha, const void *X,
+ const int incX, void *A);
+void cblas_zher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+ const void *alpha, const void *X, const int incX,
+ const void *Y, const int incY, void *A, const int lda);
+void cblas_zhpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+ const void *alpha, const void *X, const int incX,
+ const void *Y, const int incY, void *Ap);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 3 BLAS
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (S, D, C, Z)
+ */
+void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+ const int K, const float alpha, const float *A,
+ const int lda, const float *B, const int ldb,
+ const float beta, float *C, const int ldc);
+void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+ const enum CBLAS_UPLO Uplo, const int M, const int N,
+ const float alpha, const float *A, const int lda,
+ const float *B, const int ldb, const float beta,
+ float *C, const int ldc);
+void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+ const float alpha, const float *A, const int lda,
+ const float beta, float *C, const int ldc);
+void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+ const float alpha, const float *A, const int lda,
+ const float *B, const int ldb, const float beta,
+ float *C, const int ldc);
+void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+ const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_DIAG Diag, const int M, const int N,
+ const float alpha, const float *A, const int lda,
+ float *B, const int ldb);
+void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+ const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_DIAG Diag, const int M, const int N,
+ const float alpha, const float *A, const int lda,
+ float *B, const int ldb);
+
+void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+ const int K, const double alpha, const double *A,
+ const int lda, const double *B, const int ldb,
+ const double beta, double *C, const int ldc);
+void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+ const enum CBLAS_UPLO Uplo, const int M, const int N,
+ const double alpha, const double *A, const int lda,
+ const double *B, const int ldb, const double beta,
+ double *C, const int ldc);
+void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+ const double alpha, const double *A, const int lda,
+ const double beta, double *C, const int ldc);
+void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+ const double alpha, const double *A, const int lda,
+ const double *B, const int ldb, const double beta,
+ double *C, const int ldc);
+void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+ const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_DIAG Diag, const int M, const int N,
+ const double alpha, const double *A, const int lda,
+ double *B, const int ldb);
+void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+ const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_DIAG Diag, const int M, const int N,
+ const double alpha, const double *A, const int lda,
+ double *B, const int ldb);
+
+void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+ const int K, const void *alpha, const void *A,
+ const int lda, const void *B, const int ldb,
+ const void *beta, void *C, const int ldc);
+void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+ const enum CBLAS_UPLO Uplo, const int M, const int N,
+ const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta,
+ void *C, const int ldc);
+void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+ const void *alpha, const void *A, const int lda,
+ const void *beta, void *C, const int ldc);
+void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+ const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta,
+ void *C, const int ldc);
+void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+ const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_DIAG Diag, const int M, const int N,
+ const void *alpha, const void *A, const int lda,
+ void *B, const int ldb);
+void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+ const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_DIAG Diag, const int M, const int N,
+ const void *alpha, const void *A, const int lda,
+ void *B, const int ldb);
+
+void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+ const int K, const void *alpha, const void *A,
+ const int lda, const void *B, const int ldb,
+ const void *beta, void *C, const int ldc);
+void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+ const enum CBLAS_UPLO Uplo, const int M, const int N,
+ const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta,
+ void *C, const int ldc);
+void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+ const void *alpha, const void *A, const int lda,
+ const void *beta, void *C, const int ldc);
+void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+ const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta,
+ void *C, const int ldc);
+void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+ const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_DIAG Diag, const int M, const int N,
+ const void *alpha, const void *A, const int lda,
+ void *B, const int ldb);
+void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+ const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_DIAG Diag, const int M, const int N,
+ const void *alpha, const void *A, const int lda,
+ void *B, const int ldb);
+
+
+/*
+ * Routines with prefixes C and Z only
+ */
+void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+ const enum CBLAS_UPLO Uplo, const int M, const int N,
+ const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta,
+ void *C, const int ldc);
+void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+ const float alpha, const void *A, const int lda,
+ const float beta, void *C, const int ldc);
+void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+ const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const float beta,
+ void *C, const int ldc);
+void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+ const enum CBLAS_UPLO Uplo, const int M, const int N,
+ const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const void *beta,
+ void *C, const int ldc);
+void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+ const double alpha, const void *A, const int lda,
+ const double beta, void *C, const int ldc);
+void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+ const void *alpha, const void *A, const int lda,
+ const void *B, const int ldb, const double beta,
+ void *C, const int ldc);
+
+int cblas_errprn(int ierr, int info, char *form, ...);
+
+#endif /* end #ifdef CBLAS_ENUM_ONLY */
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/cblas_test.h b/kaldi_io/src/tools/ATLAS/include/cblas_test.h
new file mode 100644
index 0000000..b871a47
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/cblas_test.h
@@ -0,0 +1,542 @@
+/*
+ * Added by R. Clint Whaley to make compatible with ATLAS
+ */
+#if defined(Add_) || defined(Add__)
+ #define ADD_
+#elif defined(NoChange)
+ #define NOCHANGE
+#elif defined(UpCase)
+ #define UPCASE
+#endif
+
+#ifdef ADD_
+ #define F77_crotg crotgtest_
+ #define F77_zrotg zrotgtest_
+ #define F77_csrot csrottest_
+ #define F77_zdrot zdrottest_
+#elif defined NOCHANGE
+ #define F77_crotg crotgtest
+ #define F77_zrotg zrotgtest
+ #define F77_csrot csrottest
+ #define F77_zdrot zdrottest
+#elif defined UPCASE
+ #define F77_crotg CROTGTEST
+ #define F77_zrotg ZROTGTEST
+ #define F77_csrot CSROTTEST
+ #define F77_zdrot ZDROTTEST
+#endif
+
+
+/*
+ * cblas_test.h
+ * Written by Keita Teranishi
+ */
+#ifndef CBLAS_TEST_H
+#define CBLAS_TEST_H
+#include "cblas.h"
+
+#define TRUE 1
+#define PASSED 1
+#define TEST_ROW_MJR 1
+
+#define FALSE 0
+#define FAILED 0
+#define TEST_COL_MJR 0
+
+#define INVALID -1
+#define UNDEFINED -1
+
+typedef struct { float real; float imag; } CBLAS_TEST_COMPLEX;
+typedef struct { double real; double imag; } CBLAS_TEST_ZOMPLEX;
+
+#if defined(ADD_)
+ #define F77_xerbla xerbla_
+/*
+ * Level 1 BLAS
+ */
+ #define F77_srotg srotgtest_
+ #define F77_srotmg srotmgtest_
+ #define F77_srot srottest_
+ #define F77_srotm srotmtest_
+ #define F77_drotg drotgtest_
+ #define F77_drotmg drotmgtest_
+ #define F77_drot drottest_
+ #define F77_drotm drotmtest_
+ #define F77_sswap sswaptest_
+ #define F77_scopy scopytest_
+ #define F77_saxpy saxpytest_
+ #define F77_isamax isamaxtest_
+ #define F77_dswap dswaptest_
+ #define F77_dcopy dcopytest_
+ #define F77_daxpy daxpytest_
+ #define F77_idamax idamaxtest_
+ #define F77_cswap cswaptest_
+ #define F77_ccopy ccopytest_
+ #define F77_caxpy caxpytest_
+ #define F77_icamax icamaxtest_
+ #define F77_zswap zswaptest_
+ #define F77_zcopy zcopytest_
+ #define F77_zaxpy zaxpytest_
+ #define F77_izamax izamaxtest_
+ #define F77_sdot sdottestsub_
+ #define F77_ddot ddottestsub_
+ #define F77_dsdot dsdottest_
+ #define F77_sscal sscaltest_
+ #define F77_dscal dscaltest_
+ #define F77_cscal cscaltest_
+ #define F77_zscal zscaltest_
+ #define F77_csscal csscaltest_
+ #define F77_zdscal zdscaltest_
+ #define F77_cdotu cdotutest_
+ #define F77_cdotc cdotctest_
+ #define F77_zdotu zdotutest_
+ #define F77_zdotc zdotctest_
+ #define F77_snrm2 snrm2testsub_
+ #define F77_sasum sasumtestsub_
+ #define F77_dnrm2 dnrm2testsub_
+ #define F77_dasum dasumtestsub_
+ #define F77_scnrm2 scnrm2testsub_
+ #define F77_scasum scasumtestsub_
+ #define F77_dznrm2 dznrm2testsub_
+ #define F77_dzasum dzasumtestsub_
+ #define F77_sdsdot sdsdottest_
+/*
+ * Level 2 BLAS
+ */
+ #define F77_s2chke cs2chke_
+ #define F77_d2chke cd2chke_
+ #define F77_c2chke cc2chke_
+ #define F77_z2chke cz2chke_
+ #define F77_ssymv cssymv_
+ #define F77_ssbmv cssbmv_
+ #define F77_sspmv csspmv_
+ #define F77_sger csger_
+ #define F77_ssyr cssyr_
+ #define F77_sspr csspr_
+ #define F77_ssyr2 cssyr2_
+ #define F77_sspr2 csspr2_
+ #define F77_dsymv cdsymv_
+ #define F77_dsbmv cdsbmv_
+ #define F77_dspmv cdspmv_
+ #define F77_dger cdger_
+ #define F77_dsyr cdsyr_
+ #define F77_dspr cdspr_
+ #define F77_dsyr2 cdsyr2_
+ #define F77_dspr2 cdspr2_
+ #define F77_chemv cchemv_
+ #define F77_chbmv cchbmv_
+ #define F77_chpmv cchpmv_
+ #define F77_cgeru ccgeru_
+ #define F77_cgerc ccgerc_
+ #define F77_cher ccher_
+ #define F77_chpr cchpr_
+ #define F77_cher2 ccher2_
+ #define F77_chpr2 cchpr2_
+ #define F77_zhemv czhemv_
+ #define F77_zhbmv czhbmv_
+ #define F77_zhpmv czhpmv_
+ #define F77_zgeru czgeru_
+ #define F77_zgerc czgerc_
+ #define F77_zher czher_
+ #define F77_zhpr czhpr_
+ #define F77_zher2 czher2_
+ #define F77_zhpr2 czhpr2_
+ #define F77_sgemv csgemv_
+ #define F77_sgbmv csgbmv_
+ #define F77_strmv cstrmv_
+ #define F77_stbmv cstbmv_
+ #define F77_stpmv cstpmv_
+ #define F77_strsv cstrsv_
+ #define F77_stbsv cstbsv_
+ #define F77_stpsv cstpsv_
+ #define F77_dgemv cdgemv_
+ #define F77_dgbmv cdgbmv_
+ #define F77_dtrmv cdtrmv_
+ #define F77_dtbmv cdtbmv_
+ #define F77_dtpmv cdtpmv_
+ #define F77_dtrsv cdtrsv_
+ #define F77_dtbsv cdtbsv_
+ #define F77_dtpsv cdtpsv_
+ #define F77_cgemv ccgemv_
+ #define F77_cgbmv ccgbmv_
+ #define F77_ctrmv cctrmv_
+ #define F77_ctbmv cctbmv_
+ #define F77_ctpmv cctpmv_
+ #define F77_ctrsv cctrsv_
+ #define F77_ctbsv cctbsv_
+ #define F77_ctpsv cctpsv_
+ #define F77_zgemv czgemv_
+ #define F77_zgbmv czgbmv_
+ #define F77_ztrmv cztrmv_
+ #define F77_ztbmv cztbmv_
+ #define F77_ztpmv cztpmv_
+ #define F77_ztrsv cztrsv_
+ #define F77_ztbsv cztbsv_
+ #define F77_ztpsv cztpsv_
+/*
+ * Level 3 BLAS
+ */
+ #define F77_s3chke cs3chke_
+ #define F77_d3chke cd3chke_
+ #define F77_c3chke cc3chke_
+ #define F77_z3chke cz3chke_
+ #define F77_chemm cchemm_
+ #define F77_cherk ccherk_
+ #define F77_cher2k ccher2k_
+ #define F77_zhemm czhemm_
+ #define F77_zherk czherk_
+ #define F77_zher2k czher2k_
+ #define F77_sgemm csgemm_
+ #define F77_ssymm cssymm_
+ #define F77_ssyrk cssyrk_
+ #define F77_ssyr2k cssyr2k_
+ #define F77_strmm cstrmm_
+ #define F77_strsm cstrsm_
+ #define F77_dgemm cdgemm_
+ #define F77_dsymm cdsymm_
+ #define F77_dsyrk cdsyrk_
+ #define F77_dsyr2k cdsyr2k_
+ #define F77_dtrmm cdtrmm_
+ #define F77_dtrsm cdtrsm_
+ #define F77_cgemm ccgemm_
+ #define F77_csymm ccsymm_
+ #define F77_csyrk ccsyrk_
+ #define F77_csyr2k ccsyr2k_
+ #define F77_ctrmm cctrmm_
+ #define F77_ctrsm cctrsm_
+ #define F77_zgemm czgemm_
+ #define F77_zsymm czsymm_
+ #define F77_zsyrk czsyrk_
+ #define F77_zsyr2k czsyr2k_
+ #define F77_ztrmm cztrmm_
+ #define F77_ztrsm cztrsm_
+#elif defined(UPCASE)
+ #define F77_xerbla XERBLA
+/*
+ * Level 1 BLAS
+ */
+ #define F77_srotg SROTGTEST
+ #define F77_srotmg SROTMGTEST
+ #define F77_srot SROTTEST
+ #define F77_srotm SROTMTEST
+ #define F77_drotg DROTGTEST
+ #define F77_drotmg DROTMGTEST
+ #define F77_drot DROTTEST
+ #define F77_drotm DROTMTEST
+ #define F77_sswap SSWAPTEST
+ #define F77_scopy SCOPYTEST
+ #define F77_saxpy SAXPYTEST
+ #define F77_isamax ISAMAXTEST
+ #define F77_dswap DSWAPTEST
+ #define F77_dcopy DCOPYTEST
+ #define F77_daxpy DAXPYTEST
+ #define F77_idamax IDAMAXTEST
+ #define F77_cswap CSWAPTEST
+ #define F77_ccopy CCOPYTEST
+ #define F77_caxpy CAXPYTEST
+ #define F77_icamax ICAMAXTEST
+ #define F77_zswap ZSWAPTEST
+ #define F77_zcopy ZCOPYTEST
+ #define F77_zaxpy ZAXPYTEST
+ #define F77_izamax IZAMAXTEST
+ #define F77_sdot SDOTTESTSUB
+ #define F77_ddot DDOTTESTSUB
+ #define F77_dsdot DSDOTTEST
+ #define F77_sscal SSCALTEST
+ #define F77_dscal DSCALTEST
+ #define F77_cscal CSCALTEST
+ #define F77_zscal ZSCALTEST
+ #define F77_csscal CSSCALTEST
+ #define F77_zdscal ZDSCALTEST
+ #define F77_cdotu CDOTUTEST
+ #define F77_cdotc CDOTCTEST
+ #define F77_zdotu ZDOTUTEST
+ #define F77_zdotc ZDOTCTEST
+ #define F77_snrm2 SNRM2TESTSUB
+ #define F77_sasum SASUMTESTSUB
+ #define F77_dnrm2 DNRM2TESTSUB
+ #define F77_dasum DASUMTESTSUB
+ #define F77_scnrm2 SCNRM2TESTSUB
+ #define F77_scasum SCASUMTESTSUB
+ #define F77_dznrm2 DZNRM2TESTSUB
+ #define F77_dzasum DZASUMTESTSUB
+ #define F77_sdsdot SDSDOTTEST
+/*
+ * Level 2 BLAS
+ */
+ #define F77_s2chke CS2CHKE
+ #define F77_d2chke CD2CHKE
+ #define F77_c2chke CC2CHKE
+ #define F77_z2chke CZ2CHKE
+ #define F77_ssymv CSSYMV
+ #define F77_ssbmv CSSBMV
+ #define F77_sspmv CSSPMV
+ #define F77_sger CSGER
+ #define F77_ssyr CSSYR
+ #define F77_sspr CSSPR
+ #define F77_ssyr2 CSSYR2
+ #define F77_sspr2 CSSPR2
+ #define F77_dsymv CDSYMV
+ #define F77_dsbmv CDSBMV
+ #define F77_dspmv CDSPMV
+ #define F77_dger CDGER
+ #define F77_dsyr CDSYR
+ #define F77_dspr CDSPR
+ #define F77_dsyr2 CDSYR2
+ #define F77_dspr2 CDSPR2
+ #define F77_chemv CCHEMV
+ #define F77_chbmv CCHBMV
+ #define F77_chpmv CCHPMV
+ #define F77_cgeru CCGERU
+ #define F77_cgerc CCGERC
+ #define F77_cher CCHER
+ #define F77_chpr CCHPR
+ #define F77_cher2 CCHER2
+ #define F77_chpr2 CCHPR2
+ #define F77_zhemv CZHEMV
+ #define F77_zhbmv CZHBMV
+ #define F77_zhpmv CZHPMV
+ #define F77_zgeru CZGERU
+ #define F77_zgerc CZGERC
+ #define F77_zher CZHER
+ #define F77_zhpr CZHPR
+ #define F77_zher2 CZHER2
+ #define F77_zhpr2 CZHPR2
+ #define F77_sgemv CSGEMV
+ #define F77_sgbmv CSGBMV
+ #define F77_strmv CSTRMV
+ #define F77_stbmv CSTBMV
+ #define F77_stpmv CSTPMV
+ #define F77_strsv CSTRSV
+ #define F77_stbsv CSTBSV
+ #define F77_stpsv CSTPSV
+ #define F77_dgemv CDGEMV
+ #define F77_dgbmv CDGBMV
+ #define F77_dtrmv CDTRMV
+ #define F77_dtbmv CDTBMV
+ #define F77_dtpmv CDTPMV
+ #define F77_dtrsv CDTRSV
+ #define F77_dtbsv CDTBSV
+ #define F77_dtpsv CDTPSV
+ #define F77_cgemv CCGEMV
+ #define F77_cgbmv CCGBMV
+ #define F77_ctrmv CCTRMV
+ #define F77_ctbmv CCTBMV
+ #define F77_ctpmv CCTPMV
+ #define F77_ctrsv CCTRSV
+ #define F77_ctbsv CCTBSV
+ #define F77_ctpsv CCTPSV
+ #define F77_zgemv CZGEMV
+ #define F77_zgbmv CZGBMV
+ #define F77_ztrmv CZTRMV
+ #define F77_ztbmv CZTBMV
+ #define F77_ztpmv CZTPMV
+ #define F77_ztrsv CZTRSV
+ #define F77_ztbsv CZTBSV
+ #define F77_ztpsv CZTPSV
+/*
+ * Level 3 BLAS
+ */
+ #define F77_s3chke CS3CHKE
+ #define F77_d3chke CD3CHKE
+ #define F77_c3chke CC3CHKE
+ #define F77_z3chke CZ3CHKE
+ #define F77_chemm CCHEMM
+ #define F77_cherk CCHERK
+ #define F77_cher2k CCHER2K
+ #define F77_zhemm CZHEMM
+ #define F77_zherk CZHERK
+ #define F77_zher2k CZHER2K
+ #define F77_sgemm CSGEMM
+ #define F77_ssymm CSSYMM
+ #define F77_ssyrk CSSYRK
+ #define F77_ssyr2k CSSYR2K
+ #define F77_strmm CSTRMM
+ #define F77_strsm CSTRSM
+ #define F77_dgemm CDGEMM
+ #define F77_dsymm CDSYMM
+ #define F77_dsyrk CDSYRK
+ #define F77_dsyr2k CDSYR2K
+ #define F77_dtrmm CDTRMM
+ #define F77_dtrsm CDTRSM
+ #define F77_cgemm CCGEMM
+ #define F77_csymm CCSYMM
+ #define F77_csyrk CCSYRK
+ #define F77_csyr2k CCSYR2K
+ #define F77_ctrmm CCTRMM
+ #define F77_ctrsm CCTRSM
+ #define F77_zgemm CZGEMM
+ #define F77_zsymm CZSYMM
+ #define F77_zsyrk CZSYRK
+ #define F77_zsyr2k CZSYR2K
+ #define F77_ztrmm CZTRMM
+ #define F77_ztrsm CZTRSM
+#elif defined(NOCHANGE)
+ #define F77_xerbla xerbla
+/*
+ * Level 1 BLAS
+ */
+ #define F77_srotg srotgtest
+ #define F77_srotmg srotmgtest
+ #define F77_srot srottest
+ #define F77_srotm srotmtest
+ #define F77_drotg drotgtest
+ #define F77_drotmg drotmgtest
+ #define F77_drot drottest
+ #define F77_drotm drotmtest
+ #define F77_sswap sswaptest
+ #define F77_scopy scopytest
+ #define F77_saxpy saxpytest
+ #define F77_isamax isamaxtest
+ #define F77_dswap dswaptest
+ #define F77_dcopy dcopytest
+ #define F77_daxpy daxpytest
+ #define F77_idamax idamaxtest
+ #define F77_cswap cswaptest
+ #define F77_ccopy ccopytest
+ #define F77_caxpy caxpytest
+ #define F77_icamax icamaxtest
+ #define F77_zswap zswaptest
+ #define F77_zcopy zcopytest
+ #define F77_zaxpy zaxpytest
+ #define F77_izamax izamaxtest
+ #define F77_sdot sdottestsub
+ #define F77_ddot ddottestsub
+ #define F77_dsdot dsdottest
+ #define F77_sscal sscaltest
+ #define F77_dscal dscaltest
+ #define F77_cscal cscaltest
+ #define F77_zscal zscaltest
+ #define F77_csscal csscaltest
+ #define F77_zdscal zdscaltest
+ #define F77_cdotu cdotutest
+ #define F77_cdotc cdotctest
+ #define F77_zdotu zdotutest
+ #define F77_zdotc zdotctest
+ #define F77_snrm2 snrm2testsub
+ #define F77_sasum sasumtestsub
+ #define F77_dnrm2 dnrm2testsub
+ #define F77_dasum dasumtestsub
+ #define F77_scnrm2 scnrm2testsub
+ #define F77_scasum scasumtestsub
+ #define F77_dznrm2 dznrm2testsub
+ #define F77_dzasum dzasumtestsub
+ #define F77_sdsdot sdsdottest
+/*
+ * Level 2 BLAS
+ */
+ #define F77_s2chke cs2chke
+ #define F77_d2chke cd2chke
+ #define F77_c2chke cc2chke
+ #define F77_z2chke cz2chke
+ #define F77_ssymv cssymv
+ #define F77_ssbmv cssbmv
+ #define F77_sspmv csspmv
+ #define F77_sger csger
+ #define F77_ssyr cssyr
+ #define F77_sspr csspr
+ #define F77_ssyr2 cssyr2
+ #define F77_sspr2 csspr2
+ #define F77_dsymv cdsymv
+ #define F77_dsbmv cdsbmv
+ #define F77_dspmv cdspmv
+ #define F77_dger cdger
+ #define F77_dsyr cdsyr
+ #define F77_dspr cdspr
+ #define F77_dsyr2 cdsyr2
+ #define F77_dspr2 cdspr2
+ #define F77_chemv cchemv
+ #define F77_chbmv cchbmv
+ #define F77_chpmv cchpmv
+ #define F77_cgeru ccgeru
+ #define F77_cgerc ccgerc
+ #define F77_cher ccher
+ #define F77_chpr cchpr
+ #define F77_cher2 ccher2
+ #define F77_chpr2 cchpr2
+ #define F77_zhemv czhemv
+ #define F77_zhbmv czhbmv
+ #define F77_zhpmv czhpmv
+ #define F77_zgeru czgeru
+ #define F77_zgerc czgerc
+ #define F77_zher czher
+ #define F77_zhpr czhpr
+ #define F77_zher2 czher2
+ #define F77_zhpr2 czhpr2
+ #define F77_sgemv csgemv
+ #define F77_sgbmv csgbmv
+ #define F77_strmv cstrmv
+ #define F77_stbmv cstbmv
+ #define F77_stpmv cstpmv
+ #define F77_strsv cstrsv
+ #define F77_stbsv cstbsv
+ #define F77_stpsv cstpsv
+ #define F77_dgemv cdgemv
+ #define F77_dgbmv cdgbmv
+ #define F77_dtrmv cdtrmv
+ #define F77_dtbmv cdtbmv
+ #define F77_dtpmv cdtpmv
+ #define F77_dtrsv cdtrsv
+ #define F77_dtbsv cdtbsv
+ #define F77_dtpsv cdtpsv
+ #define F77_cgemv ccgemv
+ #define F77_cgbmv ccgbmv
+ #define F77_ctrmv cctrmv
+ #define F77_ctbmv cctbmv
+ #define F77_ctpmv cctpmv
+ #define F77_ctrsv cctrsv
+ #define F77_ctbsv cctbsv
+ #define F77_ctpsv cctpsv
+ #define F77_zgemv czgemv
+ #define F77_zgbmv czgbmv
+ #define F77_ztrmv cztrmv
+ #define F77_ztbmv cztbmv
+ #define F77_ztpmv cztpmv
+ #define F77_ztrsv cztrsv
+ #define F77_ztbsv cztbsv
+ #define F77_ztpsv cztpsv
+/*
+ * Level 3 BLAS
+ */
+ #define F77_s3chke cs3chke
+ #define F77_d3chke cd3chke
+ #define F77_c3chke cc3chke
+ #define F77_z3chke cz3chke
+ #define F77_chemm cchemm
+ #define F77_cherk ccherk
+ #define F77_cher2k ccher2k
+ #define F77_zhemm czhemm
+ #define F77_zherk czherk
+ #define F77_zher2k czher2k
+ #define F77_sgemm csgemm
+ #define F77_ssymm cssymm
+ #define F77_ssyrk cssyrk
+ #define F77_ssyr2k cssyr2k
+ #define F77_strmm cstrmm
+ #define F77_strsm cstrsm
+ #define F77_dgemm cdgemm
+ #define F77_dsymm cdsymm
+ #define F77_dsyrk cdsyrk
+ #define F77_dsyr2k cdsyr2k
+ #define F77_dtrmm cdtrmm
+ #define F77_dtrsm cdtrsm
+ #define F77_cgemm ccgemm
+ #define F77_csymm ccsymm
+ #define F77_csyrk ccsyrk
+ #define F77_csyr2k ccsyr2k
+ #define F77_ctrmm cctrmm
+ #define F77_ctrsm cctrsm
+ #define F77_zgemm czgemm
+ #define F77_zsymm czsymm
+ #define F77_zsyrk czsyrk
+ #define F77_zsyr2k czsyr2k
+ #define F77_ztrmm cztrmm
+ #define F77_ztrsm cztrsm
+#endif
+
+void get_transpose_type(char *type, enum CBLAS_TRANSPOSE *trans);
+void get_uplo_type(char *type, enum CBLAS_UPLO *uplo);
+void get_diag_type(char *type, enum CBLAS_DIAG *diag);
+void get_side_type(char *type, enum CBLAS_SIDE *side);
+
+#endif /* CBLAS_TEST_H */
diff --git a/kaldi_io/src/tools/ATLAS/include/clapack.h b/kaldi_io/src/tools/ATLAS/include/clapack.h
new file mode 100644
index 0000000..c5dde3f
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/clapack.h
@@ -0,0 +1,149 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef CLAPACK_H
+
+#define CLAPACK_H
+#include "cblas.h"
+
+#ifndef ATLAS_ORDER
+ #define ATLAS_ORDER CBLAS_ORDER
+#endif
+#ifndef ATLAS_UPLO
+ #define ATLAS_UPLO CBLAS_UPLO
+#endif
+#ifndef ATLAS_DIAG
+ #define ATLAS_DIAG CBLAS_DIAG
+#endif
+int clapack_sgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
+ float *A, const int lda, int *ipiv,
+ float *B, const int ldb);
+int clapack_sgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+ float *A, const int lda, int *ipiv);
+int clapack_sgetrs
+ (const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans,
+ const int N, const int NRHS, const float *A, const int lda,
+ const int *ipiv, float *B, const int ldb);
+int clapack_sgetri(const enum CBLAS_ORDER Order, const int N, float *A,
+ const int lda, const int *ipiv);
+int clapack_sposv(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+ const int N, const int NRHS, float *A, const int lda,
+ float *B, const int ldb);
+int clapack_spotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+ const int N, float *A, const int lda);
+int clapack_spotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const int NRHS, const float *A, const int lda,
+ float *B, const int ldb);
+int clapack_spotri(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+ const int N, float *A, const int lda);
+int clapack_slauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+ const int N, float *A, const int lda);
+int clapack_strtri(const enum ATLAS_ORDER Order,const enum ATLAS_UPLO Uplo,
+ const enum ATLAS_DIAG Diag,const int N, float *A, const int lda);
+
+int clapack_dgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
+ double *A, const int lda, int *ipiv,
+ double *B, const int ldb);
+int clapack_dgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+ double *A, const int lda, int *ipiv);
+int clapack_dgetrs
+ (const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans,
+ const int N, const int NRHS, const double *A, const int lda,
+ const int *ipiv, double *B, const int ldb);
+int clapack_dgetri(const enum CBLAS_ORDER Order, const int N, double *A,
+ const int lda, const int *ipiv);
+int clapack_dposv(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+ const int N, const int NRHS, double *A, const int lda,
+ double *B, const int ldb);
+int clapack_dpotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+ const int N, double *A, const int lda);
+int clapack_dpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const int NRHS, const double *A, const int lda,
+ double *B, const int ldb);
+int clapack_dpotri(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+ const int N, double *A, const int lda);
+int clapack_dlauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+ const int N, double *A, const int lda);
+int clapack_dtrtri(const enum ATLAS_ORDER Order,const enum ATLAS_UPLO Uplo,
+ const enum ATLAS_DIAG Diag,const int N, double *A, const int lda);
+
+int clapack_cgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
+ void *A, const int lda, int *ipiv,
+ void *B, const int ldb);
+int clapack_cgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+ void *A, const int lda, int *ipiv);
+int clapack_cgetrs
+ (const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans,
+ const int N, const int NRHS, const void *A, const int lda,
+ const int *ipiv, void *B, const int ldb);
+int clapack_cgetri(const enum CBLAS_ORDER Order, const int N, void *A,
+ const int lda, const int *ipiv);
+int clapack_cposv(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+ const int N, const int NRHS, void *A, const int lda,
+ void *B, const int ldb);
+int clapack_cpotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+ const int N, void *A, const int lda);
+int clapack_cpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const int NRHS, const void *A, const int lda,
+ void *B, const int ldb);
+int clapack_cpotri(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+ const int N, void *A, const int lda);
+int clapack_clauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+ const int N, void *A, const int lda);
+int clapack_ctrtri(const enum ATLAS_ORDER Order,const enum ATLAS_UPLO Uplo,
+ const enum ATLAS_DIAG Diag,const int N, void *A, const int lda);
+
+int clapack_zgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
+ void *A, const int lda, int *ipiv,
+ void *B, const int ldb);
+int clapack_zgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+ void *A, const int lda, int *ipiv);
+int clapack_zgetrs
+ (const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans,
+ const int N, const int NRHS, const void *A, const int lda,
+ const int *ipiv, void *B, const int ldb);
+int clapack_zgetri(const enum CBLAS_ORDER Order, const int N, void *A,
+ const int lda, const int *ipiv);
+int clapack_zposv(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+ const int N, const int NRHS, void *A, const int lda,
+ void *B, const int ldb);
+int clapack_zpotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+ const int N, void *A, const int lda);
+int clapack_zpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+ const int N, const int NRHS, const void *A, const int lda,
+ void *B, const int ldb);
+int clapack_zpotri(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+ const int N, void *A, const int lda);
+int clapack_zlauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+ const int N, void *A, const int lda);
+int clapack_ztrtri(const enum ATLAS_ORDER Order,const enum ATLAS_UPLO Uplo,
+ const enum ATLAS_DIAG Diag,const int N, void *A, const int lda);
+
+#endif
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h b/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h
new file mode 100644
index 0000000..118d3de
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h
@@ -0,0 +1,188 @@
+#ifdef GER
+#undef NO_TRANSPOSE
+#define NO_TRANSPOSE
+#endif
+
+
+#if NDPM > 4
+#error Max NDPM is 4
+#endif
+
+#if !defined(ATL_SSE1) && ( defined(SREAL) || defined(SCPLX) )
+#error This routine needs ATL_SSE1 defined
+#endif
+
+#if !defined(ATL_SSE2) && ( defined(DREAL) || defined(DCPLX) )
+#error This routine needs ATL_SSE2 defined
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "camm_util.h"
+
+#ifndef GER
+#if defined(BETAX) || defined(BETAXI0)
+#include "camm_scale.h"
+#endif
+#endif
+
+#if NDPM >= 4
+#define EXT4 Mjoin(4dp,BLC)
+#undef NDP
+#define NDP 4
+#undef EXT
+#define EXT EXT4
+#include "camm_dpa.h"
+#endif
+
+#if NDPM >= 3
+#define EXT3 Mjoin(3dp,BLC)
+#undef NDP
+#define NDP 3
+#undef EXT
+#define EXT EXT3
+#include "camm_dpa.h"
+#endif
+
+#if NDPM >= 2
+#define EXT2 Mjoin(2dp,BLC)
+#undef NDP
+#define NDP 2
+#undef EXT
+#define EXT EXT2
+#include "camm_dpa.h"
+#endif
+
+#define EXT1 Mjoin(1dp,BLC)
+#undef NDP
+#define NDP 1
+#undef EXT
+#define EXT EXT1
+#include "camm_dpa.h"
+
+#undef NDP
+#define NDP NDPM
+#undef EXT
+#define EXT Mjoin(Mjoin(NDP,Mjoin(dp,BLC)),m)
+#include "camm_dpa.h"
+
+#ifdef GER
+#if defined(SCPLX) || defined(DCPLX)
+#ifdef Conj_
+#define IM 1c
+#else
+#define IM 1u
+#endif
+#else
+#define IM 1
+#endif
+
+
+#define FN Mjoin(Mjoin(Mjoin(ATL_,PREC),Mjoin(ger,IM)),_a1_x1_yX)
+
+#undef MY_FUNCTION
+#define MY_FUNCTION FN
+
+void
+MY_FUNCTION(int m,int n, const SCALAR alpha,const TYPE *c,
+ int cinc,const TYPE *b,int binc,
+ TYPE *a,int lda) {
+
+#else
+
+
+#define FN Mjoin(Mjoin(Mjoin(ATL_,PREC),gemv),Mjoin(FEXT,Mjoin(_a1_x1_,Mjoin(BL,_y1))))
+
+#undef MY_FUNCTION
+#define MY_FUNCTION FN
+
+void
+MY_FUNCTION(int m,int n, const SCALAR alpha,const TYPE *a,
+ int lda,const TYPE *b,int binc,
+ const SCALAR beta,TYPE *c,int cinc) {
+
+#endif
+
+ int i,mm,nn;
+ const TYPE *ae;
+#ifdef NO_TRANSPOSE
+ int len=m,w=n;
+#define zz b
+#else
+ int len=n,w=m;
+#define zz c
+#endif
+
+#ifdef GER
+#define zzinc binc
+#else
+#define zzinc 1
+
+
+#if defined(NO_TRANSPOSE) && defined(BETA0)
+ memset(c,0,m*sizeof(*c));
+#endif
+
+#if defined(BETAX) || defined(BETAXI0)
+#if defined(SCPLX) || defined(DCPLX)
+ SCALE(beta,c,m);
+#endif
+#if defined(SREAL) || defined(DREAL)
+ SCALE(&beta,c,m);
+#endif
+#endif
+
+#endif
+
+ ae=a+w*lda;
+ nn=STRIDE*lda;
+
+
+#if NDPM == 1
+ for (;a<ae;a+=lda,zz+=zzinc)
+ Mjoin(dp,EXT)(a,nn,b,c,STRIDE*zzinc,len);
+
+#else
+
+ while (a+NDPM*nn<=ae) {
+ for (i=0;i<STRIDE;i++,a+=lda,zz+=zzinc)
+ Mjoin(dp,EXT)(a,nn,b,c,STRIDE*zzinc,len);
+
+ a+=(NDPM-1)*nn;
+ zz+=(NDPM-1)*STRIDE*zzinc;
+ }
+
+ for (i=0;a<ae && i<STRIDE;i++,a+=lda,zz+=zzinc) {
+
+ mm=(ae-a)/nn;
+#if STRIDE > 1
+ if (((ae-a)/lda)%STRIDE)
+ mm++;
+#endif
+
+ if (mm == 1)
+ Mjoin(dp,EXT1)(a,nn,b,c,STRIDE*zzinc,len);
+
+#if ( NDPM == 2 && STRIDE > 1 ) || NDPM > 2
+ else if (mm == 2)
+ Mjoin(dp,EXT2)(a,nn,b,c,STRIDE*zzinc,len);
+#endif
+
+#if ( NDPM == 3 && STRIDE > 1 ) || NDPM > 3
+ else if (mm == 3)
+ Mjoin(dp,EXT3)(a,nn,b,c,STRIDE*zzinc,len);
+#endif
+
+#if ( NDPM == 4 && STRIDE > 1 ) || NDPM > 4
+ else if (mm == 4)
+ Mjoin(dp,EXT4)(a,nn,b,c,STRIDE*zzinc,len);
+#endif
+
+
+ }
+
+#endif
+
+}
+
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext b/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext
new file mode 100644
index 0000000..f7f9a0a
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext
@@ -0,0 +1,39 @@
+
+topd = /home/whaley/atlas3.8/AtlasBase
+incs = -def topd /home/whaley/atlas3.8/AtlasBase \
+ -def incd /home/whaley/atlas3.8/AtlasBase/Clint \
+ -def BASEdir /home/whaley/atlas3.8/AtlasBase/Antoine/ \
+ -def basd /home/whaley/atlas3.8/AtlasBase/Clint
+ext = extract
+extF = $(ext) -langF -lnlen71 -Remtblank -llwarn2 -LAPACK1 $(incs)
+extC = $(ext) -langC -lnlen79 -Remtblank -llwarn2 $(incs)
+extM = $(ext) -langM -lnlen79 -llwarn2 $(incs)
+
+default: all
+force_build:
+basd = /home/whaley/atlas3.8/AtlasBase/Clint
+basdRCW = /home/whaley/atlas3.8/AtlasBase/Clint
+basdAPP = /home/whaley/atlas3.8/AtlasBase/Antoine
+incf = /home/whaley/atlas3.8/AtlasBase/gen.inc
+
+files = ATL_gemv_ger_SSE.h SSE3Dnow.h camm_dpa.h camm_pipe3.h camm_scale.h \
+ camm_strat1.h camm_tpipe.h camm_util.h
+
+all : $(files)
+
+camm_strat1.h : $(topd)/kernel/CammMaguire/camm_strat1.h
+ cp $(topd)/kernel/CammMaguire/camm_strat1.h .
+camm_tpipe.h : $(topd)/kernel/CammMaguire/camm_tpipe.h
+ cp $(topd)/kernel/CammMaguire/camm_tpipe.h .
+camm_pipe3.h : $(topd)/kernel/CammMaguire/camm_pipe3.h
+ cp $(topd)/kernel/CammMaguire/camm_pipe3.h .
+ATL_gemv_ger_SSE.h : $(topd)/kernel/CammMaguire/ATL_gemv_ger_SSE.h
+ cp $(topd)/kernel/CammMaguire/ATL_gemv_ger_SSE.h .
+camm_util.h : $(topd)/kernel/CammMaguire/camm_util.h
+ cp $(topd)/kernel/CammMaguire/camm_util.h .
+camm_scale.h : $(topd)/kernel/CammMaguire/camm_scale.h
+ cp $(topd)/kernel/CammMaguire/camm_scale.h .
+camm_dpa.h : $(topd)/kernel/CammMaguire/camm_dpa.h
+ cp $(topd)/kernel/CammMaguire/camm_dpa.h .
+SSE3Dnow.h : $(topd)/kernel/PeterSoendergaard/SSE3Dnow.h
+ cp $(topd)/kernel/PeterSoendergaard/SSE3Dnow.h .
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h b/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h
new file mode 100644
index 0000000..a783749
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h
@@ -0,0 +1,709 @@
+#if !defined(ATL_GAS_x8632) && !defined(ATL_GAS_x8664)
+ #error "This kernel requires gas x86 assembler!"
+#endif
+#ifndef Mstr /* Added by RCW to make multiline macros work */
+ #define Mstr2(m) # m
+ #define Mstr(m) Mstr2(m)
+#endif
+/* The mening of the defined macros is as follows:
+ * VECLEN: The length of a singleprecision vector register
+ * vec_add: Add to single precision vectors.
+ * vec_mul: Multiply to single precision vectors.
+ * vec_mov: Moves data around
+ * vec_mov1: Load one element in a vector and zero all other entries!
+ * vec_splat: Load one element relpicated in all positions in the vector.
+ * vec_load_apart: Load elements from different memory positions into a register.
+ * vec_sum: Sums a register.
+ * vec_store_one: Stores lowest element in vector to memory, no zero-extend!
+ * Meaning of suffixes is as follows:
+ * mr means memory to register
+ * rr means register to register
+ * rm means register to memory
+ * a means that instruction needs aligned data
+ * 1 means that the instructions only operates on the lowest element of the
+ * vector.
+ *
+ * The _1 instructions work under one important assumption: That you never mix
+ * them with regular instructions, e.g. loading into a register with a normal
+ * mov, and then using add_rr_1 will not work under 3dnow! since it is in
+ * reality a normal add. However, if using a mov_1 first, the upper part of
+ * the register will be zeroed, and it will therefore work. The _1 system is
+ * more robust under SSE, but other architectures might be implemented the
+ * same way as 3dnow!
+ *
+ * RCW: I added the following functionality for SSE only (note that vw may
+ * be overwritten with intermediate results, but is not used as input,
+ * and that all input array may be overwritten wt intermediate results.
+ * VL : vector length -1):
+ * vec_red(vd, vw) : vd[0] = sum(vd[0:VL])
+ * vec_red2(v1, v2, vw) : v1[0] = sum(v1[0:VL]); v1[1] = sum(v2[0:VL])
+ * vec_red4(v0, v1, v2, v3 vw1, vw2) :
+ * v0[0] = sum(v0[0:VL]); v0[1] = sum(v1[0:VL])
+ * if type = double:
+ * v2[0] = sum(v2[0:VL]); v2[1] = sum(v3[0:VL])
+ * else
+ * v0[2] = sum(v2[0:VL]); v0[3] = sum(v3[0:VL])
+ * vec_zero(vd) : vd[0:VL] = 0.0
+ */
+
+
+/* Things to try:
+ * Non-temporal stores
+ * Sequences of instructions instead of movups
+ *
+ *
+ *
+ *
+ */
+
+
+
+#define gen_vec_rr(op,reg1,reg2) \
+ __asm__ __volatile__ (#op " " #reg1 ", " #reg2 \
+ : /* nothing */ \
+ : /* nothing */)
+
+
+#define w(p) p
+
+#define nop() __asm__ __volatile__ ("nop")
+
+#define rep() __asm__ __volatile__ ("rep")
+
+#define align() __asm__ __volatile__ (".align 16")
+
+
+#ifdef x87double
+
+#define st0 %%st(0)
+#define st1 %%st(1)
+#define st2 %%st(2)
+#define st3 %%st(3)
+#define st4 %%st(4)
+#define st5 %%st(5)
+#define st6 %%st(6)
+#define st7 %%st(7)
+
+
+#define gen_stack_rt(op,reg) \
+ __asm__ __volatile__ (#op " " #reg \
+ : /* nothing */ \
+ : /* nothing */)
+
+#define gen_stack_tr(op,reg) \
+ __asm__ __volatile__ (#op " %%st(0)," #reg \
+ : \
+ : )
+
+
+#define gen_stack_rr(op,reg1,reg2) \
+ __asm__ __volatile__ (#op " " #reg1 ", " #reg2 \
+ : /* nothing */ \
+ : /* nothing */)
+
+#define gen_stack_t(op) \
+ __asm__ __volatile__ (#op \
+ : /* nothing */ \
+ : /* nothing */)
+
+
+#define gen_stack_tm(op,mem) \
+ __asm__ __volatile__ (#op " %0" \
+ : "=m" (((mem)[0])) \
+ : )
+
+#define gen_stack_mt(op,mem) \
+ __asm__ __volatile__ (#op " %0" \
+ : \
+ : "m" (((mem)[0])))
+
+
+#define stack_mov_mt_push(mem) gen_stack_mt(fldl,mem)
+
+#define stack_add_tr_pop(reg) gen_stack_tr(faddp,reg)
+#define stack_add_mt(mem) gen_stack_mt(faddl,mem)
+
+#define stack_mul_tr(reg) gen_stack_tr(fmul,reg)
+#define stack_mul_tr_pop(reg) gen_stack_tr(fmulp,reg)
+#define stack_mul_mt(mem) gen_stack_mt(fmul,mem)
+
+#define stack_mov_tm_pop(mem) gen_stack_tm(fstpl,mem)
+
+#define stack_zero_push() gen_stack_t(fldz)
+
+#endif /* x87double */
+
+#ifdef SSE
+
+/* Peculiarities of SSE: Alignment is good, but not mandatory. It is possible to
+ * load/store from misaligned adresses using movups at a cost of some cycles. Loading
+ * using mul/add must always be aligned. Alignment is 16 bytes.
+ * No muladd.
+ */
+
+
+
+#define gen_vec_mr(op,mem,reg) \
+ __asm__ __volatile__ (#op " %0, " #reg \
+ : /* nothing */ \
+ : "m" (((mem)[0])), "m" (((mem)[1])), "m" (((mem)[2])), "m" (((mem)[3])))
+
+
+#define gen_vec_rm(op,reg,mem) \
+ __asm__ __volatile__ (#op " " #reg ", %0" \
+ : "=m" (((mem)[0])), "=m" (((mem)[1])), "=m" (((mem)[2])), "=m" (((mem)[3])) \
+ : /* nothing */ )
+
+
+
+
+#define VECLEN 4
+
+#define reg0 %%xmm0
+#define reg1 %%xmm1
+#define reg2 %%xmm2
+#define reg3 %%xmm3
+#define reg4 %%xmm4
+#define reg5 %%xmm5
+#define reg6 %%xmm6
+#define reg7 %%xmm7
+#ifdef ATL_GAS_x8664
+ #define reg8 %%xmm8
+ #define reg9 %%xmm9
+ #define reg10 %%xmm10
+ #define reg11 %%xmm11
+ #define reg12 %%xmm12
+ #define reg13 %%xmm13
+ #define reg14 %%xmm14
+ #define reg15 %%xmm15
+#endif
+
+#define vec_mov_mr(mem,reg) gen_vec_mr(movups,mem,reg)
+#define vec_mov_rm(reg,mem) gen_vec_rm(movups,reg,mem)
+#define vec_mov_mr_a(mem,reg) gen_vec_mr(movaps,mem,reg)
+#define vec_mov_rm_a(reg,mem) gen_vec_rm(movaps,reg,mem)
+#define vec_mov_rr(reg1,reg2) gen_vec_rr(movaps,reg1,reg2)
+
+#define vec_add_mr_a(mem,reg) gen_vec_mr(addps,mem,reg)
+#define vec_mul_mr_a(mem,reg) gen_vec_mr(mulps,mem,reg)
+
+#define vec_add_rr(mem,reg) gen_vec_rr(addps,mem,reg)
+#define vec_mul_rr(mem,reg) gen_vec_rr(mulps,mem,reg)
+
+#define vec_mov_mr_1(mem,reg) gen_vec_mr(movss,mem,reg)
+#define vec_mov_rm_1(reg,mem) gen_vec_rm(movss,reg,mem)
+#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movss,reg1,reg2)
+
+#define vec_add_mr_1(mem,reg) gen_vec_mr(addss,mem,reg)
+#define vec_add_rr_1(reg1,reg2) gen_vec_rr(addss,reg1,reg2)
+
+#define vec_mul_mr_1(mem,reg) gen_vec_mr(mulss,mem,reg)
+#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(mulss,reg1,reg2)
+
+#define vec_unpack_low(reg1,reg2) gen_vec_rr(unpcklps,reg1,reg2)
+#define vec_unpack_high(reg1,reg2) gen_vec_rr(unpckhps,reg1,reg2)
+#define vec_shuffle(mode,reg1,reg2) vec_shuffle_wrap(mode,reg1,reg2)
+#define vec_shuffle_wrap(mode,reg1,reg2) \
+ __asm__ __volatile__ ("shufps " #mode ", " #reg1 ", " #reg2 \
+ : /* nothing */\
+ : /* nothing */)
+
+/* Hack! */
+/* To use this instruction be sure that register 7 is not in use!!! */
+/* It must be possible to reduce this sequence to only four instructions.
+ * please tell me how! */
+#define vec_sum(reg) vec_sum_wrap(reg)
+#define vec_sum_wrap(reg) \
+ __asm__ __volatile__ ("movhlps " #reg ", %%xmm7\n"\
+ "addps " #reg ", %%xmm7\n"\
+ "movaps %%xmm7, " #reg "\n"\
+ "shufps $1, " #reg ", %%xmm7\n"\
+ "addss %%xmm7, " #reg "\n"\
+ : /* nothing */\
+ : /* nothing */)
+
+/* RCW: added to safely replace vec_sum (vec reduce), and use SSE3 when avail */
+#define vec_zero(vd) __asm__ __volatile__("xorps " Mstr(vd) ", " Mstr(vd) ::)
+#ifdef ATL_SSE3
+ #define vec_red(vr, vwrk) \
+ __asm__ __volatile__("haddps " Mstr(vr) ", " Mstr(vr) "\n"\
+ "haddps " Mstr(vr) ", " Mstr(vr) "\n" ::)
+/*
+ * haddps v1 v0 # v0 = {v1cd, v1ab, v0cd, v0ab}
+ * haddps v0 v0 # v0 = {v1abcd, v0abcd, v1abcd, v0abcd}
+ */
+ #define vec_red2(v0, v1, vwork) \
+ __asm__ __volatile__("haddps " Mstr(v1) ", " Mstr(v0) "\n"\
+ "haddps " Mstr(v0) ", " Mstr(v0) "\n" ::)
+/*
+ * haddps v1, v0 # v0 = {v1cd,v1ab,v0cd,v0ab}
+ * haddps v3, v2 # v2 = {v3cd,v3ab,v2cd,v2ab}
+ * haddps v2, v0 # v0 = {v3abcd,v2abcd,v1abcd, v0abcd}
+ */
+ #define vec_red4(v0, v1, v2, v3, w0, w1) \
+ __asm__ __volatile__("haddps " Mstr(v1) ", " Mstr(v0) "\n"\
+ "haddps " Mstr(v3) ", " Mstr(v2) "\n"\
+ "haddps " Mstr(v2) ", " Mstr(v0) "\n" ::)
+#elif defined(ATL_SSE2)
+ #define vec_red(vr, vwrk) \
+ __asm__ __volatile__ ("pshufd $0xEE, " Mstr(vr) ", " Mstr(vwrk) "\n"\
+ "addps " Mstr(vwrk) ", " Mstr(vr) "\n"\
+ "pshufd $0xE5, " Mstr(vr) ", " Mstr(vwrk) "\n"\
+ "addss " Mstr(vwrk) ", " Mstr(vr) "\n"\
+ ::)
+#else
+ #define vec_red(vr, vwrk) \
+ __asm__ __volatile__ ("movhlps " Mstr(vr) ", " Mstr(vwrk) "\n"\
+ "addps " Mstr(vwrk) ", " Mstr(vr) "\n"\
+ "movaps " Mstr(vr) ", " Mstr(vwrk) "\n"\
+ "shufps $0xE5, " Mstr(vr) ", " Mstr(vr) "\n"\
+ "addss " Mstr(vwrk) ", " Mstr(vr) "\n"\
+ ::)
+#endif
+#ifndef ATL_SSE3 /* codes that are the same for SSE2 and SSE1 */
+/*
+ # v0 = {v0d,v0c,v0b,v0a}
+ # v1 = {v1d,v1c,v1b,v1a}
+ movaps v0, vw # vw = {v0d,v0c,v0b,v0a}
+ unpacklps v1, v0 # v0 = {v1b,v0b,v1a,v0a}
+ unpackhps v1, vw # vw = {v1d,v0d,v1c,v0c}
+ addps vw, v0 # v0 = {v1bd,v0bd,v1ac,v0ac}
+ movhlps v0, vw # vw = {X , X,v1bd,v0bd}
+ addps vw, v0 # v0 = {X , X,v1abcd,v0abcd}
+*/
+ #define vec_red2(v0, v1, vw) \
+ __asm__ __volatile__ ("movaps " Mstr(v0) ", " Mstr(vw) "\n"\
+ "unpcklps " Mstr(v1) ", " Mstr(v0) "\n"\
+ "unpckhps " Mstr(v1) ", " Mstr(vw) "\n"\
+ "addps " Mstr(vw) ", " Mstr(v0) "\n"\
+ "movhlps " Mstr(v0) ", " Mstr(vw) "\n"\
+ "addps " Mstr(vw) ", " Mstr(v0) "\n"\
+ ::)
+/*
+ * movaps v0, w0 # w0 = {v0d, v0c, v0b, v0a}
+ * unpcklps v1, v0 # v0 = {v1b, v0b, v1a, v0a}
+ * movaps v2, w1 # w1 = {v2d, v2c, v2b, v2a}
+ * unpckhps v1, w0 # w0 = {v1d, v0d, v1c, v0c}
+ * unpcklps v3, v2 # v2 = {v3b, v2b, v3a, v2a}
+ * addps w0, v0 # v0 = {v1bd, v0bd, v1ac, v0ac}
+ * unpckhps v3, w1 # w1 = {v3d, v2d, v3c, v2c}
+ * movaps v0, w0 # w0 = {v1bd, v0bd, v1ac, v0ac}
+ * addps w1, v2 # v2 = {v3bd, v2bd, v3ac, v2ac}
+ * shufps $0x44,v2,v0 # v0 = {v3ac, v2ac, v1ac, v0ac}
+ * shufps $0xEE,v2,w0 # w0 = {v3bd, v2bd, v1bd, v0bd}
+ * addps w0, v0 # v0 = {v3abcd, v2abcd, v1abcd, v0abcd}
+ */
+ #define vec_red4(v0, v1, v2, v3, w0, w1) \
+ __asm__ __volatile__ ("movaps " Mstr(v0) ", " Mstr(w0) "\n"\
+ "unpcklps " Mstr(v1) ", " Mstr(v0) "\n"\
+ "movaps " Mstr(v2) ", " Mstr(w1) "\n"\
+ "unpckhps " Mstr(v1) ", " Mstr(w0) "\n"\
+ "unpcklps " Mstr(v3) ", " Mstr(v2) "\n"\
+ "addps " Mstr(w0) ", " Mstr(v0) "\n"\
+ "unpckhps " Mstr(v3) ", " Mstr(w1) "\n"\
+ "movaps " Mstr(v0) ", " Mstr(w0) "\n"\
+ "addps " Mstr(w1) ", " Mstr(v2) "\n"\
+ "shufps $0x44, " Mstr(v2) ", " Mstr(v0) "\n"\
+ "shufps $0xEE, " Mstr(v2) ", " Mstr(w0) "\n"\
+ "addps " Mstr(w0) ", " Mstr(v0) "\n"\
+ ::)
+#endif
+
+#define vec_splat(mem,reg) vec_splat_wrap(mem,reg)
+#define vec_splat_wrap(mem,reg) \
+ __asm__ __volatile__ ("movss %0, " #reg "\n"\
+ "unpcklps " #reg ", " #reg "\n"\
+ "movlhps " #reg ", " #reg "\n"\
+ : /* nothing */ \
+ : "m" ((mem)[0]))
+
+
+/* This instruction sequence appears courtesy of Camm Maguire. */
+#define vec_sum_full(reg0,reg1,reg2,reg3,regout,empty0,empty1) vec_sum_full_wrap(reg0,reg1,reg2,reg3,regout,empty0,empty1)
+#define vec_sum_full_wrap(reg0,reg1,reg2,reg3,regout,empty0,empty1) \
+ __asm__ __volatile__ ("movaps " #reg0 "," #empty0 "\n"\
+ "unpcklps " #reg1 "," #reg0 "\n"\
+ "movaps " #reg2 "," #empty1 "\n"\
+ "unpckhps " #reg1 "," #empty0 "\n"\
+ "unpcklps " #reg3 "," #reg2 "\n"\
+ "addps " #empty0 "," #reg0 "\n"\
+ "unpckhps " #reg3 "," #empty1 "\n"\
+ "movaps " #reg0 "," #regout "\n"\
+ "addps " #empty1 "," #reg2 "\n"\
+ "shufps $0x44," #reg2 "," #reg0 "\n"\
+ "shufps $0xee," #reg2 "," #regout "\n"\
+ "addps " #reg0 "," #regout "\n"\
+ : /* nothing */ \
+ : /* nothing */)
+
+
+
+typedef float vector[VECLEN];
+
+#endif /* end ifdef SSE */
+
+
+#ifdef SSE2
+
+/* Peculiarities of SSE: Alignment is good, but not mandatory. It is possible to
+ * load/store from misaligned adresses using movups at a cost of some cycles. Loading
+ * using mul/add must always be aligned. Alignment is 16 bytes.
+ * No muladd.
+ */
+
+
+
+#define gen_vec_mr(op,mem,reg) \
+ __asm__ __volatile__ (#op " %0, " #reg \
+ : /* nothing */ \
+ : "m" (((mem)[0])), "m" (((mem)[1])))
+
+
+#define gen_vec_rm(op,reg,mem) \
+ __asm__ __volatile__ (#op " " #reg ", %0" \
+ : "=m" (((mem)[0])), "=m" (((mem)[1])) \
+ : /* nothing */ )
+
+
+
+
+#define VECLEN 2
+
+#define reg0 %%xmm0
+#define reg1 %%xmm1
+#define reg2 %%xmm2
+#define reg3 %%xmm3
+#define reg4 %%xmm4
+#define reg5 %%xmm5
+#define reg6 %%xmm6
+#define reg7 %%xmm7
+#ifdef ATL_GAS_x8664
+ #define reg8 %%xmm8
+ #define reg9 %%xmm9
+ #define reg10 %%xmm10
+ #define reg11 %%xmm11
+ #define reg12 %%xmm12
+ #define reg13 %%xmm13
+ #define reg14 %%xmm14
+ #define reg15 %%xmm15
+#endif
+
+
+#define vec_mov_mr(mem,reg) gen_vec_mr(movupd,mem,reg)
+#define vec_mov_rm(reg,mem) gen_vec_rm(movupd,reg,mem)
+#define vec_mov_mr_a(mem,reg) gen_vec_mr(movapd,mem,reg)
+#define vec_mov_rm_a(reg,mem) gen_vec_rm(movapd,reg,mem)
+#define vec_mov_rr(reg1,reg2) gen_vec_rr(movapd,reg1,reg2)
+
+#define vec_add_mr_a(mem,reg) gen_vec_mr(addpd,mem,reg)
+#define vec_mul_mr_a(mem,reg) gen_vec_mr(mulpd,mem,reg)
+
+#define vec_add_rr(mem,reg) gen_vec_rr(addpd,mem,reg)
+#define vec_mul_rr(mem,reg) gen_vec_rr(mulpd,mem,reg)
+
+#define vec_mov_mr_1(mem,reg) gen_vec_mr(movsd,mem,reg)
+#define vec_mov_rm_1(reg,mem) gen_vec_rm(movsd,reg,mem)
+#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movsd,reg1,reg2)
+
+#define vec_add_mr_1(mem,reg) gen_vec_mr(addsd,mem,reg)
+#define vec_add_rr_1(reg1,reg2) gen_vec_rr(addsd,reg1,reg2)
+
+#define vec_mul_mr_1(mem,reg) gen_vec_mr(mulsd,mem,reg)
+#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(mulsd,reg1,reg2)
+
+#define vec_splat(mem,reg) vec_splat_wrap(mem,reg)
+#define vec_splat_wrap(mem,reg) \
+ __asm__ __volatile__ ("movsd %0, " #reg "\n"\
+ "unpcklpd " #reg ", " #reg \
+ : /* nothing */ \
+ : "m" ((mem)[0]))
+
+/* Hack! */
+/* To use this instruction be sure that register 7 is not in use!!! */
+#define vec_sum(reg) vec_sum_wrap(reg)
+#define vec_sum_wrap(reg) \
+ __asm__ __volatile__ ("movhlps " #reg ", %%xmm7\n"\
+ "addpd %%xmm7, " #reg "\n"\
+ : /* nothing */\
+ : /* nothing */)
+/*
+ * Added by RCW to improve performance and avoid xmm7 hack (replace vec_sum)
+ */
+#define vec_zero(vd) __asm__ __volatile__("xorps " Mstr(vd) ", " Mstr(vd) ::)
+#ifdef ATL_SSE3
+ #define vec_red(vr, vwrk) \
+ __asm__ __volatile__("haddpd " Mstr(vr) ", " Mstr(vr) "\n" ::)
+ #define vec_red2(v0, v1, vw) \
+ __asm__ __volatile__("haddpd " Mstr(v1) ", " Mstr(v0) "\n" ::)
+ #define vec_red4(v0, v1, v2, v3, w0, w1) \
+ __asm__ __volatile__("haddpd " Mstr(v1) ", " Mstr(v0) "\n"\
+ "haddpd " Mstr(v3) ", " Mstr(v2) "\n"\
+ ::)
+#else
+ #define vec_red(vr, vwrk) \
+ __asm__ __volatile__ ("pshufd $0xEE, " Mstr(vr) ", " Mstr(vwrk) "\n"\
+ "addsd " Mstr(vwrk) ", " Mstr(vr) "\n" ::)
+/*
+ * movapd v0, vw # vw = {v0b, v0a}
+ * unpcklpd v1,v0 # v0 = {v1a, v0a}
+ * unpckhpd v1, vw # vw = {v1b, v0b}
+ * addpd vw, v0 # v0 = {v1ab,v0ab}
+ */
+ #define vec_red2(v0, v1, vw) \
+ __asm__ __volatile__("movapd " Mstr(v0) ", " Mstr(vw) "\n"\
+ "unpcklpd " Mstr(v1) ", " Mstr(v0) "\n"\
+ "unpckhpd " Mstr(v1) ", " Mstr(vw) "\n"\
+ "addpd " Mstr(vw) ", " Mstr(v0) "\n"\
+ ::)
+/*
+ * movapd v0, w0 # w0 = {v0b, v0a}
+ * movapd v2, w1 # w1 = {v2b, v2a}
+ * unpcklpd v1, v0 # v0 = {v1a, v0a}
+ * unpcklpd v3, v2 # v2 = {v3a, v2a}
+ * unpckhpd v1, w0 # w0 = {v1b, v0b}
+ * unpckhpd v3, w1 # w1 = {v3b, v2b}
+ * addpd w0, v0 # v0 = {v1ab, v0ab}
+ * addpd w1, v2 # v2 = {v3ab, v2ab}
+ */
+ #define vec_red4(v0, v1, v2, v3, w0, w1) \
+ __asm__ __volatile__("movapd " Mstr(v0) ", " Mstr(w0) "\n"\
+ "movapd " Mstr(v2) ", " Mstr(w1) "\n"\
+ "unpcklpd " Mstr(v1) ", " Mstr(v0) "\n"\
+ "unpcklpd " Mstr(v3) ", " Mstr(v2) "\n"\
+ "unpckhpd " Mstr(v1) ", " Mstr(w0) "\n"\
+ "unpckhpd " Mstr(v3) ", " Mstr(w1) "\n"\
+ "addpd " Mstr(w0) ", " Mstr(v0) "\n"\
+ "addpd " Mstr(w1) ", " Mstr(v2) "\n"\
+ ::)
+#endif
+
+#define vec_sum_full(reg1,reg2,empty1) vec_sum_full_wrap(reg1,reg2,empty1)
+#define vec_sum_full_wrap(reg1,reg2,empty1) \
+ __asm__ __volatile__ ("movhlps " #reg2 ", " #empty1 "\n"\
+ "movlhps " #reg2 ", " #empty1 "\n"\
+ "addpd " #empty1 ", " #reg1 "\n"\
+ : /* nothing */\
+ : /* nothing */)
+
+
+typedef double vector[VECLEN];
+
+#endif /* end ifdef SSE2 */
+
+
+#ifdef THREEDNOW
+
+/* Peculiarities of 3DNOW. Alignment is not an issue,
+ * all alignments are legal, however alignment gives a speed increase.
+ * The vec_acc instruction can be used to sum to registers at once more efficiently
+ * than a series of vec_sum and vec_store_one
+ * No muladd.
+ */
+
+
+#define gen_vec_mr(op,mem,reg) \
+ __asm__ __volatile__ (#op " %0, " #reg \
+ : /* nothing */ \
+ : "m" (((mem)[0])), "m" (((mem)[1])))
+
+#define gen_vec_rm(op,reg,mem) \
+ __asm__ __volatile__ (#op " " #reg ", %0" \
+ : "=m" (((mem)[0])), "=m" (((mem)[1])) \
+ : /* nothing */ )
+
+
+
+
+#define VECLEN 2
+
+#define reg0 %%mm0
+#define reg1 %%mm1
+#define reg2 %%mm2
+#define reg3 %%mm3
+#define reg4 %%mm4
+#define reg5 %%mm5
+#define reg6 %%mm6
+#define reg7 %%mm7
+
+#define vec_add_mr(mem,reg) gen_vec_mr(pfadd,mem,reg)
+#define vec_mul_mr(mem,reg) gen_vec_mr(pfmul,mem,reg)
+#define vec_mov_mr(mem,reg) gen_vec_mr(movq,mem,reg)
+#define vec_mov_rm(reg,mem) gen_vec_rm(movq,reg,mem)
+#define vec_add_rr(reg1,reg2) gen_vec_rr(pfadd,reg1,reg2)
+#define vec_mul_rr(reg1,reg2) gen_vec_rr(pfmul,reg1,reg2)
+#define vec_acc_rr(reg1,reg2) gen_vec_rr(pfacc,reg1,reg2)
+#define vec_mov_rr(reg1,reg2) gen_vec_rr(movq,reg1,reg2)
+
+#define vec_sum(reg) gen_vec_rr(pfacc,reg,reg)
+#define vec_sum_full(reg1,reg2) gen_vec_rr(pfacc,reg1,reg2)
+
+#define vec_mov_mr_1(mem,reg) gen_vec_mr(movd,mem,reg)
+#define vec_mov_rm_1(reg,mem) gen_vec_rm(movd,reg,mem)
+#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movd,reg1,reg2)
+
+#define vec_add_rr_1(reg1,reg2) gen_vec_rr(pfadd,reg1,reg2)
+#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(pfmul,reg1,reg2)
+
+
+#define vec_splat(mem,reg) vec_splat_wrap(mem,reg)
+#define vec_splat_wrap(mem,reg) \
+ __asm__ __volatile__ ("movd %0, " #reg "\n"\
+ "punpckldq " #reg ", " #reg \
+ : /* nothing */ \
+ : "m" ((mem)[0]))
+
+
+#define vec_load_apart(mem1,mem2,reg) vec_load_apart_wrap(mem1,mem2,reg)
+#define vec_load_apart_wrap(mem1,mem2,reg) \
+ __asm__ __volatile__ ("movd %0, " #reg "\n"\
+ "punpckldq %1, " #reg \
+ : /* nothing */ \
+ : "m" ((mem1)[0]), "m" (((mem2)[0])))
+
+
+#define vec_zero(reg) gen_vec_rr(pxor,reg,reg)
+
+#define vec_enter() __asm__ __volatile__ ("femms")
+#define vec_exit() __asm__ __volatile__ ("femms")
+
+#define align() __asm__ __volatile__ (".align 16")
+
+
+typedef float vector[VECLEN];
+
+#endif
+
+
+
+
+
+#ifdef ALTIVEC
+
+#define VECLEN 4
+
+#define reg0 %%vr0
+#define reg1 %%vr1
+#define reg2 %%vr2
+#define reg3 %%vr3
+#define reg4 %%vr4
+#define reg5 %%vr5
+#define reg6 %%vr6
+#define reg7 %%vr7
+#define reg8 %%vr8
+#define reg9 %%vr9
+#define reg10 %%vr10
+#define reg11 %%vr11
+#define reg12 %%vr12
+#define reg13 %%vr13
+#define reg14 %%vr14
+#define reg15 %%vr15
+#define reg16 %%vr16
+#define reg17 %%vr17
+#define reg18 %%vr18
+#define reg19 %%vr19
+#define reg20 %%vr20
+#define reg21 %%vr21
+#define reg22 %%vr22
+#define reg23 %%vr23
+#define reg24 %%vr24
+#define reg25 %%vr25
+#define reg26 %%vr26
+#define reg27 %%vr27
+#define reg28 %%vr28
+#define reg29 %%vr29
+#define reg30 %%vr30
+#define reg31 %%vr31
+
+#define gen_vec_mr(op,mem,reg) \
+ __asm__ __volatile__ (#op " %0, " #reg \
+ : /* nothing */ \
+ : "m" (((mem)[0])), "m" (((mem)[1])), "m" (((mem)[2])), "m" (((mem)[3])))
+
+
+#define gen_vec_rm(op,reg,mem) \
+ __asm__ __volatile__ (#op " " #reg ", %0" \
+ : "=m" (((mem)[0])), "=m" (((mem)[1])), "=m" (((mem)[2])), "=m" (((mem)[3])) \
+ : /* nothing */ )
+
+
+#define gen_alti3(op,reg1,reg2,regout) \
+ __asm__ __volatile__ (#op " " #reg1 ", " #reg2 ", " #regout \
+ : /* nothing */ \
+ : /* nothing */)
+
+#define gen_alti_muladd(op,reg1,reg2,regout) \
+ __asm__ __volatile__ (#op " " #reg1 ", " #reg2 ", " #regout ", " #regout \
+ : /* nothing */ \
+ : /* nothing */)
+
+
+
+#define vec_mov_mr_a(mem,reg) gen_vec_mr(lvx,mem,reg)
+#define vec_mov_rm_a(reg,mem) gen_vec_rm(svx,reg,mem)
+#define vec_muladd(reg1,reg2,regout) gen_alti3(vmaddfp,reg1,reg2,regout)
+
+#define vec_zero(reg) gen_alti3(vxor,reg,reg,reg)
+
+
+typedef float vector[VECLEN];
+
+#endif
+
+
+#ifdef ALTIVEC_C
+
+/* These macros have been written by, or greatly inspired by,
+ * Nicholas A. Coult . Thanks.
+ */
+
+/* assumes that last four registers are not in use! */
+#define transpose(x0,x1,x2,x3) \
+reg28 = vec_mergeh(x0,x2); \
+reg29 = vec_mergeh(x1,x3); \
+reg30 = vec_mergel(x0,x2); \
+reg31 = vec_mergel(x1,x3); \
+x0 = vec_mergeh(reg28,reg29); \
+x1 = vec_mergel(reg28,reg29); \
+x2 = vec_mergeh(reg30,reg31); \
+x3 = vec_mergel(reg30,reg31)
+
+#define vec_mov_rm(v, where) \
+low = vec_ld(0, (where)); \
+high = vec_ld(16, (where)); \
+p_vector = vec_lvsr(0, (int *)(where)); \
+mask = vec_perm((vector unsigned char)(0), (vector unsigned char)(-1), p_vector); \
+v = vec_perm(v, v, p_vector); \
+low = vec_sel(low, v, mask); \
+high = vec_sel(v, high, mask); \
+vec_st(low, 0, (where)); \
+vec_st(high, 16, (where))
+
+#define vec_mov_mr_a(mem,reg) reg = vec_ld(0, mem)
+
+#define vec_mov_mr(u,v) \
+p_vector = (vector unsigned char)vec_lvsl(0, (int*)(v)); \
+low = (vector unsigned char)vec_ld(0, (v)); \
+high = (vector unsigned char)vec_ld(16, (v)); \
+u=(vector float)vec_perm(low, high, p_vector)
+
+#define vec_muladd(reg1,reg2,regout) regout = vec_madd(reg1,reg2,regout)
+#define vec_add_rr(reg1,reg2) reg2 = vec_add(reg1,reg2)
+
+#define vec_zero(reg) reg = vec_xor(reg,reg)
+
+#define vec_sum_full(reg0,reg1,reg2,reg3,regout,empty0,empty1) \
+transpose(reg0, reg1,reg2,reg3,regout,empty0,empty1); \
+empty0 = vec_add(reg0,reg1); \
+empty1 = vec_add(reg2,reg3); \
+regout = vec_add(empty0,empty1)
+
+
+#endif /* ALTIVEC_C */
+
+
+
+
+
+
+
+
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h
new file mode 100644
index 0000000..af9c6b1
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h
@@ -0,0 +1,1626 @@
+#include <stdlib.h>
+#include <sys/time.h>
+#include <stdio.h>
+
+#include "camm_util.h"
+
+
+#if defined(ALIGN)
+#if( defined(SCPLX) || defined(DCPLX))
+#error Cannot align complex routines
+#endif
+#if defined(SREAL) && ( NDPM != 1 ) && ( STRIDE % 4 != 0)
+#error Can only align SREAL with NDPM 1 or STRIDE % 4 = 0
+#endif
+#if defined(DREAL) && ( NDPM != 1 ) && ( STRIDE % 2 != 0)
+#error Can only align DREAL with NDPM 1 or STRIDE % 2 = 0
+#endif
+#endif
+
+/******************************************************************************
+ * Single Precision Complex Macros
+ ******************************************************************************/
+
+#ifdef SCPLX
+
+#ifdef NO_TRANSPOSE
+
+#if NDPM > 3
+#error Max NDPM is 3 for SCPLX NO_TRANSPOSE
+#endif
+
+#undef plax
+#define plax
+
+#undef R1
+#define R1 2
+#undef R2
+#define R2 4
+#undef R3
+#define R3 6
+#undef R4
+#define R4 6
+
+#undef TREG
+#define TREG 1
+#undef SREG
+#define SREG 0
+#undef CREG
+#define CREG 0
+
+#ifdef GER
+#undef AREG
+#define AREG 0
+#undef targ
+#define targ(a_) AREG
+#undef wb
+#define wb(a_,b_) pu(AREG,a_,b_)
+#undef wbd
+#define wbd(a_,b_) pud(AREG,a_,b_)
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#else
+#undef AREG
+#define AREG TREG
+#undef targ
+#define targ(a_) CREG
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef w
+#define w(a_) pu(CREG,a_ ## 0,si)
+#undef w1_2
+#define w1_2(a_) pud(CREG,a_ ## 0,si)
+#endif
+
+#undef src
+#define src(a_) a_
+#undef mpx
+#define mpx(a_) pls(0,si,a_) ps(0,a_,a_) pls(4,si,P(a_,1)) \
+ ps(0,P(a_,1),P(a_,1)) sign(a_)
+#undef madd
+#define madd(a_,b_,c_) pas(a_,b_,c_)
+#undef ulfa
+#define ulfa(a_)
+
+#else
+
+#undef R1
+#define R1 4
+#undef R2
+#define R2 5
+#undef R3
+#define R3 6
+#undef R4
+#define R4 7
+
+#undef TREG
+#define TREG 3
+#undef SREG
+#define SREG 2
+#undef CREG
+#define CREG 0
+#undef targ
+#define targ(a_) a_
+#undef src
+#define src(a_) 0
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef mpx
+#define mpx(a_) px(a_)
+#ifdef BETA0
+#undef ulfa
+#define ulfa(a_) phl(a_,0) pa(0,a_) pud(a_,0,si)
+#else
+#undef ulfa
+#define ulfa(a_) pld(0,si,TREG) phl(a_,0) pa(0,a_) pa(TREG,a_) pud(a_,0,si)
+#endif
+#undef AREG
+#define AREG TREG
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+#define wbs(a_,b_)
+
+
+#undef plax
+#define plax pc(CREG,1) ps(160,CREG,CREG) ps(245,1,1) sign(CREG)
+
+
+
+#endif
+
+#if defined(Conj_) && ! defined(GER)
+#undef sign
+#define sign(a_) pm(SREG,a_)
+#else
+#undef sign
+#define sign(a_) pm(SREG,P(a_,1))
+#endif
+
+
+
+#undef plb
+#define plb(a_,b_) pl(a_,b_,AREG)
+#undef plbd
+#define plbd(a_,b_) px(AREG) pld(a_,b_,AREG)
+
+#undef dpr
+#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprp
+#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dpi
+#define dpi(a_) pm(P(src(a_),1),TREG) ps(177,TREG,TREG) pa(TREG,targ(a_))
+
+#ifndef GER
+
+
+#undef plaa
+#define plaa(a_) pl(a_ ## 0,si,CREG) plax
+#undef wa
+#define wa(a_) w(a_)
+#undef dp
+#define dp(a_,b_,c_) plb(a_ ## 0,b_) dpr(c_) plb(a_ ## 0,b_) dpi(c_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) plb(a_ ## 0,b_) dprp(c_,d_,e_) plb(a_ ## 0,b_) dpi(c_)
+#undef ddp
+#define ddp(a_,b_,c_) dp(a_,b_,c_)
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+#define plaa1_2(a_) px(CREG) pld(a_ ## 0,si,CREG) plax
+#undef wa1_2
+#define wa1_2(a_) w1_2(a_)
+#undef dp1_2
+#define dp1_2(a_,b_,c_) plbd(a_ ## 0,b_) dpr(c_) plbd(a_ ## 0,b_) dpi(c_)
+#undef dpp1_2
+#define dpp1_2(a_,b_,c_,d_,e_) plbd(a_ ## 0,b_) dprp(c_,d_,e_) plbd(a_ ## 0,b_) dpi(c_)
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_)
+#undef ddpp1_2
+#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_)
+
+
+#else
+
+#undef lqc
+#define lqc(a_) pl(a_ ## 0,si,TREG)
+#undef lqc1
+#define lqc1_2(a_) px(TREG) pld(a_ ## 0,si,TREG)
+
+
+#undef plaa
+#define plaa(a_)
+#undef wa
+#define wa(a_)
+#undef dp
+#define dp(a_,b_,c_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) \
+ lqc(a_) dpi(c_) wb(a_ ## 0,b_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) pf(d_,e_) \
+ lqc(a_) dpi(c_) wb(a_ ## 0,b_)
+#undef ddp
+#define ddp(a_,b_,c_) dp(a_,b_,c_)
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+#define plaa1_2(a_)
+#undef wa1_2
+#define wa1_2(a_)
+#undef dp1_2
+#define dp1_2(a_,b_,c_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) \
+ lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_)
+#undef dpp1_2
+#define dpp1_2(a_,b_,c_,d_,e_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) pf(d_,e_) \
+ lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_)
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_)
+#undef ddpp1_2
+#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_)
+
+#endif
+
+#endif
+
+/******************************************************************************
+ * Single Precision Real Macros
+ ******************************************************************************/
+
+#ifdef SREAL
+
+#ifdef NO_TRANSPOSE
+
+#undef mpx
+#define mpx(a_) pls(0,si,a_) ps(0,a_,a_)
+#undef madd
+#define madd(a_,b_,c_) pas(a_,b_,c_)
+#undef TREG
+#define TREG 1
+#undef targ
+#define targ(a_) 0
+#undef src
+#define src(a_) a_
+#undef ulfa
+#define ulfa(a_)
+
+#ifdef GER
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef w1_4
+#define w1_4(a_)
+#undef CREG
+#define CREG 2
+#undef AREG
+#define AREG 0
+#undef cp
+#define cp pc(CREG,TREG)
+#undef wb
+#define wb(a_,b_) pu(AREG,a_,b_)
+#undef wbd
+#define wbd(a_,b_) pud(AREG,a_,b_)
+#undef wbs
+#define wbs(a_,b_) pus(AREG,a_,b_)
+#else
+#undef CREG
+#define CREG 0
+#undef AREG
+#define AREG TREG
+#undef cp
+#define cp
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+#define wbs(a_,b_)
+#undef w
+#define w(a_) pu(CREG,a_ ## 0,si)
+#undef w1_2
+#define w1_2(a_) pud(CREG,a_ ## 0,si)
+#undef w1_4
+#define w1_4(a_) pus(CREG,a_ ## 0,si)
+#endif
+
+#else
+
+#undef mpx
+#define mpx(a_) px(a_)
+#ifdef BETA0
+#undef madd
+#define madd(a_,b_,c_)
+#else
+#undef madd
+#define madd(a_,b_,c_) pas(a_,b_,c_)
+#endif
+#undef TREG
+#define TREG 3
+#undef targ
+#define targ(a_) a_
+#undef src
+#define src(a_) 0
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef w1_4
+#define w1_4(a_)
+#undef ulfa
+#undef ulfa
+#define ulfa(a_) phl(a_,0) pa(0,a_) pc(a_,0) ps(1,0,0) pa(0,a_) \
+ madd(0,si,a_) pus(a_,0,si)
+
+#undef CREG
+#define CREG 0
+#undef AREG
+#define AREG TREG
+#undef cp
+#define cp
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+#define wbs(a_,b_)
+
+#endif
+
+#if defined(ALIGN)
+#undef plb
+#define plb(a_,b_) pla(a_,b_,AREG)
+#else
+#undef plb
+#define plb(a_,b_) pl(a_,b_,AREG)
+#endif
+#undef plbd
+#define plbd(a_,b_) px(AREG) pld(a_,b_,AREG)
+#undef plbs
+#define plbs(a_,b_) pls(a_,b_,AREG)
+#undef dpr
+#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprp
+#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprs
+#define dprs(a_) pmsr(src(a_),TREG) pasr(TREG,targ(a_))
+#undef dprps
+#define dprps(a_,b_,c_) pf(b_,c_) pmsr(src(a_),TREG) pasr(TREG,targ(a_))
+
+#undef plaa
+#define plaa(a_) pl(a_ ## 0,si,CREG)
+#undef wa
+#define wa(a_) w(a_)
+#undef dp
+#define dp(a_,b_,c_) cp plb(a_ ## 0,b_) dpr(c_) wb(a_ ## 0,b_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) cp plb(a_ ## 0,b_) dprp(c_,d_,e_) wb(a_ ## 0,b_)
+#undef ddp
+#define ddp(a_,b_,c_) dp(a_,b_,c_)
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+#define plaa1_2(a_) px(CREG) pld(a_ ## 0,si,CREG)
+#undef wa1_2
+#define wa1_2(a_) w1_2(a_)
+#undef dp1_2
+#define dp1_2(a_,b_,c_) cp plbd(a_ ## 0,b_) dpr(c_) wbd(a_ ## 0,b_)
+#undef dpp1_2
+#define dpp1_2(a_,b_,c_,d_,e_) cp plbd(a_ ## 0,b_) dprp(c_,d_,e_) wbd(a_ ## 0,b_)
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_)
+#undef ddpp1_2
+#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_)
+
+#undef plaa1_4
+#define plaa1_4(a_) pls(a_ ## 0,si,CREG)
+#undef wa1_4
+#define wa1_4(a_) w1_4(a_)
+#undef dp1_4
+#define dp1_4(a_,b_,c_) cp plbs(a_ ## 0,b_) dprs(c_) wbs(a_ ## 0,b_)
+#undef dpp1_4
+#define dpp1_4(a_,b_,c_,d_,e_) cp plbs(a_ ## 0,b_) dprps(c_,d_,e_) wbs(a_ ## 0,b_)
+#undef ddp1_4
+#define ddp1_4(a_,b_,c_) dp1_4(a_,b_,c_)
+#undef ddpp1_4
+#define ddpp1_4(a_,b_,c_,d_,e_) dpp1_4(a_,b_,c_,d_,e_)
+
+
+
+#undef R1
+#define R1 4
+#undef R2
+#define R2 5
+#undef R3
+#define R3 6
+#undef R4
+#define R4 7
+
+#endif
+
+/******************************************************************************
+ * Double Precision Real Macros
+ ******************************************************************************/
+
+#ifdef DREAL
+
+#ifdef ATL_SSE2
+
+#ifdef NO_TRANSPOSE
+
+#undef mpx
+#define mpx(a_) pls(0,si,a_) ps(0,a_,a_)
+#undef madd
+#define madd(a_,b_,c_) pas(a_,b_,c_)
+#undef TREG
+#define TREG 1
+#undef targ
+#define targ(a_) 0
+#undef src
+#define src(a_) a_
+#undef ulfa
+#define ulfa(a_)
+
+#ifdef GER
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef w1_4
+#define w1_4(a_)
+#undef CREG
+#define CREG 2
+#undef AREG
+#define AREG 0
+#undef cp
+#define cp pc(CREG,TREG)
+#undef wb
+#define wb(a_,b_) pu(AREG,a_,b_)
+#undef wbd
+#define wbd(a_,b_) pus(AREG,a_,b_)
+#undef wbs
+/* #define wbs(a_,b_) pus(AREG,a_,b_) */
+#else
+#undef CREG
+#define CREG 0
+#undef AREG
+#define AREG TREG
+#undef cp
+#define cp
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+/* #define wbs(a_,b_) */
+#undef w
+#define w(a_) pu(CREG,a_ ## 0,si)
+#undef w1_2
+#define w1_2(a_) pus(CREG,a_ ## 0,si)
+#undef w1_4
+/* #define w1_4(a_) pus(CREG,a_ ## 0,si) */
+#endif
+
+#else
+
+#undef mpx
+#define mpx(a_) px(a_)
+#ifdef BETA0
+#undef madd
+#define madd(a_,b_,c_)
+#else
+#undef madd
+#define madd(a_,b_,c_) pas(a_,b_,c_)
+#endif
+#undef TREG
+#define TREG 3
+#undef targ
+#define targ(a_) a_
+#undef src
+#define src(a_) 0
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef w1_4
+#define w1_4(a_)
+#undef ulfa
+#undef ulfa
+#define ulfa(a_) /* phl(a_,0) pa(0,a_) */ pc(a_,0) ps(1,0,0) pa(0,a_) \
+ madd(0,si,a_) pus(a_,0,si)
+
+#undef CREG
+#define CREG 0
+#undef AREG
+#define AREG TREG
+#undef cp
+#define cp
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+#define wbs(a_,b_)
+
+#endif
+
+#if defined(ALIGN)
+#undef plb
+#define plb(a_,b_) pla(a_,b_,AREG)
+#else
+#undef plb
+#define plb(a_,b_) pl(a_,b_,AREG)
+#endif
+#undef plbd
+#define plbd(a_,b_) /* px(AREG) */pls(a_,b_,AREG)
+#undef plbs
+/* #define plbs(a_,b_) pls(a_,b_,AREG) */
+#undef dpr
+#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprp
+#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprs
+#define dprs(a_) pmsr(src(a_),TREG) pasr(TREG,targ(a_))
+#undef dprps
+#define dprps(a_,b_,c_) pf(b_,c_) pmsr(src(a_),TREG) pasr(TREG,targ(a_))
+
+#undef plaa
+#define plaa(a_) pl(a_ ## 0,si,CREG)
+#undef wa
+#define wa(a_) w(a_)
+#undef dp
+#define dp(a_,b_,c_) cp plb(a_ ## 0,b_) dpr(c_) wb(a_ ## 0,b_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) cp plb(a_ ## 0,b_) dprp(c_,d_,e_) wb(a_ ## 0,b_)
+#undef ddp
+#define ddp(a_,b_,c_) dp(a_,b_,c_)
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+#define plaa1_2(a_) /* px(CREG) */pls(a_ ## 0,si,CREG)
+#undef wa1_2
+#define wa1_2(a_) w1_2(a_)
+#undef dp1_2
+#define dp1_2(a_,b_,c_) cp plbd(a_ ## 0,b_) dprs(c_) wbd(a_ ## 0,b_)
+#undef dpp1_2
+#define dpp1_2(a_,b_,c_,d_,e_) cp plbd(a_ ## 0,b_) dprps(c_,d_,e_) wbd(a_ ## 0,b_)
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_)
+#undef ddpp1_2
+#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_)
+
+#undef plaa1_4
+/* #define plaa1_4(a_) pls(a_ ## 0,si,CREG) */
+#undef wa1_4
+/* #define wa1_4(a_) w1_4(a_) */
+#undef dp1_4
+/* #define dp1_4(a_,b_,c_) cp plbs(a_ ## 0,b_) dprs(c_) wbs(a_ ## 0,b_) */
+#undef dpp1_4
+/* #define dpp1_4(a_,b_,c_,d_,e_) cp plbs(a_ ## 0,b_) dprps(c_,d_,e_) wbs(a_ ## 0,b_) */
+#undef ddp1_4
+/* #define ddp1_4(a_,b_,c_) dp1_4(a_,b_,c_) */
+#undef ddpp1_4
+/* #define ddpp1_4(a_,b_,c_,d_,e_) dpp1_4(a_,b_,c_,d_,e_) */
+
+
+
+#undef R1
+#define R1 4
+#undef R2
+#define R2 5
+#undef R3
+#define R3 6
+#undef R4
+#define R4 7
+
+#else
+
+#ifdef NO_TRANSPOSE
+
+#undef t0
+#define t0(a_) 1
+#undef s0
+#define s0(a_) a_
+#undef t8
+#define t8(a_) 2
+#undef s8
+#define s8(a_) a_
+#undef w
+#define w(a_) fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef w1_2
+#define w1_2(a_) fp(a_ ## 0,si)
+#undef mpx
+#define mpx(a_) fl(0,si) fc(M(a_,2))
+#undef madd
+#define madd(a_,b_,c_) faa(a_,b_)
+#undef ulfa
+#define ulfa(a_) fc(0)
+
+#else
+
+#undef t0
+#define t0(a_) a_
+#undef s0
+#define s0(a_) 1
+#undef t8
+#define t8(a_) a_
+#undef s8
+#define s8(a_) 2
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef mpx
+#define mpx(a_) fz
+#ifdef BETA0
+#undef madd
+#define madd(a_,b_,c_)
+#else
+#undef madd
+#define madd(a_,b_,c_) faa(a_,b_)
+#endif
+#undef ulfa
+#define ulfa(a_) madd(0,si,a_) fp(0,si)
+
+#endif
+
+
+#ifndef GER
+
+#undef plaa1_2
+#define plaa1_2(a_) fl(a_ ## 0,si)
+#undef wa1_2
+#define wa1_2(a_) w1_2(a_)
+#ifdef NO_TRANSPOSE
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_) fl(a_ ## 0,b_) fm(M(s0(c_),1),0) fap(0,t0(c_))
+#undef dp1_2
+#define dp1_2(a_,b_,c_) ddp1_2(a_,b_,c_)
+#else
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_) fl(a_ ## 0,b_) fm(s0(c_),0) fap(0,M(t0(c_),1))
+#undef dp1_2
+#define dp1_2(a_,b_,c_) fl(a_ ## 0,b_) fmp(0,s0(c_)) fap(0,M(t0(c_),2))
+#endif
+
+#else
+
+#undef plaa1_2
+#define plaa1_2(a_) fl(a_ ## 0,si)
+#undef wa1_2
+#define wa1_2(a_)
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_) fd(M(s0(c_),2)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_)
+#undef dp1_2
+#define dp1_2(a_,b_,c_) fm(M(s0(c_),2),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_)
+
+#endif
+
+
+
+#undef plaa
+#define plaa(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) fx1
+
+#ifndef GER
+
+
+#undef wa
+#define wa(a_) w(a_)
+
+
+#undef ddp
+#define ddp(a_,b_,c_) fl(a_ ## 0,b_) fm(s0(c_),0) fl(a_ ## 8,b_) \
+ fm(P(s8(c_),1),0) fx1 fap(0,P(t0(c_),1)) \
+ fap(0,t8(c_))
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fm(s0(c_),0) fl(a_ ## 8,b_) \
+ fm(P(s8(c_),1),0) pf(d_,e_) fx1 fap(0,P(t0(c_),1)) \
+ fap(0,t8(c_))
+
+/* #define ddp(a_,b_,c_) fd(M(s0(c_),1)) fma(a_ ## 0,b_) fap(0,t0(c_)) \ */
+/* fd(M(s8(c_),1)) fma(a_ ## 8,b_) fap(0,t8(c_)) */
+/* #define ddpp(a_,b_,c_,d_,e_) fd(M(s0(c_),1)) fma(a_ ## 0,b_) fap(0,t0(c_)) \ */
+/* \ */
+/* fd(M(s8(c_),1)) fma(a_ ## 8,b_) fap(0,t8(c_)) pf(d_,e_) */
+
+#ifdef NO_TRANSPOSE
+
+#undef dp
+#define dp(a_,b_,c_) ddp(a_,b_,c_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) ddpp(a_,b_,c_,d_,e_)
+
+#else
+
+#undef dp
+#define dp(a_,b_,c_) fl(a_ ## 0,b_) fmp(0,s0(c_)) fl(a_ ## 8,b_) \
+ fmp(0,s8(c_)) fap(0,M(t0(c_),1)) fap(0,M(t8(c_),2))
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) pf(d_ ,e_) fmp(0,s0(c_)) fl(a_ ## 8,b_) \
+ fmp(0,s8(c_)) fap(0,M(t0(c_),1)) fap(0,M(t8(c_),2))
+
+/* #define dp(a_,b_,c_) fma(a_ ## 0,b_) fap(0,M(t0(c_),1)) \ */
+/* fma(a_ ## 8,b_) fap(0,M(t8(c_),2)) */
+/* #define dpp(a_,b_,c_,d_,e_) fma(a_ ## 0,b_) fap(0,M(t0(c_),1)) \ */
+/* \ */
+/* fma(a_ ## 8,b_) fap(0,M(t8(c_),2)) pf(d_,e_) */
+
+#endif
+
+
+#else
+
+#undef wa
+#define wa(a_)
+#undef ddp
+#define ddp(a_,b_,c_) fd(M(s0(c_),1)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \
+ fd(M(s8(c_),1)) fm(t8(c_),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_)
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) fd(M(s0(c_),1)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \
+ fd(M(s8(c_),1)) fm(t8(c_),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) pf(d_,e_)
+
+#undef dp
+#define dp(a_,b_,c_) fm(M(s0(c_),1),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \
+ fm(M(s8(c_),2),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) fm(M(s0(c_),1),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \
+ fm(M(s8(c_),2),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) pf(d_,e_)
+
+#endif
+
+
+#undef R1
+#define R1 3
+#undef R2
+#define R2 4
+#undef R3
+#define R3 5
+#undef R4
+#define R4 6
+
+#endif
+
+#endif
+
+/******************************************************************************
+ * Double Precision Complex Macros
+ ******************************************************************************/
+
+#ifdef DCPLX
+
+#ifdef ATL_SSE2
+#ifdef NO_TRANSPOSE
+
+#if NDPM > 3
+#error Max NDPM is 3 for DCPLX NO_TRANSPOSE
+#endif
+
+#undef plax
+#define plax
+
+#undef R1
+#define R1 2
+#undef R2
+#define R2 4
+#undef R3
+#define R3 6
+#undef R4
+#define R4 6
+
+#undef TREG
+#define TREG 1
+#undef SREG
+#define SREG 0
+#undef CREG
+#define CREG 0
+
+#ifdef GER
+#undef AREG
+#define AREG 0
+#undef targ
+#define targ(a_) AREG
+#undef wb
+#define wb(a_,b_) pu(AREG,a_,b_)
+#undef wbd
+/* #define wbd(a_,b_) pud(AREG,a_,b_) */
+#undef w
+#define w(a_)
+#undef w1_2
+/* #define w1_2(a_) */
+#else
+#undef AREG
+#define AREG TREG
+#undef targ
+#define targ(a_) CREG
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+/* #define wbd(a_,b_) */
+#undef w
+#define w(a_) pu(CREG,a_ ## 0,si)
+#undef w1_2
+/* #define w1_2(a_) pud(CREG,a_ ## 0,si) */
+#endif
+
+#undef src
+#define src(a_) a_
+#undef mpx
+#define mpx(a_) pls(0,si,a_) ps(0,a_,a_) pls(8,si,P(a_,1)) \
+ ps(0,P(a_,1),P(a_,1)) sign(a_)
+#undef madd
+#define madd(a_,b_,c_) pas(a_,b_,c_)
+#undef ulfa
+#define ulfa(a_)
+
+#else
+
+#undef R1
+#define R1 4
+#undef R2
+#define R2 5
+#undef R3
+#define R3 6
+#undef R4
+#define R4 7
+
+#undef TREG
+#define TREG 3
+#undef SREG
+#define SREG 2
+#undef CREG
+#define CREG 0
+#undef targ
+#define targ(a_) a_
+#undef src
+#define src(a_) 0
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef mpx
+#define mpx(a_) px(a_)
+#ifdef BETA0
+#undef ulfa
+#define ulfa(a_) /* phl(a_,0) pa(0,a_) */pu(a_,0,si)
+#else
+#undef ulfa
+#define ulfa(a_) pl(0,si,TREG) /* phl(a_,0) pa(0,a_) */ pa(TREG,a_) pu(a_,0,si)
+#endif
+#undef AREG
+#define AREG TREG
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+#define wbs(a_,b_)
+
+
+#undef plax
+#define plax pc(CREG,1) ps(0,CREG,CREG) ps(3,1,1) sign(CREG)
+
+
+
+#endif
+
+#if defined(Conj_) && ! defined(GER)
+#undef sign
+#define sign(a_) pm(SREG,a_)
+#else
+#undef sign
+#define sign(a_) pm(SREG,P(a_,1))
+#endif
+
+
+
+#undef plb
+#define plb(a_,b_) pl(a_,b_,AREG)
+#undef plbd
+/* #define plbd(a_,b_) px(AREG) pld(a_,b_,AREG) */
+
+#undef dpr
+#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprp
+#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dpi
+#define dpi(a_) pm(P(src(a_),1),TREG) ps(1,TREG,TREG) pa(TREG,targ(a_))
+
+#ifndef GER
+
+#undef plaa
+#define plaa(a_) pl(a_ ## 0,si,CREG) plax
+#undef wa
+#define wa(a_) w(a_)
+#undef dp
+#define dp(a_,b_,c_) plb(a_ ## 0,b_) dpr(c_) plb(a_ ## 0,b_) dpi(c_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) plb(a_ ## 0,b_) dprp(c_,d_,e_) plb(a_ ## 0,b_) dpi(c_)
+#undef ddp
+#define ddp(a_,b_,c_) dp(a_,b_,c_)
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+/* #define plaa1_2(a_) px(CREG) pld(a_ ## 0,si,CREG) plax */
+#undef wa1_2
+/* #define wa1_2(a_) w1_2(a_) */
+#undef dp1_2
+/* #define dp1_2(a_,b_,c_) plbd(a_ ## 0,b_) dpr(c_) plbd(a_ ## 0,b_) dpi(c_) */
+#undef dpp1_2
+/* #define dpp1_2(a_,b_,c_,d_,e_) plbd(a_ ## 0,b_) dprp(c_,d_,e_) plbd(a_ ## 0,b_) dpi(c_) */
+#undef ddp1_2
+/* #define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) */
+#undef ddpp1_2
+/* #define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) */
+
+
+#else
+
+#undef lqc
+#define lqc(a_) pl(a_ ## 0,si,TREG)
+#undef lqc1
+/* #define lqc1_2(a_) px(TREG) pld(a_ ## 0,si,TREG) */
+
+
+#undef plaa
+#define plaa(a_)
+#undef wa
+#define wa(a_)
+#undef dp
+#define dp(a_,b_,c_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) \
+ lqc(a_) dpi(c_) wb(a_ ## 0,b_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) pf(d_,e_) \
+ lqc(a_) dpi(c_) wb(a_ ## 0,b_)
+#undef ddp
+#define ddp(a_,b_,c_) dp(a_,b_,c_)
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+/* #define plaa1_2(a_) */
+#undef wa1_2
+/* #define wa1_2(a_) */
+#undef dp1_2
+/* #define dp1_2(a_,b_,c_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) \ */
+/* lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) */
+#undef dpp1_2
+/* #define dpp1_2(a_,b_,c_,d_,e_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) pf(d_,e_) \ */
+/* lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) */
+#undef ddp1_2
+/* #define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) */
+#undef ddpp1_2
+/* #define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) */
+
+#endif
+
+#else
+
+#if NDPM > 2
+#error Max NDPM is 2 for DCPLX
+#endif
+
+#undef TREG
+#define TREG 2
+
+#ifdef NO_TRANSPOSE
+
+#undef w
+#define w(a_) fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef plax
+#define plax fx1
+#undef srr
+#define srr(a_) a_
+#undef sri
+#define sri(a_) a_
+#undef sir
+#define sir(a_) a_
+#undef sii
+#define sii(a_) a_
+#undef trr
+#define trr(a_) P(TREG,1)
+#undef tri
+#define tri(a_) M(TREG,1)
+#undef tir
+#define tir(a_) TREG
+#undef tii
+#define tii(a_) TREG
+#undef mpx
+#define mpx(a_) fl(0,si) fl(8,si) fc(M(a_,2)) fc(M(a_,2))
+#undef madd
+#define madd(a_,b_,c_) faa(a_,b_)
+#undef ulfa
+#define ulfa(a_) fc(0) fc(0)
+
+#else
+
+#undef srr
+#define srr(a_) P(TREG,1)
+#undef sri
+#define sri(a_) M(TREG,1)
+#undef sir
+#define sir(a_) TREG
+#undef sii
+#define sii(a_) TREG
+#undef trr
+#define trr(a_) a_
+#undef tri
+#define tri(a_) a_
+#undef tir
+#define tir(a_) a_
+#undef tii
+#define tii(a_) a_
+#undef w
+#define w(a_)
+#undef plax
+#define plax
+#undef mpx
+#define mpx(a_) fz fz
+#ifdef BETA0
+#undef madd
+#define madd(a_,b_,c_)
+#else
+#undef madd
+#define madd(a_,b_,c_) faa(a_,b_)
+#endif
+#undef ulfa
+#define ulfa(a_) madd(0,si,a_) fp(0,si) madd(8,si,a_) fp(8,si)
+
+#endif
+
+
+
+#ifdef Conj_
+#undef fapi
+#define fapi(a_,b_) fsp(b_)
+#undef fspi
+#define fspi(a_,b_) fap(a_,b_)
+#else
+#undef fapi
+#define fapi(a_,b_) fap(a_,b_)
+#undef fspi
+#define fspi(a_,b_) fsp(b_)
+#endif
+
+#ifndef GER
+
+
+#undef plaa
+#define plaa(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) plax
+#undef wa
+#define wa(a_) w(a_)
+#undef ddp
+#define ddp(a_,b_,c_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \
+ fm(sri(c_),0) fap(0,tri(c_))\
+ fl(a_ ## 8,b_) fd(0) fm(sir(c_),0) fspi(0,tir(c_)) \
+ fm(sii(c_),0) fapi(0,tii(c_))
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \
+ fm(sri(c_),0) fap(0,tri(c_))\
+ fl(a_ ## 8,b_) fd(0) pf(d_,e_) fm(sir(c_),0) fspi(0,tir(c_))\
+ fm(sii(c_),0) fapi(0,tii(c_))
+
+
+
+#ifdef NO_TRANSPOSE
+
+
+
+#undef dp
+#define dp(a_,b_,c_) ddp(a_,b_,c_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) ddpp(a_,b_,c_,d_,e_)
+
+
+
+#else
+
+#undef dp
+#define dp(a_,b_,c_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \
+ fm(sri(c_),0) fap(0,tri(c_))\
+ fl(a_ ## 8,b_) fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \
+ fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2))
+
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \
+ pf(d_,e_) fm(sri(c_),0) fap(0,tri(c_))\
+ fl(a_ ## 8,b_) fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \
+ fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2))
+
+
+#endif
+
+#else
+
+#undef plaa
+#define plaa(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) plax
+#undef wa
+#define wa(a_)
+
+#undef ddprr
+#define ddprr(a_,b_,c_) fl(a_ ## 0,b_) \
+ fd(tri(c_)) fm(P(sri(c_),1),0) fap(0,1) \
+ fd(M(trr(c_),1)) fm(srr(c_),0) fspi(0,1) \
+ fp(a_ ## 0,b_)
+#undef ddpri
+#define ddpri(a_,b_,c_) fl(a_ ## 8,b_) \
+ fd(tii(c_)) fm(P(sii(c_),1),0) fap(0,1) \
+ fd(M(tir(c_),1)) fm(sir(c_),0) fapi(0,1) \
+ fp(a_ ## 8,b_)
+#undef dpri
+#define dpri(a_,b_,c_) fl(a_ ## 8,b_) \
+ fx(2) fm(sir(c_),0) fap(0,2) \
+ fm(M(sii(c_),2),0) fapi(0,1) \
+ fp(a_ ## 8,b_)
+
+
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) ddprr(a_,b_,c_) pf(d_,e_) ddpri(a_,b_,c_)
+#undef ddp
+#define ddp(a_,b_,c_) ddprr(a_,b_,c_) ddpri(a_,b_,c_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) ddprr(a_,b_,c_) pf(d_,e_) dpri(a_,b_,c_)
+#undef dp
+#define dp(a_,b_,c_) ddprr(a_,b_,c_) dpri(a_,b_,c_)
+
+#endif
+
+
+#undef R1
+#define R1 4
+#undef R2
+#define R2 6
+#undef R3
+#define R3 6
+#undef R4
+#define R4 6
+
+#endif
+
+#endif
+
+
+/******************************************************************************
+ * General Macros
+ ******************************************************************************/
+
+
+
+
+#undef bla1
+#define bla1(a_,b_) plaa(a_) dpp(a_,ax,R1,b_,si) wa(a_)
+#undef blb1
+#define blb1(a_,b_) plaa(a_) dpp(a_,ax,R1,b_,ax) wa(a_)
+
+#undef bla2
+#undef bla2
+#define bla2(a_,b_) pf(b_,si) plaa(a_) ddp(a_,ax,R1) pf(b_,ax) dp(a_,bx,R2) wa(a_)
+#undef blb2
+#undef blb2
+#define blb2(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,bx) dp(a_,bx,R2) wa(a_)
+
+#undef bla3
+#define bla3(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,si) ddp(a_,bx,R2) \
+ dpp(a_,cx,R3,b_,ax) wa(a_)
+#undef blb3
+#define blb3(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,bx) ddp(a_,bx,R2) \
+ dpp(a_,cx,R3,b_,cx) wa(a_)
+
+#undef bla4
+#define bla4(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,si) ddpp(a_,bx,R2,b_,ax) \
+ ddp(a_,cx,R3) dpp(a_,dx,R4,b_,bx) wa(a_)
+#undef blb4
+#define blb4(a_,b_) plaa(a_) ddp(a_,ax,R1) ddpp(a_,bx,R2,b_,cx) \
+ ddp(a_,cx,R3) dpp(a_,dx,R4,b_,dx) wa(a_)
+
+#undef bla
+#define bla(a_,b_) Mjoin(bla,NDP)(a_,b_)
+#undef blb
+#define blb(a_,b_) Mjoin(blb,NDP)(a_,b_)
+
+
+
+#undef bla11_2
+#define bla11_2(a_) plaa1_2(a_) dp1_2(a_,ax,R1) wa1_2(a_)
+#undef bla21_2
+#define bla21_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) dp1_2(a_,bx,R2) wa1_2(a_)
+#undef bla31_2
+#define bla31_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \
+ dp1_2(a_,cx,R3) wa1_2(a_)
+#undef bla41_2
+#define bla41_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \
+ ddp1_2(a_,cx,R3) dp1_2(a_,dx,R4) wa1_2(a_)
+
+#undef bla1_2
+#define bla1_2(a_) Mjoin(Mjoin(bla,NDP),1_2)(a_)
+
+
+
+#undef bla11_4
+#define bla11_4(a_) plaa1_4(a_) dp1_4(a_,ax,R1) wa1_4(a_)
+#undef bla21_4
+#define bla21_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) dp1_4(a_,bx,R2) wa1_4(a_)
+#undef bla31_4
+#define bla31_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \
+ dp1_4(a_,cx,R3) wa1_4(a_)
+#undef bla41_4
+#define bla41_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \
+ ddp1_4(a_,cx,R3) dp1_4(a_,dx,R4) wa1_4(a_)
+
+#undef bla1_4
+#define bla1_4(a_) Mjoin(Mjoin(bla,NDP),1_4)(a_)
+
+
+
+#undef inc1
+#define inc1(a_) a(a_,si) a(a_,ax)
+#undef inc2
+#define inc2(a_) inc1(a_) a(a_,bx)
+#undef inc3
+#define inc3(a_) inc2(a_) a(a_,cx)
+#undef inc4
+#define inc4(a_) inc3(a_) a(a_,dx)
+
+#undef inc
+#define inc(a_) Mjoin(inc,NDP)(a_)
+
+
+#ifdef PREFETCH
+/* #include "camm_arith.h" */
+#undef S
+#define S(a_,b_) (a_) + (b_)
+#undef PF1
+#define PF1 PREFETCH
+#undef PF2
+#define PF2 S(PF1,32)
+#undef PF3
+#define PF3 S(PF1,64)
+#undef PF4
+#define PF4 S(PF1,96)
+#undef PF5
+#define PF5 S(PF1,128)
+#undef PF6
+#define PF6 S(PF1,160)
+#undef PF7
+#define PF7 S(PF1,192)
+#undef PF8
+#define PF8 S(PF1,224)
+#else
+#undef PF1
+#define PF1 64
+#undef PF2
+#define PF2 96
+#undef PF3
+#define PF3 128
+#undef PF4
+#define PF4 160
+#undef PF5
+#define PF5 192
+#undef PF6
+#define PF6 224
+#undef PF7
+#define PF7 256
+#undef PF8
+#define PF8 288
+#endif
+
+
+#if defined(NO_TRANSPOSE) && !defined(SREAL) && !defined(GER)
+#undef pf
+#define pf(a_,b_) f(t0,a_,b_)
+#else
+#undef pf
+#define pf(a_,b_) f(nta,a_,b_)
+#endif
+
+#undef bl1
+#define bl1 bla1_4(0x0) inc(4)
+#undef bl2
+#define bl2 bla1_2(0x0) inc(8)
+#undef bl4
+#define bl4 bla(0x0,PF1) inc(16)
+#undef bl8
+#define bl8 bla(0x0,PF1) blb(0x1,PF1) inc(32)
+#undef bl16
+#define bl16 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) inc(64)
+#undef bl32
+#define bl32 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \
+ bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) inc(128)
+#undef bl64
+#define bl64 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \
+ bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) \
+ bla(0x8,PF5) blb(0x9,PF5) bla(0xa,PF6) blb(0xb,PF6) \
+ bla(0xc,PF7) blb(0xd,PF7) bla(0xe,PF8) blb(0xf,PF8) inc(256)
+
+/* #define in2 inc(8) */
+/* #define in4 inc(16) */
+/* #define in8 inc(32) */
+/* #define in16 inc(64) */
+
+#undef in2
+#define in2
+#undef in4
+#define in4
+#undef in8
+#define in8
+#undef in16
+#define in16
+
+#ifdef NO_TRANSPOSE
+#undef incf
+#define incf ra(di,si)
+#else
+#undef incf
+#define incf
+#endif
+
+#undef lf1
+#define lf1 mpx(R1)
+#undef lf2
+#define lf2 lf1 incf mpx(R2)
+#undef lf3
+#define lf3 lf2 incf mpx(R3)
+#undef lf4
+#define lf4 lf3 incf mpx(R4)
+
+#undef lf
+#define lf Mjoin(lf,NDP)
+
+
+#undef ulf1
+#define ulf1 ulfa(R1)
+#undef ulf2
+#define ulf2 ulf1 ra(di,si) ulfa(R2)
+#undef ulf3
+#define ulf3 ulf2 ra(di,si) ulfa(R3)
+#undef ulf4
+#define ulf4 ulf3 ra(di,si) ulfa(R4)
+
+#undef ulf
+#define ulf Mjoin(ulf,NDP)
+
+#undef lpba
+#define lpba(a_) "movl %%esi,%%e" #a_ "\n\t"
+
+#undef lpb1
+#define lpb1 lpba(ax)
+#undef lpb2
+#define lpb2 lpb1 ra(di,si) lpba(bx)
+#undef lpb3
+#define lpb3 lpb2 ra(di,si) lpba(cx)
+#undef lpb4
+#define lpb4 lpb3 ra(di,si) lpba(dx)
+
+#undef lpb
+#define lpb Mjoin(lpb,NDP)
+
+#undef ipf1
+#define ipf1(a_) pf(a_,si) pf(a_,ax)
+#undef ipf2
+#define ipf2(a_) ipf1(a_) pf(a_,bx)
+#undef ipf3
+#define ipf3(a_) ipf2(a_) pf(a_,cx)
+#undef ipf4
+#define ipf4(a_) ipf3(a_) pf(a_,dx)
+
+#undef ipf
+#define ipf(a_) Mjoin(ipf,NDP)(a_)
+
+#ifdef LUNROLL
+#undef UNROLL
+#ifdef SREAL
+#undef UNROLL
+#define UNROLL LUNROLL
+#elif defined(DREAL) || defined(SCPLX)
+#undef UNROLL
+#define UNROLL LUNROLL*2
+#elif defined(DCPLX)
+#undef UNROLL
+#define UNROLL LUNROLL*4
+#endif
+#else
+#undef UNROLL
+#define UNROLL 16
+#endif
+
+#undef UNROLL1_2
+#if UNROLL == 64
+#undef blUNROLL
+#define blUNROLL bl64
+#undef UNROLL1_2
+#define UNROLL1_2 32
+#elif UNROLL == 32
+#undef blUNROLL
+#define blUNROLL bl32
+#undef UNROLL1_2
+#define UNROLL1_2 16
+#elif UNROLL == 16
+#undef blUNROLL
+#define blUNROLL bl16
+#undef UNROLL1_2
+#define UNROLL1_2 8
+#elif UNROLL == 8
+#undef blUNROLL
+#define blUNROLL bl8
+#undef UNROLL1_2
+#define UNROLL1_2 4
+#elif UNROLL == 4
+#undef blUNROLL
+#define blUNROLL bl4
+#undef UNROLL1_2
+#define UNROLL1_2 2
+#elif UNROLL == 2
+#undef blUNROLL
+#define blUNROLL bl2
+#undef UNROLL1_2
+#define UNROLL1_2 1
+#elif UNROLL == 1
+#undef blUNROLL
+#define blUNROLL bl1
+#undef UNROLL1_2
+#define UNROLL1_2 stop
+#endif
+#ifndef UNROLL1_2
+#error UNROLL must be set to power of 2 < 128
+#endif
+
+
+#ifdef GER
+#undef aconst
+#define aconst
+#undef cconst
+#define cconst const
+#else
+#undef aconst
+#define aconst const
+#undef cconst
+#define cconst
+#endif
+
+#undef MY_FUNCTION
+#define MY_FUNCTION Mjoin(dp,EXT)
+
+static void
+MY_FUNCTION(aconst TYPE *a,int lda,
+ const TYPE *b,
+ cconst TYPE *c,int stride,int len) {
+
+#ifdef SCPLX
+#if defined(GER) && defined(Conj_)
+ const TYPE w1[2]={{-1.0,1.0},{-1.0,1.0}},*w=w1;
+#else
+ const TYPE w1[2]={{1.0,-1.0},{1.0,-1.0}},*w=w1;
+#endif
+#endif
+
+#if defined(DCPLX) && defined(ATL_SSE2)
+#if defined(GER) && defined(Conj_)
+ const TYPE w1[1]={{-1.0,1.0}},*w=w1;
+#else
+ const TYPE w1[1]={{1.0,-1.0}},*w=w1;
+#endif
+#endif
+
+#ifdef NO_TRANSPOSE
+#undef movm
+#define movm c
+#undef fixm
+#define fixm b
+#else
+#undef movm
+#define movm b
+#undef fixm
+#define fixm c
+#endif
+ NO_INLINE
+ unsigned u1=stride*sizeof(*fixm),u2=lda*sizeof(*a),u3=len*sizeof(*movm)/sizeof(float);
+
+ ASM (
+
+ "pushl %%ebx\n\t"
+ a(4,sp)
+
+#if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2))
+ "movl %6,%%esi\n\t"
+ pl(0,si,SREG)
+#endif
+
+#ifdef NO_TRANSPOSE
+ "movl %1,%%esi\n\t" /* fixm */
+ "movl %2,%%edi\n\t" /* fixm2fixm */
+#endif
+
+ lf
+
+ "movl %3,%%esi\n\t" /* a */
+ "movl %4,%%edi\n\t" /* a2a */
+
+ lpb
+
+ ipf(0)
+
+ "movl %0,%%esi\n\t" /* movm */
+ "movl %5,%%edi\n\t" /* len */
+
+#if defined(ALIGN)
+
+#if defined(SREAL)
+
+ test(4,ax)
+ je(Mjoin(a1,EXT))
+ test(-1,di)
+ je(Mjoin(a1,EXT))
+ sub(1,di)
+ bl1
+
+ lab(Mjoin(a1,EXT))
+
+#endif
+
+#if defined(DREAL) || defined(SREAL)
+
+ test(8,ax)
+ je(Mjoin(as,EXT))
+ test(-2,di)
+ je(Mjoin(as,EXT))
+ sub(2,di)
+ bl2
+
+ lab(Mjoin(as,EXT))
+
+#endif
+
+#endif
+
+
+ ipf(32)
+
+ lab(Mjoin(loop,EXT))
+
+ test(-UNROLL,di)
+ je(Mjoin(UNROLL1_2,EXT))
+ sub(UNROLL,di)
+
+ blUNROLL
+
+ jmp(Mjoin(loop,EXT))
+
+#if UNROLL > 32
+ lab(Mjoin(32,EXT))
+ test(32,di)
+ je(Mjoin(16,EXT))
+ bl32
+#endif
+
+#if UNROLL > 16
+ lab(Mjoin(16,EXT))
+ test(16,di)
+ je(Mjoin(8,EXT))
+ bl16
+#endif
+
+#if UNROLL > 8
+ lab(Mjoin(8,EXT))
+ test(8,di)
+ je(Mjoin(4,EXT))
+ bl8
+#endif
+
+#if UNROLL > 4
+ lab(Mjoin(4,EXT))
+ test(4,di)
+ je(Mjoin(2,EXT))
+ bl4
+#endif
+
+#if UNROLL > 2
+ lab(Mjoin(2,EXT))
+#ifndef DCPLX
+ test(2,di)
+ je(Mjoin(1,EXT))
+ bl2
+#endif
+#endif
+
+#if UNROLL > 1
+ lab(Mjoin(1,EXT))
+#ifdef SREAL
+ test(1,di)
+ je(Mjoin(stop,EXT))
+ bl1
+#endif
+#endif
+
+ lab(Mjoin(stop,EXT))
+
+#ifndef NO_TRANSPOSE
+ "movl %1,%%esi\n\t" /* fixm */
+ "movl %2,%%edi\n\t" /* fixm2fixm */
+#endif
+
+ ulf
+
+ a(-4,sp)
+ "popl %%ebx\n\t"
+
+
+ ::"m" (movm),"m" (fixm),"m" (u1),"m" (a),"m" (u2),"m" (u3)
+
+#if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2))
+ ,"m" (w)
+#endif
+ :"ax","bx","cx","dx","si","di");
+
+
+}
+
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h
new file mode 100644
index 0000000..7fd1404
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h
@@ -0,0 +1,295 @@
+#include "camm_util.h"
+
+#ifndef N
+#error N must be defined in camm_pipe3.h
+#endif
+#ifndef KB
+#error KB must be defined in camm_pipe3.h
+#endif
+
+#undef p1
+#define p1(a_) Mjoin(p1_4_,N)(a_)
+#undef p2
+#define p2(a_) Mjoin(p1_2_,N)(a_)
+#undef p4
+#define p4(a_) Mjoin(p1_,N)(a_)
+#undef load_pipe
+#define load_pipe(a_) Mjoin(lp,N)(a_)
+#undef drain_pipe
+#define drain_pipe(a_) Mjoin(dp,N)(a_)
+#undef pipe_len
+#define pipe_len Mjoin(pl,N)
+
+#undef p8
+#if pipe_len > 4
+#define p8(a_) Mjoin(p2_,N)(a_)
+#else
+#define p8(a_) p4(a_) p4(SS(a_,16))
+#endif
+
+#undef p16
+#if pipe_len > 8
+#define p16(a_) Mjoin(p4_,N)(a_)
+#else
+#define p16(a_) p8(a_) p8(SS(a_,32))
+#endif
+
+#undef p32
+#if pipe_len > 16
+#define p32(a_) Mjoin(p8_,N)(a_)
+#else
+#define p32(a_) p16(a_) p16(SS(a_,64))
+#endif
+
+#undef p64
+#if pipe_len > 32
+#define p64(a_) Mjoin(p16_,N)(a_)
+#else
+#define p64(a_) p32(a_) p32(SS(a_,128))
+#endif
+
+#undef p128
+#if pipe_len > 64
+#define p128(a_) Mjoin(p32_,N)(a_)
+#else
+#define p128(a_) p64(a_) p64(SS(a_,256))
+#endif
+
+#undef p256
+#if pipe_len > 128
+#define p256(a_) Mjoin(p64_,N)(a_)
+#else
+#define p256(a_) p128(a_) p128(SS(a_,512))
+#endif
+
+#if KB < pipe_len
+#undef pipe_len
+#define pipe_len 0
+#undef load_pipe
+#define load_pipe(a_)
+#undef drain_pipe
+#define drain_pipe(a_)
+#endif
+
+
+#undef MKB
+/* #ifdef SREAL */
+#define MKB KB
+/* #elif defined (DCPLX) */
+/* #define MKB ( KB * 4 ) */
+/* #else */
+/* #define MKB ( KB * 2 ) */
+/* #endif */
+
+#if MKB >= 512
+#error MKB must be less than 512
+#endif
+
+#undef x0
+#undef o0
+#define x0 load_pipe(0)
+#define o0 0
+
+#undef MKBB
+#define MKBB ( MKB - pipe_len )
+
+#undef xx1
+#undef oo1
+#if MKBB >= 256
+#define xx1 x0 p256(o0)
+#define oo1 SS(1024,o0)
+#else
+#define xx1 x0
+#define oo1 o0
+#endif
+
+#undef xx1a
+#undef oo1a
+#if pipe_len == 256
+#define xx1a xx1 drain_pipe(oo1)
+#define oo1a SS(1024,oo1)
+#undef MKBB
+#define MKBB MKB
+#else
+#define xx1a xx1
+#define oo1a oo1
+#endif
+
+#undef x1
+#undef o1
+#if ( MKBB / 128 ) % 2
+#define x1 xx1a p128(oo1a)
+#define o1 SS(512,oo1a)
+#else
+#define x1 xx1a
+#define o1 oo1a
+#endif
+
+#undef x1a
+#undef o1a
+#if pipe_len == 128
+#define x1a x1 drain_pipe(o1)
+#define o1a SS(512,o1)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x1a x1
+#define o1a o1
+#endif
+
+#undef x2
+#undef o2
+#if ( MKBB / 64 ) % 2
+#define x2 x1a p64(o1a)
+#define o2 SS(256,o1a)
+#else
+#define x2 x1a
+#define o2 o1a
+#endif
+
+#undef x2a
+#undef o2a
+#if pipe_len == 64
+#define x2a x2 drain_pipe(o2)
+#define o2a SS(256,o2)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x2a x2
+#define o2a o2
+#endif
+
+#undef x3
+#undef o3
+#if ( MKBB / 32 ) % 2
+#define x3 x2a p32(o2a)
+#define o3 SS(128,o2a)
+#else
+#define x3 x2a
+#define o3 o2a
+#endif
+
+#undef x3a
+#undef o3a
+#if pipe_len == 32
+#define x3a x3 drain_pipe(o3)
+#define o3a SS(128,o3)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x3a x3
+#define o3a o3
+#endif
+
+#undef x4
+#undef o4
+#if ( MKBB / 16 ) % 2
+#define x4 x3a p16(o3a)
+#define o4 SS(64,o3a)
+#else
+#define x4 x3a
+#define o4 o3a
+#endif
+
+#undef x4a
+#undef o4a
+#if pipe_len == 16
+#define x4a x4 drain_pipe(o4)
+#define o4a SS(64,o4)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x4a x4
+#define o4a o4
+#endif
+
+#undef x5
+#undef o5
+#if ( MKBB / 8 ) % 2
+#define x5 x4a p8(o4a)
+#define o5 SS(32,o4a)
+#else
+#define x5 x4a
+#define o5 o4a
+#endif
+
+#undef x5a
+#undef o5a
+#if pipe_len == 8
+#define x5a x5 drain_pipe(o5)
+#define o5a SS(32,o5)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x5a x5
+#define o5a o5
+#endif
+
+#undef x6
+#undef o6
+#if ( MKBB / 4 ) % 2
+#define x6 x5a p4(o5a)
+#define o6 SS(16,o5a)
+#else
+#define x6 x5a
+#define o6 o5a
+#endif
+
+#undef x6a
+#undef o6a
+#if pipe_len == 4
+#define x6a x6 drain_pipe(o6)
+#define o6a SS(16,o6)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x6a x6
+#define o6a o6
+#endif
+
+#undef x7
+#undef o7
+#if ( MKB / 2 ) % 2
+#define x7 x6a p2(o6a)
+#define o7 SS(8,o6a)
+#else
+#define x7 x6a
+#define o7 o6a
+#endif
+
+#undef x7a
+#undef o7a
+#if pipe_len == 2
+#define x7a x7 drain_pipe(o7)
+#define o7a SS(8,o7)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x7a x7
+#define o7a o7
+#endif
+
+#undef x8
+#undef o8
+#if ( MKB / 1 ) % 2
+#define x8 x7a p1(o7a)
+#define o8 SS(4,o7a)
+#else
+#define x8 x7a
+#define o8 o7a
+#endif
+
+#undef x8a
+#undef o8a
+#if pipe_len == 1
+#define x8a x8 drain_pipe(o8)
+#define o8a SS(4,o8)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x8a x8
+#define o8a o8
+#endif
+
+#undef KB_block
+#define KB_block x8a
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h
new file mode 100644
index 0000000..35e9e59
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h
@@ -0,0 +1,215 @@
+#ifndef CAMM_SCALE_H
+#define CAMM_SCALE_H /*+ To stop multiple inclusions. +*/
+
+#include "camm_util.h"
+
+#undef spf
+#define spf(a_,b_) f(t0,a_,b_)
+
+#ifdef SCPLX
+#ifdef BETAX
+#undef SSREG
+#define SSREG 2
+#undef lbx
+#define lbx pls(4,ax,1) ps(0,1,1) pm(SSREG,1)
+#undef cxx
+#define cxx pm(1,3) ps(177,3,3) pa(3,2)
+#undef pcx
+#define pcx pc(2,3)
+#else
+#undef lbx
+#define lbx
+#undef cxx
+#define cxx
+#undef pcx
+#define pcx
+#endif
+#undef lb
+#define lb pls(0,ax,0) ps(0,0,0) lbx
+#undef c
+#define c(a_) pl(a_ ## 0,si,2) pcx pm(0,2) cxx pu(2,a_ ## 0,si)
+#undef cp
+#define cp(a_,b_) pl(a_ ## 0,si,2) pcx pm(0,2) spf(b_,si) cxx pu(2,a_ ## 0,si)
+#undef c1_2
+#define c1_2(a_) px(2) pld(a_ ## 0,si,2) pcx pm(0,2) cxx pud(2,a_ ## 0,si)
+#undef ub
+#define ub
+#endif
+
+#ifdef SREAL
+#undef lb
+#define lb pls(0,ax,0) ps(0,0,0)
+#undef c
+#define c(a_) pl(a_ ## 0,si,2) pm(0,2) pu(2,a_ ## 0,si)
+#undef cp
+#define cp(a_,b_) pl(a_ ## 0,si,2) spf(b_,si) pm(0,2) pu(2,a_ ## 0,si)
+#undef c1_2
+#define c1_2(a_) px(2) pld(a_ ## 0,si,2) pm(0,2) pud(2,a_ ## 0,si)
+#undef c1_4
+#define c1_4(a_) pls(a_ ## 0,si,2) pm(0,2) pus(2,a_ ## 0,si)
+#undef ub
+#define ub
+#endif
+
+#ifdef DREAL
+#undef lb
+#define lb fl(0,ax)
+#undef c
+#define c(a_) fl(a_ ## 0,si) fm(1,0) fl(a_ ## 8,si) fm(2,0) fx1 \
+ fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef cp
+#define cp(a_,b_) fl(a_ ## 0,si) fm(1,0) fl(a_ ## 8,si) spf(b_,si) fm(2,0) fx1 \
+ fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef c1_2
+#define c1_2(a_) fl(a_ ## 0,si) fm(1,0) fp(a_ ## 0,si)
+#undef ub
+#define ub fc(0)
+#endif
+
+#ifdef DCPLX
+#undef lb
+#define lb fl(0,ax) fl(8,ax)
+#undef c
+#define c(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) fd(3) fm(2,0) fd(3) \
+ fm(2,0) fx(3) fm(4,0) fx(2) fm(5,0) fap(0,2) fx(2) fsp(2) fx1 \
+ fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef cp
+#define cp(a_,b_) fl(a_ ## 0,si) fl(a_ ## 8,si) fd(3) fm(2,0) fd(3) \
+ fm(2,0) fx(3) spf(b_,si) fm(4,0) fx(2) fm(5,0) fap(0,2) fx(2) \
+ fsp(2) fx1 fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef ub
+#define ub fc(0) fc(0)
+#endif
+
+#undef sbl1
+#define sbl1 c1_4(0x0)
+#undef sbl2
+#define sbl2 c1_2(0x0)
+#undef sbl4
+#define sbl4 cp(0x0,0x40)
+#undef sbl8
+#define sbl8 sbl4 c(0x1)
+#undef sbl16
+#define sbl16 sbl8 cp(0x2,0x60) c(0x3)
+
+#undef sinc16
+#define sinc16 a(0x40,si)
+#undef sinc8
+#define sinc8 a(0x20,si)
+#undef sinc4
+#define sinc4 a(0x10,si)
+#undef sinc2
+#define sinc2 a(0x8,si)
+#undef sinc1
+#define sinc1 a(0x4,si)
+
+#undef SCALE
+#define SCALE Mjoin(Mjoin(PREC,Mjoin(scale,BLC)),FEXT)
+
+#undef MY_FUNCTION
+#define MY_FUNCTION SCALE
+
+static void
+MY_FUNCTION(const TYPE *b,TYPE *c,int len) {
+
+ const TYPE *ce=c+len;
+#if defined(BETAX) && defined(SCPLX)
+ const TYPE z1[2]={{1.0,-1.0},{1.0,-1.0}},*z=z1;
+#endif
+ NO_INLINE
+
+#ifndef SREAL
+ len+=len;
+#endif
+#ifdef DCPLX
+ len+=len;
+#endif
+
+
+ ASM(
+
+ "pushl %%ebx\n\t"
+ a(4,sp)
+
+
+ "movl %0,%%esi\n\t"
+
+ spf(0x00,si)
+ spf(0x20,si)
+
+ "movl %1,%%eax\n\t"
+ "movl %2,%%edi\n\t"
+
+#if defined(BETAX) && defined(SCPLX)
+ "movl %3,%%ebx\n\t"
+ pl(0,bx,SSREG)
+#endif
+
+ lb
+
+ lab(loop)
+
+ test(-16,di)
+ je(8)
+ sub(16,di)
+ align
+
+ sbl16
+ sinc16
+
+ jmp(loop)
+ align
+
+ lab(8)
+
+ test(8,di)
+ je(4)
+
+ sbl8
+ sinc8
+
+ lab(4)
+
+ test(4,di)
+ je(2)
+
+ sbl4
+ sinc4
+
+ lab(2)
+
+#ifndef DCPLX
+ test(2,di)
+ je(1)
+
+ sbl2
+ sinc2
+
+ lab(1)
+
+#ifdef SREAL
+ test(1,di)
+ je(stop)
+
+ sbl1
+ sinc1
+
+ lab(stop)
+#endif
+#endif
+
+ ub
+
+ a(-4,sp)
+ "popl %%ebx\n\t"
+
+
+ ::"m" (c),"m" (b), "m" (len)
+#if defined(BETAX) && defined(SCPLX)
+ ,"m" (z)
+#endif
+ : "si","ax","di");
+
+
+}
+#endif /* CAMM_SCALE_H */
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h
new file mode 100644
index 0000000..4a92006
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h
@@ -0,0 +1,2982 @@
+#include "camm_util.h"
+
+#undef p1_4_swap_1
+#define p1_4_swap_1(a_) \
+ pls(a_,ax,1) \
+ pls(a_,cx,0) \
+ pus(0,a_,ax) \
+ pus(1,a_,cx)
+#undef p1_2_swap_1
+#define p1_2_swap_1(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ px(0) \
+ pld(a_,cx,0) \
+ pud(0,a_,ax) \
+ pud(1,a_,cx)
+#undef p1_swap_1
+#define p1_swap_1(a_) \
+ plq(a_,ax,1) \
+ pl(a_,cx,0) \
+ puq(0,a_,ax) \
+ pu(1,a_,cx)
+#undef p2_swap_1
+#define p2_swap_1(a_) \
+ plq(SS(a_,RS4),ax,3) \
+ pl(SS(a_,RS4),cx,2) \
+ puq(0,a_,ax) \
+ pu(1,a_,cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,0) \
+ puq(2,SS(a_,RS4),ax) \
+ pu(3,SS(a_,RS4),cx)
+#undef lpswap_1
+#define lpswap_1(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,1) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,0)
+#undef dpswap_1
+#define dpswap_1(a_) \
+ plq(SS(a_,RS4),ax,3) \
+ pl(SS(a_,RS4),cx,2) \
+ puq(0,a_,ax) \
+ pu(1,a_,cx) \
+ puq(2,SS(a_,RS4),ax) \
+ pu(3,SS(a_,RS4),cx)
+#undef plswap_1
+#define plswap_1 8
+
+
+#undef p1_4_scal_3
+#define p1_4_scal_3(a_) \
+ pls(a_,ax,0) \
+ pmsr(6,0) \
+ pus(0,a_,ax)
+#undef p1_2_scal_3
+#define p1_2_scal_3(a_) \
+ pld(a_,ax,0) \
+ pm(6,0) \
+ pud(0,a_,ax)
+#undef p1_scal_3
+#define p1_scal_3(a_) \
+ plq(a_,ax,0) \
+ pm(6,0) \
+ puq(0,a_,ax)
+#undef p2_scal_3
+#define p2_scal_3(a_) \
+ plq(a_,ax,0) \
+ plq(SS(a_,RS4),ax,1) \
+ pm(6,0) \
+ pm(6,1) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_scal_3
+#define p4_scal_3(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(6,2) \
+ puq(0,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ pm(6,3) \
+ puq(1,SS(a_,RS4),ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ pm(6,0) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ pm(6,1) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax)
+#undef lpscal_3
+#define lpscal_3(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pm(6,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pm(6,1)
+#undef dpscal_3
+#define dpscal_3(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(6,2) \
+ puq(0,a_,ax) \
+ pm(6,3) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plscal_3
+#define plscal_3 16
+
+#undef p1_4_scal_3c
+#define p1_4_scal_3c(a_)
+#undef p1_2_scal_3c
+#define p1_2_scal_3c(a_) \
+ pld(a_,ax,0) \
+ pc(0,1) \
+ pm(6,0) \
+ ps(CSHUF,1,1) \
+ pm(7,1) \
+ pa(1,0) \
+ pud(0,a_,ax)
+#undef p1_scal_3c
+#define p1_scal_3c(a_) \
+ plq(a_,ax,0) \
+ pc(0,1) \
+ pm(6,0) \
+ ps(CSHUF,1,1) \
+ pm(7,1) \
+ pa(1,0) \
+ puq(0,a_,ax)
+#undef p2_scal_3c
+#define p2_scal_3c(a_) \
+ plq(a_,ax,0) \
+ plq(SS(a_,RS4),ax,1) \
+ pc(0,2) \
+ pm(6,0) \
+ ps(CSHUF,2,2) \
+ pm(7,2) \
+ pa(2,0) \
+ puq(0,a_,ax) \
+ pc(1,3) \
+ pm(6,1) \
+ ps(CSHUF,3,3) \
+ pm(7,3) \
+ pa(3,1) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_scal_3c
+#define p4_scal_3c(a_) \
+ pm(7,5) \
+ pa(5,1) \
+ puq(0,a_,ax) \
+ ps(CSHUF,4,4) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ pc(3,5) \
+ pm(6,3) \
+ pm(7,4) \
+ pa(4,2) \
+ puq(1,SS(a_,RS4),ax) \
+ ps(CSHUF,5,5) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ pc(0,4) \
+ pm(6,0) \
+ pm(7,5) \
+ pa(5,3) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ ps(CSHUF,4,4) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ pc(1,5) \
+ pm(6,1) \
+ pm(7,4) \
+ pa(4,0) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ ps(CSHUF,5,5) \
+ plq(SS(a_,MM(7,RS4)),ax,3) \
+ pc(2,4) \
+ pm(6,2)
+#undef lpscal_3c
+#define lpscal_3c(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pc(0,4) \
+ pm(6,0) \
+ ps(CSHUF,4,4) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pc(1,5) \
+ pm(6,1) \
+ pm(7,4) \
+ pa(4,0) \
+ ps(CSHUF,5,5) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pc(2,4) \
+ pm(6,2)
+#undef dpscal_3c
+#define dpscal_3c(a_) \
+ pm(7,5) \
+ pa(5,1) \
+ ps(CSHUF,4,4) \
+ puq(0,a_,ax) \
+ pm(7,4) \
+ pa(4,2) \
+ pc(3,5) \
+ pm(6,3) \
+ puq(1,SS(a_,RS4),ax) \
+ ps(CSHUF,5,5) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ pm(7,5) \
+ pa(5,3) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plscal_3c
+#define plscal_3c 16
+
+#undef p1_4_scal_4
+#define p1_4_scal_4(a_) \
+ pls(SS(a_,MM(0,RS4)),ax,0) \
+ pmsr(6,0) \
+ pus(0,a_,ax)
+#undef p1_2_scal_4
+#define p1_2_scal_4(a_) \
+ pld(SS(a_,MM(0,RS4)),ax,0) \
+ pm(6,0) \
+ pud(0,a_,ax)
+#undef p1_scal_4
+#define p1_scal_4(a_) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ pm(6,0) \
+ puq(0,a_,ax)
+#undef p2_scal_4
+#define p2_scal_4(a_) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pm(6,0) \
+ pm(6,1) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_scal_4
+#define p4_scal_4(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(6,0) \
+ pm(6,1) \
+ pm(6,2) \
+ pm(6,3) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef p8_scal_4
+#define p8_scal_4(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ plq(SS(a_,MM(4,RS4)),ax,4) \
+ plq(SS(a_,MM(5,RS4)),ax,5) \
+ plq(SS(a_,MM(6,RS4)),ax,7) \
+ pm(6,0) \
+ pm(6,1) \
+ pm(6,2) \
+ puq(0,a_,ax) \
+ pm(6,3) \
+ pm(6,4) \
+ pm(6,5) \
+ plq(SS(a_,MM(7,RS4)),ax,0) \
+ pm(6,7) \
+ pm(6,0) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ puq(4,SS(a_,MM(4,RS4)),ax) \
+ puq(5,SS(a_,MM(5,RS4)),ax) \
+ puq(7,SS(a_,MM(6,RS4)),ax) \
+ puq(0,SS(a_,MM(7,RS4)),ax)
+#undef lpscal_4
+#define lpscal_4(a_)
+#undef dpscal_4
+#define dpscal_4(a_) p4_scal_4(a_)
+#undef plscal_4
+#define plscal_4 16
+
+#undef p1_4_scal_4c
+#define p1_4_scal_4c(a_)
+#undef p1_2_scal_4c
+#define p1_2_scal_4c(a_) \
+ pld(a_,ax,0) \
+ pc(0,1) \
+ pm(6,0) \
+ ps(CSHUF,1,1) \
+ pm(7,1) \
+ pa(1,0) \
+ pud(0,a_,ax)
+#undef p1_scal_4c
+#define p1_scal_4c(a_) \
+ plq(a_,ax,0) \
+ pc(0,1) \
+ pm(6,0) \
+ ps(CSHUF,1,1) \
+ pm(7,1) \
+ pa(1,0) \
+ puq(0,a_,ax)
+#undef p2_scal_4c
+#define p2_scal_4c(a_) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pc(0,4) \
+ pc(1,5) \
+ pm(6,0) \
+ pm(6,1) \
+ ps(CSHUF,4,4) \
+ ps(CSHUF,5,5) \
+ pm(7,4) \
+ pa(4,0) \
+ pm(7,5) \
+ pa(5,1) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_scal_4c
+#define p4_scal_4c(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pc(0,4) \
+ pc(1,5) \
+ pm(6,0) \
+ pm(6,1) \
+ ps(CSHUF,4,4) \
+ ps(CSHUF,5,5) \
+ pm(7,4) \
+ pa(4,0) \
+ pc(2,4) \
+ pm(7,5) \
+ pa(5,1) \
+ pc(3,5) \
+ pm(6,2) \
+ pm(6,3) \
+ ps(CSHUF,4,4) \
+ ps(CSHUF,5,5) \
+ pm(7,4) \
+ pa(4,2) \
+ pm(7,5) \
+ pa(5,3) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef lpscal_4c
+#define lpscal_4c(a_)
+#undef dpscal_4c
+#define dpscal_4c(a_) p4_scal_4c(a_)
+#undef plscal_4c
+#define plscal_4c 16
+
+#undef p1_4_scal_1
+#define p1_4_scal_1(a_) \
+ pls(a_,ax,1) \
+ pmsr(0,1) \
+ pus(1,a_,ax)
+#undef p1_2_scal_1
+#define p1_2_scal_1(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ pm(0,1) \
+ pud(1,a_,ax)
+#undef p1_scal_1
+#define p1_scal_1(a_) \
+ plq(a_,ax,1) \
+ pm(0,1) \
+ puq(1,a_,ax)
+#undef p2_scal_1
+#define p2_scal_1(a_) \
+ plq(a_,ax,1) \
+ plq(SS(a_,RS4),ax,2) \
+ pm(0,1) \
+ pm(0,2) \
+ puq(1,a_,ax) \
+ puq(2,SS(a_,RS4),ax)
+#undef p4_scal_1
+#define p4_scal_1(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pm(0,3) \
+ puq(7,a_,ax) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pm(0,1) \
+ puq(3,SS(a_,MM(1,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,7) \
+ pm(0,2) \
+ puq(1,SS(a_,MM(2,RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,3) \
+ pm(0,7) \
+ puq(2,SS(a_,MM(3,RS4)),ax)
+#undef lpscal_1
+#define lpscal_1(a_) \
+ plq(a_,ax,7) \
+ plq(SS(a_,MM(1,RS4)),ax,3) \
+ pm(0,7)
+#undef dpscal_1
+#define dpscal_1(a_) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pm(0,3) \
+ puq(7,a_,ax) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pm(0,1) \
+ puq(3,SS(a_,MM(1,RS4)),ax) \
+ pm(0,2) \
+ puq(1,SS(a_,MM(2,RS4)),ax) \
+ puq(2,SS(a_,MM(3,RS4)),ax)
+#undef plscal_1
+#define plscal_1 RS4
+
+
+#undef p1_4_set_1
+#define p1_4_set_1(a_) \
+ pls(a_,ax,1) \
+ pcs(0,1) \
+ pus(1,a_,ax)
+#undef p1_2_set_1
+#define p1_2_set_1(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ pc(0,1) \
+ pud(1,a_,ax)
+#undef p1_set_1
+#define p1_set_1(a_) \
+ plq(a_,ax,1) \
+ pc(0,1) \
+ puq(1,a_,ax)
+#undef p2_set_1
+#define p2_set_1(a_) \
+ plq(a_,ax,1) \
+ plq(SS(a_,RS4),ax,2) \
+ pc(0,1) \
+ pc(0,2) \
+ puq(1,a_,ax) \
+ puq(2,SS(a_,RS4),ax)
+#undef p4_set_1
+#define p4_set_1(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pc(0,3) \
+ puq(7,a_,ax) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pc(0,1) \
+ puq(3,SS(a_,MM(1,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,7) \
+ pc(0,2) \
+ puq(1,SS(a_,MM(2,RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,3) \
+ pc(0,7) \
+ puq(2,SS(a_,MM(3,RS4)),ax)
+#undef lpset_1
+#define lpset_1(a_) \
+ plq(a_,ax,7) \
+ plq(SS(a_,MM(1,RS4)),ax,3) \
+ pc(0,7)
+#undef dpset_1
+#define dpset_1(a_) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pc(0,3) \
+ puq(7,a_,ax) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pc(0,1) \
+ puq(3,SS(a_,MM(1,RS4)),ax) \
+ pc(0,2) \
+ puq(1,SS(a_,MM(2,RS4)),ax) \
+ puq(2,SS(a_,MM(3,RS4)),ax)
+#undef plset_1
+#define plset_1 RS4
+
+
+#undef p1_4_set_2
+#define p1_4_set_2(a_) \
+ pus(0,a_,ax)
+#undef p1_2_set_2
+#define p1_2_set_2(a_) \
+ pud(0,a_,ax)
+#undef p1_set_2
+#define p1_set_2(a_) \
+ puq(0,a_,ax)
+#undef p2_set_2
+#define p2_set_2(a_) \
+ puq(0,a_,ax) \
+ puq(0,SS(a_,RS4),ax)
+#undef p4_set_2
+#define p4_set_2(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ puq(0,a_,ax) \
+ puq(0,SS(a_,MM(1,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ puq(0,SS(a_,MM(2,RS4)),ax) \
+ puq(0,SS(a_,MM(3,RS4)),ax)
+#undef lpset_2
+#define lpset_2(a_)
+#undef dpset_2
+#define dpset_2(a_) \
+ puq(0,a_,ax) \
+ puq(0,SS(a_,MM(1,RS4)),ax) \
+ puq(0,SS(a_,MM(2,RS4)),ax) \
+ puq(0,SS(a_,MM(3,RS4)),ax)
+#undef plset_2
+#define plset_2 RS4
+
+
+#undef p1_4_set_3
+#define p1_4_set_3(a_) \
+ pus(0,a_,ax)
+#undef p1_2_set_3
+#define p1_2_set_3(a_) \
+ pud(0,a_,ax)
+#undef p1_set_3
+#define p1_set_3(a_) \
+ puq(0,SS(a_,MM(0,RS4)),ax)
+#undef p2_set_3
+#define p2_set_3(a_) \
+ puq(0,SS(a_,MM(0,RS4)),ax) \
+ puq(0,SS(a_,MM(1,RS4)),ax)
+#undef p4_set_3
+#define p4_set_3(a_) \
+ puq(0,SS(a_,MM(0,RS4)),ax) \
+ puq(0,SS(a_,MM(1,RS4)),ax) \
+ puq(0,SS(a_,MM(2,RS4)),ax) \
+ puq(0,SS(a_,MM(3,RS4)),ax)
+#undef p8_set_3
+#define p8_set_3(a_) \
+ puq(0,SS(a_,MM(0,RS4)),ax) \
+ puq(0,SS(a_,MM(1,RS4)),ax) \
+ puq(0,SS(a_,MM(2,RS4)),ax) \
+ puq(0,SS(a_,MM(3,RS4)),ax) \
+ puq(0,SS(a_,MM(4,RS4)),ax) \
+ puq(0,SS(a_,MM(5,RS4)),ax) \
+ puq(0,SS(a_,MM(6,RS4)),ax) \
+ puq(0,SS(a_,MM(7,RS4)),ax)
+#undef lpset_3
+#define lpset_3(a_)
+#undef dpset_3
+#define dpset_3(a_) p8_set_3(a_)
+#undef plset_3
+#define plset_3 32
+
+
+#undef p1_4_0x1_nrm2_1
+#define p1_4_0x1_nrm2_1(a_) \
+ pls(a_,ax,1) \
+ pmsr(1,1) \
+ pasr(1,0)
+#undef p1_2_0x1_nrm2_1
+#define p1_2_0x1_nrm2_1(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ pm(1,1) \
+ pa(1,0)
+#undef p1_0x1_nrm2_1
+#define p1_0x1_nrm2_1(a_) \
+ plq(a_,ax,1) \
+ pm(1,1) \
+ pa(1,0)
+#undef p2_0x1_nrm2_1
+#define p2_0x1_nrm2_1(a_) \
+ plq(a_,ax,1) \
+ plq(SS(a_,RS4),ax,2) \
+ pm(1,1) \
+ pm(2,2) \
+ pa(1,0) \
+ pm(2,0)
+#undef p4_0x1_nrm2_1
+#define p4_0x1_nrm2_1(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pm(3,3) \
+ pa(7,0) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pm(1,1) \
+ pa(3,0) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,7) \
+ pm(2,2) \
+ pa(1,0) \
+ plq(SS(a_,MM(5,RS4)),ax,3) \
+ pm(7,7) \
+ pa(2,0)
+#undef lp0x1_nrm2_1
+#define lp0x1_nrm2_1(a_) \
+ plq(a_,ax,7) \
+ plq(SS(a_,MM(1,RS4)),ax,3) \
+ pm(7,7)
+#undef dp0x1_nrm2_1
+#define dp0x1_nrm2_1(a_) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pm(3,3) \
+ pa(7,0) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pm(1,1) \
+ pa(3,0) \
+ pm(2,2) \
+ pa(1,0) \
+ pa(2,0)
+#undef pl0x1_nrm2_1
+#define pl0x1_nrm2_1 RS4
+
+
+#undef p1_4_nrm2_2
+#define p1_4_nrm2_2(a_) \
+ pls(a_,ax,1) dbg(1) \
+ pan(4,1) dbg(1) \
+ pcs(5,6) dbg(6) \
+ pcs(5,7) dbg(7) \
+ paxs(1,5) dbg(5) \
+ prps(5,2) dbg(2) \
+ px(3) \
+ pcms(0,2,3) dbg(3) \
+ pan(3,7) dbg(7) \
+ pann(5,3) dbg(3) \
+ pasr(3,7) dbg(7) \
+ pcs(7,5) dbg(5) \
+ pdsr(5,6) dbg(6) \
+ pdsr(5,1) dbg(1) \
+ pmsr(6,6) dbg(6) \
+ pmsr(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pasr(1,0) dbg(0)
+#undef p1_2_nrm2_2
+#define p1_2_nrm2_2(a_) \
+ px(1) pld(a_,ax,1) dbg(1) \
+ pan(4,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pc(5,7) dbg(7) \
+ pax(1,5) dbg(5) \
+ prp(5,2) dbg(2) \
+ px(3) \
+ pcm(0,2,3)dbg(3) \
+ pan(3,7) dbg(7) \
+ pann(5,3) dbg(3) \
+ pa(3,7) dbg(7) \
+ pc(7,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#undef p1_nrm2_2
+#define p1_nrm2_2(a_) \
+ plq(a_,ax,1) dbg(1) \
+ pan(4,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pc(5,7) dbg(7) \
+ pax(1,5) dbg(5) \
+ prp(5,2) dbg(2) \
+ px(3) \
+ pcm(0,2,3)dbg(3) \
+ pan(3,7) dbg(7) \
+ pann(5,3) dbg(3) \
+ pa(3,7) dbg(7) \
+ pc(7,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#define p2_nrm2_2(a_) \
+ plq(SS(a_,RS4),ax,1) dbg(1) \
+ pan(4,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pc(5,7) dbg(7) \
+ pax(1,5) dbg(5) \
+ prp(5,2) dbg(2) \
+ px(3) \
+ pcm(0,2,3)dbg(3) \
+ pan(3,7) dbg(7) \
+ pann(5,3) dbg(3) \
+ pa(3,7) dbg(7) \
+ pc(7,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \
+ pan(4,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pc(5,7) dbg(7) \
+ pax(1,5) dbg(5) \
+ prp(5,2) dbg(2) \
+ px(3) \
+ pcm(0,2,3)dbg(3) \
+ pan(3,7) dbg(7) \
+ pann(5,3) dbg(3) \
+ pa(3,7) dbg(7) \
+ pc(7,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#undef lpnrm2_2
+#define lpnrm2_2(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \
+ pan(4,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pc(5,7) dbg(7) \
+ pax(1,5) dbg(5) \
+ prp(5,2) dbg(2) \
+ px(3) \
+ pcm(0,2,3)dbg(3) \
+ pan(3,7) dbg(7) \
+ pann(5,3) dbg(3) \
+ pa(3,7) dbg(7) \
+ pc(7,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#undef dpnrm2_2
+#define dpnrm2_2(a_) \
+ plq(SS(a_,RS4),ax,1) dbg(1) \
+ pan(4,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pc(5,7) dbg(7) \
+ pax(1,5) dbg(5) \
+ prp(5,2) dbg(2) \
+ px(3) \
+ pcm(0,2,3)dbg(3) \
+ pan(3,7) dbg(7) \
+ pann(5,3) dbg(3) \
+ pa(3,7) dbg(7) \
+ pc(7,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#undef plnrm2_2
+#define plnrm2_2 8
+
+
+#undef p1_4_nrm2_3
+#define p1_4_nrm2_3(a_) \
+ pls(a_,ax,1) dbg(1) \
+ pcs(5,6) dbg(6) \
+ pan(4,1) dbg(1) \
+ paxs(1,5) dbg(5) \
+ pdsr(5,6) dbg(6) \
+ pdsr(5,1) dbg(1) \
+ pmsr(6,6) dbg(6) \
+ pmsr(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pasr(1,0) dbg(0)
+#undef p1_2_nrm2_3
+#define p1_2_nrm2_3(a_) \
+ px(1) pld(a_,ax,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pan(4,1) dbg(1) \
+ pax(1,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#undef p1_nrm2_3
+#define p1_nrm2_3(a_) \
+ plq(a_,ax,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pan(4,1) dbg(1) \
+ pax(1,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#define p2_nrm2_3(a_) \
+ plq(SS(a_,RS4),ax,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pan(4,1) dbg(1) \
+ pax(1,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pan(4,1) dbg(1) \
+ pax(1,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#undef lpnrm2_3
+#define lpnrm2_3(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pan(4,1) dbg(1) \
+ pax(1,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#undef dpnrm2_3
+#define dpnrm2_3(a_) \
+ plq(SS(a_,RS4),ax,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pan(4,1) dbg(1) \
+ pax(1,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#undef plnrm2_3
+#define plnrm2_3 8
+
+#define block_nrm2_4(a_,b_) \
+ Mjoin(pc,a_)(5,6) dbg(6) \
+ pan(4,1) dbg(1) \
+ Mjoin(pax,a_)(1,5) dbg(5) \
+ Mjoin(pc,a_)(2,7) dbg(7) \
+ Mjoin(pd,b_)(5,7) dbg(7) \
+ Mjoin(pm,b_)(7,6) dbg(6) \
+ Mjoin(pm,b_)(7,1) dbg(1) \
+ Mjoin(pm,b_)(6,6) dbg(6) \
+ Mjoin(pm,b_)(6,0) dbg(0) \
+ Mjoin(pm,b_)(1,1) dbg(1) \
+ Mjoin(pa,b_)(1,0) dbg(0)
+
+
+/* #undef p1_4_nrm2_4 */
+/* #define p1_4_nrm2_4(a_) \ */
+/* pls(a_,ax,1) dbg(1) \ */
+/* pcs(5,6) dbg(6) \ */
+/* pan(4,1) dbg(1) \ */
+/* paxs(1,5) dbg(5) \ */
+/* pcs(2,7) dbg(7) \ */
+/* pdsr(5,7) dbg(7) \ */
+/* pmsr(7,6) dbg(6) \ */
+/* pmsr(7,1) dbg(1) \ */
+/* pmsr(6,6) dbg(6) \ */
+/* pmsr(6,0) dbg(0) \ */
+/* pmsr(1,1) dbg(1) \ */
+/* pasr(1,0) dbg(0) */
+#undef p1_4_nrm2_4
+#define p1_4_nrm2_4(a_) \
+ pls(a_,ax,1) dbg(1) \
+ block_nrm2_4(s,sr)
+#undef p1_2_nrm2_4
+#define p1_2_nrm2_4(a_) \
+ px(1) pld(a_,ax,1) dbg(1) \
+ block_nrm2_4(,)
+#undef p1_nrm2_4
+#define p1_nrm2_4(a_) \
+ plq(a_,ax,1) dbg(1) \
+ block_nrm2_4(,)
+#define p2_nrm2_4(a_) \
+ plq(SS(a_,RS4),ax,1) dbg(1) \
+ block_nrm2_4(,) \
+ plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ block_nrm2_4(,)
+#undef lpnrm2_4
+#define lpnrm2_4(a_) \
+ plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ block_nrm2_4(,)
+#undef dpnrm2_4
+#define dpnrm2_4(a_) \
+ plq(SS(a_,RS4),ax,1) dbg(1) \
+ block_nrm2_4(,)
+#undef plnrm2_4
+#define plnrm2_4 8
+
+
+#undef p1_4_1x1_1
+#define p1_4_1x1_1(a_) \
+ pls(a_,ax,1) \
+ pls(a_,bx,0) \
+ pm(0,1) \
+ pa(1,6)
+#undef p1_2_1x1_1
+#define p1_2_1x1_1(a_) \
+ pld(a_,ax,1) \
+ pld(a_,bx,0) \
+ pm(0,1) \
+ pa(1,6)
+#undef p1_1x1_1
+#define p1_1x1_1(a_) \
+ plq(a_,ax,1) \
+ plq(a_,bx,0) \
+ pm(0,1) \
+ pa(0,6)
+#undef p2_1x1_1
+#define p2_1x1_1(a_) \
+ plq(a_,ax,1) \
+ plq(a_,bx,0) \
+ plq(SS(a_,RS4),ax,2) \
+ plq(SS(a_,RS4),bx,3) \
+ pm(0,1) \
+ pm(2,3) \
+ pa(1,6) \
+ pa(3,6)
+#undef p4_1x1_1
+#define p4_1x1_1(a_) \
+ f(nta,SS(a_,MM(4,RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pm(0,3) \
+ puq(7,a_,ax) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pm(0,1) \
+ puq(3,SS(a_,RS4),ax) \
+ f(nta,SS(a_,MM(6,RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,7) \
+ pm(0,2) \
+ puq(1,SS(a_,MM(2,RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,3) \
+ pm(0,7) \
+ puq(2,SS(a_,MM(3,RS4)),ax)
+#undef lp1x1_1
+#define lp1x1_1(a_) \
+ plq(a_,ax,7) \
+ plq(SS(a_,RS4),ax,3) \
+ pm(0,7)
+#undef dp1x1_1
+#define dp1x1_1(a_) \
+ plq(SS(,a_,MM(2,RS4)),ax,1) \
+ pm(0,3) \
+ puq(7,a_,ax) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pm(0,1) \
+ puq(3,SS(a_,RS4),ax) \
+ pm(0,2) \
+ puq(1,SS(a_,MM(2,RS4)),ax) \
+ puq(2,SS(a_,MM(3,RS4)),ax)
+#undef pl1x1_1
+#define pl1x1_1 RS4
+
+
+#undef p1_4_0x1_asum_1
+#define p1_4_0x1_asum_1(a_) \
+ pls(a_,ax,1) \
+ pan(4,1) \
+ pasr(1,0)
+#undef p1_2_0x1_asum_1
+#define p1_2_0x1_asum_1(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ pan(4,1) \
+ pa(1,0)
+#undef p1_0x1_asum_1
+#define p1_0x1_asum_1(a_) \
+ plq(a_,ax,1) \
+ pan(4,1) \
+ pa(1,0)
+#undef p2_0x1_asum_1
+#define p2_0x1_asum_1(a_) \
+ plq(a_,ax,1) \
+ plq(SS(a_,RS4),ax,2) \
+ pan(4,1) \
+ pan(4,2) \
+ pa(1,0) \
+ pa(2,0)
+#undef p4_0x1_asum_1
+#define p4_0x1_asum_1(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pan(4,3) \
+ pa(7,0) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pan(4,1) \
+ pa(3,0) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,7) \
+ pan(4,2) \
+ pa(1,0) \
+ plq(SS(a_,MM(5,RS4)),ax,3) \
+ pan(4,7) \
+ pa(2,0)
+#undef lp0x1_asum_1
+#define lp0x1_asum_1(a_) \
+ plq(a_,ax,7) \
+ plq(SS(a_,MM(1,RS4)),ax,3) \
+ pan(4,7)
+#undef dp0x1_asum_1
+#define dp0x1_asum_1(a_) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pan(4,3) \
+ pa(7,0) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pan(4,1) \
+ pa(3,0) \
+ pan(4,2) \
+ pa(1,0) \
+ pa(2,0)
+#undef pl0x1_asum_1
+#define pl0x1_asum_1 RS4
+
+
+#undef p1_4_sum_1
+#define p1_4_sum_1(a_) \
+ pls(a_,ax,1) \
+ pasr(1,0)
+#undef p1_2_sum_1
+#define p1_2_sum_1(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ pa(1,0)
+#undef p1_sum_1
+#define p1_sum_1(a_) \
+ plq(a_,ax,1) \
+ pa(1,0)
+#undef p2_sum_1
+#define p2_sum_1(a_) \
+ plq(a_,ax,1) \
+ plq(SS(a_,RS4),ax,2) \
+ pa(1,0) \
+ pa(2,0)
+#undef p4_sum_1
+#define p4_sum_1(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pa(7,0) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pa(3,0) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,7) \
+ pa(1,0) \
+ plq(SS(a_,MM(5,RS4)),ax,3) \
+ pa(2,0)
+#undef lpsum_1
+#define lpsum_1(a_) \
+ plq(a_,ax,7) \
+ plq(SS(a_,MM(1,RS4)),ax,3)
+#undef dpsum_1
+#define dpsum_1(a_) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pa(7,0) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pa(3,0) \
+ pa(1,0) \
+ pa(2,0)
+#undef plsum_1
+#define plsum_1 RS4
+
+
+#undef p1_4_dot_1
+#define p1_4_dot_1(a_) \
+ pls(a_,ax,1) \
+ pls(a_,cx,2) \
+ pmsr(2,1) \
+ pasr(1,0)
+#undef p1_2_dot_1
+#define p1_2_dot_1(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ px(2) \
+ pld(a_,cx,2) \
+ pm(2,1) \
+ pa(1,0)
+#undef p1_dot_1
+#define p1_dot_1(a_) \
+ plq(a_,ax,1) \
+ pl(a_,cx,2) \
+ pm(2,1) \
+ pa(1,0)
+#undef p2_dot_1
+#define p2_dot_1(a_) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pl(SS(a_,MM(1,RS4)),cx,2) \
+ pm(4,3) \
+ pa(3,0) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,3) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,4) \
+ pm(2,1) \
+ pa(1,0)
+#undef lpdot_1
+#define lpdot_1(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(a_,ax,3) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(a_,cx,4)
+#undef dpdot_1
+#define dpdot_1(a_) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pl(SS(a_,MM(1,RS4)),cx,2) \
+ pm(4,3) \
+ pa(3,0) \
+ pm(2,1) \
+ pa(1,0)
+#undef pldot_1
+#define pldot_1 8
+
+#undef p1_4_dot_1c
+#define p1_4_dot_1c(a_)
+#undef p1_2_dot_1c
+#define p1_2_dot_1c(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ px(2) \
+ pld(a_,cx,2) \
+ pc(1,3) \
+ ps(HSHUF,1,1) \
+ ps(LSHUF,3,3) \
+ pm(7,1) \
+ pm(2,3) \
+ pa(3,0) \
+ pm(2,1) \
+ pa(1,6)
+#undef p1_dot_1c
+#define p1_dot_1c(a_) \
+ plq(a_,ax,1) \
+ pl(a_,cx,2) \
+ pc(1,3) \
+ ps(HSHUF,1,1) \
+ ps(LSHUF,3,3) \
+ pm(7,1) \
+ pm(2,3) \
+ pa(3,0) \
+ pm(2,1) \
+ pa(1,6)
+#undef p2_dot_1c
+#define p2_dot_1c(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pl(SS(a_,MM(1,RS4)),cx,2) \
+ pc(3,5) \
+ ps(HSHUF,3,3) \
+ ps(LSHUF,5,5) \
+ pm(7,3) \
+ pm(4,5) \
+ pa(5,0) \
+ pm(4,3) \
+ pa(3,6) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,4) \
+ plq(SS(a_,MM(2,RS4)),ax,3) \
+ pc(1,5) \
+ ps(HSHUF,1,1) \
+ ps(LSHUF,5,5) \
+ pm(7,1) \
+ pm(2,5) \
+ pa(5,0) \
+ pm(2,1) \
+ pa(1,6)
+#undef lpdot_1c
+#define lpdot_1c(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(a_,ax,3) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(a_,cx,4)
+#undef dpdot_1c
+#define dpdot_1c(a_) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pl(SS(a_,MM(1,RS4)),cx,2) \
+ pc(3,5) \
+ ps(HSHUF,3,3) \
+ ps(LSHUF,5,5) \
+ pm(7,3) \
+ pm(4,5) \
+ pa(5,0) \
+ pm(4,3) \
+ pa(3,6) \
+ pc(1,5) \
+ ps(HSHUF,1,1) \
+ ps(LSHUF,5,5) \
+ pm(7,1) \
+ pm(2,5) \
+ pa(5,0) \
+ pm(2,1) \
+ pa(1,6)
+#undef pldot_1c
+#define pldot_1c 8
+
+#undef p1_4_dot_2c
+#define p1_4_dot_2c(a_)
+#undef p1_2_dot_2c
+#define p1_2_dot_2c(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ px(2) \
+ pld(a_,cx,2) \
+ pc(1,3) \
+ ps(CSHUF,1,1) \
+ pm(2,3) \
+ pa(3,0) \
+ pm(2,1) \
+ pa(1,6)
+#undef p1_dot_2c
+#define p1_dot_2c(a_) \
+ plq(a_,ax,1) \
+ pl(a_,cx,2) \
+ pc(1,3) \
+ ps(CSHUF,1,1) \
+ pm(2,3) \
+ pa(3,0) \
+ pm(2,1) \
+ pa(1,6)
+#undef p2_dot_2c
+#define p2_dot_2c(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pl(SS(a_,MM(1,RS4)),cx,2) \
+ pc(3,5) \
+ ps(CSHUF,3,3) \
+ pm(4,5) \
+ pa(5,0) \
+ pm(4,3) \
+ pa(3,6) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,4) \
+ plq(SS(a_,MM(2,RS4)),ax,3) \
+ pc(1,5) \
+ ps(CSHUF,1,1) \
+ pm(2,5) \
+ pa(5,0) \
+ pm(2,1) \
+ pa(1,6)
+#undef lpdot_2c
+#define lpdot_2c(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(a_,ax,3) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(a_,cx,4)
+#undef dpdot_2c
+#define dpdot_2c(a_) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pl(SS(a_,MM(1,RS4)),cx,2) \
+ pc(3,5) \
+ ps(CSHUF,3,3) \
+ pm(4,5) \
+ pa(5,0) \
+ pm(4,3) \
+ pa(3,6) \
+ pc(1,5) \
+ ps(CSHUF,1,1) \
+ pm(2,5) \
+ pa(5,0) \
+ pm(2,1) \
+ pa(1,6)
+#undef pldot_2c
+#define pldot_2c 8
+
+#undef p1_4_axpby_3
+#define p1_4_axpby_3(a_) \
+ pls(a_,ax,0) \
+ pls(a_,cx,3) \
+ pmsr(5,0) \
+ pmsr(6,3) \
+ pasr(3,0) \
+ pus(0,a_,ax)
+#undef p1_2_axpby_3
+#define p1_2_axpby_3(a_) \
+ pld(a_,ax,0) \
+ pld(a_,cx,3) \
+ pm(5,0) \
+ pm(6,3) \
+ pa(3,0) \
+ pud(0,a_,ax)
+#undef p1_axpby_3
+#define p1_axpby_3(a_) \
+ plq(a_,ax,0) \
+ pl(a_,cx,3) \
+ pm(5,0) \
+ pm(6,3) \
+ pa(3,0) \
+ punt(0,a_,ax)
+#undef p2_axpby_3
+#define p2_axpby_3(a_) \
+ plq(a_,ax,0) \
+ pl(a_,cx,3) \
+ plq(SS(a_,RS4),ax,1) \
+ pm(5,0) \
+ pm(6,3) \
+ pa(3,0) \
+ pl(SS(a_,RS4),cx,3) \
+ punt(0,a_,ax) \
+ pm(5,1) \
+ pm(6,3) \
+ pa(3,1) \
+ punt(1,SS(a_,RS4),ax)
+#undef p4_axpby_3
+#define p4_axpby_3(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(5,2) \
+ pl(SS(a_,MM(3,RS4)),cx,7) \
+ pm(6,4) \
+ pa(4,2) \
+ punt(0,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(4,RS4)),cx,4) \
+ pm(5,3) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ pm(6,7) \
+ pa(7,3) \
+ punt(1,SS(a_,RS4),ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ pm(5,0) \
+ pl(SS(a_,MM(5,RS4)),cx,7) \
+ pm(6,4) \
+ pa(4,0) \
+ punt(2,SS(a_,MM(2,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+ pl(SS(a_,MM(6,RS4)),cx,4) \
+ pm(5,1) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ pm(6,7) \
+ pa(7,1) \
+ punt(3,SS(a_,MM(3,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax)
+#undef lpaxpby_3
+#define lpaxpby_3(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,4) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ pl(SS(a_,MM(1,RS4)),cx,7) \
+ pm(5,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pm(6,4) \
+ pa(4,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pm(5,1) \
+ pl(SS(a_,MM(2,RS4)),cx,4) \
+ pm(6,7) \
+ pa(7,1)
+#undef dpaxpby_3
+#define dpaxpby_3(a_) \
+ pl(SS(a_,MM(3,RS4)),cx,7) \
+ pm(5,2) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(6,4) \
+ pa(4,2) \
+ pm(5,3) \
+ punt(0,a_,ax) \
+ pm(6,7) \
+ pa(7,3) \
+ punt(1,SS(a_,RS4),ax) \
+ punt(2,SS(a_,MM(2,RS4)),ax) \
+ punt(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpby_3
+#define plaxpby_3 16
+
+#undef p1_4_axpby_3c
+#define p1_4_axpby_3c(a_)
+#undef p1_2_axpby_3c
+#define p1_2_axpby_3c(a_) \
+ pld(a_,ax,0) \
+ pld(a_,cx,2) \
+ pc(0,3) \
+ pm(5,0) \
+ ps(CSHUF,3,3) \
+ pm(4,3) \
+ pa(3,0) \
+ pc(2,3) \
+ pm(6,2) \
+ pa(2,0) \
+ ps(CSHUF,3,3) \
+ pm(7,3) \
+ pa(3,0) \
+ pud(0,a_,ax)
+#undef p1_axpby_3c
+#define p1_axpby_3c(a_) \
+ plq(a_,ax,0) \
+ pl(a_,cx,2) \
+ pc(0,3) \
+ pm(5,0) \
+ ps(CSHUF,3,3) \
+ pm(4,3) \
+ pa(3,0) \
+ pc(2,3) \
+ pm(6,2) \
+ pa(2,0) \
+ ps(CSHUF,3,3) \
+ pm(7,3) \
+ pa(3,0) \
+ puq(0,a_,ax)
+#undef p2_axpby_3c
+#define p2_axpby_3c(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pl(SS(a_,MM(1,RS4)),cx,3) \
+ pc(1,2) \
+ pm(5,1) \
+ ps(CSHUF,2,2) \
+ pm(4,2) \
+ pa(2,1) \
+ pc(3,2) \
+ pm(6,3) \
+ pa(3,1) \
+ ps(CSHUF,2,2) \
+ pm(7,2) \
+ pa(2,1) \
+ puq(0,a_,ax) \
+ plq(SS(a_,MM(2,RS4)),ax,0) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pc(0,3) \
+ pm(5,0) \
+ ps(CSHUF,3,3) \
+ pm(4,3) \
+ pa(3,0) \
+ pc(2,3) \
+ pm(6,2) \
+ pa(2,0) \
+ ps(CSHUF,3,3) \
+ pm(7,3) \
+ pa(3,0) \
+ puq(1,SS(a_,RS4),ax)
+#undef lpaxpby_3c
+#define lpaxpby_3c(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,2) \
+ pc(0,3) \
+ pm(5,0) \
+ ps(CSHUF,3,3) \
+ pm(4,3) \
+ pa(3,0) \
+ pc(2,3) \
+ pm(6,2) \
+ pa(2,0) \
+ ps(CSHUF,3,3) \
+ pm(7,3) \
+ pa(3,0)
+#undef dpaxpby_3c
+#define dpaxpby_3c(a_) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pl(SS(a_,MM(1,RS4)),cx,3) \
+ pc(1,2) \
+ pm(5,1) \
+ ps(CSHUF,2,2) \
+ pm(4,2) \
+ pa(2,1) \
+ pc(3,2) \
+ pm(6,3) \
+ pa(3,1) \
+ ps(CSHUF,2,2) \
+ pm(7,2) \
+ pa(2,1) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax)
+#undef plaxpby_3c
+#define plaxpby_3c 8
+
+#undef p1_4_axpby_2
+#define p1_4_axpby_2(a_) \
+ pls(a_,cx,5) \
+ pls(a_,ax,0) \
+ pmsr(6,5) \
+ pasr(5,0) \
+ pus(0,a_,ax)
+#undef p1_2_axpby_2
+#define p1_2_axpby_2(a_) \
+ pld(a_,cx,5) \
+ pld(a_,ax,0) \
+ pm(6,5) \
+ pa(5,0) \
+ pud(0,a_,ax)
+#undef p1_axpby_2
+#define p1_axpby_2(a_) \
+ pl(a_,cx,5) \
+ plq(a_,ax,0) \
+ pm(6,5) \
+ pa(5,0) \
+ puq(0,a_,ax)
+#undef p2_axpby_2
+#define p2_axpby_2(a_) \
+ pl(a_,cx,5) \
+ plq(a_,ax,0) \
+ pl(SS(a_,RS4),cx,4) \
+ pm(6,5) \
+ pa(5,0) \
+ plq(SS(a_,RS4),ax,1) \
+ puq(0,a_,ax) \
+ pm(6,4) \
+ pa(4,1) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_axpby_2
+#define p4_axpby_2(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pl(SS(a_,MM(3,RS4)),cx,5) \
+ pm(6,4) \
+ pa(4,2) \
+ puq(0,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(4,RS4)),cx,4) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ pm(6,5) \
+ pa(5,3) \
+ puq(1,SS(a_,RS4),ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ pl(SS(a_,MM(5,RS4)),cx,5) \
+ pm(6,4) \
+ pa(4,0) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+ pl(SS(a_,MM(6,RS4)),cx,4) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ pm(6,5) \
+ pa(5,1) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax)
+#undef lpaxpby_2
+#define lpaxpby_2(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,4) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ pl(SS(a_,MM(1,RS4)),cx,5) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pm(6,4) \
+ pa(4,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pl(SS(a_,MM(2,RS4)),cx,4) \
+ pm(6,5) \
+ pa(5,1)
+#undef dpaxpby_2
+#define dpaxpby_2(a_) \
+ pl(SS(a_,MM(3,RS4)),cx,5) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(6,4) \
+ pa(4,2) \
+ puq(0,a_,ax) \
+ pm(6,5) \
+ pa(5,3) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpby_2
+#define plaxpby_2 16
+
+#undef p1_4_axpby_2c
+#define p1_4_axpby_2c(a_)
+#undef p1_2_axpby_2c
+#define p1_2_axpby_2c(a_) \
+ pld(a_,cx,5) \
+ pld(a_,ax,0) \
+ pc(5,1) \
+ pm(6,5) \
+ pa(5,0) \
+ ps(CSHUF,1,1) \
+ pm(7,1) \
+ pa(1,0) \
+ pud(0,a_,ax)
+#undef p1_axpby_2c
+#define p1_axpby_2c(a_) \
+ pl(a_,cx,5) \
+ plq(a_,ax,0) \
+ pc(5,1) \
+ pm(6,5) \
+ pa(5,0) \
+ ps(CSHUF,1,1) \
+ pm(7,1) \
+ pa(1,0) \
+ puq(0,a_,ax)
+#undef p2_axpby_2c
+#define p2_axpby_2c(a_) \
+ pl(a_,cx,5) \
+ plq(a_,ax,0) \
+ pl(SS(a_,RS4),cx,4) \
+ pc(5,1) \
+ pm(6,5) \
+ pa(5,0) \
+ ps(CSHUF,2,2) \
+ pm(7,2) \
+ pa(2,0) \
+ plq(SS(a_,RS4),ax,1) \
+ puq(0,a_,ax) \
+ pc(4,3) \
+ pm(6,4) \
+ pa(4,1) \
+ ps(CSHUF,3,3) \
+ pm(7,3) \
+ pa(3,1) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_axpby_2c
+#define p4_axpby_2c(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ puq(0,a_,ax) \
+ pc(4,0) \
+ pm(6,4) \
+ pa(4,2) \
+ ps(CSHUF,0,0) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(4,RS4)),cx,4) \
+ pm(7,0) \
+ pa(0,2) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ puq(1,SS(a_,RS4),ax) \
+ pc(5,1) \
+ pm(6,5) \
+ pa(5,3) \
+ ps(CSHUF,1,1) \
+ pl(SS(a_,MM(5,RS4)),cx,5) \
+ pm(7,1) \
+ pa(1,3) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ pc(4,2) \
+ pm(6,4) \
+ pa(4,0) \
+ ps(CSHUF,2,2) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+ pl(SS(a_,MM(6,RS4)),cx,4) \
+ pm(7,2) \
+ pa(2,0) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ pc(5,3) \
+ pm(6,5) \
+ pa(5,1) \
+ ps(CSHUF,3,3) \
+ pl(SS(a_,MM(7,RS4)),cx,5) \
+ pm(7,3) \
+ pa(3,1) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax)
+#undef lpaxpby_2c
+#define lpaxpby_2c(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,4) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ pl(SS(a_,MM(1,RS4)),cx,5) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pc(4,2) \
+ pm(6,4) \
+ pa(4,0) \
+ ps(CSHUF,2,2) \
+ pl(SS(a_,MM(2,RS4)),cx,4) \
+ pm(7,2) \
+ pa(2,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pc(5,3) \
+ pm(6,5) \
+ pa(5,1) \
+ ps(CSHUF,3,3) \
+ pl(SS(a_,MM(3,RS4)),cx,5) \
+ pm(7,3) \
+ pa(3,1)
+#undef dpaxpby_2c
+#define dpaxpby_2c(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ puq(0,a_,ax) \
+ pc(4,0) \
+ pm(6,4) \
+ pa(4,2) \
+ ps(CSHUF,0,0) \
+ puq(1,SS(a_,RS4),ax) \
+ pm(7,0) \
+ pa(0,2) \
+ pc(5,1) \
+ pm(6,5) \
+ pa(5,3) \
+ ps(CSHUF,1,1) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ pm(7,1) \
+ pa(1,3) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpby_2c
+#define plaxpby_2c 16
+
+#undef p1_4_axpby_1
+#define p1_4_axpby_1(a_) \
+ pls(a_,ax,1) \
+ pls(a_,cx,2) \
+ pmsr(5,1) \
+ pmsr(6,2) \
+ pasr(2,1) \
+ pus(1,a_,ax)
+#undef p1_2_axpby_1
+#define p1_2_axpby_1(a_) \
+ pld(a_,ax,1) \
+ pld(a_,cx,2) \
+ pm(5,1) \
+ pm(6,2) \
+ pa(2,1) \
+ pud(1,a_,ax)
+#undef p1_axpby_1
+#define p1_axpby_1(a_) \
+ plq(a_,ax,1) \
+ pl(a_,cx,2) \
+ pm(5,1) \
+ pm(6,2) \
+ pa(2,1) \
+ puq(1,a_,ax)
+#undef p2_axpby_1
+#define p2_axpby_1(a_) \
+ plq(SS(a_,RS4),ax,3) \
+ pl(SS(a_,RS4),cx,4) \
+ pm(5,1) \
+ pm(6,2) \
+ pa(2,1) \
+ puq(1,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pm(5,3) \
+ pm(6,4) \
+ pa(4,3) \
+ puq(3,SS(a_,RS4),ax)
+#undef lpaxpby_1
+#define lpaxpby_1(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,1) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,2)
+#undef dpaxpby_1
+#define dpaxpby_1(a_) \
+ plq(SS(a_,RS4),ax,3) \
+ pl(SS(a_,RS4),cx,4) \
+ pm(5,1) \
+ pm(6,2) \
+ pa(2,1) \
+ puq(1,a_,ax) \
+ pm(5,3) \
+ pm(6,4) \
+ pa(4,3) \
+ puq(3,SS(a_,RS4),ax)
+#undef plaxpby_1
+#define plaxpby_1 8
+
+#undef p1_4_axpy_0
+#define p1_4_axpy_0(a_) \
+ pls(a_,cx,2) \
+ pls(a_,ax,1) \
+ pmsr(6,2) \
+ pasr(2,1) \
+ pus(1,a_,ax)
+#undef p1_2_axpy_0
+#define p1_2_axpy_0(a_) \
+ pld(a_,cx,2) \
+ pld(a_,ax,1) \
+ pm(6,2) \
+ pa(2,1) \
+ pud(1,a_,ax)
+#undef p1_axpy_0
+#define p1_axpy_0(a_) \
+ pl(a_,cx,2) \
+ plq(a_,ax,1) \
+ pm(6,2) \
+ pa(2,1) \
+ puq(1,a_,ax)
+#undef p2_axpy_0
+#define p2_axpy_0(a_) \
+ pl(SS(a_,RS4),cx,4) \
+ pm(6,2) \
+ pa(2,1) \
+ plq(SS(a_,RS4),ax,3) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ puq(1,a_,ax) \
+ pm(6,4) \
+ pa(4,3) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ puq(3,SS(a_,RS4),ax)
+#undef lpaxpy_0
+#define lpaxpy_0(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,2) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,1)
+#undef dpaxpy_0
+#define dpaxpy_0(a_) \
+ pl(SS(a_,RS4),cx,4) \
+ pm(6,2) \
+ pa(2,1) \
+ plq(SS(a_,RS4),ax,3) \
+ puq(1,a_,ax) \
+ pm(6,4) \
+ pa(4,3) \
+ puq(3,SS(a_,RS4),ax)
+#undef plaxpy_0
+#define plaxpy_0 8
+
+#undef p1_4_axpy_1
+#define p1_4_axpy_1(a_) \
+ pls(a_,cx,2) \
+ pls(a_,ax,1) \
+ pmsr(6,2) \
+ pasr(2,1) \
+ pus(1,a_,ax)
+#undef p1_2_axpy_1
+#define p1_2_axpy_1(a_) \
+ pld(a_,cx,2) \
+ pld(a_,ax,1) \
+ pm(6,2) \
+ pa(2,1) \
+ pud(1,a_,ax)
+#undef p1_axpy_1
+#define p1_axpy_1(a_) \
+ pl(a_,cx,2) \
+ pm(6,2) \
+ pam(a_,ax,2) \
+ puq(2,a_,ax)
+#undef p2_axpy_1
+#define p2_axpy_1(a_) \
+ pl(a_,cx,2) \
+ pm(6,2) \
+ pl(SS(a_,RS4),cx,4) \
+ pam(a_,ax,2) \
+ pm(6,4) \
+ puq(2,a_,ax) \
+ pam(SS(a_,RS4),ax,4) \
+ puq(4,SS(a_,RS4),ax)
+#undef p4_axpy_1
+#define p4_axpy_1(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ pm(6,2) \
+ pam(SS(a_,MM(2,RS4)),ax,2) \
+ puq(0,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ pl(SS(a_,MM(4,RS4)),cx,0) \
+ pm(6,3) \
+ pam(SS(a_,MM(3,RS4)),ax,3) \
+ puq(1,SS(a_,RS4),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+ pl(SS(a_,MM(5,RS4)),cx,1) \
+ pm(6,0) \
+ pam(SS(a_,MM(4,RS4)),ax,0) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ pl(SS(a_,MM(6,RS4)),cx,2) \
+ pm(6,1) \
+ pam(SS(a_,MM(5,RS4)),ax,1) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef lpaxpy_1
+#define lpaxpy_1(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(a_,cx,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ pl(SS(a_,RS4),cx,1) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pm(6,0) \
+ pam(a_,ax,0) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ pm(6,1) \
+ pam(SS(a_,RS4),ax,1)
+#undef dpaxpy_1
+#define dpaxpy_1(a_) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ pm(6,2) \
+ pam(SS(a_,MM(2,RS4)),ax,2) \
+ puq(0,a_,ax) \
+ pm(6,3) \
+ pam(SS(a_,MM(3,RS4)),ax,3) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpy_1
+#define plaxpy_1 16
+
+#undef p1_4_axpy_2
+#define p1_4_axpy_2(a_) \
+ pls(a_,cx,5) \
+ pls(a_,ax,0) \
+ pmsr(6,5) \
+ pasr(5,0) \
+ pus(0,a_,ax)
+#undef p1_2_axpy_2
+#define p1_2_axpy_2(a_) \
+ pld(a_,cx,5) \
+ pld(a_,ax,0) \
+ pm(6,5) \
+ pa(5,0) \
+ pud(0,a_,ax)
+#undef p1_axpy_2
+#define p1_axpy_2(a_) \
+ pl(a_,cx,5) \
+ plq(a_,ax,0) \
+ pm(6,5) \
+ pa(5,0) \
+ puq(0,a_,ax)
+#undef p2_axpy_2
+#define p2_axpy_2(a_) \
+ pl(a_,cx,5) \
+ plq(a_,ax,0) \
+ pl(SS(a_,RS4),cx,4) \
+ pm(6,5) \
+ pa(5,0) \
+ plq(SS(a_,RS4),ax,1) \
+ puq(0,a_,ax) \
+ pm(6,4) \
+ pa(4,1) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_axpy_2
+#define p4_axpy_2(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pl(SS(a_,MM(3,RS4)),cx,5) \
+ pm(6,4) \
+ pa(4,2) \
+ puq(0,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(4,RS4)),cx,4) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ pm(6,5) \
+ pa(5,3) \
+ puq(1,SS(a_,RS4),ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ pl(SS(a_,MM(5,RS4)),cx,5) \
+ pm(6,4) \
+ pa(4,0) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+ pl(SS(a_,MM(6,RS4)),cx,4) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ pm(6,5) \
+ pa(5,1) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax)
+#undef lpaxpy_2
+#define lpaxpy_2(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,4) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ pl(SS(a_,MM(1,RS4)),cx,5) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pm(6,4) \
+ pa(4,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pl(SS(a_,MM(2,RS4)),cx,4) \
+ pm(6,5) \
+ pa(5,1)
+#undef dpaxpy_2
+#define dpaxpy_2(a_) \
+ pl(SS(a_,MM(3,RS4)),cx,5) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(6,4) \
+ pa(4,2) \
+ puq(0,a_,ax) \
+ pm(6,5) \
+ pa(5,3) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpy_2
+#define plaxpy_2 16
+
+#undef p1_4_axpy_2c
+#define p1_4_axpy_2c(a_)
+#undef p1_2_axpy_2c
+#define p1_2_axpy_2c(a_) \
+ pld(a_,cx,4) \
+ pld(a_,ax,0) \
+ pc(4,2) \
+ pm(6,4) \
+ pa(4,0) \
+ ps(CSHUF,2,2) \
+ pm(7,2) \
+ pa(2,0) \
+ pud(0,a_,ax)
+#undef p1_axpy_2c
+#define p1_axpy_2c(a_) \
+ pl(a_,cx,4) \
+ plq(a_,ax,0) \
+ pc(4,2) \
+ pm(6,4) \
+ pa(4,0) \
+ ps(CSHUF,2,2) \
+ pm(7,2) \
+ pa(2,0) \
+ puq(0,a_,ax)
+#undef p2_axpy_2c
+#define p2_axpy_2c(a_) \
+ pl(a_,cx,4) \
+ plq(a_,ax,0) \
+ pl(SS(a_,RS4),cx,5) \
+ pc(4,2) \
+ pm(6,4) \
+ pa(4,0) \
+ ps(CSHUF,2,2) \
+ pm(7,2) \
+ pa(2,0) \
+ plq(SS(a_,RS4),ax,1) \
+ puq(0,a_,ax) \
+ pc(5,3) \
+ pm(6,5) \
+ pa(5,1) \
+ ps(CSHUF,3,3) \
+ pm(7,3) \
+ pa(3,1) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_axpy_2c
+#define p4_axpy_2c(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ puq(0,a_,ax) \
+ pc(4,0) \
+ pm(6,4) \
+ pa(4,2) \
+ ps(CSHUF,0,0) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(4,RS4)),cx,4) \
+ pm(7,0) \
+ pa(0,2) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ puq(1,SS(a_,RS4),ax) \
+ pc(5,1) \
+ pm(6,5) \
+ pa(5,3) \
+ ps(CSHUF,1,1) \
+ pl(SS(a_,MM(5,RS4)),cx,5) \
+ pm(7,1) \
+ pa(1,3) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ pc(4,2) \
+ pm(6,4) \
+ pa(4,0) \
+ ps(CSHUF,2,2) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+ pl(SS(a_,MM(6,RS4)),cx,4) \
+ pm(7,2) \
+ pa(2,0) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ pc(5,3) \
+ pm(6,5) \
+ pa(5,1) \
+ ps(CSHUF,3,3) \
+ pl(SS(a_,MM(7,RS4)),cx,5) \
+ pm(7,3) \
+ pa(3,1) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax)
+#undef lpaxpy_2c
+#define lpaxpy_2c(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,4) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ pl(SS(a_,MM(1,RS4)),cx,5) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pc(4,2) \
+ pm(6,4) \
+ pa(4,0) \
+ ps(CSHUF,2,2) \
+ pl(SS(a_,MM(2,RS4)),cx,4) \
+ pm(7,2) \
+ pa(2,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pc(5,3) \
+ pm(6,5) \
+ pa(5,1) \
+ ps(CSHUF,3,3) \
+ pl(SS(a_,MM(3,RS4)),cx,5) \
+ pm(7,3) \
+ pa(3,1)
+#undef dpaxpy_2c
+#define dpaxpy_2c(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ puq(0,a_,ax) \
+ pc(4,0) \
+ pm(6,4) \
+ pa(4,2) \
+ ps(CSHUF,0,0) \
+ puq(1,SS(a_,RS4),ax) \
+ pm(7,0) \
+ pa(0,2) \
+ pc(5,1) \
+ pm(6,5) \
+ pa(5,3) \
+ ps(CSHUF,1,1) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ pm(7,1) \
+ pa(1,3) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpy_2c
+#define plaxpy_2c 16
+
+#undef p1_4_axpy_1c
+#define p1_4_axpy_1c(a_)
+#undef p1_2_axpy_1c
+#define p1_2_axpy_1c(a_) \
+ pld(a_,cx,2) \
+ pc(2,0) \
+ pld(a_,ax,1) \
+ ps(CSHUF,0,0) \
+ pm(6,2) \
+ pa(2,1) \
+ pm(7,0) \
+ pa(0,1) \
+ pud(1,a_,ax)
+#undef p1_axpy_1c
+#define p1_axpy_1c(a_) \
+ pl(a_,cx,2) \
+ pc(2,0) \
+ plq(a_,ax,1) \
+ ps(CSHUF,0,0) \
+ pm(6,2) \
+ pa(2,1) \
+ pm(7,0) \
+ pa(0,1) \
+ puq(1,a_,ax)
+#undef p2_axpy_1c
+#define p2_axpy_1c(a_) \
+ plq(SS(a_,RS4),ax,3) \
+ ps(CSHUF,0,0) \
+ pl(SS(a_,RS4),cx,4) \
+ pm(6,2) \
+ pa(2,1) \
+ pm(7,0) \
+ pa(0,1) \
+ pc(4,0) \
+ puq(1,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ ps(CSHUF,0,0) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pm(6,4) \
+ pa(4,3) \
+ pm(7,0) \
+ pa(0,3) \
+ pc(2,0) \
+ puq(3,SS(a_,RS4),ax)
+#undef lpaxpy_1c
+#define lpaxpy_1c(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,2) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,1) \
+ pc(2,0)
+#undef dpaxpy_1c
+#define dpaxpy_1c(a_) \
+ plq(SS(a_,RS4),ax,3) \
+ ps(CSHUF,0,0) \
+ pl(SS(a_,RS4),cx,4) \
+ pm(6,2) \
+ pa(2,1) \
+ pm(7,0) \
+ pa(0,1) \
+ pc(4,0) \
+ puq(1,a_,ax) \
+ ps(CSHUF,0,0) \
+ pm(6,4) \
+ pa(4,3) \
+ pm(7,0) \
+ pa(0,3) \
+ puq(3,SS(a_,RS4),ax)
+#undef plaxpy_1c
+#define plaxpy_1c 8
+
+#undef p1_4_copy_1
+#define p1_4_copy_1(a_) \
+ pls(a_,cx,2) \
+ pus(2,a_,ax)
+#undef p1_2_copy_1
+#define p1_2_copy_1(a_) \
+ pld(a_,cx,2) \
+ pud(2,a_,ax)
+#undef p1_copy_1
+#define p1_copy_1(a_) \
+ pl(a_,cx,2) \
+ puq(2,a_,ax)
+#undef p2_copy_1
+#define p2_copy_1(a_) \
+ pl(SS(a_,RS4),cx,4) \
+ puq(2,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ puq(4,SS(a_,RS4),ax)
+#undef lpcopy_1
+#define lpcopy_1(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,2)
+#undef dpcopy_1
+#define dpcopy_1(a_) \
+ pl(SS(a_,RS4),cx,4) \
+ puq(2,a_,ax) \
+ puq(4,SS(a_,RS4),ax)
+#undef plcopy_1
+#define plcopy_1 8
+
+#undef p1_4_copy_2
+#define p1_4_copy_2(a_) \
+ pls(a_,ax,2) \
+ pus(2,a_,cx)
+#undef p1_2_copy_2
+#define p1_2_copy_2(a_) \
+ pld(a_,ax,2) \
+ pud(2,a_,cx)
+#undef p1_copy_2
+#define p1_copy_2(a_) \
+ plq(a_,ax,2) \
+ pu(2,a_,cx)
+#undef p2_copy_2
+#define p2_copy_2(a_) \
+ plq(SS(a_,RS4),ax,4) \
+ pu(2,a_,cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pu(4,SS(a_,RS4),cx)
+#undef lpcopy_2
+#define lpcopy_2(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,2)
+#undef dpcopy_2
+#define dpcopy_2(a_) \
+ plq(SS(a_,RS4),ax,4) \
+ pu(2,a_,cx) \
+ pu(4,SS(a_,RS4),cx)
+#undef plcopy_2
+#define plcopy_2 8
+
+#undef p1_4_copy_3
+#define p1_4_copy_3(a_) \
+ pls(a_,cx,2) \
+ pus(2,a_,ax)
+#undef p1_2_copy_3
+#define p1_2_copy_3(a_) \
+ pld(a_,cx,2) \
+ pud(2,a_,ax)
+#undef p1_copy_3
+#define p1_copy_3(a_) \
+ pl(a_,cx,2) \
+ punt(2,a_,ax)
+#undef p2_copy_3
+#define p2_copy_3(a_) \
+ pl(SS(a_,MM(0,RS4)),cx,0) \
+ pl(SS(a_,MM(1,RS4)),cx,1) \
+ punt(0,SS(a_,MM(0,RS4)),ax) \
+ punt(1,SS(a_,MM(1,RS4)),ax)
+#undef p4_copy_3
+#define p4_copy_3(a_) \
+ pl(SS(a_,MM(0,RS4)),cx,0) \
+ pl(SS(a_,MM(1,RS4)),cx,1) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ punt(0,SS(a_,MM(0,RS4)),ax) \
+ punt(1,SS(a_,MM(1,RS4)),ax) \
+ punt(2,SS(a_,MM(2,RS4)),ax) \
+ punt(3,SS(a_,MM(3,RS4)),ax)
+#undef p8_copy_3
+#define p8_copy_3(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,0) \
+ pl(SS(a_,MM(1,RS4)),cx,1) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ pl(SS(a_,MM(4,RS4)),cx,4) \
+ pl(SS(a_,MM(5,RS4)),cx,5) \
+ pl(SS(a_,MM(6,RS4)),cx,6) \
+ pl(SS(a_,MM(7,RS4)),cx,7) \
+ punt(0,SS(a_,MM(0,RS4)),ax) \
+ punt(1,SS(a_,MM(1,RS4)),ax) \
+ punt(2,SS(a_,MM(2,RS4)),ax) \
+ punt(3,SS(a_,MM(3,RS4)),ax) \
+ punt(4,SS(a_,MM(4,RS4)),ax) \
+ punt(5,SS(a_,MM(5,RS4)),ax) \
+ punt(6,SS(a_,MM(6,RS4)),ax) \
+ punt(7,SS(a_,MM(7,RS4)),ax)
+#undef lpcopy_3
+#define lpcopy_3(a_)
+#undef dpcopy_3
+#define dpcopy_3(a_) p8_copy_3(a_)
+#undef plcopy_3
+#define plcopy_3 32
+
+#undef p1_4_cpsc_3
+#define p1_4_cpsc_3(a_) \
+ pls(a_,ax,0) \
+ pmsr(6,0) \
+ pus(0,a_,cx)
+#undef p1_2_cpsc_3
+#define p1_2_cpsc_3(a_) \
+ pld(a_,ax,0) \
+ pm(6,0) \
+ pud(0,a_,cx)
+#undef p1_cpsc_3
+#define p1_cpsc_3(a_) \
+ plq(a_,ax,0) \
+ pm(6,0) \
+ pu(0,a_,cx)
+#undef p2_cpsc_3
+#define p2_cpsc_3(a_) \
+ plq(a_,ax,0) \
+ plq(SS(a_,RS4),ax,1) \
+ pm(6,0) \
+ pm(6,1) \
+ pu(0,a_,cx) \
+ pu(1,SS(a_,RS4),cx)
+#undef p4_cpsc_3
+#define p4_cpsc_3(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(6,2) \
+ pu(0,a_,cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ pm(6,3) \
+ pu(1,SS(a_,RS4),cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ pm(6,0) \
+ pu(2,SS(a_,MM(2,RS4)),cx) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ pm(6,1) \
+ pu(3,SS(a_,MM(3,RS4)),cx) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx)
+#undef lpcpsc_3
+#define lpcpsc_3(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pm(6,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pm(6,1)
+#undef dpcpsc_3
+#define dpcpsc_3(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(6,2) \
+ pu(0,a_,cx) \
+ pm(6,3) \
+ pu(1,SS(a_,RS4),cx) \
+ pu(2,SS(a_,MM(2,RS4)),cx) \
+ pu(3,SS(a_,MM(3,RS4)),cx)
+#undef plcpsc_3
+#define plcpsc_3 16
+
+#undef p1_4_cpsc_3c
+#define p1_4_cpsc_3c(a_)
+#undef p1_2_cpsc_3c
+#define p1_2_cpsc_3c(a_) \
+ pld(a_,ax,0) \
+ pc(0,1) \
+ pm(6,0) \
+ ps(CSHUF,1,1) \
+ pm(7,1) \
+ pa(1,0) \
+ pud(0,a_,cx)
+#undef p1_cpsc_3c
+#define p1_cpsc_3c(a_) \
+ plq(a_,ax,0) \
+ pc(0,1) \
+ pm(6,0) \
+ ps(CSHUF,1,1) \
+ pm(7,1) \
+ pa(1,0) \
+ pu(0,a_,cx)
+#undef p2_cpsc_3c
+#define p2_cpsc_3c(a_) \
+ plq(a_,ax,0) \
+ plq(SS(a_,RS4),ax,1) \
+ pc(0,2) \
+ pm(6,0) \
+ ps(CSHUF,2,2) \
+ pm(7,2) \
+ pa(2,0) \
+ pu(0,a_,cx) \
+ pc(1,3) \
+ pm(6,1) \
+ ps(CSHUF,3,3) \
+ pm(7,3) \
+ pa(3,1) \
+ pu(1,SS(a_,RS4),cx)
+#undef p4_cpsc_3c
+#define p4_cpsc_3c(a_) \
+ pu(0,a_,cx) \
+ pc(2,4) \
+ pm(6,2) \
+ ps(CSHUF,4,4) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ pm(7,4) \
+ pa(4,2) \
+ pu(1,SS(a_,RS4),cx) \
+ pc(3,4) \
+ pm(6,3) \
+ ps(CSHUF,4,4) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ pm(7,4) \
+ pa(4,3) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pu(2,SS(a_,MM(2,RS4)),cx) \
+ pc(0,4) \
+ pm(6,0) \
+ ps(CSHUF,4,4) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ pm(7,4) \
+ pa(4,0) \
+ pu(3,SS(a_,MM(3,RS4)),cx) \
+ pc(1,4) \
+ pm(6,1) \
+ ps(CSHUF,4,4) \
+ plq(SS(a_,MM(7,RS4)),ax,3) \
+ pm(7,4) \
+ pa(4,1) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx)
+#undef lpcpsc_3c
+#define lpcpsc_3c(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pc(0,4) \
+ pm(6,0) \
+ ps(CSHUF,4,4) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pm(7,4) \
+ pa(4,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pc(1,4) \
+ pm(6,1) \
+ ps(CSHUF,4,4) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(7,4) \
+ pa(4,1)
+#undef dpcpsc_3c
+#define dpcpsc_3c(a_) \
+ pu(0,a_,cx) \
+ pc(2,4) \
+ pm(6,2) \
+ ps(CSHUF,4,4) \
+ pu(1,SS(a_,RS4),cx) \
+ pm(7,4) \
+ pa(4,2) \
+ pc(3,4) \
+ pm(6,3) \
+ ps(CSHUF,4,4) \
+ pu(2,SS(a_,MM(2,RS4)),cx) \
+ pm(7,4) \
+ pa(4,3) \
+ pu(3,SS(a_,MM(3,RS4)),cx)
+#undef plcpsc_3c
+#define plcpsc_3c 16
+
+#undef p1_4_cpsc_4
+#define p1_4_cpsc_4(a_) \
+ pls(a_,cx,0) \
+ pmsr(6,0) \
+ pus(0,a_,ax)
+#undef p1_2_cpsc_4
+#define p1_2_cpsc_4(a_) \
+ pld(a_,cx,0) \
+ pm(6,0) \
+ pud(0,a_,ax)
+#undef p1_cpsc_4
+#define p1_cpsc_4(a_) \
+ pl(a_,cx,0) \
+ pm(6,0) \
+ puq(0,a_,ax)
+#undef p2_cpsc_4
+#define p2_cpsc_4(a_) \
+ pl(a_,cx,0) \
+ pl(SS(a_,RS4),cx,1) \
+ pm(6,0) \
+ pm(6,1) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_cpsc_4
+#define p4_cpsc_4(a_) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ pm(6,2) \
+ puq(0,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(4,RS4)),cx,0) \
+ pm(6,3) \
+ puq(1,SS(a_,RS4),ax) \
+ pl(SS(a_,MM(5,RS4)),cx,1) \
+ pm(6,0) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+ pl(SS(a_,MM(6,RS4)),cx,2) \
+ pm(6,1) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef lpcpsc_4
+#define lpcpsc_4(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,0) \
+ pl(SS(a_,MM(1,RS4)),cx,1) \
+ pm(6,0) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pm(6,1)
+#undef dpcpsc_4
+#define dpcpsc_4(a_) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ pm(6,2) \
+ puq(0,a_,ax) \
+ pm(6,3) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plcpsc_4
+#define plcpsc_4 16
+
+#undef p1_4_cpsc_5
+#define p1_4_cpsc_5(a_) \
+ pls(a_,cx,0) \
+ pmsr(6,0) \
+ pus(0,a_,ax)
+#undef p1_2_cpsc_5
+#define p1_2_cpsc_5(a_) \
+ pld(a_,cx,0) \
+ pm(6,0) \
+ pud(0,a_,ax)
+#undef p1_cpsc_5
+#define p1_cpsc_5(a_) \
+ pl(a_,cx,0) \
+ pm(6,0) \
+ puq(0,a_,ax)
+#undef p2_cpsc_5
+#define p2_cpsc_5(a_) \
+ pl(a_,cx,0) \
+ pl(SS(a_,RS4),cx,1) \
+ pm(6,0) \
+ pm(6,1) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_cpsc_5
+#define p4_cpsc_5(a_) \
+ pl(SS(a_,MM(0,RS4)),cx,0) \
+ pl(SS(a_,MM(1,RS4)),cx,1) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ pm(6,0) \
+ pm(6,1) \
+ pm(6,2) \
+ pm(6,3) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef p8_cpsc_5
+#define p8_cpsc_5(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,0) \
+ pl(SS(a_,MM(1,RS4)),cx,1) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ pl(SS(a_,MM(4,RS4)),cx,4) \
+ pl(SS(a_,MM(5,RS4)),cx,5) \
+ pl(SS(a_,MM(6,RS4)),cx,7) \
+ pm(6,0) \
+ pm(6,1) \
+ pm(6,2) \
+ pm(6,3) \
+ puq(0,a_,ax) \
+ pl(SS(a_,MM(7,RS4)),cx,0) \
+ pm(6,4) \
+ pm(6,5) \
+ pm(6,7) \
+ pm(6,0) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ puq(4,SS(a_,MM(4,RS4)),ax) \
+ puq(5,SS(a_,MM(5,RS4)),ax) \
+ puq(7,SS(a_,MM(6,RS4)),ax) \
+ puq(0,SS(a_,MM(7,RS4)),ax)
+#undef lpcpsc_5
+#define lpcpsc_5(a_)
+#undef dpcpsc_5
+#define dpcpsc_5(a_) p8_cpsc_5(a_)
+#undef plcpsc_5
+#define plcpsc_5 32
+
+#undef cpsc_cdp
+#define cpsc_cdp(a_) pc(a_,5) pm(6,a_) ps(CSHUF,5,5) pm(7,5) pa(5,a_)
+#undef p1_4_cpsc_5c
+#define p1_4_cpsc_5c(a_)
+#undef p1_2_cpsc_5c
+#define p1_2_cpsc_5c(a_) \
+ pld(a_,cx,0) \
+ cpsc_cdp(0) \
+ pud(0,a_,ax)
+#undef p1_cpsc_5c
+#define p1_cpsc_5c(a_) \
+ pl(a_,cx,0) \
+ cpsc_cdp(0) \
+ puq(0,a_,ax)
+#undef p2_cpsc_5c
+#define p2_cpsc_5c(a_) \
+ pl(a_,cx,0) \
+ pl(SS(a_,RS4),cx,1) \
+ cpsc_cdp(0) \
+ cpsc_cdp(1) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_cpsc_5c
+#define p4_cpsc_5c(a_) \
+ pl(SS(a_,MM(0,RS4)),cx,0) \
+ pl(SS(a_,MM(1,RS4)),cx,1) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ cpsc_cdp(0) \
+ cpsc_cdp(1) \
+ cpsc_cdp(2) \
+ cpsc_cdp(3) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef p8_cpsc_5c
+#define p8_cpsc_5c(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,0) \
+ pl(SS(a_,MM(1,RS4)),cx,1) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ pl(SS(a_,MM(4,RS4)),cx,4) \
+ cpsc_cdp(0) \
+ cpsc_cdp(1) \
+ puq(0,a_,ax) \
+ pl(SS(a_,MM(5,RS4)),cx,0) \
+ cpsc_cdp(2) \
+ cpsc_cdp(3) \
+ puq(1,SS(a_,RS4),ax) \
+ pl(SS(a_,MM(6,RS4)),cx,1) \
+ cpsc_cdp(4) \
+ cpsc_cdp(0) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ pl(SS(a_,MM(7,RS4)),cx,2) \
+ cpsc_cdp(1) \
+ cpsc_cdp(2) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ puq(4,SS(a_,MM(4,RS4)),ax) \
+ puq(0,SS(a_,MM(5,RS4)),ax) \
+ puq(1,SS(a_,MM(6,RS4)),ax) \
+ puq(2,SS(a_,MM(7,RS4)),ax)
+#undef lpcpsc_5c
+#define lpcpsc_5c(a_)
+#undef dpcpsc_5c
+#define dpcpsc_5c(a_) p8_cpsc_5c(a_)
+#undef plcpsc_5c
+#define plcpsc_5c 32
+
+#undef p1_4_cpsc_1
+#define p1_4_cpsc_1(a_) \
+ pls(a_,ax,2) \
+ pmsr(3,2) \
+ pus(2,a_,cx)
+#undef p1_2_cpsc_1
+#define p1_2_cpsc_1(a_) \
+ pld(a_,ax,2) \
+ pm(3,2) \
+ pud(2,a_,cx)
+#undef p1_cpsc_1
+#define p1_cpsc_1(a_) \
+ plq(a_,ax,2) \
+ pm(3,2) \
+ pu(2,a_,cx)
+#undef p2_cpsc_1
+#define p2_cpsc_1(a_) \
+ plq(SS(a_,RS4),ax,4) \
+ pm(3,2) \
+ pu(2,a_,cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pm(3,4) \
+ pu(4,SS(a_,RS4),cx)
+#undef lpcpsc_1
+#define lpcpsc_1(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,2)
+#undef dpcpsc_1
+#define dpcpsc_1(a_) \
+ plq(SS(a_,RS4),ax,4) \
+ pm(3,2) \
+ pu(2,a_,cx) \
+ pm(3,4) \
+ pu(4,SS(a_,RS4),cx)
+#undef plcpsc_1
+#define plcpsc_1 8
+
+#undef p1_4_cpsc_2
+#define p1_4_cpsc_2(a_) \
+ pls(a_,ax,2) \
+ pmsr(3,2) \
+ pus(2,a_,cx)
+#undef p1_2_cpsc_2
+#define p1_2_cpsc_2(a_) \
+ pld(a_,ax,2) \
+ pm(3,2) \
+ pud(2,a_,cx)
+#undef p1_cpsc_2
+#define p1_cpsc_2(a_) \
+ plq(a_,ax,2) \
+ pm(3,2) \
+ pu(2,a_,cx)
+#undef p2_cpsc_2
+#define p2_cpsc_2(a_) \
+ plq(a_,ax,2) \
+ plq(SS(a_,RS4),ax,4) \
+ pm(3,2) \
+ pm(3,4) \
+ pu(2,a_,cx) \
+ pu(4,SS(a_,RS4),cx)
+#undef p4_cpsc_2
+#define p4_cpsc_2(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,7) \
+ pm(3,6) \
+ pu(4,a_,cx) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pm(3,7) \
+ pu(6,SS(a_,RS4),cx) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,4) \
+ pm(3,2) \
+ pu(7,SS(a_,MM(2,RS4)),cx) \
+ plq(SS(a_,MM(5,RS4)),ax,6) \
+ pm(3,4) \
+ pu(2,SS(a_,MM(3,RS4)),cx)
+#undef lpcpsc_2
+#define lpcpsc_2(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,4) \
+ plq(SS(a_,MM(1,RS4)),ax,6) \
+ pm(3,4)
+#undef dpcpsc_2
+#define dpcpsc_2(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,7) \
+ pm(3,6) \
+ pu(4,a_,cx) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pm(3,7) \
+ pu(6,SS(a_,RS4),cx) \
+ pm(3,2) \
+ pu(7,SS(a_,MM(2,RS4)),cx) \
+ pu(2,SS(a_,MM(3,RS4)),cx)
+#undef plcpsc_2
+#define plcpsc_2 RS4
+
+
+#undef p1_4_iamax_1
+#define p1_4_iamax_1(a_) \
+ px(4) \
+ pls(a_,ax,4) \
+ pan(2,4) \
+ pc(3,5) \
+ pcm(6,4,5) \
+ paxs(4,3) \
+ pan(5,6) \
+ pann(0,5) \
+ pasr(5,6) \
+ pasr(1,0) \
+ ps(57,0,0)
+#undef p1_2_iamax_1
+#define p1_2_iamax_1(a_) \
+ px(4) \
+ pld(a_,ax,4) \
+ pan(2,4) \
+ pc(3,5) \
+ pcm(6,4,5) \
+ pax(4,3) \
+ pan(5,6) \
+ pann(0,5) \
+ pa(5,6) \
+ pasr(1,0) \
+ ps(57,0,0)\
+ pasr(1,0) \
+ ps(57,0,0)
+#undef p1_iamax_1
+#define p1_iamax_1(a_) \
+ plq(a_,ax,4) \
+ pan(2,4) \
+ pc(3,5) \
+ pcm(6,4,5) \
+ pax(4,3) \
+ pan(5,6) \
+ pann(0,5) \
+ pa(5,6) \
+ pa(1,0)
+#define p2_iamax_1(a_) \
+ plq(SS(a_,RS4),ax,4) \
+ pan(2,4) \
+ pc(3,5) \
+ pcm(6,4,5) \
+ pax(4,3) \
+ pan(5,6) \
+ pann(0,5) \
+ pa(5,6) \
+ pa(1,0) \
+ f(nta,SS(a_,MM(SS(2,CL),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,4) \
+ pan(2,4) \
+ pc(3,5) \
+ pcm(6,4,5) \
+ pax(4,3) \
+ pan(5,6) \
+ pann(0,5) \
+ pa(5,6) \
+ pa(1,0)
+#undef lpiamax_1
+#define lpiamax_1(a_) \
+ f(nta,SS(a_,MM(CL,RS4)),ax) \
+ plq(a_,ax,4) \
+ pan(2,4) \
+ pc(3,5) \
+ pcm(6,4,5) \
+ pax(4,3) \
+ pan(5,6) \
+ pann(0,5) \
+ pa(5,6) \
+ pa(1,0)
+#undef dpiamax_1
+#define dpiamax_1(a_) \
+ plq(SS(a_,RS4),ax,4) \
+ pan(2,4) \
+ pc(3,5) \
+ pcm(6,4,5) \
+ pax(4,3) \
+ pan(5,6) \
+ pann(0,5) \
+ pa(5,6) \
+ pa(1,0)
+#undef pliamax_1
+#define pliamax_1 8
+
+#undef p1_4_iamax_1d
+#define p1_4_iamax_1d(a_)
+#undef p1_2_iamax_1d
+#define p1_2_iamax_1d(a_) \
+ px(4) \
+ pld(a_,ax,4) \
+ dbg(2) \
+ pan(2,4) \
+ dbg(4) \
+ pc(3,5) \
+ dbg(5) \
+ pcm(6,4,5) \
+ dbg(5) \
+ pax(4,3) \
+ dbg(3) \
+ pan(5,6) \
+ dbg(6) \
+ pann(0,5) \
+ dbg(5) \
+ pa(5,6) \
+ dbg(6) \
+ pasr(1,0) \
+ dbg(0) \
+ ps(1,0,0)
+#undef p1_iamax_1d
+#define p1_iamax_1d(a_) \
+ plq(a_,ax,4) \
+ dbg(2) \
+ pan(2,4) \
+ dbg(4) \
+ pc(3,5) \
+ dbg(5) \
+ pcm(6,4,5) \
+ dbg(5) \
+ pax(4,3) \
+ dbg(3) \
+ pan(5,6) \
+ dbg(6) \
+ pann(0,5) \
+ dbg(5) \
+ pa(5,6) \
+ dbg(6) \
+ pa(1,0)
+#define p2_iamax_1d(a_) \
+ plq(SS(a_,RS4),ax,4) \
+ dbg(2) \
+ pan(2,4) \
+ dbg(4) \
+ pc(3,5) \
+ dbg(5) \
+ pcm(6,4,5) \
+ dbg(5) \
+ pax(4,3) \
+ dbg(3) \
+ pan(5,6) \
+ dbg(6) \
+ pann(0,5) \
+ dbg(5) \
+ pa(5,6) \
+ dbg(6) \
+ pa(1,0) \
+ dbg(0) \
+ f(nta,SS(a_,MM(SS(2,CL),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,4) \
+ dbg(2) \
+ pan(2,4) \
+ dbg(4) \
+ pc(3,5) \
+ dbg(5) \
+ pcm(6,4,5) \
+ dbg(5) \
+ pax(4,3) \
+ dbg(3) \
+ pan(5,6) \
+ dbg(6) \
+ pann(0,5) \
+ dbg(5) \
+ pa(5,6) \
+ dbg(6) \
+ pa(1,0)
+#undef lpiamax_1d
+#define lpiamax_1d(a_) \
+ f(nta,SS(a_,MM(CL,RS4)),ax) \
+ plq(a_,ax,4) \
+ dbg(2) \
+ pan(2,4) \
+ dbg(4) \
+ pc(3,5) \
+ dbg(5) \
+ pcm(6,4,5) \
+ dbg(5) \
+ pax(4,3) \
+ dbg(3) \
+ pan(5,6) \
+ dbg(6) \
+ pann(0,5) \
+ dbg(5) \
+ pa(5,6) \
+ dbg(6) \
+ pa(1,0)
+#undef dpiamax_1d
+#define dpiamax_1d(a_) \
+ plq(SS(a_,RS4),ax,4) \
+ dbg(2) \
+ pan(2,4) \
+ dbg(4) \
+ pc(3,5) \
+ dbg(5) \
+ pcm(6,4,5) \
+ dbg(5) \
+ pax(4,3) \
+ dbg(3) \
+ pan(5,6) \
+ dbg(6) \
+ pann(0,5) \
+ dbg(5) \
+ pa(5,6) \
+ dbg(6) \
+ pa(1,0)
+#undef pliamax_1d
+#define pliamax_1d 8
+
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h
new file mode 100644
index 0000000..03486cf
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h
@@ -0,0 +1,331 @@
+/***************************************
+ $Header: /cvsroot/math-atlas/AtlasBase/kernel/CammMaguire/camm_tpipe.h,v 1.2 2003/10/18 18:13:30 yycamm Exp $
+
+
+***************************************/
+
+
+/* #ifndef CAMM_TPIPE_H */
+/* #define CAMM_TPIPE_H */ /*+ To stop multiple inclusions. +*/
+
+#ifndef BITS
+#error BITS must be defined in camm_tpipe.h
+#endif
+#ifndef DIV
+#error DIV must be defined in camm_tpipe.h
+#endif
+#ifndef INC
+#error INC(a_) must be defined in camm_tpipe.h
+#endif
+#ifndef LR
+#error LR must be defined in camm_tpipe.h
+#endif
+
+#ifdef ALIGN
+
+#if defined(SREAL)
+
+ test(4,ax)
+ je(a2)
+
+#undef KB
+#define KB ( 1 /* / DIV */ )
+#include "camm_pipe3.h"
+
+ KB_block
+ INC(4)
+ sub(1,LR)
+
+ lab(a2)
+
+#endif
+
+#if defined(SREAL) || defined(DREAL)
+
+ test(8,ax)
+ je(a4)
+ test(-2,LR)
+ je(a4)
+
+#undef KB
+#define KB ( 2 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(8)
+ sub(2,LR)
+
+ lab(a4)
+
+#endif
+#endif
+
+/* "movl %%edx,%%edi\n\t" */
+ push(LR)
+ shr(BITS,LR)
+ shl(BITS,LR)
+ m(4,LR)
+ ra(ax,LR)
+
+#if defined(ALIGN) && ( defined(SCPLX) || defined(DCPLX) )
+ test(12,ax)
+ je(loopa)
+#endif
+
+#if !defined(ALIGN) || defined(SCPLX) || defined(DCPLX)
+#undef plq
+#define plq(a_,b_,c_) pl(a_,b_,c_)
+#undef puq
+#define puq(a_,b_,c_) pu(a_,b_,c_)
+#undef plqx
+#define plqx(a_,b_,c_,d_,e_) plx(a_,b_,c_,d_,e_)
+#undef puqx
+#define puqx(a_,b_,c_,d_,e_) pux(a_,b_,c_,d_,e_)
+#else
+#undef plq
+#define plq(a_,b_,c_) pla(a_,b_,c_)
+#undef puq
+#define puq(a_,b_,c_) punt(a_,b_,c_)
+#undef plqx
+#define plqx(a_,b_,c_,d_,e_) plax(a_,b_,c_,d_,e_)
+#undef puqx
+#define puqx(a_,b_,c_,d_,e_) puax(a_,b_,c_,d_,e_)
+#endif
+
+ align
+ lab(loop)
+ cmp(ax,LR)
+ je(stop)
+
+#undef KB
+#define KB ( (1 << BITS) /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(4*KB/**DIV*/)
+
+ jmp(loop)
+
+ lab(stop)
+ pop(LR)
+
+#if ( 1 << BITS ) > 128
+ test(128,LR)
+ je(64)
+#undef KB
+#define KB ( 128 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(512)
+
+ lab(64)
+#endif
+
+#if ( 1 << BITS ) > 64
+ test(64,LR)
+ je(32)
+#undef KB
+#define KB ( 64 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(256)
+
+ lab(32)
+#endif
+
+#if ( 1 << BITS ) > 32
+ test(32,LR)
+ je(16)
+#undef KB
+#define KB ( 32 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(128)
+
+ lab(16)
+#endif
+
+#if ( 1 << BITS ) > 16
+ test(16,LR)
+ je(8)
+#undef KB
+#define KB ( 16 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(64)
+
+ lab(8)
+#endif
+
+#if ( 1 << BITS ) > 8
+ test(8,LR)
+ je(4)
+#undef KB
+#define KB ( 8 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(32)
+
+ lab(4)
+#endif
+
+#if ( 1 << BITS ) > 4
+ test(4,LR)
+ je(2)
+#undef KB
+#define KB ( 4 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(16)
+
+ lab(2)
+#endif
+
+#if DIV != 4 && ( 1 << BITS ) > 2
+ test(2,LR)
+ je(1)
+#undef KB
+#define KB ( 2 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(8)
+
+ lab(1)
+#endif
+
+#if DIV == 1 && ( 1 << BITS ) > 1
+ test(1,LR)
+ je(end)
+#undef KB
+#define KB ( 1 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ lab(end)
+#endif
+
+#if defined (ALIGN) && ( defined(SCPLX) || defined(DCPLX) )
+
+ jmp(tend)
+
+#undef plq
+#define plq(a_,b_,c_) pla(a_,b_,c_)
+#undef puq
+#define puq(a_,b_,c_) punt(a_,b_,c_)
+#undef plqx
+#define plqx(a_,b_,c_,d_,e_) plax(a_,b_,c_,d_,e_)
+#undef puqx
+#define puqx(a_,b_,c_,d_,e_) puax(a_,b_,c_,d_,e_)
+
+ align
+ lab(loopa)
+ cmp(ax,LR)
+ je(stopa)
+
+#undef KB
+#define KB ( (1 << BITS) /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(4*KB/**DIV*/)
+
+ jmp(loopa)
+
+ lab(stopa)
+ pop(LR)
+
+#if ( 1 << BITS ) > 128
+ test(128,LR)
+ je(64a)
+#undef KB
+#define KB ( 128 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(512)
+
+ lab(64a)
+#endif
+
+#if ( 1 << BITS ) > 64
+ test(64,LR)
+ je(32a)
+#undef KB
+#define KB ( 64 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(256)
+
+ lab(32a)
+#endif
+
+#if ( 1 << BITS ) > 32
+ test(32,LR)
+ je(16a)
+#undef KB
+#define KB ( 32 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(128)
+
+ lab(16a)
+#endif
+
+#if ( 1 << BITS ) > 16
+ test(16,LR)
+ je(8a)
+#undef KB
+#define KB ( 16 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(64)
+
+ lab(8a)
+#endif
+
+#if ( 1 << BITS ) > 8
+ test(8,LR)
+ je(4a)
+#undef KB
+#define KB ( 8 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(32)
+
+ lab(4a)
+#endif
+
+#if ( 1 << BITS ) > 4
+ test(4,LR)
+ je(2a)
+#undef KB
+#define KB ( 4 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(16)
+
+ lab(2a)
+#endif
+
+#if DIV != 4 && ( 1 << BITS ) > 2
+ test(2,LR)
+ je(1a)
+#undef KB
+#define KB ( 2 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(8)
+
+ lab(1a)
+#endif
+
+#if DIV == 1 && ( 1 << BITS ) > 1
+ test(1,LR)
+ je(enda)
+#undef KB
+#define KB ( 1 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ lab(enda)
+#endif
+
+ lab(tend)
+
+#endif
+
+/* #endif */ /* CAMM_TPIPE_H */
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h
new file mode 100644
index 0000000..6b150d3
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h
@@ -0,0 +1,508 @@
+#ifndef CAMM_UTIL_H
+#define CAMM_UTIL_H /*+ To stop multiple inclusions. +*/
+
+typedef struct {
+ float r,i;
+} Complex;
+
+typedef struct {
+ double r,i;
+} Dcomplex;
+
+#undef str
+#define str(a_) xstr(a_)
+#undef xstr
+#define xstr(a_) #a_
+
+#undef val
+#define val(a_) xval(a_)
+#undef xval
+#define xval(a_) a_
+
+#ifndef Mjoin
+#define Mjoin(a,b) mjoin(a,b)
+#ifdef mjoin
+ #undef mjoin
+#endif
+#define mjoin(a,b) a ## b
+#endif
+
+#undef VOLATILE
+#define VOLATILE __volatile__
+#undef ASM
+#define ASM __asm__ VOLATILE
+
+#ifdef BETA0
+#undef BL
+#define BL b0
+#endif
+#ifdef BETA1
+#undef BL
+#define BL b1
+#endif
+#ifdef BETAX
+#undef BL
+#define BL bX
+#endif
+#ifdef BETAXI0
+#undef BL
+#define BL bXi0
+#endif
+
+#ifdef NO_TRANSPOSE
+#ifdef GER
+#ifdef Conj_
+#undef FEXT
+#define FEXT Gc
+#else
+#undef FEXT
+#define FEXT Gu
+#endif
+#else
+#ifdef Conj_
+#undef FEXT
+#define FEXT Nc
+#else
+#undef FEXT
+#define FEXT N
+#endif
+#endif
+#else
+#ifdef Conj_
+#undef FEXT
+#define FEXT C
+#else
+#undef FEXT
+#define FEXT T
+#endif
+#endif
+
+#undef BLC
+#define BLC Mjoin(FEXT,BL)
+
+#ifdef __GNUC__
+#undef NO_INLINE
+#define NO_INLINE double sq(double x) {return x*x;}
+#else
+#undef NO_INLINE
+#define NO_INLINE
+#endif
+
+#undef lab
+#define lab(a_) "\n" str(MY_FUNCTION) "_" str(N) "_" str(a_) ":\n\t"
+#undef jmp
+#define jmp(a_) "jmp " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef je
+#define je(a_) "je " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef jge
+#define jge(a_) "jge " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef jle
+#define jle(a_) "jle " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef jl
+#define jl(a_) "jl " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef jne
+#define jne(a_) "jne " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef align
+#define align ".align 16\n\t"
+#undef test
+#define test(a_,b_) "testl $" str(a_) ",%%e" str(b_) "\n\t"
+#undef and
+#define and(a_,b_) "andl $" str(a_) ",%%e" str(b_) "\n\t"
+#undef sub
+#define sub(a_,b_) "subl $" str(a_) ",%%e" str(b_) "\n\t"
+#undef SS
+#define SS(a_,b_) a_ + b_
+#undef MM
+#define MM(a_,b_) a_ * b_
+#undef E4
+#define E4(a_) (( a_ >> 2 ) << 2 )
+
+#undef TYPE
+#undef SCALAR
+#undef PREC
+#undef CSHUF
+#undef LSHUF
+#undef HSHUF
+#undef ISHUF
+#undef RSHUF
+#undef SINGLE
+#undef REAL
+#undef DIV
+
+#ifdef SCPLX
+#define TYPE Complex
+#define SCALAR Complex *
+#define PREC c
+#define CSHUF 177
+#define LSHUF 160
+#define HSHUF 245
+#define ISHUF 13*17
+#define RSHUF 8*17
+#define SINGLE
+#define DIV 2
+/* #ifdef Conj_ */
+/* static const TYPE signd[2]={{-1.0,1.0},{-1.0,1.0}}; */
+/* #else */
+ static const TYPE signd[2]={{1.0,-1.0},{1.0,-1.0}};
+/* #endif */
+#endif
+
+#ifdef SREAL
+#define TYPE float
+#define SCALAR float
+#define PREC s
+#define SINGLE
+#define REAL
+#define DIV 1
+#endif
+
+#ifdef DREAL
+#define TYPE double
+#define SCALAR double
+#define PREC d
+#define REAL
+#define DIV 2
+#endif
+
+#ifdef DCPLX
+#define TYPE Dcomplex
+#define SCALAR Dcomplex *
+#define PREC z
+#define CSHUF 1
+#define LSHUF 0
+#define HSHUF 3
+#define ISHUF 3
+#define RSHUF 0
+#define DIV 4
+/* #ifdef Conj_ */
+/* static const TYPE signd[1]={{-1.0,1.0}}; */
+/* #else */
+ static const TYPE signd[1]={{1.0,-1.0}};
+/* #endif */
+#endif
+
+#undef M11
+#define M11 0
+#undef M12
+#define M12 1
+#undef M13
+#define M13 2
+#undef M14
+#define M14 3
+#undef M15
+#define M15 4
+#undef M16
+#define M16 5
+#undef M17
+#define M17 6
+#undef M18
+#define M18 7
+
+#undef M23
+#define M23 1
+#undef M24
+#define M24 2
+#undef M25
+#define M25 3
+#undef M26
+#define M26 4
+#undef M27
+#define M27 5
+#undef M28
+#define M28 6
+
+#undef M33
+#define M33 0
+#undef M34
+#define M34 1
+#undef M35
+#define M35 2
+#undef M36
+#define M36 3
+#undef M37
+#define M37 4
+#undef M38
+#define M38 5
+
+#undef P10
+#define P10 1
+#undef P11
+#define P11 2
+#undef P12
+#define P12 3
+#undef P13
+#define P13 4
+#undef P14
+#define P14 5
+#undef P15
+#define P15 6
+#undef P16
+#define P16 7
+
+#undef XM
+#define XM(a_,b_) M ## b_ ## a_
+#undef M
+#define M(a_,b_) XM(a_,b_)
+
+#undef XP
+#define XP(a_,b_) P ## b_ ## a_
+#undef P
+#define P(a_,b_) XP(a_,b_)
+
+#undef mex
+#define mex(a_) str(%%e ## a_)
+#undef msx
+#define msx(a_) "%%st(" str(a_) ")"
+
+#undef cmp
+#define cmp(a_,b_) "cmp " mex(a_) "," mex(b_) "\n\t"
+#undef icmpr
+#define icmpr(a_,b_) "cmp " mex(a_) ",(" mex(b_) ")\n\t"
+#undef f
+#define f(a_,b_,c_) "prefetch" str(a_) " " str(b_) "(%%e" #c_ ")\n\t"
+#undef pfx
+#define pfx(a_,b_,c_,d_,e_) "prefetch" str(a_) " " str(b_) "(%%e" #c_ ",%%e" #d_ "," str(e_) ")\n\t"
+#undef a
+#define a(a_,b_) "addl $" str(a_) "," mex(b_) "\n\t"
+#undef m
+#define m(a_,b_) "imul $" str(a_) "," mex(b_) "\n\t"
+#undef pop
+#define pop(a_) "popl %%e" str(a_) "\n\t"
+#undef push
+#define push(a_) "pushl %%e" str(a_) "\n\t"
+#undef d
+#define d(a_,b_) "idiv $" str(a_) "," mex(b_) "\n\t"
+#undef shl
+#define shl(a_,b_) "shl $" str(a_) "," mex(b_) "\n\t"
+#undef shr
+#define shr(a_,b_) "shr $" str(a_) "," mex(b_) "\n\t"
+#undef mm
+#define mm(a_,b_) "mov $" str(a_) "," mex(b_) "\n\t"
+#undef ra
+#define ra(a_,b_) "addl %%e" str(a_) "," mex(b_) "\n\t"
+#undef rs
+#define rs(a_,b_) "subl %%e" str(a_) "," mex(b_) "\n\t"
+
+#undef fl
+#define fl(a_,b_) "fldl " str(a_) "(" mex(b_) ")\n\t"
+#undef fp
+#define fp(a_,b_) "fstpl " str(a_) "(" mex(b_) ")\n\t"
+#undef fd
+#define fd(a_) "fld " msx(a_) "\n\t"
+#undef fap
+#define fap(a_,b_) "faddp " msx(a_) "," msx(b_) "\n\t"
+/* #define fsp(a_) fx(a_) "fsubp %%st," msx(a_) "\n\t" */
+#undef fsp
+#define fsp(a_) "fsubrp %%st," msx(a_) "\n\t"
+#undef fmp
+#define fmp(a_,b_) "fmulp " msx(a_) "," msx(b_) "\n\t"
+#undef fa
+#define fa(a_,b_) "fadd " msx(a_) "," msx(b_) "\n\t"
+#undef fm
+#define fm(a_,b_) "fmul " msx(a_) "," msx(b_) "\n\t"
+#undef faa
+#define faa(a_,b_) "faddl " str(a_) "(" mex(b_) ")\n\t"
+#undef fma
+#define fma(a_,b_) "fmull " str(a_) "(" mex(b_) ")\n\t"
+#undef fz
+#define fz "fldz\n\t"
+#undef fx
+#define fx(a_) "fxch " msx(a_) "\n\t"
+#undef fx1
+#define fx1 "fxch\n\t"
+#undef fc
+#define fc(a_) "fstp " msx(a_) "\n\t"
+
+
+#ifndef ATHLON
+
+
+#if defined(DREAL) || defined(DCPLX)
+#undef SSESUF
+#define SSESUF "d "
+#undef RS4
+#define RS4 16
+#undef RS
+#define RS 4
+#else
+#undef SSESUF
+#define SSESUF "s "
+#undef RS4
+#define RS4 16
+#undef RS
+#define RS 4
+#endif
+
+#undef mxx
+#define mxx(a_) str(%%xmm ## a_)
+#undef prp
+#define prp(a_,b_) "rcpp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef prps
+#define prps(a_,b_) "rcps" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pann
+#define pann(a_,b_) "andnp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef psqs
+#define psqs(a_,b_) "sqrts" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef por
+#define por(a_,b_) "orp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pan
+#define pan(a_,b_) "andp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pcm
+#define pcm(a_,b_,c_) "cmpp" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t"
+#undef pcms
+#define pcms(a_,b_,c_) "cmps" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t"
+#undef pax
+#define pax(a_,b_) "maxp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef paxs
+#define paxs(a_,b_) "maxs" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pd
+#define pd(a_,b_) "divp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pdsr
+#define pdsr(a_,b_) "divs" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pxx
+#define pxx(a_,b_) "xorp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef px
+#define px(a_) "xorp" SSESUF mxx(a_) "," mxx(a_) "\n\t"
+#undef pm
+#define pm(a_,b_) "mulp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pa
+#define pa(a_,b_) "addp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pmm
+#define pmm(a_,b_,c_) "mulp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pam
+#define pam(a_,b_,c_) "addp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pl
+#define pl(a_,b_,c_) "movup" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pla
+#define pla(a_,b_,c_) "movap" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pu
+#define pu(a_,b_,c_) "movup" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef punt
+#define punt(a_,b_,c_) "movntp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pua
+#define pua(a_,b_,c_) "movap" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pud
+#define pud(a_,b_,c_) "movlp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pudr
+#define pudr(a_,b_) "movlp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pc
+#define pc(a_,b_) "movap" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef ps
+#define ps(a_,b_,c_) "shufp" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t"
+#undef phl
+#define phl(a_,b_) "movhlp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pus
+#define pus(a_,b_,c_) "movs" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pls
+#define pls(a_,b_,c_) "movs" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pld
+#define pld(a_,b_,c_) "movlp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef plh
+#define plh(a_,b_) "movlhp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pas
+#define pas(a_,b_,c_) "adds" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pms
+#define pms(a_,b_,c_) "muls" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pcs
+#define pcs(a_,b_) "movs" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pasr
+#define pasr(a_,b_) "adds" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pmsr
+#define pmsr(a_,b_) "muls" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pul
+#define pul(a_,b_) "unpcklp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef puh
+#define puh(a_,b_) "unpckhp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+
+#undef plsx
+#define plsx(a_,b_,c_,d_,e_) \
+ "movs" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef plx
+#define plx(a_,b_,c_,d_,e_) \
+ "movup" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef plax
+#define plax(a_,b_,c_,d_,e_) \
+ "movap" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef pasx
+#define pasx(a_,b_,c_,d_,e_) \
+ "adds" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef pusx
+#define pusx(a_,b_,c_,d_,e_) \
+ "movs" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+#undef pux
+#define pux(a_,b_,c_,d_,e_) \
+ "movup" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+#undef puax
+#define puax(a_,b_,c_,d_,e_) \
+ "movap" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+#undef pudx
+#define pudx(a_,b_,c_,d_,e_) \
+ "movlp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+
+#undef pldx
+#define pldx(a_,b_,c_,d_,e_) \
+ "movlp" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+
+#else
+
+#undef RS4
+#define RS4 8
+#undef RS
+#define RS 2
+
+#undef mxx
+#define mxx(a_) str(%%mm ## a_)
+#undef pul
+#define pul(a_,b_) "punpckldq " mxx(a_) "," mxx(b_) "\n\t"
+#undef puh
+#define puh(a_,b_) "punpckhdq " mxx(a_) "," mxx(b_) "\n\t"
+
+#undef px
+#define px(a_) "pxor " mxx(a_) "," mxx(a_) "\n\t"
+#undef pm
+#define pm(a_,b_) "pfmul " mxx(a_) "," mxx(b_) "\n\t"
+#undef pa
+#define pa(a_,b_) "pfadd " mxx(a_) "," mxx(b_) "\n\t"
+#undef pac
+#define pac(a_,b_) "pfacc " mxx(a_) "," mxx(b_) "\n\t"
+#undef pmm
+#define pmm(a_,b_,c_) "pfmul " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pam
+#define pam(a_,b_,c_) "pfadd " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pl
+#define pl(a_,b_,c_) "movq " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pla
+#define pla(a_,b_,c_) "movq " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pu
+#define pu(a_,b_,c_) "movq " mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pc
+#define pc(a_,b_) "movq " mxx(a_) "," mxx(b_) "\n\t"
+#undef ps
+#define ps(a_,b_,c_) "pswapd " mxx(b_) "," mxx(c_) "\n\t"
+#undef phl
+#define phl(a_,b_) "punpckhdq " mxx(a_) "," mxx(b_) "\n\t"
+#undef plh
+#define plh(a_,b_) "punpckldq " mxx(a_) "," mxx(b_) "\n\t"
+#undef pus
+#define pus(a_,b_,c_) "movd " mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pls
+#define pls(a_,b_,c_) "movd " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+
+#undef plsx
+#define plsx(a_,b_,c_,d_,e_) \
+ "movd " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef plx
+#define plx(a_,b_,c_,d_,e_) \
+ "movq " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef pasx
+#define pasx(a_,b_,c_,d_,e_) \
+ "addss " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef pusx
+#define pusx(a_,b_,c_,d_,e_) \
+ "movd " mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+#undef pux
+#define pux(a_,b_,c_,d_,e_) \
+ "movq " mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+#endif
+
+#endif /* CAMM_UTIL_H */
diff --git a/kaldi_io/src/tools/ATLAS/include/f77wrap_lapack.h b/kaldi_io/src/tools/ATLAS/include/f77wrap_lapack.h
new file mode 100644
index 0000000..89417f7
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/f77wrap_lapack.h
@@ -0,0 +1,91 @@
+/*
+ * Automatically Tuned Linear Algebra Software v3.8.3
+ * (C) Copyright 1999 R. Clint Whaley
+ *
+ * Code contributers : R. Clint Whaley, Antoine P. Petitet
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions, and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the ATLAS group or the names of its contributers may
+ * not be used to endorse or promote products derived from this
+ * software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef F77WRAP_LAPACK_H
+#define F77WRAP_LAPACK_H
+
+#include "atlas_misc.h"
+#include "atlas_f77.h"
+
+#ifdef UpCase
+ #define PFW Mjoin(ATL_F77WRAP_,PREU)
+#else
+ #define PFW Mjoin(atl_f77wrap_,PRE)
+#endif
+
+#ifdef Add_
+ #define F77WRAP_GETRI Mjoin(PFW,getri_)
+ #define F77WRAP_LAUUM Mjoin(PFW,lauum_)
+ #define F77WRAP_TRTRI Mjoin(PFW,trtri_)
+ #define F77WRAP_GETNB Mjoin(PFW,getnb_)
+ #define F77WRAP_GETRS Mjoin(PFW,getrs_)
+ #define F77WRAP_GETRF Mjoin(PFW,getrf_)
+ #define F77WRAP_GESV Mjoin(PFW,gesv_)
+ #define F77WRAP_POTRS Mjoin(PFW,potrs_)
+ #define F77WRAP_POTRF Mjoin(PFW,potrf_)
+ #define F77WRAP_POSV Mjoin(PFW,posv_)
+#elif defined(Add__)
+ #define F77WRAP_GETRI Mjoin(PFW,getri__)
+ #define F77WRAP_LAUUM Mjoin(PFW,lauum__)
+ #define F77WRAP_TRTRI Mjoin(PFW,trtri__)
+ #define F77WRAP_GETNB Mjoin(PFW,getnb__)
+ #define F77WRAP_GETRS Mjoin(PFW,getrs__)
+ #define F77WRAP_GETRF Mjoin(PFW,getrf__)
+ #define F77WRAP_GESV Mjoin(PFW,gesv__)
+ #define F77WRAP_POTRS Mjoin(PFW,potrs__)
+ #define F77WRAP_POTRF Mjoin(PFW,potrf__)
+ #define F77WRAP_POSV Mjoin(PFW,posv__)
+#elif defined(NoChange)
+ #define F77WRAP_GETRI Mjoin(PFW,getri)
+ #define F77WRAP_LAUUM Mjoin(PFW,lauum)
+ #define F77WRAP_TRTRI Mjoin(PFW,trtri)
+ #define F77WRAP_GETNB Mjoin(PFW,getnb)
+ #define F77WRAP_GETRS Mjoin(PFW,getrs)
+ #define F77WRAP_GETRF Mjoin(PFW,getrf)
+ #define F77WRAP_GESV Mjoin(PFW,gesv)
+ #define F77WRAP_POTRS Mjoin(PFW,potrs)
+ #define F77WRAP_POTRF Mjoin(PFW,potrf)
+ #define F77WRAP_POSV Mjoin(PFW,posv)
+#elif defined(UpCase)
+ #define F77WRAP_GETRI Mjoin(PFW,GETRI)
+ #define F77WRAP_LAUUM Mjoin(PFW,LAUUM)
+ #define F77WRAP_TRTRI Mjoin(PFW,TRTRI)
+ #define F77WRAP_GETNB Mjoin(PFW,GETNB)
+ #define F77WRAP_GETRS Mjoin(PFW,GETRS)
+ #define F77WRAP_GETRF Mjoin(PFW,GETRF)
+ #define F77WRAP_GESV Mjoin(PFW,GESV)
+ #define F77WRAP_POTRS Mjoin(PFW,POTRS)
+ #define F77WRAP_POTRF Mjoin(PFW,POTRF)
+ #define F77WRAP_POSV Mjoin(PFW,POSV)
+#endif
+
+#endif