diff options
author | Determinant <[email protected]> | 2015-08-14 11:51:42 +0800 |
---|---|---|
committer | Determinant <[email protected]> | 2015-08-14 11:51:42 +0800 |
commit | 96a32415ab43377cf1575bd3f4f2980f58028209 (patch) | |
tree | 30a2d92d73e8f40ac87b79f6f56e227bfc4eea6e /kaldi_io/src/tools/ATLAS | |
parent | c177a7549bd90670af4b29fa813ddea32cfe0f78 (diff) |
add implementation for kaldi io (by ymz)
Diffstat (limited to 'kaldi_io/src/tools/ATLAS')
57 files changed, 31829 insertions, 0 deletions
diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_altivec.h b/kaldi_io/src/tools/ATLAS/include/atlas_altivec.h new file mode 100644 index 0000000..a772448 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_altivec.h @@ -0,0 +1,27 @@ +#ifndef ATLAS_ALTIVEC_H + #define ATLAS_ALTIVEC_H + +#ifdef ATL_AltiVec + #ifdef ATL_AVgcc + #include <altivec.h> + + #define VECTOR_INIT(v0_,v1_,v2_,v3_) (vector float) {v0_,v1_,v2_,v3_} + #define VECTOR_INITI(v0_,v1_,v2_,v3_) (vector int) {v0_,v1_,v2_,v3_} + #else + #define VECTOR_INIT(v0_,v1_,v2_,v3_) (vector float)(v0_,v1_,v2_,v3_) + #define VECTOR_INITI(v0_,v1_,v2_,v3_) (vector int)(v0_,v1_,v2_,v3_) + #define VECTOR_INITL(v0_,v1_,v2_,v3_) (vector long)(v0_,v1_,v2_,v3_) + #endif + #define ATL_GetCtrl(stride, count, size) \ + (int)((stride) | ((count)<<16) | ((size)<<24)) + #define ATL_pfavR(ptr, cwrd, stream) \ + vec_dst((vector float *)(ptr), (cwrd), (stream)) + #define ATL_pfavW(ptr, cwrd, stream) \ + vec_dstst((vector float *)(ptr), (cwrd), (stream)) +#else + #define ATL_GetCtrl(stride, count, size) + #define ATL_pfavR(ptr, cwrd, stream) + #define ATL_pfavW(ptr, cwrd, stream) +#endif + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_asm.h b/kaldi_io/src/tools/ATLAS/include/atlas_asm.h new file mode 100644 index 0000000..4c4fa86 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_asm.h @@ -0,0 +1,411 @@ +#ifndef ATLAS_ASM_H + #define ATLAS_ASM_H + +#ifndef Mjoin + #define Mjoin(pre, nam) my_join(pre, nam) + #define my_join(pre, nam) pre ## nam +#endif + +#if defined(ATL_OS_WinNT) || defined(ATL_OS_Win9x) || defined(ATL_OS_OSX) + #define ATL_asmdecor(nam) Mjoin(_,nam) +#elif defined(ATL_OS_AIX) && defined(ATL_GAS_PPC) + #define ATL_asmdecor(nam) Mjoin(.,nam) +#elif !defined(ATL_OS_OSX) && defined(ATL_GAS_PPC) && defined(ATL_USE64BITS) + #define ATL_asmdecor(nam) Mjoin(.,nam) +#else + #define ATL_asmdecor(nam) nam +#endif + +#ifdef ATL_GAS_PARISC + #ifdef ATL_OS_HPUX + #define ATL_HPUX_PARISC + #else + #define ATL_LINUX_PARISC + #endif +#endif + +#ifdef ATL_GAS_PPC + #ifdef ATL_OS_OSX + #define ATL_AS_OSX_PPC + #elif defined(ATL_OS_AIX) + #define ATL_AS_AIX_PPC + #else + #define ATL_GAS_LINUX_PPC + #endif +#endif + +#if defined(ATL_GAS_LINUX_PPC) || defined(ATL_AS_AIX_PPC) + + #define r0 0 + #define f0 0 + #define r1 1 + #define f1 1 + #define r2 2 + #define f2 2 + #define r3 3 + #define f3 3 + #define r4 4 + #define f4 4 + #define r5 5 + #define f5 5 + #define r6 6 + #define f6 6 + #define r7 7 + #define f7 7 + #define r8 8 + #define f8 8 + #define r9 9 + #define f9 9 + #define r10 10 + #define f10 10 + #define r11 11 + #define f11 11 + #define r12 12 + #define f12 12 + #define r13 13 + #define f13 13 + #define r14 14 + #define f14 14 + #define r15 15 + #define f15 15 + #define r16 16 + #define f16 16 + #define r17 17 + #define f17 17 + #define r18 18 + #define f18 18 + #define r19 19 + #define f19 19 + #define r20 20 + #define f20 20 + #define r21 21 + #define f21 21 + #define r22 22 + #define f22 22 + #define r23 23 + #define f23 23 + #define r24 24 + #define f24 24 + #define r25 25 + #define f25 25 + #define r26 26 + #define f26 26 + #define r27 27 + #define f27 27 + #define r28 28 + #define f28 28 + #define r29 29 + #define f29 29 + #define r30 30 + #define f30 30 + #define r31 31 + #define f31 31 + #define cr0 0 + #define cr1 1 + #define cr2 2 + #define cr3 3 + #define cr4 4 + #define cr5 5 + #define cr6 6 + #define cr7 7 + +#endif + +#ifdef ATL_OS_OSX + #define ALIGN2 .align 1 + #define ALIGN4 .align 2 + #define ALIGN8 .align 3 + #define ALIGN16 .align 4 + #define ALIGN32 .align 5 + #define ALIGN64 .align 6 + #define ALIGN128 .align 7 + #define global globl +#else + #define ALIGN2 .align 2 + #define ALIGN4 .align 4 + #define ALIGN8 .align 8 + #define ALIGN16 .align 16 + #define ALIGN32 .align 32 + #define ALIGN64 .align 64 + #define ALIGN128 .align 128 +#endif + +#if defined(ATL_SSE1) && !defined(ATL_3DNow) + #define prefetchw prefetchnta +#endif +/* + * Solaris doesn't allow division in integer expressions in assembly, but + * many x86 kernels need to do $MB/mu; we work around this insanity with + * this kludge + */ +#if defined(ATL_DIV_NUM) && defined(ATL_DIV_DEN) + #if (ATL_DIV_NUM/ATL_DIV_DEN) == 0 + #define ATL_DivAns 0 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 1 + #define ATL_DivAns 1 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 2 + #define ATL_DivAns 2 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 3 + #define ATL_DivAns 3 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 4 + #define ATL_DivAns 4 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 5 + #define ATL_DivAns 5 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 6 + #define ATL_DivAns 6 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 7 + #define ATL_DivAns 7 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 8 + #define ATL_DivAns 8 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 9 + #define ATL_DivAns 9 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 10 + #define ATL_DivAns 10 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 11 + #define ATL_DivAns 11 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 12 + #define ATL_DivAns 12 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 13 + #define ATL_DivAns 13 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 14 + #define ATL_DivAns 14 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 15 + #define ATL_DivAns 15 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 16 + #define ATL_DivAns 16 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 17 + #define ATL_DivAns 17 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 18 + #define ATL_DivAns 18 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 19 + #define ATL_DivAns 19 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 20 + #define ATL_DivAns 20 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 21 + #define ATL_DivAns 21 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 22 + #define ATL_DivAns 22 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 23 + #define ATL_DivAns 23 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 24 + #define ATL_DivAns 24 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 25 + #define ATL_DivAns 25 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 26 + #define ATL_DivAns 26 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 27 + #define ATL_DivAns 27 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 28 + #define ATL_DivAns 28 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 29 + #define ATL_DivAns 29 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 30 + #define ATL_DivAns 30 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 31 + #define ATL_DivAns 31 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 32 + #define ATL_DivAns 32 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 33 + #define ATL_DivAns 33 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 34 + #define ATL_DivAns 34 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 35 + #define ATL_DivAns 35 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 36 + #define ATL_DivAns 36 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 37 + #define ATL_DivAns 37 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 38 + #define ATL_DivAns 38 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 39 + #define ATL_DivAns 39 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 40 + #define ATL_DivAns 40 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 41 + #define ATL_DivAns 41 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 42 + #define ATL_DivAns 42 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 43 + #define ATL_DivAns 43 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 44 + #define ATL_DivAns 44 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 45 + #define ATL_DivAns 45 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 46 + #define ATL_DivAns 46 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 47 + #define ATL_DivAns 47 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 48 + #define ATL_DivAns 48 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 49 + #define ATL_DivAns 49 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 50 + #define ATL_DivAns 50 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 51 + #define ATL_DivAns 51 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 52 + #define ATL_DivAns 52 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 53 + #define ATL_DivAns 53 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 54 + #define ATL_DivAns 54 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 55 + #define ATL_DivAns 55 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 56 + #define ATL_DivAns 56 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 57 + #define ATL_DivAns 57 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 58 + #define ATL_DivAns 58 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 59 + #define ATL_DivAns 59 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 60 + #define ATL_DivAns 60 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 61 + #define ATL_DivAns 61 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 62 + #define ATL_DivAns 62 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 63 + #define ATL_DivAns 63 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 64 + #define ATL_DivAns 64 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 65 + #define ATL_DivAns 65 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 66 + #define ATL_DivAns 66 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 67 + #define ATL_DivAns 67 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 68 + #define ATL_DivAns 68 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 69 + #define ATL_DivAns 69 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 70 + #define ATL_DivAns 70 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 71 + #define ATL_DivAns 71 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 72 + #define ATL_DivAns 72 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 73 + #define ATL_DivAns 73 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 74 + #define ATL_DivAns 74 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 75 + #define ATL_DivAns 75 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 76 + #define ATL_DivAns 76 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 77 + #define ATL_DivAns 77 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 78 + #define ATL_DivAns 78 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 79 + #define ATL_DivAns 79 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 80 + #define ATL_DivAns 80 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 81 + #define ATL_DivAns 81 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 82 + #define ATL_DivAns 82 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 83 + #define ATL_DivAns 83 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 84 + #define ATL_DivAns 84 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 85 + #define ATL_DivAns 85 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 86 + #define ATL_DivAns 86 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 87 + #define ATL_DivAns 87 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 88 + #define ATL_DivAns 88 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 89 + #define ATL_DivAns 89 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 90 + #define ATL_DivAns 90 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 91 + #define ATL_DivAns 91 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 92 + #define ATL_DivAns 92 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 93 + #define ATL_DivAns 93 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 94 + #define ATL_DivAns 94 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 95 + #define ATL_DivAns 95 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 96 + #define ATL_DivAns 96 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 97 + #define ATL_DivAns 97 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 98 + #define ATL_DivAns 98 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 99 + #define ATL_DivAns 99 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 100 + #define ATL_DivAns 100 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 101 + #define ATL_DivAns 101 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 102 + #define ATL_DivAns 102 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 103 + #define ATL_DivAns 103 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 104 + #define ATL_DivAns 104 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 105 + #define ATL_DivAns 105 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 106 + #define ATL_DivAns 106 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 107 + #define ATL_DivAns 107 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 108 + #define ATL_DivAns 108 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 109 + #define ATL_DivAns 109 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 110 + #define ATL_DivAns 110 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 111 + #define ATL_DivAns 111 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 112 + #define ATL_DivAns 112 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 113 + #define ATL_DivAns 113 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 114 + #define ATL_DivAns 114 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 115 + #define ATL_DivAns 115 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 116 + #define ATL_DivAns 116 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 117 + #define ATL_DivAns 117 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 118 + #define ATL_DivAns 118 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 119 + #define ATL_DivAns 119 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 120 + #define ATL_DivAns 120 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 121 + #define ATL_DivAns 121 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 122 + #define ATL_DivAns 122 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 123 + #define ATL_DivAns 123 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 124 + #define ATL_DivAns 124 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 125 + #define ATL_DivAns 125 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 126 + #define ATL_DivAns 126 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 127 + #define ATL_DivAns 127 + #elif (ATL_DIV_NUM/ATL_DIV_DEN) == 128 + #define ATL_DivAns 128 + #endif +#endif + +/* + * For GNU/Linux, set no-execute bit for all ATLAS assembly + */ +#if defined(ATL_OS_Linux) && defined(__ELF__) && defined(__GNUC__) && \ + defined(ATL_SSE1) +.section .note.GNU-stack,"",%progbits +#endif + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_aux.h b/kaldi_io/src/tools/ATLAS/include/atlas_aux.h new file mode 100644 index 0000000..ce31eee --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_aux.h @@ -0,0 +1,785 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1999 R. Clint Whaley + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ +/* + * Header file for ATLAS's auxiliary routines + */ +#ifndef ATLAS_AUX_H +#define ATLAS_AUX_H +#include "atlas_misc.h" + +void ATL_xerbla(int p, char *rout, char *form, ...); +int ATL_lcm(const int M, const int N); +double ATL_walltime(); +double ATL_cputime(); + +/* + * Auxiliary routines that come in all four types + */ +void ATL_sgeadd(const int M, const int N, const float alpha, + const float *A, const int lda, const float beta, + float *C, const int ldc); +void ATL_sgemove(const int M, const int N, const float alpha, + const float *A, const int lda, float *C, const int ldc); +void ATL_sgemoveT(const int N, const int M, const float alpha, + const float *A, const int lda, float *C, const int ldc); +void ATL_ssyreflect(const enum ATLAS_UPLO Uplo, const int N, + float *C, const int ldc); +void ATL_sgecopy(const int M, const int N, const float *A, const int lda, + float *C, const int ldc); + +void ATL_sgescal(const int M, const int N, const float beta, + float *C, const int ldc); +void ATL_strscal + (const enum ATLAS_UPLO Uplo, const int M, const int N, const float alpha, + float *A, const int lda); +void ATL_shescal + (const enum ATLAS_UPLO Uplo, const int M, const int N, const float alpha, + float *A, const int lda); + +void ATL_sgezero(const int M, const int N, float *C, const int ldc); + +void ATL_szero(const int N, float *X, const int incX); +void ATL_sset(const int N, const float alpha, float *X, const int incX); +void ATL_sscal(const int N, const float alpha, float *X, const int incX); +void ATL_scopy(const int N, const float *X, const int incX, + float *Y, const int incY); +void ATL_scpsc(const int N, const float alpha, const float *X, + const int incX, float *Y, const int incY); +void ATL_saxpy(const int N, const float alpha, const float *X, + const int incX, float *Y, const int incY); +void ATL_saxpy_x1_y1(const int N, const float alpha, const float *X, + const int incX, float *Y, const int incY); +void ATL_saxpby(const int N, const float alpha, const float *X, + const int incX, const float beta, float *Y, const int incY); + +void ATL_sgeadd_a1_b1 + (const int M, const int N, const float alpha, const float *A, + const int lda, const float beta, float *C, const int ldc); +void ATL_saxpby_a1_b1 + (const int N, const float alpha, const float *X, const int incX, + const float beta, float *Y, const int incY); +void ATL_sgeadd_a0_b1 + (const int M, const int N, const float alpha, const float *A, + const int lda, const float beta, float *C, const int ldc); +void ATL_saxpby_a0_b1 + (const int N, const float alpha, const float *X, const int incX, + const float beta, float *Y, const int incY); +void ATL_sgeadd_aX_b1 + (const int M, const int N, const float alpha, const float *A, + const int lda, const float beta, float *C, const int ldc); +void ATL_saxpby_aX_b1 + (const int N, const float alpha, const float *X, const int incX, + const float beta, float *Y, const int incY); +void ATL_sgeadd_a1_b0 + (const int M, const int N, const float alpha, const float *A, + const int lda, const float beta, float *C, const int ldc); +void ATL_saxpby_a1_b0 + (const int N, const float alpha, const float *X, const int incX, + const float beta, float *Y, const int incY); +void ATL_sgeadd_a0_b0 + (const int M, const int N, const float alpha, const float *A, + const int lda, const float beta, float *C, const int ldc); +void ATL_saxpby_a0_b0 + (const int N, const float alpha, const float *X, const int incX, + const float beta, float *Y, const int incY); +void ATL_sgeadd_aX_b0 + (const int M, const int N, const float alpha, const float *A, + const int lda, const float beta, float *C, const int ldc); +void ATL_saxpby_aX_b0 + (const int N, const float alpha, const float *X, const int incX, + const float beta, float *Y, const int incY); +void ATL_sgeadd_a1_bX + (const int M, const int N, const float alpha, const float *A, + const int lda, const float beta, float *C, const int ldc); +void ATL_saxpby_a1_bX + (const int N, const float alpha, const float *X, const int incX, + const float beta, float *Y, const int incY); +void ATL_sgeadd_a0_bX + (const int M, const int N, const float alpha, const float *A, + const int lda, const float beta, float *C, const int ldc); +void ATL_saxpby_a0_bX + (const int N, const float alpha, const float *X, const int incX, + const float beta, float *Y, const int incY); +void ATL_sgeadd_aX_bX + (const int M, const int N, const float alpha, const float *A, + const int lda, const float beta, float *C, const int ldc); +void ATL_saxpby_aX_bX + (const int N, const float alpha, const float *X, const int incX, + const float beta, float *Y, const int incY); + +void ATL_sgemove_a1 + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_sgemove_a0 + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_sgemove_aX + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); + +void ATL_sgescal_b1 + (const int M, const int N, const float beta, float *C, const int ldc); +void ATL_sgescal_b0 + (const int M, const int N, const float beta, float *C, const int ldc); +void ATL_sgescal_bX + (const int M, const int N, const float beta, float *C, const int ldc); + +void ATL_dgeadd(const int M, const int N, const double alpha, + const double *A, const int lda, const double beta, + double *C, const int ldc); +void ATL_dgemove(const int M, const int N, const double alpha, + const double *A, const int lda, double *C, const int ldc); +void ATL_dgemoveT(const int N, const int M, const double alpha, + const double *A, const int lda, double *C, const int ldc); +void ATL_dsyreflect(const enum ATLAS_UPLO Uplo, const int N, + double *C, const int ldc); +void ATL_dgecopy(const int M, const int N, const double *A, const int lda, + double *C, const int ldc); + +void ATL_dgescal(const int M, const int N, const double beta, + double *C, const int ldc); +void ATL_dtrscal + (const enum ATLAS_UPLO Uplo, const int M, const int N, const double alpha, + double *A, const int lda); +void ATL_dhescal + (const enum ATLAS_UPLO Uplo, const int M, const int N, const double alpha, + double *A, const int lda); + +void ATL_dgezero(const int M, const int N, double *C, const int ldc); + +void ATL_dzero(const int N, double *X, const int incX); +void ATL_dset(const int N, const double alpha, double *X, const int incX); +void ATL_dscal(const int N, const double alpha, double *X, const int incX); +void ATL_dcopy(const int N, const double *X, const int incX, + double *Y, const int incY); +void ATL_dcpsc(const int N, const double alpha, const double *X, + const int incX, double *Y, const int incY); +void ATL_daxpy(const int N, const double alpha, const double *X, + const int incX, double *Y, const int incY); +void ATL_daxpy_x1_y1(const int N, const double alpha, const double *X, + const int incX, double *Y, const int incY); +void ATL_daxpby(const int N, const double alpha, const double *X, + const int incX, const double beta, double *Y, const int incY); + +void ATL_dgeadd_a1_b1 + (const int M, const int N, const double alpha, const double *A, + const int lda, const double beta, double *C, const int ldc); +void ATL_daxpby_a1_b1 + (const int N, const double alpha, const double *X, const int incX, + const double beta, double *Y, const int incY); +void ATL_dgeadd_a0_b1 + (const int M, const int N, const double alpha, const double *A, + const int lda, const double beta, double *C, const int ldc); +void ATL_daxpby_a0_b1 + (const int N, const double alpha, const double *X, const int incX, + const double beta, double *Y, const int incY); +void ATL_dgeadd_aX_b1 + (const int M, const int N, const double alpha, const double *A, + const int lda, const double beta, double *C, const int ldc); +void ATL_daxpby_aX_b1 + (const int N, const double alpha, const double *X, const int incX, + const double beta, double *Y, const int incY); +void ATL_dgeadd_a1_b0 + (const int M, const int N, const double alpha, const double *A, + const int lda, const double beta, double *C, const int ldc); +void ATL_daxpby_a1_b0 + (const int N, const double alpha, const double *X, const int incX, + const double beta, double *Y, const int incY); +void ATL_dgeadd_a0_b0 + (const int M, const int N, const double alpha, const double *A, + const int lda, const double beta, double *C, const int ldc); +void ATL_daxpby_a0_b0 + (const int N, const double alpha, const double *X, const int incX, + const double beta, double *Y, const int incY); +void ATL_dgeadd_aX_b0 + (const int M, const int N, const double alpha, const double *A, + const int lda, const double beta, double *C, const int ldc); +void ATL_daxpby_aX_b0 + (const int N, const double alpha, const double *X, const int incX, + const double beta, double *Y, const int incY); +void ATL_dgeadd_a1_bX + (const int M, const int N, const double alpha, const double *A, + const int lda, const double beta, double *C, const int ldc); +void ATL_daxpby_a1_bX + (const int N, const double alpha, const double *X, const int incX, + const double beta, double *Y, const int incY); +void ATL_dgeadd_a0_bX + (const int M, const int N, const double alpha, const double *A, + const int lda, const double beta, double *C, const int ldc); +void ATL_daxpby_a0_bX + (const int N, const double alpha, const double *X, const int incX, + const double beta, double *Y, const int incY); +void ATL_dgeadd_aX_bX + (const int M, const int N, const double alpha, const double *A, + const int lda, const double beta, double *C, const int ldc); +void ATL_daxpby_aX_bX + (const int N, const double alpha, const double *X, const int incX, + const double beta, double *Y, const int incY); + +void ATL_dgemove_a1 + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dgemove_a0 + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dgemove_aX + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); + +void ATL_dgescal_b1 + (const int M, const int N, const double beta, double *C, const int ldc); +void ATL_dgescal_b0 + (const int M, const int N, const double beta, double *C, const int ldc); +void ATL_dgescal_bX + (const int M, const int N, const double beta, double *C, const int ldc); + +void ATL_cgeadd(const int M, const int N, const float *alpha, + const float *A, const int lda, const float *beta, + float *C, const int ldc); +void ATL_cgemove(const int M, const int N, const float *alpha, + const float *A, const int lda, float *C, const int ldc); +void ATL_cgemoveT(const int N, const int M, const float *alpha, + const float *A, const int lda, float *C, const int ldc); +void ATL_csyreflect(const enum ATLAS_UPLO Uplo, const int N, + float *C, const int ldc); +void ATL_cgecopy(const int M, const int N, const float *A, const int lda, + float *C, const int ldc); + +void ATL_cgescal(const int M, const int N, const float *beta, + float *C, const int ldc); +void ATL_ctrscal + (const enum ATLAS_UPLO Uplo, const int M, const int N, const float *alpha, + float *A, const int lda); +void ATL_chescal + (const enum ATLAS_UPLO Uplo, const int M, const int N, const float alpha, + float *A, const int lda); + +void ATL_cgezero(const int M, const int N, float *C, const int ldc); + +void ATL_czero(const int N, float *X, const int incX); +void ATL_cset(const int N, const float *alpha, float *X, const int incX); +void ATL_cscal(const int N, const float *alpha, float *X, const int incX); +void ATL_ccopy(const int N, const float *X, const int incX, + float *Y, const int incY); +void ATL_ccpsc(const int N, const float *alpha, const float *X, + const int incX, float *Y, const int incY); +void ATL_caxpy(const int N, const float *alpha, const float *X, + const int incX, float *Y, const int incY); +void ATL_caxpy_x1_y1(const int N, const float *alpha, const float *X, + const int incX, float *Y, const int incY); +void ATL_caxpby(const int N, const float *alpha, const float *X, + const int incX, const float *beta, float *Y, const int incY); + +void ATL_cgeadd_a1_b1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_caxpby_a1_b1 + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_cgeadd_a0_b1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_caxpby_a0_b1 + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_cgeadd_aX_b1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_caxpby_aX_b1 + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_cgeadd_a1_b0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_caxpby_a1_b0 + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_cgeadd_a0_b0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_caxpby_a0_b0 + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_cgeadd_aX_b0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_caxpby_aX_b0 + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_cgeadd_a1_bX + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_caxpby_a1_bX + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_cgeadd_a0_bX + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_caxpby_a0_bX + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_cgeadd_aX_bX + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_caxpby_aX_bX + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); + +void ATL_cgemove_a1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_cgemove_a0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_cgemove_aX + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); + +void ATL_cgescal_b1 + (const int M, const int N, const float *beta, float *C, const int ldc); +void ATL_cgescal_b0 + (const int M, const int N, const float *beta, float *C, const int ldc); +void ATL_cgescal_bX + (const int M, const int N, const float *beta, float *C, const int ldc); + +void ATL_zgeadd(const int M, const int N, const double *alpha, + const double *A, const int lda, const double *beta, + double *C, const int ldc); +void ATL_zgemove(const int M, const int N, const double *alpha, + const double *A, const int lda, double *C, const int ldc); +void ATL_zgemoveT(const int N, const int M, const double *alpha, + const double *A, const int lda, double *C, const int ldc); +void ATL_zsyreflect(const enum ATLAS_UPLO Uplo, const int N, + double *C, const int ldc); +void ATL_zgecopy(const int M, const int N, const double *A, const int lda, + double *C, const int ldc); + +void ATL_zgescal(const int M, const int N, const double *beta, + double *C, const int ldc); +void ATL_ztrscal + (const enum ATLAS_UPLO Uplo, const int M, const int N, const double *alpha, + double *A, const int lda); +void ATL_zhescal + (const enum ATLAS_UPLO Uplo, const int M, const int N, const double alpha, + double *A, const int lda); + +void ATL_zgezero(const int M, const int N, double *C, const int ldc); + +void ATL_zzero(const int N, double *X, const int incX); +void ATL_zset(const int N, const double *alpha, double *X, const int incX); +void ATL_zscal(const int N, const double *alpha, double *X, const int incX); +void ATL_zcopy(const int N, const double *X, const int incX, + double *Y, const int incY); +void ATL_zcpsc(const int N, const double *alpha, const double *X, + const int incX, double *Y, const int incY); +void ATL_zaxpy(const int N, const double *alpha, const double *X, + const int incX, double *Y, const int incY); +void ATL_zaxpy_x1_y1(const int N, const double *alpha, const double *X, + const int incX, double *Y, const int incY); +void ATL_zaxpby(const int N, const double *alpha, const double *X, + const int incX, const double *beta, double *Y, const int incY); + +void ATL_zgeadd_a1_b1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zaxpby_a1_b1 + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zgeadd_a0_b1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zaxpby_a0_b1 + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zgeadd_aX_b1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zaxpby_aX_b1 + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zgeadd_a1_b0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zaxpby_a1_b0 + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zgeadd_a0_b0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zaxpby_a0_b0 + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zgeadd_aX_b0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zaxpby_aX_b0 + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zgeadd_a1_bX + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zaxpby_a1_bX + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zgeadd_a0_bX + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zaxpby_a0_bX + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zgeadd_aX_bX + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zaxpby_aX_bX + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); + +void ATL_zgemove_a1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_zgemove_a0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_zgemove_aX + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); + +void ATL_zgescal_b1 + (const int M, const int N, const double *beta, double *C, const int ldc); +void ATL_zgescal_b0 + (const int M, const int N, const double *beta, double *C, const int ldc); +void ATL_zgescal_bX + (const int M, const int N, const double *beta, double *C, const int ldc); + +/* + * Specialized complex auxiliary routines + */ + +void ATL_ccplxinvert + (const int N, float *X, const int incX, float *Y, const int incY); + +void ATL_chereflect(const enum ATLAS_UPLO Uplo, const int N, + float *C, const int ldc); +void ATL_cscalConj + (const int N, const float *alpha, float *X, const int incX); +void ATL_ccopyConj + (const int N, const float *X, const int incX, float *Y, const int incY); +void ATL_cmoveConj + (const int N, const float *alpha, const float *X, const int incX, + float *Y, const int incY); +void ATL_caxpyConj + (const int N, const float *alpha, const float *X, const int incX, + float *Y, const int incY); +void ATL_caxpyConj_x1_y1(const int N, const float *alpha, const float *X, + const int incX, float *Y, const int incY); +void ATL_caxpbyConj + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_cgemoveC(const int N, const int M, const float *alpha, + const float *A, const int lda, float *C, const int ldc); + +void ATL_cgeaddConj_aXi0_b1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_a1_b1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_a0_b1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_aXi0_b1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_aX_b1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_aXi0_b0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_a1_b0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_a0_b0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_aXi0_b0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_aX_b0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_aXi0_bXi0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_a1_bXi0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_a0_bXi0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_aXi0_bXi0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_aX_bXi0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_aXi0_bX + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_a1_bX + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_a0_bX + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_aXi0_bX + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_cgeaddConj_aX_bX + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_caxpby_aXi0_b1 + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_caxpby_aXi0_b1 + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_cgeadd_aXi0_b1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_caxpby_aXi0_b0 + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_caxpby_aXi0_b0 + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_cgeadd_aXi0_b0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_caxpby_aXi0_bXi0 + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_caxpby_aXi0_bXi0 + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_cgeadd_aXi0_bXi0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_caxpby_aXi0_bX + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_caxpby_aXi0_bX + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_cgeadd_aXi0_bX + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_caxpby_a1_bXi0 + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_cgeadd_a1_bXi0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_caxpby_a0_bXi0 + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_cgeadd_a0_bXi0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); +void ATL_caxpby_aX_bXi0 + (const int N, const float *alpha, const float *X, const int incX, + const float *beta, float *Y, const int incY); +void ATL_cgeadd_aX_bXi0 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *beta, float *C, const int ldc); + +void ATL_cgemove_aXi0 + (const int M, const int N, const float *alpha0, const float *A, + const int lda, float *C, const int ldc); + +void ATL_cgescal_bXi0 + (const int M, const int N, const float *beta, float *C, const int ldc); + +void ATL_zcplxinvert + (const int N, double *X, const int incX, double *Y, const int incY); + +void ATL_zhereflect(const enum ATLAS_UPLO Uplo, const int N, + double *C, const int ldc); +void ATL_zscalConj + (const int N, const double *alpha, double *X, const int incX); +void ATL_zcopyConj + (const int N, const double *X, const int incX, double *Y, const int incY); +void ATL_zmoveConj + (const int N, const double *alpha, const double *X, const int incX, + double *Y, const int incY); +void ATL_zaxpyConj + (const int N, const double *alpha, const double *X, const int incX, + double *Y, const int incY); +void ATL_zaxpyConj_x1_y1(const int N, const double *alpha, const double *X, + const int incX, double *Y, const int incY); +void ATL_zaxpbyConj + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zgemoveC(const int N, const int M, const double *alpha, + const double *A, const int lda, double *C, const int ldc); + +void ATL_zgeaddConj_aXi0_b1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_a1_b1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_a0_b1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_aXi0_b1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_aX_b1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_aXi0_b0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_a1_b0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_a0_b0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_aXi0_b0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_aX_b0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_aXi0_bXi0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_a1_bXi0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_a0_bXi0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_aXi0_bXi0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_aX_bXi0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_aXi0_bX + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_a1_bX + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_a0_bX + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_aXi0_bX + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zgeaddConj_aX_bX + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zaxpby_aXi0_b1 + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zaxpby_aXi0_b1 + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zgeadd_aXi0_b1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zaxpby_aXi0_b0 + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zaxpby_aXi0_b0 + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zgeadd_aXi0_b0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zaxpby_aXi0_bXi0 + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zaxpby_aXi0_bXi0 + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zgeadd_aXi0_bXi0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zaxpby_aXi0_bX + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zaxpby_aXi0_bX + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zgeadd_aXi0_bX + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zaxpby_a1_bXi0 + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zgeadd_a1_bXi0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zaxpby_a0_bXi0 + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zgeadd_a0_bXi0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); +void ATL_zaxpby_aX_bXi0 + (const int N, const double *alpha, const double *X, const int incX, + const double *beta, double *Y, const int incY); +void ATL_zgeadd_aX_bXi0 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *beta, double *C, const int ldc); + +void ATL_zgemove_aXi0 + (const int M, const int N, const double *alpha0, const double *A, + const int lda, double *C, const int ldc); + +void ATL_zgescal_bXi0 + (const int M, const int N, const double *beta, double *C, const int ldc); + + +#if defined(ATL_USEPTHREADS) && !defined(ATL_flushcache) + #include "atlas_pthreads.h" + #define ATL_flushcache ATL_ptflushcache + #define ATL_PTCACHEMUL * ATL_NTHREADS +#else + #define ATL_PTCACHEMUL +#endif +double ATL_flushcache(int size); + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_cblascalias.h b/kaldi_io/src/tools/ATLAS/include/atlas_cblascalias.h new file mode 100644 index 0000000..267b176 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_cblascalias.h @@ -0,0 +1,40 @@ +#ifndef ATLAS_CBLASCALIAS_H + #define ATLAS_CBLASCALIAS_H + +#define cblas_dotc_sub cblas_cdotc_sub +#define cblas_dotu_sub cblas_cdotu_sub +#define cblas_axpy cblas_caxpy +#define cblas_copy cblas_ccopy +#define cblas_scal cblas_cscal +#define cblas_swap cblas_cswap +#define cblas_hpr2 cblas_chpr2 +#define cblas_her2 cblas_cher2 +#define cblas_hpr cblas_chpr +#define cblas_her cblas_cher +#define cblas_gerc cblas_cgerc +#define cblas_geru cblas_cgeru +#define cblas_tpsv cblas_ctpsv +#define cblas_tbsv cblas_ctbsv +#define cblas_trsv cblas_ctrsv +#define cblas_tpmv cblas_ctpmv +#define cblas_tbmv cblas_ctbmv +#define cblas_trmv cblas_ctrmv +#define cblas_hpmv cblas_chpmv +#define cblas_hbmv cblas_chbmv +#define cblas_hemv cblas_chemv +#define cblas_gbmv cblas_cgbmv +#define cblas_gemv cblas_cgemv +#define cblas_trsm cblas_ctrsm +#define cblas_trmm cblas_ctrmm +#define cblas_her2k cblas_cher2k +#define cblas_syr2k cblas_csyr2k +#define cblas_herk cblas_cherk +#define cblas_syrk cblas_csyrk +#define cblas_hemm cblas_chemm +#define cblas_symm cblas_csymm +#define cblas_gemm cblas_cgemm +#define cblas_iamax cblas_icamax +#define cblas_nrm2 cblas_scnrm2 +#define cblas_asum cblas_scasum + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_cblasdalias.h b/kaldi_io/src/tools/ATLAS/include/atlas_cblasdalias.h new file mode 100644 index 0000000..cfc6d10 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_cblasdalias.h @@ -0,0 +1,39 @@ +#ifndef ATLAS_CBLASDALIAS_H + #define ATLAS_CBLASDALIAS_H + +#define cblas_asum cblas_dasum +#define cblas_nrm2 cblas_dnrm2 +#define cblas_dot cblas_ddot +#define cblas_axpy cblas_daxpy +#define cblas_copy cblas_dcopy +#define cblas_scal cblas_dscal +#define cblas_swap cblas_dswap +#define cblas_rotm cblas_drotm +#define cblas_rot cblas_drot +#define cblas_rotmg cblas_drotmg +#define cblas_rotg cblas_drotg +#define cblas_spr2 cblas_dspr2 +#define cblas_syr2 cblas_dsyr2 +#define cblas_spr cblas_dspr +#define cblas_syr cblas_dsyr +#define cblas_ger cblas_dger +#define cblas_tpsv cblas_dtpsv +#define cblas_tbsv cblas_dtbsv +#define cblas_trsv cblas_dtrsv +#define cblas_tpmv cblas_dtpmv +#define cblas_tbmv cblas_dtbmv +#define cblas_trmv cblas_dtrmv +#define cblas_spmv cblas_dspmv +#define cblas_sbmv cblas_dsbmv +#define cblas_symv cblas_dsymv +#define cblas_gbmv cblas_dgbmv +#define cblas_gemv cblas_dgemv +#define cblas_trsm cblas_dtrsm +#define cblas_trmm cblas_dtrmm +#define cblas_syr2k cblas_dsyr2k +#define cblas_syrk cblas_dsyrk +#define cblas_symm cblas_dsymm +#define cblas_gemm cblas_dgemm +#define cblas_iamax cblas_idamax + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_cblassalias.h b/kaldi_io/src/tools/ATLAS/include/atlas_cblassalias.h new file mode 100644 index 0000000..090f9de --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_cblassalias.h @@ -0,0 +1,39 @@ +#ifndef ATLAS_CBLASSALIAS_H + #define ATLAS_CBLASSALIAS_H + +#define cblas_asum cblas_sasum +#define cblas_nrm2 cblas_snrm2 +#define cblas_dot cblas_sdot +#define cblas_axpy cblas_saxpy +#define cblas_copy cblas_scopy +#define cblas_scal cblas_sscal +#define cblas_swap cblas_sswap +#define cblas_rotm cblas_srotm +#define cblas_rot cblas_srot +#define cblas_rotmg cblas_srotmg +#define cblas_rotg cblas_srotg +#define cblas_spr2 cblas_sspr2 +#define cblas_syr2 cblas_ssyr2 +#define cblas_spr cblas_sspr +#define cblas_syr cblas_ssyr +#define cblas_ger cblas_sger +#define cblas_tpsv cblas_stpsv +#define cblas_tbsv cblas_stbsv +#define cblas_trsv cblas_strsv +#define cblas_tpmv cblas_stpmv +#define cblas_tbmv cblas_stbmv +#define cblas_trmv cblas_strmv +#define cblas_spmv cblas_sspmv +#define cblas_sbmv cblas_ssbmv +#define cblas_symv cblas_ssymv +#define cblas_gbmv cblas_sgbmv +#define cblas_gemv cblas_sgemv +#define cblas_trsm cblas_strsm +#define cblas_trmm cblas_strmm +#define cblas_syr2k cblas_ssyr2k +#define cblas_syrk cblas_ssyrk +#define cblas_symm cblas_ssymm +#define cblas_gemm cblas_sgemm +#define cblas_iamax cblas_isamax + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_cblastypealias.h b/kaldi_io/src/tools/ATLAS/include/atlas_cblastypealias.h new file mode 100644 index 0000000..0c3e82f --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_cblastypealias.h @@ -0,0 +1,9 @@ +#ifdef SREAL + #include "atlas_cblassalias.h" +#elif defined(DREAL) + #include "atlas_cblasdalias.h" +#elif defined(SCPLX) + #include "atlas_cblascalias.h" +#elif defined(DCPLX) + #include "atlas_cblaszalias.h" +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_cblaszalias.h b/kaldi_io/src/tools/ATLAS/include/atlas_cblaszalias.h new file mode 100644 index 0000000..ac01436 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_cblaszalias.h @@ -0,0 +1,40 @@ +#ifndef ATLAS_CBLASZALIAS_H + #define ATLAS_CBLASZALIAS_H + +#define cblas_dotc_sub cblas_zdotc_sub +#define cblas_dotu_sub cblas_zdotu_sub +#define cblas_axpy cblas_zaxpy +#define cblas_copy cblas_zcopy +#define cblas_scal cblas_zscal +#define cblas_swap cblas_zswap +#define cblas_hpr2 cblas_zhpr2 +#define cblas_her2 cblas_zher2 +#define cblas_hpr cblas_zhpr +#define cblas_her cblas_zher +#define cblas_gerc cblas_zgerc +#define cblas_geru cblas_zgeru +#define cblas_tpsv cblas_ztpsv +#define cblas_tbsv cblas_ztbsv +#define cblas_trsv cblas_ztrsv +#define cblas_tpmv cblas_ztpmv +#define cblas_tbmv cblas_ztbmv +#define cblas_trmv cblas_ztrmv +#define cblas_hpmv cblas_zhpmv +#define cblas_hbmv cblas_zhbmv +#define cblas_hemv cblas_zhemv +#define cblas_gbmv cblas_zgbmv +#define cblas_gemv cblas_zgemv +#define cblas_trsm cblas_ztrsm +#define cblas_trmm cblas_ztrmm +#define cblas_her2k cblas_zher2k +#define cblas_syr2k cblas_zsyr2k +#define cblas_herk cblas_zherk +#define cblas_syrk cblas_zsyrk +#define cblas_hemm cblas_zhemm +#define cblas_symm cblas_zsymm +#define cblas_gemm cblas_zgemm +#define cblas_iamax cblas_izamax +#define cblas_nrm2 cblas_dznrm2 +#define cblas_asum cblas_dzasum + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_enum.h b/kaldi_io/src/tools/ATLAS/include/atlas_enum.h new file mode 100644 index 0000000..3d638be --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_enum.h @@ -0,0 +1,55 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1997 R. Clint Whaley + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef ATLAS_ENUM_H + #define ATLAS_ENUM_H + + #define CBLAS_ENUM_ONLY + #include "cblas.h" + #undef CBLAS_ENUM_ONLY + + #define ATLAS_ORDER CBLAS_ORDER + #define AtlasRowMajor CblasRowMajor + #define AtlasColMajor CblasColMajor + #define ATLAS_TRANS CBLAS_TRANSPOSE + #define AtlasNoTrans CblasNoTrans + #define AtlasTrans CblasTrans + #define AtlasConjTrans CblasConjTrans + #define ATLAS_UPLO CBLAS_UPLO + #define AtlasUpper CblasUpper + #define AtlasLower CblasLower + #define ATLAS_DIAG CBLAS_DIAG + #define AtlasNonUnit CblasNonUnit + #define AtlasUnit CblasUnit + #define ATLAS_SIDE CBLAS_SIDE + #define AtlasLeft CblasLeft + #define AtlasRight CblasRight + +#endif + diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_f77.h b/kaldi_io/src/tools/ATLAS/include/atlas_f77.h new file mode 100644 index 0000000..1586fba --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_f77.h @@ -0,0 +1,83 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1997 R. Clint Whaley + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef ATLAS_F77_H +#define ATLAS_F77_H + + #ifndef ATL_F77_SUBROUTINE + #define ATL_F77_SUBROUTINE void + #endif + #ifndef F77_INTEGER + #define F77_INTEGER int + #else + #define ATL_FunkyInts + #endif + #if defined(CRAY) + #define UseTransChar 1 + #include <fortran.h> + #define F77_CHAR _fcd + #define ATL_F2C_TransChar(c) (*(_fcdtocp(c) )) + #define ATL_C2F_TransChar(c) (_cptofcd(&(c), 1)) + #elif defined(StringStructVal) + typedef struct {char *cp; F77_INTEGER len;} F77_CHAR; + #define ATL_F2C_TransChar(c) (*(c.cp)) + #define UseTransChar 2 + #elif defined(StringStructPtr) + typedef struct {char *cp; F77_INTEGER len;} F77_CHAR; + #define ATL_F2C_TransChar(c) (*(c->cp)) + #define UseTransChar 3 + #else + #define ATL_DeclareSlens + #define F77_CHAR char * + #define ATL_F2C_TransChar(c) (*(c)) + #define ATL_C2F_TransChar(c) (&(c)) + #define ATL_STRLEN_1 ,F77_INTEGER ATL_Slen1 + #define ATL_STRLEN_2 ,F77_INTEGER ATL_Slen1, F77_INTEGER ATL_Slen2 + #define ATL_STRLEN_3 ,F77_INTEGER ATL_Slen1, F77_INTEGER ATL_Slen2, \ + F77_INTEGER ATL_Slen3 + #define ATL_STRLEN_4 ,F77_INTEGER ATL_Slen1, F77_INTEGER ATL_Slen2, \ + F77_INTEGER ATL_Slen3, F77_INTEGER ATL_Slen4 + #define ATL_STRLEN_1_para ,ATL_Slen1 + #define ATL_STRLEN_2_para ,ATL_Slen1, ATL_Slen2 + #define ATL_STRLEN_3_para ,ATL_Slen1, ATL_Slen2, ATL_Slen3 + #define ATL_STRLEN_4_para ,ATL_Slen1, ATL_Slen2, ATL_Slen3, ATL_Slen4 + #endif + + #ifndef ATL_STRLEN_1 + #define ATL_STRLEN_1 + #define ATL_STRLEN_2 + #define ATL_STRLEN_3 + #define ATL_STRLEN_4 + #define ATL_STRLEN_1_para + #define ATL_STRLEN_2_para + #define ATL_STRLEN_3_para + #define ATL_STRLEN_4_para + #endif + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_f77blas.h b/kaldi_io/src/tools/ATLAS/include/atlas_f77blas.h new file mode 100644 index 0000000..a7c109d --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_f77blas.h @@ -0,0 +1,849 @@ +#ifndef ATLAS_F77_LVLS +#define ATLAS_F77_LVLS + +#include "atlas_f77.h" + +#if defined( StringSunStyle ) +#define F77_CHAR_DECL F77_CHAR /* input character*1 */ +#define F77_1_CHAR , F77_INTEGER +#define F77_2_CHAR F77_1_CHAR F77_1_CHAR +#define F77_3_CHAR F77_2_CHAR F77_1_CHAR +#define F77_4_CHAR F77_3_CHAR F77_1_CHAR +#elif defined( StringCrayStyle ) +#define F77_CHAR_DECL F77_CHAR /* input character*1 */ +#elif defined( StringStructVal ) +#define F77_CHAR_DECL F77_CHAR /* input character*1 */ +#elif defined( StringStructPtr ) +#define F77_CHAR_DECL F77_CHAR * /* input character*1 */ +#endif + +#ifndef F77_1_CHAR +#define F77_1_CHAR +#define F77_2_CHAR +#define F77_3_CHAR +#define F77_4_CHAR +#endif + +#ifndef F77_CHAR_DECL + #define F77_CHAR_DECL F77_CHAR * /* input character*1 */ +#endif + +#define F77_INT_DECL const F77_INTEGER * /* input integer */ + +#ifdef TREAL +#define F77_SIN_DECL const TYPE * /* input scalar */ +#define F77_SINOUT_DECL TYPE * /* input/output scalar */ +#define F77_RIN_DECL const TYPE * /* input real scalar */ +#define F77_RINOUT_DECL TYPE * /* input/output real scalar */ +#else +#define F77_SIN_DECL const TYPE * /* input scalar */ +#define F77_SINOUT_DECL TYPE * /* input/output scalar */ +#define F77_RIN_DECL const TYPE * /* input real scalar */ +#define F77_RINOUT_DECL TYPE * /* input/output real scalar */ +#endif + +#define F77_VIN_DECL const TYPE * /* input vector */ +#define F77_VINOUT_DECL TYPE * /* input/output matrix */ + +#define F77_MIN_DECL const TYPE * /* input matrix */ +#define F77_MINOUT_DECL TYPE * /* input/output matrix */ + +#if defined( CRAY ) +#define F77_VOID_FUN extern fortran void /* subroutine */ +#define F77_INT_FUN extern fortran int /* integer function */ +#define F77_TYPE_FUN extern fortran TYPE /* real function */ +#define F77_DBLE_FUN extern fortran double /* dble function */ +#else +#define F77_VOID_FUN extern void /* subroutine */ +#define F77_INT_FUN extern int /* integer function */ +#define F77_TYPE_FUN extern TYPE /* real function */ +#define F77_DBLE_FUN extern double /* dble function */ +#endif + +#if defined( NoChange ) +/* + * These defines set up the naming scheme required to have a FORTRAN + * routine called by a C routine with the following FORTRAN to C inter- + * face: + * + * FORTRAN DECLARATION C CALL + * SUBROUTINE DGEMM(...) dgemm(...) + */ +#if defined( SREAL ) + +#define F77rotg srotg +#define F77rotmg srotmg +#define F77nrm2 swrapnrm2 +#define F77asum swrapasum +#define F77amax isamax +#define F77scal sscal +#define F77axpy saxpy +#define F77axpby fatlas_saxpby +#define F77set fatlas_sset +#define F77copy scopy +#define F77swap sswap +#define F77rot srot +#define F77rotm srotm +#define F77dot swrapdot +#define F77dsdot dswrapdot +#define F77sdsdot sdswrapdot + +#define F77gemv sgemv +#define F77gbmv sgbmv +#define F77sbmv ssbmv +#define F77spmv sspmv +#define F77symv ssymv +#define F77tbmv stbmv +#define F77tpmv stpmv +#define F77trmv strmv +#define F77tbsv stbsv +#define F77tpsv stpsv +#define F77trsv strsv +#define F77ger sger +#define F77spr sspr +#define F77syr ssyr +#define F77spr2 sspr2 +#define F77syr2 ssyr2 + +#define F77gemm sgemm +#define F77symm ssymm +#define F77syrk ssyrk +#define F77syr2k ssyr2k +#define F77trmm strmm +#define F77trsm strsm + +#elif defined( DREAL ) + +#define F77rotg drotg +#define F77rotmg drotmg +#define F77nrm2 dwrapnrm2 +#define F77asum dwrapasum +#define F77amax idamax +#define F77scal dscal +#define F77axpy daxpy +#define F77axpby fatlas_daxpby +#define F77set fatlas_dset +#define F77copy dcopy +#define F77swap dswap +#define F77rot drot +#define F77rotm drotm +#define F77dot dwrapdot + +#define F77gemv dgemv +#define F77gbmv dgbmv +#define F77sbmv dsbmv +#define F77spmv dspmv +#define F77symv dsymv +#define F77tbmv dtbmv +#define F77tpmv dtpmv +#define F77trmv dtrmv +#define F77tbsv dtbsv +#define F77tpsv dtpsv +#define F77trsv dtrsv +#define F77ger dger +#define F77spr dspr +#define F77syr dsyr +#define F77spr2 dspr2 +#define F77syr2 dsyr2 + +#define F77gemm dgemm +#define F77symm dsymm +#define F77syrk dsyrk +#define F77syr2k dsyr2k +#define F77trmm dtrmm +#define F77trsm dtrsm + +#elif defined( SCPLX ) + +#define F77rotg crotg +#define F77nrm2 scwrapnrm2 +#define F77asum scwrapasum +#define F77amax icamax +#define F77scal cscal +#define F77rscal csscal +#define F77axpy caxpy +#define F77axpby fatlas_caxpby +#define F77set fatlas_cset +#define F77copy ccopy +#define F77swap cswap +#define F77rot csrot +#define F77dotc cwrapdotc +#define F77dotu cwrapdotu + +#define F77gbmv cgbmv +#define F77gemv cgemv +#define F77hbmv chbmv +#define F77hpmv chpmv +#define F77hemv chemv +#define F77tbmv ctbmv +#define F77tpmv ctpmv +#define F77trmv ctrmv +#define F77tbsv ctbsv +#define F77tpsv ctpsv +#define F77trsv ctrsv +#define F77gerc cgerc +#define F77geru cgeru +#define F77hpr chpr +#define F77her cher +#define F77hpr2 chpr2 +#define F77her2 cher2 + +#define F77gemm cgemm +#define F77hemm chemm +#define F77herk cherk +#define F77her2k cher2k +#define F77symm csymm +#define F77syrk csyrk +#define F77syr2k csyr2k +#define F77trmm ctrmm +#define F77trsm ctrsm + +#elif defined( DCPLX ) + +#define F77rotg zrotg +#define F77nrm2 dzwrapnrm2 +#define F77asum dzwrapasum +#define F77amax izamax +#define F77scal zscal +#define F77rscal zdscal +#define F77axpy zaxpy +#define F77axpby fatlas_zaxpby +#define F77set fatlas_zset +#define F77copy zcopy +#define F77swap zswap +#define F77rot zdrot +#define F77dotc zwrapdotc +#define F77dotu zwrapdotu + +#define F77gbmv zgbmv +#define F77gemv zgemv +#define F77hbmv zhbmv +#define F77hpmv zhpmv +#define F77hemv zhemv +#define F77tbmv ztbmv +#define F77tpmv ztpmv +#define F77trmv ztrmv +#define F77tbsv ztbsv +#define F77tpsv ztpsv +#define F77trsv ztrsv +#define F77gerc zgerc +#define F77geru zgeru +#define F77hpr zhpr +#define F77her zher +#define F77hpr2 zhpr2 +#define F77her2 zher2 + +#define F77gemm zgemm +#define F77hemm zhemm +#define F77herk zherk +#define F77her2k zher2k +#define F77symm zsymm +#define F77syrk zsyrk +#define F77syr2k zsyr2k +#define F77trmm ztrmm +#define F77trsm ztrsm + +#endif + +#elif defined( UpCase ) +/* + * These defines set up the naming scheme required to have a FORTRAN + * routine called by a C routine with the following FORTRAN to C inter- + * face: + * + * FORTRAN DECLARATION C CALL + * SUBROUTINE DGEMM(...) DGEMM(...) + */ +#if defined( SREAL ) + +#define F77rotg SROTG +#define F77rotmg SROTMG +#define F77nrm2 SWRAPNRM2 +#define F77asum SWRAPASUM +#define F77amax ISAMAX +#define F77scal SSCAL +#define F77axpy SAXPY +#define F77axpby FATLAS_SAXPBY +#define F77set FATLAS_SSET +#define F77copy SCOPY +#define F77swap SSWAP +#define F77rot SROT +#define F77rotm SROTM +#define F77dot SWRAPDOT +#define F77dsdot DSWRAPDOT +#define F77sdsdot SDSWRAPDOT + +#define F77gemv SGEMV +#define F77gbmv SGBMV +#define F77sbmv SSBMV +#define F77spmv SSPMV +#define F77symv SSYMV +#define F77tbmv STBMV +#define F77tpmv STPMV +#define F77trmv STRMV +#define F77tbsv STBSV +#define F77tpsv STPSV +#define F77trsv STRSV +#define F77ger SGER +#define F77spr SSPR +#define F77syr SSYR +#define F77spr2 SSPR2 +#define F77syr2 SSYR2 + +#define F77gemm SGEMM +#define F77symm SSYMM +#define F77syrk SSYRK +#define F77syr2k SSYR2K +#define F77trmm STRMM +#define F77trsm STRSM + +#elif defined( DREAL ) + +#define F77rotg DROTG +#define F77rotmg DROTMG +#define F77nrm2 DWRAPNRM2 +#define F77asum DWRAPASUM +#define F77amax IDAMAX +#define F77scal DSCAL +#define F77axpy DAXPY +#define F77axpby FATLAS_DAXPBY +#define F77set FATLAS_DSET +#define F77copy DCOPY +#define F77swap DSWAP +#define F77rot DROT +#define F77rotm DROTM +#define F77dot DWRAPDOT + +#define F77gemv DGEMV +#define F77gbmv DGBMV +#define F77sbmv DSBMV +#define F77spmv DSPMV +#define F77symv DSYMV +#define F77tbmv DTBMV +#define F77tpmv DTPMV +#define F77trmv DTRMV +#define F77tbsv DTBSV +#define F77tpsv DTPSV +#define F77trsv DTRSV +#define F77ger DGER +#define F77spr DSPR +#define F77syr DSYR +#define F77spr2 DSPR2 +#define F77syr2 DSYR2 + +#define F77gemm DGEMM +#define F77symm DSYMM +#define F77syrk DSYRK +#define F77syr2k DSYR2K +#define F77trmm DTRMM +#define F77trsm DTRSM + +#elif defined( SCPLX ) + +#define F77rotg CROTG +#define F77nrm2 SCWRAPNRM2 +#define F77asum SCWRAPASUM +#define F77amax ICAMAX +#define F77scal CSCAL +#define F77rscal CSSCAL +#define F77axpy CAXPY +#define F77axpby FATLAS_CAXPBY +#define F77set FATLAS_CSET +#define F77copy CCOPY +#define F77swap CSWAP +#define F77rot CSROT +#define F77dotc CWRAPDOTC +#define F77dotu CWRAPDOTU + +#define F77gbmv CGBMV +#define F77gemv CGEMV +#define F77hbmv CHBMV +#define F77hpmv CHPMV +#define F77hemv CHEMV +#define F77tbmv CTBMV +#define F77tpmv CTPMV +#define F77trmv CTRMV +#define F77tbsv CTBSV +#define F77tpsv CTPSV +#define F77trsv CTRSV +#define F77gerc CGERC +#define F77geru CGERU +#define F77hpr CHPR +#define F77her CHER +#define F77hpr2 CHPR2 +#define F77her2 CHER2 + +#define F77gemm CGEMM +#define F77hemm CHEMM +#define F77herk CHERK +#define F77her2k CHER2K +#define F77symm CSYMM +#define F77syrk CSYRK +#define F77syr2k CSYR2K +#define F77trmm CTRMM +#define F77trsm CTRSM + +#elif defined( DCPLX ) + +#define F77rotg ZROTG +#define F77nrm2 DZWRAPNRM2 +#define F77asum DZWRAPASUM +#define F77amax IZAMAX +#define F77scal ZSCAL +#define F77rscal ZDSCAL +#define F77axpy ZAXPY +#define F77axpby FATLAS_ZAXPBY +#define F77set FATLAS_ZSET +#define F77copy ZCOPY +#define F77swap ZSWAP +#define F77rot ZDROT +#define F77dotc ZWRAPDOTC +#define F77dotu ZWRAPDOTU + +#define F77gbmv ZGBMV +#define F77gemv ZGEMV +#define F77hbmv ZHBMV +#define F77hpmv ZHPMV +#define F77hemv ZHEMV +#define F77tbmv ZTBMV +#define F77tpmv ZTPMV +#define F77trmv ZTRMV +#define F77tbsv ZTBSV +#define F77tpsv ZTPSV +#define F77trsv ZTRSV +#define F77gerc ZGERC +#define F77geru ZGERU +#define F77hpr ZHPR +#define F77her ZHER +#define F77hpr2 ZHPR2 +#define F77her2 ZHER2 + +#define F77gemm ZGEMM +#define F77hemm ZHEMM +#define F77herk ZHERK +#define F77her2k ZHER2K +#define F77symm ZSYMM +#define F77syrk ZSYRK +#define F77syr2k ZSYR2K +#define F77trmm ZTRMM +#define F77trsm ZTRSM + +#endif + +#elif defined( Add_ ) || defined( Add__ ) +/* + * These defines set up the naming scheme required to have a FORTRAN + * routine called by a C routine with the following FORTRAN to C inter- + * face: + * + * FORTRAN DECLARATION C CALL + * SUBROUTINE DGEMM(...) dgemm_(...) + */ +#if defined( SREAL ) + +#define F77rotg srotg_ +#define F77rotmg srotmg_ +#define F77nrm2 swrapnrm2_ +#define F77asum swrapasum_ +#define F77amax isamax_ +#define F77scal sscal_ +#define F77axpy saxpy_ +#ifdef Add_ + #define F77axpby fatlas_saxpby_ + #define F77set fatlas_sset_ +#else + #define F77axpby fatlas_saxpby__ + #define F77set fatlas_sset__ +#endif +#define F77copy scopy_ +#define F77swap sswap_ +#define F77rot srot_ +#define F77rotm srotm_ +#define F77dot swrapdot_ +#define F77dsdot dswrapdot_ +#define F77sdsdot sdswrapdot_ + +#define F77gemv sgemv_ +#define F77gbmv sgbmv_ +#define F77sbmv ssbmv_ +#define F77spmv sspmv_ +#define F77symv ssymv_ +#define F77tbmv stbmv_ +#define F77tpmv stpmv_ +#define F77trmv strmv_ +#define F77tbsv stbsv_ +#define F77tpsv stpsv_ +#define F77trsv strsv_ +#define F77ger sger_ +#define F77spr sspr_ +#define F77syr ssyr_ +#define F77spr2 sspr2_ +#define F77syr2 ssyr2_ + +#define F77gemm sgemm_ +#define F77symm ssymm_ +#define F77syrk ssyrk_ +#define F77syr2k ssyr2k_ +#define F77trmm strmm_ +#define F77trsm strsm_ + +#elif defined( DREAL ) + +#define F77rotg drotg_ +#define F77rotmg drotmg_ +#define F77nrm2 dwrapnrm2_ +#define F77asum dwrapasum_ +#define F77amax idamax_ +#define F77scal dscal_ +#define F77axpy daxpy_ +#ifdef Add_ + #define F77axpby fatlas_daxpby_ + #define F77set fatlas_dset_ +#else + #define F77axpby fatlas_daxpby__ + #define F77set fatlas_dset__ +#endif +#define F77copy dcopy_ +#define F77swap dswap_ +#define F77rot drot_ +#define F77rotm drotm_ +#define F77dot dwrapdot_ + +#define F77gemv dgemv_ +#define F77gbmv dgbmv_ +#define F77sbmv dsbmv_ +#define F77spmv dspmv_ +#define F77symv dsymv_ +#define F77tbmv dtbmv_ +#define F77tpmv dtpmv_ +#define F77trmv dtrmv_ +#define F77tbsv dtbsv_ +#define F77tpsv dtpsv_ +#define F77trsv dtrsv_ +#define F77ger dger_ +#define F77spr dspr_ +#define F77syr dsyr_ +#define F77spr2 dspr2_ +#define F77syr2 dsyr2_ + +#define F77gemm dgemm_ +#define F77symm dsymm_ +#define F77syrk dsyrk_ +#define F77syr2k dsyr2k_ +#define F77trmm dtrmm_ +#define F77trsm dtrsm_ + +#elif defined( SCPLX ) + +#define F77rotg crotg_ +#define F77nrm2 scwrapnrm2_ +#define F77asum scwrapasum_ +#define F77amax icamax_ +#define F77scal cscal_ +#define F77rscal csscal_ +#define F77axpy caxpy_ +#ifdef Add_ + #define F77axpby fatlas_caxpby_ + #define F77set fatlas_cset_ +#else + #define F77axpby fatlas_caxpby__ + #define F77set fatlas_cset__ +#endif +#define F77copy ccopy_ +#define F77swap cswap_ +#define F77rot csrot_ +#define F77dotc cwrapdotc_ +#define F77dotu cwrapdotu_ + +#define F77gbmv cgbmv_ +#define F77gemv cgemv_ +#define F77hbmv chbmv_ +#define F77hpmv chpmv_ +#define F77hemv chemv_ +#define F77tbmv ctbmv_ +#define F77tpmv ctpmv_ +#define F77trmv ctrmv_ +#define F77tbsv ctbsv_ +#define F77tpsv ctpsv_ +#define F77trsv ctrsv_ +#define F77gerc cgerc_ +#define F77geru cgeru_ +#define F77hpr chpr_ +#define F77her cher_ +#define F77hpr2 chpr2_ +#define F77her2 cher2_ + +#define F77gemm cgemm_ +#define F77hemm chemm_ +#define F77herk cherk_ +#define F77her2k cher2k_ +#define F77symm csymm_ +#define F77syrk csyrk_ +#define F77syr2k csyr2k_ +#define F77trmm ctrmm_ +#define F77trsm ctrsm_ + +#elif defined( DCPLX ) + +#define F77rotg zrotg_ +#define F77nrm2 dzwrapnrm2_ +#define F77asum dzwrapasum_ +#define F77amax izamax_ +#define F77scal zscal_ +#define F77rscal zdscal_ +#define F77axpy zaxpy_ +#ifdef Add_ + #define F77axpby fatlas_zaxpby_ + #define F77set fatlas_zset_ +#else + #define F77axpby fatlas_zaxpby__ + #define F77set fatlas_zset__ +#endif +#define F77copy zcopy_ +#define F77swap zswap_ +#define F77rot zdrot_ +#define F77dotc zwrapdotc_ +#define F77dotu zwrapdotu_ + +#define F77gbmv zgbmv_ +#define F77gemv zgemv_ +#define F77hbmv zhbmv_ +#define F77hpmv zhpmv_ +#define F77hemv zhemv_ +#define F77tbmv ztbmv_ +#define F77tpmv ztpmv_ +#define F77trmv ztrmv_ +#define F77tbsv ztbsv_ +#define F77tpsv ztpsv_ +#define F77trsv ztrsv_ +#define F77gerc zgerc_ +#define F77geru zgeru_ +#define F77hpr zhpr_ +#define F77her zher_ +#define F77hpr2 zhpr2_ +#define F77her2 zher2_ + +#define F77gemm zgemm_ +#define F77hemm zhemm_ +#define F77herk zherk_ +#define F77her2k zher2k_ +#define F77symm zsymm_ +#define F77syrk zsyrk_ +#define F77syr2k zsyr2k_ +#define F77trmm ztrmm_ +#define F77trsm ztrsm_ + +#endif + +#endif + +#ifdef TREAL +F77_VOID_FUN F77rotg +( F77_SINOUT_DECL, F77_SINOUT_DECL, F77_SINOUT_DECL, F77_SINOUT_DECL ); +F77_VOID_FUN F77rotmg +( F77_SINOUT_DECL, F77_SINOUT_DECL, F77_SINOUT_DECL, F77_SIN_DECL, + F77_VINOUT_DECL ); +#else +F77_VOID_FUN F77rotg +( F77_SINOUT_DECL, F77_SIN_DECL, F77_SINOUT_DECL, F77_SINOUT_DECL ); +#endif +F77_VOID_FUN F77nrm2 +( F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_RINOUT_DECL ); +F77_VOID_FUN F77asum +( F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_RINOUT_DECL ); +F77_INT_FUN F77amax +( F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL ); +F77_VOID_FUN F77scal +( F77_INT_DECL, F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL ); +#ifdef TCPLX +F77_VOID_FUN F77rscal +( F77_INT_DECL, F77_RIN_DECL, F77_VINOUT_DECL, F77_INT_DECL ); +#endif +void F77set +( F77_INT_DECL, F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL ); +void F77axpby +( F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL, F77_INT_DECL, + F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL ); +F77_VOID_FUN F77axpy +( F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL, F77_INT_DECL, + F77_VINOUT_DECL, F77_INT_DECL ); +F77_VOID_FUN F77copy +( F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_VINOUT_DECL, + F77_INT_DECL ); +F77_VOID_FUN F77swap +( F77_INT_DECL, F77_VINOUT_DECL, F77_INT_DECL, F77_VINOUT_DECL, + F77_INT_DECL ); +F77_VOID_FUN F77rot +( F77_INT_DECL, F77_VINOUT_DECL, F77_INT_DECL, F77_VINOUT_DECL, + F77_INT_DECL, F77_SIN_DECL, F77_SIN_DECL ); +#ifdef TREAL +F77_VOID_FUN F77rotm +( F77_INT_DECL, F77_VINOUT_DECL, F77_INT_DECL, F77_VINOUT_DECL, + F77_INT_DECL, F77_VIN_DECL ); +#endif +#ifdef TREAL +F77_VOID_FUN F77dot +( F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_VIN_DECL, + F77_INT_DECL, F77_SINOUT_DECL ); +#ifdef SREAL +F77_VOID_FUN F77dsdot +( F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_VIN_DECL, + F77_INT_DECL, double * ); +F77_VOID_FUN F77sdsdot +( F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL, F77_INT_DECL, + F77_VIN_DECL, F77_INT_DECL, F77_SINOUT_DECL ); +#endif +#else +F77_VOID_FUN F77dotc +( F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_VIN_DECL, + F77_INT_DECL, F77_SINOUT_DECL ); +F77_VOID_FUN F77dotu +( F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_VIN_DECL, + F77_INT_DECL, F77_SINOUT_DECL ); +#endif + +F77_VOID_FUN F77gbmv +( F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL, F77_INT_DECL, + F77_INT_DECL, F77_SIN_DECL, F77_MIN_DECL, F77_INT_DECL, + F77_VIN_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VINOUT_DECL, + F77_INT_DECL F77_1_CHAR ); +F77_VOID_FUN F77gemv +( F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL, + F77_SIN_DECL, F77_MIN_DECL, F77_INT_DECL, + F77_VIN_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VINOUT_DECL, + F77_INT_DECL F77_1_CHAR ); +#ifdef TREAL +F77_VOID_FUN F77ger +( F77_INT_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL, + F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_MINOUT_DECL, + F77_INT_DECL ); +F77_VOID_FUN F77sbmv +( F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL, F77_SIN_DECL, + F77_MIN_DECL, F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, + F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL F77_1_CHAR ); +F77_VOID_FUN F77spmv +( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL, + F77_MIN_DECL, F77_VIN_DECL, F77_INT_DECL, + F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL F77_1_CHAR ); +F77_VOID_FUN F77symv +( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL, + F77_MIN_DECL, F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, + F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL F77_1_CHAR ); +F77_VOID_FUN F77spr +( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL, + F77_INT_DECL, F77_MINOUT_DECL F77_1_CHAR ); +F77_VOID_FUN F77syr +( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL, + F77_INT_DECL, F77_MINOUT_DECL, F77_INT_DECL F77_1_CHAR ); +F77_VOID_FUN F77spr2 +( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL, + F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_MINOUT_DECL + F77_1_CHAR ); +F77_VOID_FUN F77syr2 +( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL, + F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_MINOUT_DECL, + F77_INT_DECL F77_1_CHAR ); +#else +F77_VOID_FUN F77gerc +( F77_INT_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL, + F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_MINOUT_DECL, + F77_INT_DECL ); +F77_VOID_FUN F77geru +( F77_INT_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL, + F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_MINOUT_DECL, + F77_INT_DECL ); +F77_VOID_FUN F77hbmv +( F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL, F77_SIN_DECL, + F77_MIN_DECL, F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, + F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL F77_1_CHAR ); +F77_VOID_FUN F77hpmv +( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL, + F77_MIN_DECL, F77_VIN_DECL, F77_INT_DECL, + F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL F77_1_CHAR ); +F77_VOID_FUN F77hemv +( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL, + F77_MIN_DECL, F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, + F77_SIN_DECL, F77_VINOUT_DECL, F77_INT_DECL F77_1_CHAR ); +F77_VOID_FUN F77hpr +( F77_CHAR_DECL, F77_INT_DECL, F77_RIN_DECL, F77_VIN_DECL, + F77_INT_DECL, F77_MINOUT_DECL F77_1_CHAR ); +F77_VOID_FUN F77her +( F77_CHAR_DECL, F77_INT_DECL, F77_RIN_DECL, F77_VIN_DECL, + F77_INT_DECL, F77_MINOUT_DECL, F77_INT_DECL F77_1_CHAR ); +F77_VOID_FUN F77hpr2 +( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL, + F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_MINOUT_DECL + F77_1_CHAR ); +F77_VOID_FUN F77her2 +( F77_CHAR_DECL, F77_INT_DECL, F77_SIN_DECL, F77_VIN_DECL, + F77_INT_DECL, F77_VIN_DECL, F77_INT_DECL, F77_MINOUT_DECL, + F77_INT_DECL F77_1_CHAR ); +#endif +F77_VOID_FUN F77tbmv +( F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, + F77_INT_DECL, F77_MIN_DECL, F77_INT_DECL, F77_VINOUT_DECL, + F77_INT_DECL F77_3_CHAR ); +F77_VOID_FUN F77tpmv +( F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, + F77_MIN_DECL, F77_VINOUT_DECL, + F77_INT_DECL F77_3_CHAR ); +F77_VOID_FUN F77trmv +( F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, + F77_MIN_DECL, F77_INT_DECL, F77_VINOUT_DECL, + F77_INT_DECL F77_3_CHAR ); +F77_VOID_FUN F77tbsv +( F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, + F77_INT_DECL, F77_MIN_DECL, F77_INT_DECL, F77_VINOUT_DECL, + F77_INT_DECL F77_3_CHAR ); +F77_VOID_FUN F77tpsv +( F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, + F77_MIN_DECL, F77_VINOUT_DECL, + F77_INT_DECL F77_3_CHAR ); +F77_VOID_FUN F77trsv +( F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, + F77_MIN_DECL, F77_INT_DECL, F77_VINOUT_DECL, + F77_INT_DECL F77_3_CHAR ); + +F77_VOID_FUN F77gemm +( F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL, + F77_INT_DECL, F77_SIN_DECL, F77_MIN_DECL, F77_INT_DECL, + F77_MIN_DECL, F77_INT_DECL, F77_SIN_DECL, F77_MINOUT_DECL, + F77_INT_DECL F77_2_CHAR ); +F77_VOID_FUN F77hemm +( F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL, + F77_SIN_DECL, F77_MIN_DECL, F77_INT_DECL, F77_MIN_DECL, + F77_INT_DECL, F77_SIN_DECL, F77_MINOUT_DECL, F77_INT_DECL + F77_2_CHAR ); +F77_VOID_FUN F77her2k +( F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL, + F77_SIN_DECL, F77_MIN_DECL, F77_INT_DECL, F77_MIN_DECL, + F77_INT_DECL, F77_RIN_DECL, F77_MINOUT_DECL, F77_INT_DECL + F77_2_CHAR ); +F77_VOID_FUN F77herk +( F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL, + F77_RIN_DECL, F77_MIN_DECL, F77_INT_DECL, F77_RIN_DECL, + F77_MINOUT_DECL, F77_INT_DECL F77_2_CHAR ); +F77_VOID_FUN F77symm +( F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL, + F77_SIN_DECL, F77_MIN_DECL, F77_INT_DECL, F77_MIN_DECL, + F77_INT_DECL, F77_SIN_DECL, F77_MINOUT_DECL, F77_INT_DECL + F77_2_CHAR ); +F77_VOID_FUN F77syr2k +( F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL, + F77_SIN_DECL, F77_MIN_DECL, F77_INT_DECL, F77_MIN_DECL, + F77_INT_DECL, F77_SIN_DECL, F77_MINOUT_DECL, F77_INT_DECL + F77_2_CHAR ); +F77_VOID_FUN F77syrk +( F77_CHAR_DECL, F77_CHAR_DECL, F77_INT_DECL, F77_INT_DECL, + F77_SIN_DECL, F77_MIN_DECL, F77_INT_DECL, F77_SIN_DECL, + F77_MINOUT_DECL, F77_INT_DECL F77_2_CHAR ); +F77_VOID_FUN F77trmm +( F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, + F77_INT_DECL, F77_INT_DECL, F77_SIN_DECL, F77_MIN_DECL, + F77_INT_DECL, F77_MINOUT_DECL, F77_INT_DECL F77_4_CHAR ); +F77_VOID_FUN F77trsm +( F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, F77_CHAR_DECL, + F77_INT_DECL, F77_INT_DECL, F77_SIN_DECL, F77_MIN_DECL, + F77_INT_DECL, F77_MINOUT_DECL, F77_INT_DECL F77_4_CHAR ); + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_f77wrap.h b/kaldi_io/src/tools/ATLAS/include/atlas_f77wrap.h new file mode 100644 index 0000000..db6099c --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_f77wrap.h @@ -0,0 +1,1088 @@ +/* --------------------------------------------------------------------- + * + * -- Automatically Tuned Linear Algebra Software (ATLAS) + * (C) Copyright 2000 All Rights Reserved + * + * -- ATLAS routine -- Version 3.2 -- December 25, 2000 + * + * Author : Antoine P. Petitet + * Originally developed at the University of Tennessee, + * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA. + * + * --------------------------------------------------------------------- + * + * -- Copyright notice and Licensing terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in + * the documentation and/or other materials provided with the distri- + * bution. + * 3. The name of the University, the ATLAS group, or the names of its + * contributors may not be used to endorse or promote products deri- + * ved from this software without specific written permission. + * + * -- Disclaimer: + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO- + * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN- + * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --------------------------------------------------------------------- + */ +#ifndef ATLAS_F77WRAP_H +#define ATLAS_F77WRAP_H +/* + * ===================================================================== + * Include Files + * ===================================================================== + */ +#include "atlas_misc.h" +#include "atlas_f77.h" +/* + * ===================================================================== + * Multi-threaded/reference implementation function names re-definition + * ===================================================================== + * + * Uncomment the following definition macros to call the multi-threaded + * implementation or define those macros at compile time. + * + * #define USE_L1_PTHREADS + * #define USE_L2_PTHREADS + * #define USE_L3_PTHREADS + * + * Uncomment the following definition macros to call the reference im- + * plementation or define those macros at compile time. + * + * #define USE_L1_REFERENCE + * #define USE_L2_REFERENCE + * #define USE_L3_REFERENCE + * + * ===================================================================== + */ + +#ifdef ATL_USEPTHREADS +#define USE_L3_PTHREADS +#endif + +/* + * ===================================================================== + * ATLAS Levels 1, 2 and 3 Prototypes + * ===================================================================== + */ +#if defined( USE_L1_PTHREADS ) +#include "atlas_ptalias1.h" +#include "atlas_ptlevel1.h" +#elif defined( USE_L1_REFERENCE ) +#include "atlas_refalias1.h" +#include "atlas_reflevel1.h" +#else +#include "atlas_level1.h" +#endif + +#if defined( USE_L2_PTHREADS ) +#include "atlas_ptalias2.h" +#include "atlas_ptlevel2.h" +#elif defined( USE_L2_REFERENCE ) +#include "atlas_refalias2.h" +#include "atlas_reflevel2.h" +#else +#include "atlas_level2.h" +#endif + +#if defined( USE_L3_PTHREADS ) +#include "atlas_ptalias3.h" +#include "atlas_ptlevel3.h" +#elif defined( USE_L3_REFERENCE ) +#include "atlas_refalias3.h" +#include "atlas_reflevel3.h" +#else +#include "atlas_level3.h" +#endif +/* + * ===================================================================== + * #define macro constants + * ===================================================================== + */ +#define PATLF77WRAP Mjoin( ATL_F77wrap_, PRE ) + +#ifdef TREAL +#define ATLPUF77WRAP Mjoin( ATL_F77wrap_, PRE ) +#define ATLUPF77WRAP Mjoin( ATL_F77wrap_, PRE ) +#else +#define ATLPUF77WRAP Mjoin( Mjoin( ATL_F77wrap_, PRE ), UPR ) +#define ATLUPF77WRAP Mjoin( Mjoin( ATL_F77wrap_, UPR ), PRE ) +#endif + +#define F77_INOTRAN 111 +#define F77_ITRAN 112 +#define F77_ICOTRAN 113 + +#define F77_IUPPER 121 +#define F77_ILOWER 122 + +#define F77_INONUNIT 131 +#define F77_IUNIT 132 + +#define F77_ILEFT 141 +#define F77_IRIGHT 142 +/* + * ===================================================================== + * #define macro functions + * ===================================================================== + */ +#ifdef TREAL +#define V1N( n_, x_, incx_ ) \ + ( (*n_) > 0 ? (x_)+(1-(*n_))*(*incx_) : (x_) ) +#define VN1( n_, x_, incx_ ) \ + ( (*n_) > 0 ? (x_)+((*n_)-1)*(*incx_) : (x_) ) +#define W1N( n_, x_, incx_ ) \ + ( (*incx_) < 0 ? V1N( n_, x_, incx_ ) : (x_) ) +#else +#define V1N( n_, x_, incx_ ) \ + ( (*n_) > 0 ? (x_)+( ( (1-(*n_))*(*incx_) ) << 1 ) : (x_) ) +#define VN1( n_, x_, incx_ ) \ + ( (*n_) > 0 ? (x_)+( ( ((*n_)-1)*(*incx_) ) << 1 ) : (x_) ) +#define W1N( n_, x_, incx_ ) \ + ( (*incx_) < 0 ? V1N( n_, x_, incx_ ) : (x_) ) +#endif +/* + * ===================================================================== + * FORTRAN <-> C interface + * ===================================================================== + * + * These macros identifies how these wrappers will be called as follows: + * + * Add_: the FORTRAN compiler expects the name of C functions to be + * in all lower case and to have an underscore postfixed it (Suns, Intel + * compilers expect this). + * + * NoChange: the FORTRAN compiler expects the name of C functions to be + * in all lower case (IBM RS6K compilers do this). + * + * UpCase: the FORTRAN compiler expects the name of C functions to be + * in all upcase. (Cray compilers expect this). + * + * Add__: the FORTRAN compiler in use is f2c, a FORTRAN to C conver- + * ter. + */ +#if defined( Add_ ) +/* + * These defines set up the naming scheme required to have a FORTRAN + * routine calling a C routine. + * + * FORTRAN CALL C declaration + * CALL ATL_F77WRAP_SGEMM(...) void atl_f77wrap_sgemm_(...) + * + * This is the default. + */ +#if defined( SREAL ) + +#define ATL_F77wrap_srotg atl_f77wrap_srotg_ +#define ATL_F77wrap_srotmg atl_f77wrap_srotmg_ +#define ATL_F77wrap_snrm2 atl_f77wrap_snrm2_ +#define ATL_F77wrap_sasum atl_f77wrap_sasum_ +#define ATL_F77wrap_sscal atl_f77wrap_sscal_ +#define ATL_F77wrap_isamax atl_f77wrap_isamax_ +#define ATL_F77wrap_saxpy atl_f77wrap_saxpy_ +#define ATL_F77wrap_scopy atl_f77wrap_scopy_ +#define ATL_F77wrap_sswap atl_f77wrap_sswap_ +#define ATL_F77wrap_srot atl_f77wrap_srot_ +#define ATL_F77wrap_srotm atl_f77wrap_srotm_ +#define ATL_F77wrap_sdot atl_f77wrap_sdot_ +#define ATL_F77wrap_dsdot atl_f77wrap_dsdot_ +#define ATL_F77wrap_sdsdot atl_f77wrap_sdsdot_ + +#define ATL_F77wrap_sgbmv atl_f77wrap_sgbmv_ +#define ATL_F77wrap_sgemv atl_f77wrap_sgemv_ +#define ATL_F77wrap_sger atl_f77wrap_sger_ +#define ATL_F77wrap_ssbmv atl_f77wrap_ssbmv_ +#define ATL_F77wrap_sspmv atl_f77wrap_sspmv_ +#define ATL_F77wrap_ssymv atl_f77wrap_ssymv_ +#define ATL_F77wrap_sspr atl_f77wrap_sspr_ +#define ATL_F77wrap_ssyr atl_f77wrap_ssyr_ +#define ATL_F77wrap_sspr2 atl_f77wrap_sspr2_ +#define ATL_F77wrap_ssyr2 atl_f77wrap_ssyr2_ +#define ATL_F77wrap_stbmv atl_f77wrap_stbmv_ +#define ATL_F77wrap_stpmv atl_f77wrap_stpmv_ +#define ATL_F77wrap_strmv atl_f77wrap_strmv_ +#define ATL_F77wrap_stbsv atl_f77wrap_stbsv_ +#define ATL_F77wrap_stpsv atl_f77wrap_stpsv_ +#define ATL_F77wrap_strsv atl_f77wrap_strsv_ + +#define ATL_F77wrap_sgemm atl_f77wrap_sgemm_ +#define ATL_F77wrap_ssymm atl_f77wrap_ssymm_ +#define ATL_F77wrap_ssyrk atl_f77wrap_ssyrk_ +#define ATL_F77wrap_ssyr2k atl_f77wrap_ssyr2k_ +#define ATL_F77wrap_strmm atl_f77wrap_strmm_ +#define ATL_F77wrap_strsm atl_f77wrap_strsm_ + +#elif defined( DREAL ) + +#define ATL_F77wrap_drotg atl_f77wrap_drotg_ +#define ATL_F77wrap_drotmg atl_f77wrap_drotmg_ +#define ATL_F77wrap_dnrm2 atl_f77wrap_dnrm2_ +#define ATL_F77wrap_dasum atl_f77wrap_dasum_ +#define ATL_F77wrap_dscal atl_f77wrap_dscal_ +#define ATL_F77wrap_idamax atl_f77wrap_idamax_ +#define ATL_F77wrap_daxpy atl_f77wrap_daxpy_ +#define ATL_F77wrap_dcopy atl_f77wrap_dcopy_ +#define ATL_F77wrap_dswap atl_f77wrap_dswap_ +#define ATL_F77wrap_drot atl_f77wrap_drot_ +#define ATL_F77wrap_drotm atl_f77wrap_drotm_ +#define ATL_F77wrap_ddot atl_f77wrap_ddot_ + +#define ATL_F77wrap_dgbmv atl_f77wrap_dgbmv_ +#define ATL_F77wrap_dgemv atl_f77wrap_dgemv_ +#define ATL_F77wrap_dger atl_f77wrap_dger_ +#define ATL_F77wrap_dsbmv atl_f77wrap_dsbmv_ +#define ATL_F77wrap_dspmv atl_f77wrap_dspmv_ +#define ATL_F77wrap_dsymv atl_f77wrap_dsymv_ +#define ATL_F77wrap_dspr atl_f77wrap_dspr_ +#define ATL_F77wrap_dsyr atl_f77wrap_dsyr_ +#define ATL_F77wrap_dspr2 atl_f77wrap_dspr2_ +#define ATL_F77wrap_dsyr2 atl_f77wrap_dsyr2_ +#define ATL_F77wrap_dtbmv atl_f77wrap_dtbmv_ +#define ATL_F77wrap_dtpmv atl_f77wrap_dtpmv_ +#define ATL_F77wrap_dtrmv atl_f77wrap_dtrmv_ +#define ATL_F77wrap_dtbsv atl_f77wrap_dtbsv_ +#define ATL_F77wrap_dtpsv atl_f77wrap_dtpsv_ +#define ATL_F77wrap_dtrsv atl_f77wrap_dtrsv_ + +#define ATL_F77wrap_dgemm atl_f77wrap_dgemm_ +#define ATL_F77wrap_dsymm atl_f77wrap_dsymm_ +#define ATL_F77wrap_dsyrk atl_f77wrap_dsyrk_ +#define ATL_F77wrap_dsyr2k atl_f77wrap_dsyr2k_ +#define ATL_F77wrap_dtrmm atl_f77wrap_dtrmm_ +#define ATL_F77wrap_dtrsm atl_f77wrap_dtrsm_ + +#elif defined( SCPLX ) + +#define ATL_F77wrap_crotg atl_f77wrap_crotg_ +#define ATL_F77wrap_scnrm2 atl_f77wrap_scnrm2_ +#define ATL_F77wrap_scasum atl_f77wrap_scasum_ +#define ATL_F77wrap_cscal atl_f77wrap_cscal_ +#define ATL_F77wrap_csscal atl_f77wrap_csscal_ +#define ATL_F77wrap_icamax atl_f77wrap_icamax_ +#define ATL_F77wrap_caxpy atl_f77wrap_caxpy_ +#define ATL_F77wrap_ccopy atl_f77wrap_ccopy_ +#define ATL_F77wrap_cswap atl_f77wrap_cswap_ +#define ATL_F77wrap_csrot atl_f77wrap_csrot_ +#define ATL_F77wrap_cdotc atl_f77wrap_cdotc_ +#define ATL_F77wrap_cdotu atl_f77wrap_cdotu_ + +#define ATL_F77wrap_cgbmv atl_f77wrap_cgbmv_ +#define ATL_F77wrap_cgemv atl_f77wrap_cgemv_ +#define ATL_F77wrap_cgerc atl_f77wrap_cgerc_ +#define ATL_F77wrap_cgeru atl_f77wrap_cgeru_ +#define ATL_F77wrap_chbmv atl_f77wrap_chbmv_ +#define ATL_F77wrap_chpmv atl_f77wrap_chpmv_ +#define ATL_F77wrap_chemv atl_f77wrap_chemv_ +#define ATL_F77wrap_chpr atl_f77wrap_chpr_ +#define ATL_F77wrap_cher atl_f77wrap_cher_ +#define ATL_F77wrap_chpr2 atl_f77wrap_chpr2_ +#define ATL_F77wrap_cher2 atl_f77wrap_cher2_ +#define ATL_F77wrap_ctbmv atl_f77wrap_ctbmv_ +#define ATL_F77wrap_ctpmv atl_f77wrap_ctpmv_ +#define ATL_F77wrap_ctrmv atl_f77wrap_ctrmv_ +#define ATL_F77wrap_ctbsv atl_f77wrap_ctbsv_ +#define ATL_F77wrap_ctpsv atl_f77wrap_ctpsv_ +#define ATL_F77wrap_ctrsv atl_f77wrap_ctrsv_ + +#define ATL_F77wrap_cgemm atl_f77wrap_cgemm_ +#define ATL_F77wrap_chemm atl_f77wrap_chemm_ +#define ATL_F77wrap_cherk atl_f77wrap_cherk_ +#define ATL_F77wrap_cher2k atl_f77wrap_cher2k_ +#define ATL_F77wrap_csymm atl_f77wrap_csymm_ +#define ATL_F77wrap_csyrk atl_f77wrap_csyrk_ +#define ATL_F77wrap_csyr2k atl_f77wrap_csyr2k_ +#define ATL_F77wrap_ctrmm atl_f77wrap_ctrmm_ +#define ATL_F77wrap_ctrsm atl_f77wrap_ctrsm_ + +#elif defined( DCPLX ) + +#define ATL_F77wrap_zrotg atl_f77wrap_zrotg_ +#define ATL_F77wrap_dznrm2 atl_f77wrap_dznrm2_ +#define ATL_F77wrap_dzasum atl_f77wrap_dzasum_ +#define ATL_F77wrap_zscal atl_f77wrap_zscal_ +#define ATL_F77wrap_zdscal atl_f77wrap_zdscal_ +#define ATL_F77wrap_izamax atl_f77wrap_izamax_ +#define ATL_F77wrap_zaxpy atl_f77wrap_zaxpy_ +#define ATL_F77wrap_zcopy atl_f77wrap_zcopy_ +#define ATL_F77wrap_zswap atl_f77wrap_zswap_ +#define ATL_F77wrap_zdrot atl_f77wrap_zdrot_ +#define ATL_F77wrap_zdotc atl_f77wrap_zdotc_ +#define ATL_F77wrap_zdotu atl_f77wrap_zdotu_ + +#define ATL_F77wrap_zgbmv atl_f77wrap_zgbmv_ +#define ATL_F77wrap_zgemv atl_f77wrap_zgemv_ +#define ATL_F77wrap_zgerc atl_f77wrap_zgerc_ +#define ATL_F77wrap_zgeru atl_f77wrap_zgeru_ +#define ATL_F77wrap_zhbmv atl_f77wrap_zhbmv_ +#define ATL_F77wrap_zhpmv atl_f77wrap_zhpmv_ +#define ATL_F77wrap_zhemv atl_f77wrap_zhemv_ +#define ATL_F77wrap_zhpr atl_f77wrap_zhpr_ +#define ATL_F77wrap_zher atl_f77wrap_zher_ +#define ATL_F77wrap_zhpr2 atl_f77wrap_zhpr2_ +#define ATL_F77wrap_zher2 atl_f77wrap_zher2_ +#define ATL_F77wrap_ztbmv atl_f77wrap_ztbmv_ +#define ATL_F77wrap_ztpmv atl_f77wrap_ztpmv_ +#define ATL_F77wrap_ztrmv atl_f77wrap_ztrmv_ +#define ATL_F77wrap_ztbsv atl_f77wrap_ztbsv_ +#define ATL_F77wrap_ztpsv atl_f77wrap_ztpsv_ +#define ATL_F77wrap_ztrsv atl_f77wrap_ztrsv_ + +#define ATL_F77wrap_zgemm atl_f77wrap_zgemm_ +#define ATL_F77wrap_zhemm atl_f77wrap_zhemm_ +#define ATL_F77wrap_zherk atl_f77wrap_zherk_ +#define ATL_F77wrap_zher2k atl_f77wrap_zher2k_ +#define ATL_F77wrap_zsymm atl_f77wrap_zsymm_ +#define ATL_F77wrap_zsyrk atl_f77wrap_zsyrk_ +#define ATL_F77wrap_zsyr2k atl_f77wrap_zsyr2k_ +#define ATL_F77wrap_ztrmm atl_f77wrap_ztrmm_ +#define ATL_F77wrap_ztrsm atl_f77wrap_ztrsm_ + +#endif + +#elif defined( UpCase ) +/* + * These defines set up the naming scheme required to have a FORTRAN + * routine calling a C routine with the following interface: + * + * FORTRAN CALL C declaration + * CALL ATL_F77WRAP_SGEMM(...) void ATL_F77WRAP_SGEMM(...) + * + */ +#if defined( SREAL ) + +#define ATL_F77wrap_srotg ATL_F77WRAP_SROTG +#define ATL_F77wrap_srotmg ATL_F77WRAP_SROTMG +#define ATL_F77wrap_snrm2 ATL_F77WRAP_SNRM2 +#define ATL_F77wrap_sasum ATL_F77WRAP_SASUM +#define ATL_F77wrap_sscal ATL_F77WRAP_SSCAL +#define ATL_F77wrap_isamax ATL_F77WRAP_ISAMAX +#define ATL_F77wrap_saxpy ATL_F77WRAP_SAXPY +#define ATL_F77wrap_scopy ATL_F77WRAP_SCOPY +#define ATL_F77wrap_sswap ATL_F77WRAP_SSWAP +#define ATL_F77wrap_srot ATL_F77WRAP_SROT +#define ATL_F77wrap_srotm ATL_F77WRAP_SROTM +#define ATL_F77wrap_sdot ATL_F77WRAP_SDOT +#define ATL_F77wrap_dsdot ATL_F77WRAP_DSDOT +#define ATL_F77wrap_sdsdot ATL_F77WRAP_SDSDOT + +#define ATL_F77wrap_sgbmv ATL_F77WRAP_SGBMV +#define ATL_F77wrap_sgemv ATL_F77WRAP_SGEMV +#define ATL_F77wrap_sger ATL_F77WRAP_SGER +#define ATL_F77wrap_ssbmv ATL_F77WRAP_SSBMV +#define ATL_F77wrap_sspmv ATL_F77WRAP_SSPMV +#define ATL_F77wrap_ssymv ATL_F77WRAP_SSYMV +#define ATL_F77wrap_sspr ATL_F77WRAP_SSPR +#define ATL_F77wrap_ssyr ATL_F77WRAP_SSYR +#define ATL_F77wrap_sspr2 ATL_F77WRAP_SSPR2 +#define ATL_F77wrap_ssyr2 ATL_F77WRAP_SSYR2 +#define ATL_F77wrap_stbmv ATL_F77WRAP_STBMV +#define ATL_F77wrap_stpmv ATL_F77WRAP_STPMV +#define ATL_F77wrap_strmv ATL_F77WRAP_STRMV +#define ATL_F77wrap_stbsv ATL_F77WRAP_STBSV +#define ATL_F77wrap_stpsv ATL_F77WRAP_STPSV +#define ATL_F77wrap_strsv ATL_F77WRAP_STRSV + +#define ATL_F77wrap_sgemm ATL_F77WRAP_SGEMM +#define ATL_F77wrap_ssymm ATL_F77WRAP_SSYMM +#define ATL_F77wrap_ssyrk ATL_F77WRAP_SSYRK +#define ATL_F77wrap_ssyr2k ATL_F77WRAP_SSYR2K +#define ATL_F77wrap_strmm ATL_F77WRAP_STRMM +#define ATL_F77wrap_strsm ATL_F77WRAP_STRSM + +#elif defined( DREAL ) + +#define ATL_F77wrap_drotg ATL_F77WRAP_DROTG +#define ATL_F77wrap_drotmg ATL_F77WRAP_DROTMG +#define ATL_F77wrap_dnrm2 ATL_F77WRAP_DNRM2 +#define ATL_F77wrap_dasum ATL_F77WRAP_DASUM +#define ATL_F77wrap_dscal ATL_F77WRAP_DSCAL +#define ATL_F77wrap_idamax ATL_F77WRAP_IDAMAX +#define ATL_F77wrap_daxpy ATL_F77WRAP_DAXPY +#define ATL_F77wrap_dcopy ATL_F77WRAP_DCOPY +#define ATL_F77wrap_dswap ATL_F77WRAP_DSWAP +#define ATL_F77wrap_drot ATL_F77WRAP_DROT +#define ATL_F77wrap_drotm ATL_F77WRAP_DROTM +#define ATL_F77wrap_ddot ATL_F77WRAP_DDOT + +#define ATL_F77wrap_dgbmv ATL_F77WRAP_DGBMV +#define ATL_F77wrap_dgemv ATL_F77WRAP_DGEMV +#define ATL_F77wrap_dger ATL_F77WRAP_DGER +#define ATL_F77wrap_dsbmv ATL_F77WRAP_DSBMV +#define ATL_F77wrap_dspmv ATL_F77WRAP_DSPMV +#define ATL_F77wrap_dsymv ATL_F77WRAP_DSYMV +#define ATL_F77wrap_dspr ATL_F77WRAP_DSPR +#define ATL_F77wrap_dsyr ATL_F77WRAP_DSYR +#define ATL_F77wrap_dspr2 ATL_F77WRAP_DSPR2 +#define ATL_F77wrap_dsyr2 ATL_F77WRAP_DSYR2 +#define ATL_F77wrap_dtbmv ATL_F77WRAP_DTBMV +#define ATL_F77wrap_dtpmv ATL_F77WRAP_DTPMV +#define ATL_F77wrap_dtrmv ATL_F77WRAP_DTRMV +#define ATL_F77wrap_dtbsv ATL_F77WRAP_DTBSV +#define ATL_F77wrap_dtpsv ATL_F77WRAP_DTPSV +#define ATL_F77wrap_dtrsv ATL_F77WRAP_DTRSV + +#define ATL_F77wrap_dgemm ATL_F77WRAP_DGEMM +#define ATL_F77wrap_dsymm ATL_F77WRAP_DSYMM +#define ATL_F77wrap_dsyrk ATL_F77WRAP_DSYRK +#define ATL_F77wrap_dsyr2k ATL_F77WRAP_DSYR2K +#define ATL_F77wrap_dtrmm ATL_F77WRAP_DTRMM +#define ATL_F77wrap_dtrsm ATL_F77WRAP_DTRSM + +#elif defined( SCPLX ) + +#define ATL_F77wrap_crotg ATL_F77WRAP_CROTG +#define ATL_F77wrap_scnrm2 ATL_F77WRAP_SCNRM2 +#define ATL_F77wrap_scasum ATL_F77WRAP_SCASUM +#define ATL_F77wrap_cscal ATL_F77WRAP_CSCAL +#define ATL_F77wrap_csscal ATL_F77WRAP_CSSCAL +#define ATL_F77wrap_icamax ATL_F77WRAP_ICAMAX +#define ATL_F77wrap_caxpy ATL_F77WRAP_CAXPY +#define ATL_F77wrap_ccopy ATL_F77WRAP_CCOPY +#define ATL_F77wrap_cswap ATL_F77WRAP_CSWAP +#define ATL_F77wrap_csrot ATL_F77WRAP_CSROT +#define ATL_F77wrap_cdotc ATL_F77WRAP_CDOTC +#define ATL_F77wrap_cdotu ATL_F77WRAP_CDOTU + +#define ATL_F77wrap_cgbmv ATL_F77WRAP_CGBMV +#define ATL_F77wrap_cgemv ATL_F77WRAP_CGEMV +#define ATL_F77wrap_cgerc ATL_F77WRAP_CGERC +#define ATL_F77wrap_cgeru ATL_F77WRAP_CGERU +#define ATL_F77wrap_chbmv ATL_F77WRAP_CHBMV +#define ATL_F77wrap_chpmv ATL_F77WRAP_CHPMV +#define ATL_F77wrap_chemv ATL_F77WRAP_CHEMV +#define ATL_F77wrap_chpr ATL_F77WRAP_CHPR +#define ATL_F77wrap_cher ATL_F77WRAP_CHER +#define ATL_F77wrap_chpr2 ATL_F77WRAP_CHPR2 +#define ATL_F77wrap_cher2 ATL_F77WRAP_CHER2 +#define ATL_F77wrap_ctbmv ATL_F77WRAP_CTBMV +#define ATL_F77wrap_ctpmv ATL_F77WRAP_CTPMV +#define ATL_F77wrap_ctrmv ATL_F77WRAP_CTRMV +#define ATL_F77wrap_ctbsv ATL_F77WRAP_CTBSV +#define ATL_F77wrap_ctpsv ATL_F77WRAP_CTPSV +#define ATL_F77wrap_ctrsv ATL_F77WRAP_CTRSV + +#define ATL_F77wrap_cgemm ATL_F77WRAP_CGEMM +#define ATL_F77wrap_chemm ATL_F77WRAP_CHEMM +#define ATL_F77wrap_cherk ATL_F77WRAP_CHERK +#define ATL_F77wrap_cher2k ATL_F77WRAP_CHER2K +#define ATL_F77wrap_csymm ATL_F77WRAP_CSYMM +#define ATL_F77wrap_csyrk ATL_F77WRAP_CSYRK +#define ATL_F77wrap_csyr2k ATL_F77WRAP_CSYR2K +#define ATL_F77wrap_ctrmm ATL_F77WRAP_CTRMM +#define ATL_F77wrap_ctrsm ATL_F77WRAP_CTRSM + +#elif defined( DCPLX ) + +#define ATL_F77wrap_zrotg ATL_F77WRAP_ZROTG +#define ATL_F77wrap_dznrm2 ATL_F77WRAP_DZNRM2 +#define ATL_F77wrap_dzasum ATL_F77WRAP_DZASUM +#define ATL_F77wrap_zscal ATL_F77WRAP_ZSCAL +#define ATL_F77wrap_zdscal ATL_F77WRAP_ZDSCAL +#define ATL_F77wrap_izamax ATL_F77WRAP_IZAMAX +#define ATL_F77wrap_zaxpy ATL_F77WRAP_ZAXPY +#define ATL_F77wrap_zcopy ATL_F77WRAP_ZCOPY +#define ATL_F77wrap_zswap ATL_F77WRAP_ZSWAP +#define ATL_F77wrap_zdrot ATL_F77WRAP_ZDROT +#define ATL_F77wrap_zdotc ATL_F77WRAP_ZDOTC +#define ATL_F77wrap_zdotu ATL_F77WRAP_ZDOTU + +#define ATL_F77wrap_zgbmv ATL_F77WRAP_ZGBMV +#define ATL_F77wrap_zgemv ATL_F77WRAP_ZGEMV +#define ATL_F77wrap_zgerc ATL_F77WRAP_ZGERC +#define ATL_F77wrap_zgeru ATL_F77WRAP_ZGERU +#define ATL_F77wrap_zhbmv ATL_F77WRAP_ZHBMV +#define ATL_F77wrap_zhpmv ATL_F77WRAP_ZHPMV +#define ATL_F77wrap_zhemv ATL_F77WRAP_ZHEMV +#define ATL_F77wrap_zhpr ATL_F77WRAP_ZHPR +#define ATL_F77wrap_zher ATL_F77WRAP_ZHER +#define ATL_F77wrap_zhpr2 ATL_F77WRAP_ZHPR2 +#define ATL_F77wrap_zher2 ATL_F77WRAP_ZHER2 +#define ATL_F77wrap_ztbmv ATL_F77WRAP_ZTBMV +#define ATL_F77wrap_ztpmv ATL_F77WRAP_ZTPMV +#define ATL_F77wrap_ztrmv ATL_F77WRAP_ZTRMV +#define ATL_F77wrap_ztbsv ATL_F77WRAP_ZTBSV +#define ATL_F77wrap_ztpsv ATL_F77WRAP_ZTPSV +#define ATL_F77wrap_ztrsv ATL_F77WRAP_ZTRSV + +#define ATL_F77wrap_zgemm ATL_F77WRAP_ZGEMM +#define ATL_F77wrap_zhemm ATL_F77WRAP_ZHEMM +#define ATL_F77wrap_zherk ATL_F77WRAP_ZHERK +#define ATL_F77wrap_zher2k ATL_F77WRAP_ZHER2K +#define ATL_F77wrap_zsymm ATL_F77WRAP_ZSYMM +#define ATL_F77wrap_zsyrk ATL_F77WRAP_ZSYRK +#define ATL_F77wrap_zsyr2k ATL_F77WRAP_ZSYR2K +#define ATL_F77wrap_ztrmm ATL_F77WRAP_ZTRMM +#define ATL_F77wrap_ztrsm ATL_F77WRAP_ZTRSM + +#endif + +#elif defined( NoChange ) +/* + * These defines set up the naming scheme required to have a FORTRAN + * routine calling a C routine with the following interface: + * + * FORTRAN CALL C declaration + * CALL ATL_F77WRAP_SGEMM(...) void atl_f77wrap_sgemm(...) + */ +#if defined( SREAL ) + +#define ATL_F77wrap_srotg atl_f77wrap_srotg +#define ATL_F77wrap_srotmg atl_f77wrap_srotmg +#define ATL_F77wrap_snrm2 atl_f77wrap_snrm2 +#define ATL_F77wrap_sasum atl_f77wrap_sasum +#define ATL_F77wrap_sscal atl_f77wrap_sscal +#define ATL_F77wrap_isamax atl_f77wrap_isamax +#define ATL_F77wrap_saxpy atl_f77wrap_saxpy +#define ATL_F77wrap_scopy atl_f77wrap_scopy +#define ATL_F77wrap_sswap atl_f77wrap_sswap +#define ATL_F77wrap_srot atl_f77wrap_srot +#define ATL_F77wrap_srotm atl_f77wrap_srotm +#define ATL_F77wrap_sdot atl_f77wrap_sdot +#define ATL_F77wrap_dsdot atl_f77wrap_dsdot +#define ATL_F77wrap_sdsdot atl_f77wrap_sdsdot + +#define ATL_F77wrap_sgbmv atl_f77wrap_sgbmv +#define ATL_F77wrap_sgemv atl_f77wrap_sgemv +#define ATL_F77wrap_sger atl_f77wrap_sger +#define ATL_F77wrap_ssbmv atl_f77wrap_ssbmv +#define ATL_F77wrap_sspmv atl_f77wrap_sspmv +#define ATL_F77wrap_ssymv atl_f77wrap_ssymv +#define ATL_F77wrap_sspr atl_f77wrap_sspr +#define ATL_F77wrap_ssyr atl_f77wrap_ssyr +#define ATL_F77wrap_sspr2 atl_f77wrap_sspr2 +#define ATL_F77wrap_ssyr2 atl_f77wrap_ssyr2 +#define ATL_F77wrap_stbmv atl_f77wrap_stbmv +#define ATL_F77wrap_stpmv atl_f77wrap_stpmv +#define ATL_F77wrap_strmv atl_f77wrap_strmv +#define ATL_F77wrap_stbsv atl_f77wrap_stbsv +#define ATL_F77wrap_stpsv atl_f77wrap_stpsv +#define ATL_F77wrap_strsv atl_f77wrap_strsv + +#define ATL_F77wrap_sgemm atl_f77wrap_sgemm +#define ATL_F77wrap_ssymm atl_f77wrap_ssymm +#define ATL_F77wrap_ssyrk atl_f77wrap_ssyrk +#define ATL_F77wrap_ssyr2k atl_f77wrap_ssyr2k +#define ATL_F77wrap_strmm atl_f77wrap_strmm +#define ATL_F77wrap_strsm atl_f77wrap_strsm + +#elif defined( DREAL ) + +#define ATL_F77wrap_drotg atl_f77wrap_drotg +#define ATL_F77wrap_drotmg atl_f77wrap_drotmg +#define ATL_F77wrap_dnrm2 atl_f77wrap_dnrm2 +#define ATL_F77wrap_dasum atl_f77wrap_dasum +#define ATL_F77wrap_dscal atl_f77wrap_dscal +#define ATL_F77wrap_idamax atl_f77wrap_idamax +#define ATL_F77wrap_daxpy atl_f77wrap_daxpy +#define ATL_F77wrap_dcopy atl_f77wrap_dcopy +#define ATL_F77wrap_dswap atl_f77wrap_dswap +#define ATL_F77wrap_drot atl_f77wrap_drot +#define ATL_F77wrap_drotm atl_f77wrap_drotm +#define ATL_F77wrap_ddot atl_f77wrap_ddot + +#define ATL_F77wrap_dgbmv atl_f77wrap_dgbmv +#define ATL_F77wrap_dgemv atl_f77wrap_dgemv +#define ATL_F77wrap_dger atl_f77wrap_dger +#define ATL_F77wrap_dsbmv atl_f77wrap_dsbmv +#define ATL_F77wrap_dspmv atl_f77wrap_dspmv +#define ATL_F77wrap_dsymv atl_f77wrap_dsymv +#define ATL_F77wrap_dspr atl_f77wrap_dspr +#define ATL_F77wrap_dsyr atl_f77wrap_dsyr +#define ATL_F77wrap_dspr2 atl_f77wrap_dspr2 +#define ATL_F77wrap_dsyr2 atl_f77wrap_dsyr2 +#define ATL_F77wrap_dtbmv atl_f77wrap_dtbmv +#define ATL_F77wrap_dtpmv atl_f77wrap_dtpmv +#define ATL_F77wrap_dtrmv atl_f77wrap_dtrmv +#define ATL_F77wrap_dtbsv atl_f77wrap_dtbsv +#define ATL_F77wrap_dtpsv atl_f77wrap_dtpsv +#define ATL_F77wrap_dtrsv atl_f77wrap_dtrsv + +#define ATL_F77wrap_dgemm atl_f77wrap_dgemm +#define ATL_F77wrap_dsymm atl_f77wrap_dsymm +#define ATL_F77wrap_dsyrk atl_f77wrap_dsyrk +#define ATL_F77wrap_dsyr2k atl_f77wrap_dsyr2k +#define ATL_F77wrap_dtrmm atl_f77wrap_dtrmm +#define ATL_F77wrap_dtrsm atl_f77wrap_dtrsm + +#elif defined( SCPLX ) + +#define ATL_F77wrap_crotg atl_f77wrap_crotg +#define ATL_F77wrap_scnrm2 atl_f77wrap_scnrm2 +#define ATL_F77wrap_scasum atl_f77wrap_scasum +#define ATL_F77wrap_cscal atl_f77wrap_cscal +#define ATL_F77wrap_csscal atl_f77wrap_csscal +#define ATL_F77wrap_icamax atl_f77wrap_icamax +#define ATL_F77wrap_caxpy atl_f77wrap_caxpy +#define ATL_F77wrap_ccopy atl_f77wrap_ccopy +#define ATL_F77wrap_cswap atl_f77wrap_cswap +#define ATL_F77wrap_csrot atl_f77wrap_csrot +#define ATL_F77wrap_cdotc atl_f77wrap_cdotc +#define ATL_F77wrap_cdotu atl_f77wrap_cdotu + +#define ATL_F77wrap_cgbmv atl_f77wrap_cgbmv +#define ATL_F77wrap_cgemv atl_f77wrap_cgemv +#define ATL_F77wrap_cgerc atl_f77wrap_cgerc +#define ATL_F77wrap_cgeru atl_f77wrap_cgeru +#define ATL_F77wrap_chbmv atl_f77wrap_chbmv +#define ATL_F77wrap_chpmv atl_f77wrap_chpmv +#define ATL_F77wrap_chemv atl_f77wrap_chemv +#define ATL_F77wrap_chpr atl_f77wrap_chpr +#define ATL_F77wrap_cher atl_f77wrap_cher +#define ATL_F77wrap_chpr2 atl_f77wrap_chpr2 +#define ATL_F77wrap_cher2 atl_f77wrap_cher2 +#define ATL_F77wrap_ctbmv atl_f77wrap_ctbmv +#define ATL_F77wrap_ctpmv atl_f77wrap_ctpmv +#define ATL_F77wrap_ctrmv atl_f77wrap_ctrmv +#define ATL_F77wrap_ctbsv atl_f77wrap_ctbsv +#define ATL_F77wrap_ctpsv atl_f77wrap_ctpsv +#define ATL_F77wrap_ctrsv atl_f77wrap_ctrsv + +#define ATL_F77wrap_cgemm atl_f77wrap_cgemm +#define ATL_F77wrap_chemm atl_f77wrap_chemm +#define ATL_F77wrap_cherk atl_f77wrap_cherk +#define ATL_F77wrap_cher2k atl_f77wrap_cher2k +#define ATL_F77wrap_csymm atl_f77wrap_csymm +#define ATL_F77wrap_csyrk atl_f77wrap_csyrk +#define ATL_F77wrap_csyr2k atl_f77wrap_csyr2k +#define ATL_F77wrap_ctrmm atl_f77wrap_ctrmm +#define ATL_F77wrap_ctrsm atl_f77wrap_ctrsm + +#elif defined( DCPLX ) + +#define ATL_F77wrap_zrotg atl_f77wrap_zrotg +#define ATL_F77wrap_dznrm2 atl_f77wrap_dznrm2 +#define ATL_F77wrap_dzasum atl_f77wrap_dzasum +#define ATL_F77wrap_zscal atl_f77wrap_zscal +#define ATL_F77wrap_zdscal atl_f77wrap_zdscal +#define ATL_F77wrap_izamax atl_f77wrap_izamax +#define ATL_F77wrap_zaxpy atl_f77wrap_zaxpy +#define ATL_F77wrap_zcopy atl_f77wrap_zcopy +#define ATL_F77wrap_zswap atl_f77wrap_zswap +#define ATL_F77wrap_zdrot atl_f77wrap_zdrot +#define ATL_F77wrap_zdotc atl_f77wrap_zdotc +#define ATL_F77wrap_zdotu atl_f77wrap_zdotu + +#define ATL_F77wrap_zgbmv atl_f77wrap_zgbmv +#define ATL_F77wrap_zgemv atl_f77wrap_zgemv +#define ATL_F77wrap_zgerc atl_f77wrap_zgerc +#define ATL_F77wrap_zgeru atl_f77wrap_zgeru +#define ATL_F77wrap_zhbmv atl_f77wrap_zhbmv +#define ATL_F77wrap_zhpmv atl_f77wrap_zhpmv +#define ATL_F77wrap_zhemv atl_f77wrap_zhemv +#define ATL_F77wrap_zhpr atl_f77wrap_zhpr +#define ATL_F77wrap_zher atl_f77wrap_zher +#define ATL_F77wrap_zhpr2 atl_f77wrap_zhpr2 +#define ATL_F77wrap_zher2 atl_f77wrap_zher2 +#define ATL_F77wrap_ztbmv atl_f77wrap_ztbmv +#define ATL_F77wrap_ztpmv atl_f77wrap_ztpmv +#define ATL_F77wrap_ztrmv atl_f77wrap_ztrmv +#define ATL_F77wrap_ztbsv atl_f77wrap_ztbsv +#define ATL_F77wrap_ztpsv atl_f77wrap_ztpsv +#define ATL_F77wrap_ztrsv atl_f77wrap_ztrsv + +#define ATL_F77wrap_zgemm atl_f77wrap_zgemm +#define ATL_F77wrap_zhemm atl_f77wrap_zhemm +#define ATL_F77wrap_zherk atl_f77wrap_zherk +#define ATL_F77wrap_zher2k atl_f77wrap_zher2k +#define ATL_F77wrap_zsymm atl_f77wrap_zsymm +#define ATL_F77wrap_zsyrk atl_f77wrap_zsyrk +#define ATL_F77wrap_zsyr2k atl_f77wrap_zsyr2k +#define ATL_F77wrap_ztrmm atl_f77wrap_ztrmm +#define ATL_F77wrap_ztrsm atl_f77wrap_ztrsm + +#endif + +#elif defined( Add__ ) +/* + * These defines set up the naming scheme required to have a FORTRAN + * routine calling a C routine with the following interface: + * + * FORTRAN CALL C declaration + * CALL ATL_F77WRAP_SGEMM(...) void atl_f77wrap_sgemm__(...) + */ +#if defined( SREAL ) + +#define ATL_F77wrap_srotg atl_f77wrap_srotg__ +#define ATL_F77wrap_srotmg atl_f77wrap_srotmg__ +#define ATL_F77wrap_snrm2 atl_f77wrap_snrm2__ +#define ATL_F77wrap_sasum atl_f77wrap_sasum__ +#define ATL_F77wrap_sscal atl_f77wrap_sscal__ +#define ATL_F77wrap_isamax atl_f77wrap_isamax__ +#define ATL_F77wrap_saxpy atl_f77wrap_saxpy__ +#define ATL_F77wrap_scopy atl_f77wrap_scopy__ +#define ATL_F77wrap_sswap atl_f77wrap_sswap__ +#define ATL_F77wrap_srot atl_f77wrap_srot__ +#define ATL_F77wrap_srotm atl_f77wrap_srotm__ +#define ATL_F77wrap_sdot atl_f77wrap_sdot__ +#define ATL_F77wrap_dsdot atl_f77wrap_dsdot__ +#define ATL_F77wrap_sdsdot atl_f77wrap_sdsdot__ + +#define ATL_F77wrap_sgbmv atl_f77wrap_sgbmv__ +#define ATL_F77wrap_sgemv atl_f77wrap_sgemv__ +#define ATL_F77wrap_sger atl_f77wrap_sger__ +#define ATL_F77wrap_ssbmv atl_f77wrap_ssbmv__ +#define ATL_F77wrap_sspmv atl_f77wrap_sspmv__ +#define ATL_F77wrap_ssymv atl_f77wrap_ssymv__ +#define ATL_F77wrap_sspr atl_f77wrap_sspr__ +#define ATL_F77wrap_ssyr atl_f77wrap_ssyr__ +#define ATL_F77wrap_sspr2 atl_f77wrap_sspr2__ +#define ATL_F77wrap_ssyr2 atl_f77wrap_ssyr2__ +#define ATL_F77wrap_stbmv atl_f77wrap_stbmv__ +#define ATL_F77wrap_stpmv atl_f77wrap_stpmv__ +#define ATL_F77wrap_strmv atl_f77wrap_strmv__ +#define ATL_F77wrap_stbsv atl_f77wrap_stbsv__ +#define ATL_F77wrap_stpsv atl_f77wrap_stpsv__ +#define ATL_F77wrap_strsv atl_f77wrap_strsv__ + +#define ATL_F77wrap_sgemm atl_f77wrap_sgemm__ +#define ATL_F77wrap_ssymm atl_f77wrap_ssymm__ +#define ATL_F77wrap_ssyrk atl_f77wrap_ssyrk__ +#define ATL_F77wrap_ssyr2k atl_f77wrap_ssyr2k__ +#define ATL_F77wrap_strmm atl_f77wrap_strmm__ +#define ATL_F77wrap_strsm atl_f77wrap_strsm__ + +#elif defined( DREAL ) + +#define ATL_F77wrap_drotg atl_f77wrap_drotg__ +#define ATL_F77wrap_drotmg atl_f77wrap_drotmg__ +#define ATL_F77wrap_dnrm2 atl_f77wrap_dnrm2__ +#define ATL_F77wrap_dasum atl_f77wrap_dasum__ +#define ATL_F77wrap_dscal atl_f77wrap_dscal__ +#define ATL_F77wrap_idamax atl_f77wrap_idamax__ +#define ATL_F77wrap_daxpy atl_f77wrap_daxpy__ +#define ATL_F77wrap_dcopy atl_f77wrap_dcopy__ +#define ATL_F77wrap_dswap atl_f77wrap_dswap__ +#define ATL_F77wrap_drot atl_f77wrap_drot__ +#define ATL_F77wrap_drotm atl_f77wrap_drotm__ +#define ATL_F77wrap_ddot atl_f77wrap_ddot__ + +#define ATL_F77wrap_dgbmv atl_f77wrap_dgbmv__ +#define ATL_F77wrap_dgemv atl_f77wrap_dgemv__ +#define ATL_F77wrap_dger atl_f77wrap_dger__ +#define ATL_F77wrap_dsbmv atl_f77wrap_dsbmv__ +#define ATL_F77wrap_dspmv atl_f77wrap_dspmv__ +#define ATL_F77wrap_dsymv atl_f77wrap_dsymv__ +#define ATL_F77wrap_dspr atl_f77wrap_dspr__ +#define ATL_F77wrap_dsyr atl_f77wrap_dsyr__ +#define ATL_F77wrap_dspr2 atl_f77wrap_dspr2__ +#define ATL_F77wrap_dsyr2 atl_f77wrap_dsyr2__ +#define ATL_F77wrap_dtbmv atl_f77wrap_dtbmv__ +#define ATL_F77wrap_dtpmv atl_f77wrap_dtpmv__ +#define ATL_F77wrap_dtrmv atl_f77wrap_dtrmv__ +#define ATL_F77wrap_dtbsv atl_f77wrap_dtbsv__ +#define ATL_F77wrap_dtpsv atl_f77wrap_dtpsv__ +#define ATL_F77wrap_dtrsv atl_f77wrap_dtrsv__ + +#define ATL_F77wrap_dgemm atl_f77wrap_dgemm__ +#define ATL_F77wrap_dsymm atl_f77wrap_dsymm__ +#define ATL_F77wrap_dsyrk atl_f77wrap_dsyrk__ +#define ATL_F77wrap_dsyr2k atl_f77wrap_dsyr2k__ +#define ATL_F77wrap_dtrmm atl_f77wrap_dtrmm__ +#define ATL_F77wrap_dtrsm atl_f77wrap_dtrsm__ + +#elif defined( SCPLX ) + +#define ATL_F77wrap_crotg atl_f77wrap_crotg__ +#define ATL_F77wrap_scnrm2 atl_f77wrap_scnrm2__ +#define ATL_F77wrap_scasum atl_f77wrap_scasum__ +#define ATL_F77wrap_cscal atl_f77wrap_cscal__ +#define ATL_F77wrap_csscal atl_f77wrap_csscal__ +#define ATL_F77wrap_icamax atl_f77wrap_icamax__ +#define ATL_F77wrap_caxpy atl_f77wrap_caxpy__ +#define ATL_F77wrap_ccopy atl_f77wrap_ccopy__ +#define ATL_F77wrap_cswap atl_f77wrap_cswap__ +#define ATL_F77wrap_csrot atl_f77wrap_csrot__ +#define ATL_F77wrap_cdotc atl_f77wrap_cdotc__ +#define ATL_F77wrap_cdotu atl_f77wrap_cdotu__ + +#define ATL_F77wrap_cgbmv atl_f77wrap_cgbmv__ +#define ATL_F77wrap_cgemv atl_f77wrap_cgemv__ +#define ATL_F77wrap_cgerc atl_f77wrap_cgerc__ +#define ATL_F77wrap_cgeru atl_f77wrap_cgeru__ +#define ATL_F77wrap_chbmv atl_f77wrap_chbmv__ +#define ATL_F77wrap_chpmv atl_f77wrap_chpmv__ +#define ATL_F77wrap_chemv atl_f77wrap_chemv__ +#define ATL_F77wrap_chpr atl_f77wrap_chpr__ +#define ATL_F77wrap_cher atl_f77wrap_cher__ +#define ATL_F77wrap_chpr2 atl_f77wrap_chpr2__ +#define ATL_F77wrap_cher2 atl_f77wrap_cher2__ +#define ATL_F77wrap_ctbmv atl_f77wrap_ctbmv__ +#define ATL_F77wrap_ctpmv atl_f77wrap_ctpmv__ +#define ATL_F77wrap_ctrmv atl_f77wrap_ctrmv__ +#define ATL_F77wrap_ctbsv atl_f77wrap_ctbsv__ +#define ATL_F77wrap_ctpsv atl_f77wrap_ctpsv__ +#define ATL_F77wrap_ctrsv atl_f77wrap_ctrsv__ + +#define ATL_F77wrap_cgemm atl_f77wrap_cgemm__ +#define ATL_F77wrap_chemm atl_f77wrap_chemm__ +#define ATL_F77wrap_cherk atl_f77wrap_cherk__ +#define ATL_F77wrap_cher2k atl_f77wrap_cher2k__ +#define ATL_F77wrap_csymm atl_f77wrap_csymm__ +#define ATL_F77wrap_csyrk atl_f77wrap_csyrk__ +#define ATL_F77wrap_csyr2k atl_f77wrap_csyr2k__ +#define ATL_F77wrap_ctrmm atl_f77wrap_ctrmm__ +#define ATL_F77wrap_ctrsm atl_f77wrap_ctrsm__ + +#elif defined( DCPLX ) + +#define ATL_F77wrap_zrotg atl_f77wrap_zrotg__ +#define ATL_F77wrap_dznrm2 atl_f77wrap_dznrm2__ +#define ATL_F77wrap_dzasum atl_f77wrap_dzasum__ +#define ATL_F77wrap_zscal atl_f77wrap_zscal__ +#define ATL_F77wrap_zdscal atl_f77wrap_zdscal__ +#define ATL_F77wrap_izamax atl_f77wrap_izamax__ +#define ATL_F77wrap_zaxpy atl_f77wrap_zaxpy__ +#define ATL_F77wrap_zcopy atl_f77wrap_zcopy__ +#define ATL_F77wrap_zswap atl_f77wrap_zswap__ +#define ATL_F77wrap_zdrot atl_f77wrap_zdrot__ +#define ATL_F77wrap_zdotc atl_f77wrap_zdotc__ +#define ATL_F77wrap_zdotu atl_f77wrap_zdotu__ + +#define ATL_F77wrap_zgbmv atl_f77wrap_zgbmv__ +#define ATL_F77wrap_zgemv atl_f77wrap_zgemv__ +#define ATL_F77wrap_zgerc atl_f77wrap_zgerc__ +#define ATL_F77wrap_zgeru atl_f77wrap_zgeru__ +#define ATL_F77wrap_zhbmv atl_f77wrap_zhbmv__ +#define ATL_F77wrap_zhpmv atl_f77wrap_zhpmv__ +#define ATL_F77wrap_zhemv atl_f77wrap_zhemv__ +#define ATL_F77wrap_zhpr atl_f77wrap_zhpr__ +#define ATL_F77wrap_zher atl_f77wrap_zher__ +#define ATL_F77wrap_zhpr2 atl_f77wrap_zhpr2__ +#define ATL_F77wrap_zher2 atl_f77wrap_zher2__ +#define ATL_F77wrap_ztbmv atl_f77wrap_ztbmv__ +#define ATL_F77wrap_ztpmv atl_f77wrap_ztpmv__ +#define ATL_F77wrap_ztrmv atl_f77wrap_ztrmv__ +#define ATL_F77wrap_ztbsv atl_f77wrap_ztbsv__ +#define ATL_F77wrap_ztpsv atl_f77wrap_ztpsv__ +#define ATL_F77wrap_ztrsv atl_f77wrap_ztrsv__ + +#define ATL_F77wrap_zgemm atl_f77wrap_zgemm__ +#define ATL_F77wrap_zhemm atl_f77wrap_zhemm__ +#define ATL_F77wrap_zherk atl_f77wrap_zherk__ +#define ATL_F77wrap_zher2k atl_f77wrap_zher2k__ +#define ATL_F77wrap_zsymm atl_f77wrap_zsymm__ +#define ATL_F77wrap_zsyrk atl_f77wrap_zsyrk__ +#define ATL_F77wrap_zsyr2k atl_f77wrap_zsyr2k__ +#define ATL_F77wrap_ztrmm atl_f77wrap_ztrmm__ +#define ATL_F77wrap_ztrsm atl_f77wrap_ztrsm__ + +#endif + +#endif +/* + * ===================================================================== + * Prototypes for F77 interface wrappers ATLAS BLAS routines + * ===================================================================== + */ +void Mjoin( PATLF77WRAP, rotg ) +( TYPE *, TYPE *, TYPE *, TYPE * ); +#ifdef TREAL +void Mjoin( PATLF77WRAP, rotmg ) +( TYPE *, TYPE *, TYPE *, TYPE *, + TYPE * ); +#endif +void Mjoin( ATLUPF77WRAP, nrm2 ) +( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE * ); +void Mjoin( ATLUPF77WRAP, asum ) +( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE * ); +void Mjoin( PATLF77WRAP, scal ) +( F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER * ); +#ifdef TCPLX +void Mjoin( ATLPUF77WRAP, scal ) +( F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER * ); +#endif +void Mjoin( Mjoin( ATL_F77wrap_i, PRE ), amax ) +( F77_INTEGER *, TYPE *, F77_INTEGER *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, axpy ) +( F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER *, + TYPE *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, copy ) +( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER * ); +void Mjoin( PATLF77WRAP, swap ) +( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER * ); +void Mjoin( ATLPUF77WRAP, rot ) +( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER *, TYPE *, TYPE * ); +#ifdef TREAL +void Mjoin( PATLF77WRAP, rotm ) +( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER *, TYPE * ); +#endif +#ifdef TREAL +void Mjoin( PATLF77WRAP, dot ) +( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER *, TYPE * ); +#else +void Mjoin( PATLF77WRAP, dotc ) +( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER *, TYPE * ); +void Mjoin( PATLF77WRAP, dotu ) +( F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER *, TYPE * ); +#endif +void ATL_F77wrap_dsdot +( F77_INTEGER *, float *, F77_INTEGER *, float *, + F77_INTEGER *, double * ); +void ATL_F77wrap_sdsdot +( F77_INTEGER *, float *, float *, F77_INTEGER *, + float *, F77_INTEGER *, float * ); + +void Mjoin( PATLF77WRAP, gbmv ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, + F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER *, + TYPE *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER * ); +void Mjoin( PATLF77WRAP, gemv ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, TYPE *, + TYPE *, F77_INTEGER *, TYPE *, F77_INTEGER *, + TYPE *, TYPE *, F77_INTEGER * ); +#ifdef TREAL +void Mjoin( PATLF77WRAP, ger ) +( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER * ); +void Mjoin( PATLF77WRAP, sbmv ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, TYPE *, + TYPE *, F77_INTEGER *, TYPE *, F77_INTEGER *, + TYPE *, TYPE *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, spmv ) +( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + TYPE *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER * ); +void Mjoin( PATLF77WRAP, symv ) +( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *, + TYPE *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, spr ) +( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER *, TYPE * ); +void Mjoin( PATLF77WRAP, syr ) +( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER *, TYPE *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, spr2 ) +( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE * ); +void Mjoin( PATLF77WRAP, syr2 ) +( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER * ); +#else +void Mjoin( PATLF77WRAP, gerc ) +( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER * ); +void Mjoin( PATLF77WRAP, geru ) +( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER * ); +void Mjoin( PATLF77WRAP, hbmv ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, TYPE *, + TYPE *, F77_INTEGER *, TYPE *, F77_INTEGER *, + TYPE *, TYPE *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, hpmv ) +( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + TYPE *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER * ); +void Mjoin( PATLF77WRAP, hemv ) +( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *, + TYPE *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, hpr ) +( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER *, TYPE * ); +void Mjoin( PATLF77WRAP, her ) +( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER *, TYPE *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, hpr2 ) +( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE * ); +void Mjoin( PATLF77WRAP, her2 ) +( F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER * ); +#endif +void Mjoin( PATLF77WRAP, tbmv ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, + F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER * ); +void Mjoin( PATLF77WRAP, tpmv ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, + TYPE *, TYPE *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, trmv ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, + TYPE *, F77_INTEGER *, TYPE *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, tbsv ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, + F77_INTEGER *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER * ); +void Mjoin( PATLF77WRAP, tpsv ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, + TYPE *, TYPE *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, trsv ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, + TYPE *, F77_INTEGER *, TYPE *, F77_INTEGER * ); + +void Mjoin( PATLF77WRAP, gemm ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, + F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER *, + TYPE *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER * ); +#ifdef TCPLX +void Mjoin( PATLF77WRAP, hemm ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, + TYPE *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, herk ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, + TYPE *, TYPE *, F77_INTEGER *, TYPE *, + TYPE *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, her2k ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, + TYPE *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER * ); +#endif +void Mjoin( PATLF77WRAP, symm ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, + TYPE *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, syrk ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, + TYPE *, TYPE *, F77_INTEGER *, TYPE *, + TYPE *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, syr2k ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, + TYPE *, TYPE *, F77_INTEGER *, TYPE *, + F77_INTEGER *, TYPE *, TYPE *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, trmm ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, + F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER *, TYPE *, F77_INTEGER * ); +void Mjoin( PATLF77WRAP, trsm ) +( F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, F77_INTEGER *, + F77_INTEGER *, F77_INTEGER *, TYPE *, TYPE *, + F77_INTEGER *, TYPE *, F77_INTEGER * ); + +#endif +/* + * End of atlas_f77wrap.h + */ diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_fopen.h b/kaldi_io/src/tools/ATLAS/include/atlas_fopen.h new file mode 100644 index 0000000..aaed713 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_fopen.h @@ -0,0 +1,40 @@ +#ifndef ATLAS_FOPEN_H +#define ATLAS_FOPEN_H + +static int FileExists(const char *path) +{ + FILE *fp; + int iret=0; + fp = fopen(path, "r"); + if (fp) + { + fclose(fp); + iret = 1; + } + return(iret); +} + +#ifdef ATL_FOPENDELAY +static FILE *ATL_fopen(const char *path, const char *mode) +/* + * Overload fopen so it waits for NFS propogation upon first read failure + */ +{ + FILE *fp; + char ln[256]; + + fp = fopen(path, mode); + if (fp == NULL) + { + if (*mode == 'r') /* give NFS time to produce file */ + { + sprintf(ln, "make waitfile waitfile=%s\n", path); + if (system(ln) == 0) fp = fopen(path, mode); + } + } + return(fp); +} +#define fopen ATL_fopen +#endif + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_kern3.h b/kaldi_io/src/tools/ATLAS/include/atlas_kern3.h new file mode 100644 index 0000000..97e8bcc --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_kern3.h @@ -0,0 +1,110 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1999 R. Clint Whaley + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef ATLAS_KERN3_H +#define ATLAS_KERN3_H + +#include "atlas_misc.h" +#include Mstr(Mjoin(Mjoin(atlas_,PRE),NCmm.h)) +#include "atlas_lvl3.h" +#include "atlas_kernel3.h" +#include "atlas_reflevel3.h" +/* + * Gemm entry points + */ +#define CgemmNN Mjoin(PATL,gemmNN) +#define CgemmNT Mjoin(PATL,gemmNT) +#define CgemmTN Mjoin(PATL,gemmTN) +#define CgemmNC Mjoin(PATL,gemmNC) +#define CgemmCN Mjoin(PATL,gemmCN) + +#define CAgemmNN Mjoin(PATL,aliased_gemmNN) +#define CAgemmTN Mjoin(PATL,aliased_gemmTN) + +#ifdef Left_ + #define Side_ AtlasLeft + #define SideNM L +#elif defined(Right_) + #define Side_ AtlasRight + #define SideNM R +#endif + +#ifdef Upper_ + #define Uplo_ AtlasUpper + #define UploNM U +#elif defined(Lower_) + #define Uplo_ AtlasLower + #define UploNM L +#endif + +#ifdef UnitDiag_ + #define Unit_ AtlasUnit + #define UnitNM U +#elif defined(NonUnitDiag_) + #define Unit_ AtlasNonUnit + #define UnitNM N +#endif + +#ifdef Transpose_ + #define Trans_ AtlasTrans + #define TransNM T +#elif defined(Notranspose_) + #define Trans_ AtlasNoTrans + #define TransNM N +#elif defined(ConjTrans_) + #define Trans_ AtlasConjTrans + #define TransNM C +#endif + +#ifndef TRSM_Xover + #define TRSM_Xover NB +#endif +#ifndef TRMM_Xover + #define TRMM_Xover NB +#endif +#ifndef HER2K_Xover + #define HER2K_Xover NB +#endif +#ifndef SYR2K_Xover + #define SYR2K_Xover NB +#endif +#ifndef HERK_Xover + #define HERK_Xover NB +#endif +#ifndef SYRK_Xover + #define SYRK_Xover NB +#endif +#ifndef HEMM_Xover + #define HEMM_Xover NB +#endif +#ifndef SYMM_Xover + #define SYMM_Xover NB +#endif + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_kernel2.h b/kaldi_io/src/tools/ATLAS/include/atlas_kernel2.h new file mode 100644 index 0000000..4663def --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_kernel2.h @@ -0,0 +1,5408 @@ +/* --------------------------------------------------------------------- + * + * -- Automatically Tuned Linear Algebra Software (ATLAS) + * (C) Copyright 2000 All Rights Reserved + * + * -- ATLAS routine -- Version 3.2 -- December 25, 2000 + * + * Author : Antoine P. Petitet + * Contributor(s) : R. Clint Whaley + * Originally developed at the University of Tennessee, + * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA. + * + * --------------------------------------------------------------------- + * + * -- Copyright notice and Licensing terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in + * the documentation and/or other materials provided with the distri- + * bution. + * 3. The name of the University, the ATLAS group, or the names of its + * contributors may not be used to endorse or promote products deri- + * ved from this software without specific written permission. + * + * -- Disclaimer: + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO- + * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN- + * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --------------------------------------------------------------------- + */ +#ifndef ATLAS_KERNEL2_H +#define ATLAS_KERNEL2_H +/* + * ===================================================================== + * Macro function definitions + * ===================================================================== + */ +#define ATL_GetPartSBMV ATL_GetPartSYMV +#define ATL_GetPartSPMV ATL_GetPartSYMV +#define ATL_GetPartP1 ATL_GetPartR1 + +#define MLpprev( n_, a_, lda_ ) \ + { a_ -= ( (((n_) * (lda_)) + (((n_)*((n_)+1)) >> 1)) SHIFT ); lda_ += (n_); } +#define MUpprev( n_, a_, lda_ ) \ + { a_ -= ( (((n_) * (lda_)) - (((n_)*((n_)-1)) >> 1)) SHIFT ); lda_ -= (n_); } +#define MLpnext( n_, a_, lda_ ) \ + { a_ += ( (((n_) * (lda_)) - (((n_)*((n_)-1)) >> 1)) SHIFT ); lda_ -= (n_); } +#define MUpnext( n_, a_, lda_ ) \ + { a_ += ( (((n_) * (lda_)) + (((n_)*((n_)+1)) >> 1)) SHIFT ); lda_ += (n_); } + +#define MLrprev( n_, a_, lda_ ) \ + { a_ -= ( ((n_) * ((lda_)+1)) SHIFT ); } +#define MUrprev( n_, a_, lda_ ) \ + { a_ -= ( ((n_) * ((lda_)+1)) SHIFT ); } +#define MLrnext( n_, a_, lda_ ) \ + { a_ += ( ((n_) * ((lda_)+1)) SHIFT ); } +#define MUrnext( n_, a_, lda_ ) \ + { a_ += ( ((n_) * ((lda_)+1)) SHIFT ); } +/* + * ===================================================================== + * Recursive Level 2 BLAS function prototypes + * ===================================================================== + */ +void ATL_strsvLTU +( + const int, + const float *, const int, + float * +); + +void ATL_strsvLNU +( + const int, + const float *, const int, + float * +); + +void ATL_strsvLTN +( + const int, + const float *, const int, + float * +); + +void ATL_strsvLNN +( + const int, + const float *, const int, + float * +); + +void ATL_strsvUTU +( + const int, + const float *, const int, + float * +); + +void ATL_strsvUNU +( + const int, + const float *, const int, + float * +); + +void ATL_strsvUTN +( + const int, + const float *, const int, + float * +); + +void ATL_strsvUNN +( + const int, + const float *, const int, + float * +); + +void ATL_strsvLT +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_strsvLN +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_strsvUT +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_strsvUN +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_stpsvLTU +( + const int, + const float *, const int, + float * +); + +void ATL_stpsvLNU +( + const int, + const float *, const int, + float * +); + +void ATL_stpsvLTN +( + const int, + const float *, const int, + float * +); + +void ATL_stpsvLNN +( + const int, + const float *, const int, + float * +); + +void ATL_stpsvUTU +( + const int, + const float *, const int, + float * +); + +void ATL_stpsvUNU +( + const int, + const float *, const int, + float * +); + +void ATL_stpsvUTN +( + const int, + const float *, const int, + float * +); + +void ATL_stpsvUNN +( + const int, + const float *, const int, + float * +); + +void ATL_stpsvLT +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_stpsvLN +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_stpsvUT +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_stpsvUN +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_stbsvLTU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_stbsvLNU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_stbsvLTN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_stbsvLNN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_stbsvUTU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_stbsvUNU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_stbsvUTN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_stbsvUNN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_stbsvLT +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_stbsvLN +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_stbsvUT +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_stbsvUN +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_strmvLTU +( + const int, + const float *, const int, + float * +); + +void ATL_strmvLNU +( + const int, + const float *, const int, + float * +); + +void ATL_strmvLTN +( + const int, + const float *, const int, + float * +); + +void ATL_strmvLNN +( + const int, + const float *, const int, + float * +); + +void ATL_strmvUTU +( + const int, + const float *, const int, + float * +); + +void ATL_strmvUNU +( + const int, + const float *, const int, + float * +); + +void ATL_strmvUTN +( + const int, + const float *, const int, + float * +); + +void ATL_strmvUNN +( + const int, + const float *, const int, + float * +); + +void ATL_strmvLT +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_strmvLN +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_strmvUT +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_strmvUN +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_stpmvLTU +( + const int, + const float *, const int, + float * +); + +void ATL_stpmvLNU +( + const int, + const float *, const int, + float * +); + +void ATL_stpmvLTN +( + const int, + const float *, const int, + float * +); + +void ATL_stpmvLNN +( + const int, + const float *, const int, + float * +); + +void ATL_stpmvUTU +( + const int, + const float *, const int, + float * +); + +void ATL_stpmvUNU +( + const int, + const float *, const int, + float * +); + +void ATL_stpmvUTN +( + const int, + const float *, const int, + float * +); + +void ATL_stpmvUNN +( + const int, + const float *, const int, + float * +); + +void ATL_stpmvLT +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_stpmvLN +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_stpmvUT +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_stpmvUN +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_stbmvLTU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_stbmvLNU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_stbmvLTN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_stbmvLNN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_stbmvUTU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_stbmvUNU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_stbmvUTN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_stbmvUNN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_stbmvLT +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_stbmvLN +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_stbmvUT +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_stbmvUN +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_ssyr2U +( + const int, + const float *, + const float *, + float *, const int +); + +void ATL_ssyr2L +( + const int, + const float *, + const float *, + float *, const int +); + +void ATL_sspr2U +( + const int, + const float *, + const float *, + float *, const int +); + +void ATL_sspr2L +( + const int, + const float *, + const float *, + float *, const int +); + +void ATL_ssyrU +( + const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_ssyrL +( + const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_ssprU +( + const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_ssprL +( + const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_ssymvU +( + const int, + const float *, const int, + const float *, + const float, + float * +); + +void ATL_ssymvL +( + const int, + const float *, const int, + const float *, + const float, + float * +); + +void ATL_sspmvU +( + const int, + const float *, const int, + const float *, + const float, + float * +); + +void ATL_sspmvL +( + const int, + const float *, const int, + const float *, + const float, + float * +); + +void ATL_ssbmvU +( + const int, const int, + const float *, const int, + const float *, + const float, + float * +); + +void ATL_ssbmvL +( + const int, const int, + const float *, const int, + const float *, + const float, + float * +); + +void ATL_sgpmv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgprU +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_sgprL +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_sgpr +( + const enum ATLAS_UPLO, + const int, const int, + const float, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_sgpr1U_a1_x1_yX +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_sgpr1L_a1_x1_yX +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_sgpmvUT_a1_x1_bX_y1 +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgpmvUN_a1_x1_bX_y1 +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgpmvUT_a1_x1_b1_y1 +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgpmvUN_a1_x1_b1_y1 +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgpmvUT_a1_x1_b0_y1 +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgpmvUN_a1_x1_b0_y1 +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgpmvLT_a1_x1_bX_y1 +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgpmvLN_a1_x1_bX_y1 +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgpmvLT_a1_x1_b1_y1 +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgpmvLN_a1_x1_b1_y1 +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgpmvLT_a1_x1_b0_y1 +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgpmvLN_a1_x1_b0_y1 +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgbmvT_a1_x1_bX_y1 +( + const int, const int, + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgbmvN_a1_x1_bX_y1 +( + const int, const int, + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgbmvT_a1_x1_b1_y1 +( + const int, const int, + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgbmvN_a1_x1_b1_y1 +( + const int, const int, + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgbmvT_a1_x1_b0_y1 +( + const int, const int, + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sgbmvN_a1_x1_b0_y1 +( + const int, const int, + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_dtrsvLTU +( + const int, + const double *, const int, + double * +); + +void ATL_dtrsvLNU +( + const int, + const double *, const int, + double * +); + +void ATL_dtrsvLTN +( + const int, + const double *, const int, + double * +); + +void ATL_dtrsvLNN +( + const int, + const double *, const int, + double * +); + +void ATL_dtrsvUTU +( + const int, + const double *, const int, + double * +); + +void ATL_dtrsvUNU +( + const int, + const double *, const int, + double * +); + +void ATL_dtrsvUTN +( + const int, + const double *, const int, + double * +); + +void ATL_dtrsvUNN +( + const int, + const double *, const int, + double * +); + +void ATL_dtrsvLT +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_dtrsvLN +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_dtrsvUT +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_dtrsvUN +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_dtpsvLTU +( + const int, + const double *, const int, + double * +); + +void ATL_dtpsvLNU +( + const int, + const double *, const int, + double * +); + +void ATL_dtpsvLTN +( + const int, + const double *, const int, + double * +); + +void ATL_dtpsvLNN +( + const int, + const double *, const int, + double * +); + +void ATL_dtpsvUTU +( + const int, + const double *, const int, + double * +); + +void ATL_dtpsvUNU +( + const int, + const double *, const int, + double * +); + +void ATL_dtpsvUTN +( + const int, + const double *, const int, + double * +); + +void ATL_dtpsvUNN +( + const int, + const double *, const int, + double * +); + +void ATL_dtpsvLT +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_dtpsvLN +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_dtpsvUT +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_dtpsvUN +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_dtbsvLTU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbsvLNU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbsvLTN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbsvLNN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbsvUTU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbsvUNU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbsvUTN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbsvUNN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbsvLT +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbsvLN +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbsvUT +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbsvUN +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_dtrmvLTU +( + const int, + const double *, const int, + double * +); + +void ATL_dtrmvLNU +( + const int, + const double *, const int, + double * +); + +void ATL_dtrmvLTN +( + const int, + const double *, const int, + double * +); + +void ATL_dtrmvLNN +( + const int, + const double *, const int, + double * +); + +void ATL_dtrmvUTU +( + const int, + const double *, const int, + double * +); + +void ATL_dtrmvUNU +( + const int, + const double *, const int, + double * +); + +void ATL_dtrmvUTN +( + const int, + const double *, const int, + double * +); + +void ATL_dtrmvUNN +( + const int, + const double *, const int, + double * +); + +void ATL_dtrmvLT +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_dtrmvLN +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_dtrmvUT +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_dtrmvUN +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_dtpmvLTU +( + const int, + const double *, const int, + double * +); + +void ATL_dtpmvLNU +( + const int, + const double *, const int, + double * +); + +void ATL_dtpmvLTN +( + const int, + const double *, const int, + double * +); + +void ATL_dtpmvLNN +( + const int, + const double *, const int, + double * +); + +void ATL_dtpmvUTU +( + const int, + const double *, const int, + double * +); + +void ATL_dtpmvUNU +( + const int, + const double *, const int, + double * +); + +void ATL_dtpmvUTN +( + const int, + const double *, const int, + double * +); + +void ATL_dtpmvUNN +( + const int, + const double *, const int, + double * +); + +void ATL_dtpmvLT +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_dtpmvLN +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_dtpmvUT +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_dtpmvUN +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_dtbmvLTU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbmvLNU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbmvLTN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbmvLNN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbmvUTU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbmvUNU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbmvUTN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbmvUNN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbmvLT +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbmvLN +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbmvUT +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_dtbmvUN +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_dsyr2U +( + const int, + const double *, + const double *, + double *, const int +); + +void ATL_dsyr2L +( + const int, + const double *, + const double *, + double *, const int +); + +void ATL_dspr2U +( + const int, + const double *, + const double *, + double *, const int +); + +void ATL_dspr2L +( + const int, + const double *, + const double *, + double *, const int +); + +void ATL_dsyrU +( + const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_dsyrL +( + const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_dsprU +( + const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_dsprL +( + const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_dsymvU +( + const int, + const double *, const int, + const double *, + const double, + double * +); + +void ATL_dsymvL +( + const int, + const double *, const int, + const double *, + const double, + double * +); + +void ATL_dspmvU +( + const int, + const double *, const int, + const double *, + const double, + double * +); + +void ATL_dspmvL +( + const int, + const double *, const int, + const double *, + const double, + double * +); + +void ATL_dsbmvU +( + const int, const int, + const double *, const int, + const double *, + const double, + double * +); + +void ATL_dsbmvL +( + const int, const int, + const double *, const int, + const double *, + const double, + double * +); + +void ATL_dgpmv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgprU +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_dgprL +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_dgpr +( + const enum ATLAS_UPLO, + const int, const int, + const double, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_dgpr1U_a1_x1_yX +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_dgpr1L_a1_x1_yX +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_dgpmvUT_a1_x1_bX_y1 +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgpmvUN_a1_x1_bX_y1 +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgpmvUT_a1_x1_b1_y1 +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgpmvUN_a1_x1_b1_y1 +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgpmvUT_a1_x1_b0_y1 +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgpmvUN_a1_x1_b0_y1 +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgpmvLT_a1_x1_bX_y1 +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgpmvLN_a1_x1_bX_y1 +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgpmvLT_a1_x1_b1_y1 +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgpmvLN_a1_x1_b1_y1 +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgpmvLT_a1_x1_b0_y1 +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgpmvLN_a1_x1_b0_y1 +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgbmvT_a1_x1_bX_y1 +( + const int, const int, + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgbmvN_a1_x1_bX_y1 +( + const int, const int, + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgbmvT_a1_x1_b1_y1 +( + const int, const int, + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgbmvN_a1_x1_b1_y1 +( + const int, const int, + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgbmvT_a1_x1_b0_y1 +( + const int, const int, + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dgbmvN_a1_x1_b0_y1 +( + const int, const int, + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_ctrsvLHU +( + const int, + const float *, const int, + float * +); + +void ATL_ctrsvLCU +( + const int, + const float *, const int, + float * +); + +void ATL_ctrsvLTU +( + const int, + const float *, const int, + float * +); + +void ATL_ctrsvLNU +( + const int, + const float *, const int, + float * +); + +void ATL_ctrsvLHN +( + const int, + const float *, const int, + float * +); + +void ATL_ctrsvLCN +( + const int, + const float *, const int, + float * +); + +void ATL_ctrsvLTN +( + const int, + const float *, const int, + float * +); + +void ATL_ctrsvLNN +( + const int, + const float *, const int, + float * +); + +void ATL_ctrsvUHU +( + const int, + const float *, const int, + float * +); + +void ATL_ctrsvUCU +( + const int, + const float *, const int, + float * +); + +void ATL_ctrsvUTU +( + const int, + const float *, const int, + float * +); + +void ATL_ctrsvUNU +( + const int, + const float *, const int, + float * +); + +void ATL_ctrsvUHN +( + const int, + const float *, const int, + float * +); + +void ATL_ctrsvUCN +( + const int, + const float *, const int, + float * +); + +void ATL_ctrsvUTN +( + const int, + const float *, const int, + float * +); + +void ATL_ctrsvUNN +( + const int, + const float *, const int, + float * +); + +void ATL_ctrsvLH +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctrsvLC +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctrsvLT +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctrsvLN +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctrsvUH +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctrsvUC +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctrsvUT +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctrsvUN +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctpsvLHU +( + const int, + const float *, const int, + float * +); + +void ATL_ctpsvLCU +( + const int, + const float *, const int, + float * +); + +void ATL_ctpsvLTU +( + const int, + const float *, const int, + float * +); + +void ATL_ctpsvLNU +( + const int, + const float *, const int, + float * +); + +void ATL_ctpsvLHN +( + const int, + const float *, const int, + float * +); + +void ATL_ctpsvLCN +( + const int, + const float *, const int, + float * +); + +void ATL_ctpsvLTN +( + const int, + const float *, const int, + float * +); + +void ATL_ctpsvLNN +( + const int, + const float *, const int, + float * +); + +void ATL_ctpsvUHU +( + const int, + const float *, const int, + float * +); + +void ATL_ctpsvUCU +( + const int, + const float *, const int, + float * +); + +void ATL_ctpsvUTU +( + const int, + const float *, const int, + float * +); + +void ATL_ctpsvUNU +( + const int, + const float *, const int, + float * +); + +void ATL_ctpsvUHN +( + const int, + const float *, const int, + float * +); + +void ATL_ctpsvUCN +( + const int, + const float *, const int, + float * +); + +void ATL_ctpsvUTN +( + const int, + const float *, const int, + float * +); + +void ATL_ctpsvUNN +( + const int, + const float *, const int, + float * +); + +void ATL_ctpsvLH +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctpsvLC +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctpsvLT +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctpsvLN +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctpsvUH +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctpsvUC +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctpsvUT +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctpsvUN +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctbsvLHU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvLCU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvLTU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvLNU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvLHN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvLCN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvLTN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvLNN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvUHU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvUCU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvUTU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvUNU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvUHN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvUCN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvUTN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvUNN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvLH +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvLC +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvLT +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvLN +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvUH +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvUC +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvUT +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbsvUN +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_ctrmvLHU +( + const int, + const float *, const int, + float * +); + +void ATL_ctrmvLCU +( + const int, + const float *, const int, + float * +); + +void ATL_ctrmvLTU +( + const int, + const float *, const int, + float * +); + +void ATL_ctrmvLNU +( + const int, + const float *, const int, + float * +); + +void ATL_ctrmvLHN +( + const int, + const float *, const int, + float * +); + +void ATL_ctrmvLCN +( + const int, + const float *, const int, + float * +); + +void ATL_ctrmvLTN +( + const int, + const float *, const int, + float * +); + +void ATL_ctrmvLNN +( + const int, + const float *, const int, + float * +); + +void ATL_ctrmvUHU +( + const int, + const float *, const int, + float * +); + +void ATL_ctrmvUCU +( + const int, + const float *, const int, + float * +); + +void ATL_ctrmvUTU +( + const int, + const float *, const int, + float * +); + +void ATL_ctrmvUNU +( + const int, + const float *, const int, + float * +); + +void ATL_ctrmvUHN +( + const int, + const float *, const int, + float * +); + +void ATL_ctrmvUCN +( + const int, + const float *, const int, + float * +); + +void ATL_ctrmvUTN +( + const int, + const float *, const int, + float * +); + +void ATL_ctrmvUNN +( + const int, + const float *, const int, + float * +); + +void ATL_ctrmvLH +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctrmvLC +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctrmvLT +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctrmvLN +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctrmvUH +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctrmvUC +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctrmvUT +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctrmvUN +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctpmvLHU +( + const int, + const float *, const int, + float * +); + +void ATL_ctpmvLCU +( + const int, + const float *, const int, + float * +); + +void ATL_ctpmvLTU +( + const int, + const float *, const int, + float * +); + +void ATL_ctpmvLNU +( + const int, + const float *, const int, + float * +); + +void ATL_ctpmvLHN +( + const int, + const float *, const int, + float * +); + +void ATL_ctpmvLCN +( + const int, + const float *, const int, + float * +); + +void ATL_ctpmvLTN +( + const int, + const float *, const int, + float * +); + +void ATL_ctpmvLNN +( + const int, + const float *, const int, + float * +); + +void ATL_ctpmvUHU +( + const int, + const float *, const int, + float * +); + +void ATL_ctpmvUCU +( + const int, + const float *, const int, + float * +); + +void ATL_ctpmvUTU +( + const int, + const float *, const int, + float * +); + +void ATL_ctpmvUNU +( + const int, + const float *, const int, + float * +); + +void ATL_ctpmvUHN +( + const int, + const float *, const int, + float * +); + +void ATL_ctpmvUCN +( + const int, + const float *, const int, + float * +); + +void ATL_ctpmvUTN +( + const int, + const float *, const int, + float * +); + +void ATL_ctpmvUNN +( + const int, + const float *, const int, + float * +); + +void ATL_ctpmvLH +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctpmvLC +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctpmvLT +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctpmvLN +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctpmvUH +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctpmvUC +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctpmvUT +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctpmvUN +( + const enum ATLAS_DIAG, + const int, + const float *, const int, + float * +); + +void ATL_ctbmvLHU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvLCU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvLTU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvLNU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvLHN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvLCN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvLTN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvLNN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvUHU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvUCU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvUTU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvUNU +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvUHN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvUCN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvUTN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvUNN +( + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvLH +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvLC +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvLT +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvLN +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvUH +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvUC +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvUT +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_ctbmvUN +( + const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float * +); + +void ATL_cher2U +( + const int, + const float *, + const float *, + float *, const int +); + +void ATL_cher2L +( + const int, + const float *, + const float *, + float *, const int +); + +void ATL_chpr2U +( + const int, + const float *, + const float *, + float *, const int +); + +void ATL_chpr2L +( + const int, + const float *, + const float *, + float *, const int +); + +void ATL_cherU +( + const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_cherL +( + const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_chprU +( + const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_chprL +( + const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_chemvU +( + const int, + const float *, const int, + const float *, + const float *, + float * +); + +void ATL_chemvL +( + const int, + const float *, const int, + const float *, + const float *, + float * +); + +void ATL_chpmvU +( + const int, + const float *, const int, + const float *, + const float *, + float * +); + +void ATL_chpmvL +( + const int, + const float *, const int, + const float *, + const float *, + float * +); + +void ATL_chbmvU +( + const int, const int, + const float *, const int, + const float *, + const float *, + float * +); + +void ATL_chbmvL +( + const int, const int, + const float *, const int, + const float *, + const float *, + float * +); + +void ATL_cgpmv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpruU +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_cgpruL +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_cgpru +( + const enum ATLAS_UPLO, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_cgprcU +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_cgprcL +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_cgprc +( + const enum ATLAS_UPLO, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_cgpr1uU_a1_x1_yX +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_cgpr1uL_a1_x1_yX +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_cgpr1cU_a1_x1_yX +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_cgpr1cL_a1_x1_yX +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_cgpmvUNc_a1_x1_bX_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvUC_a1_x1_bX_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvUT_a1_x1_bX_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvUN_a1_x1_bX_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvUNc_a1_x1_b1_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvUC_a1_x1_b1_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvUT_a1_x1_b1_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvUN_a1_x1_b1_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvUNc_a1_x1_bXi0_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvUC_a1_x1_bXi0_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvUT_a1_x1_bXi0_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvUN_a1_x1_bXi0_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvUNc_a1_x1_b0_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvUC_a1_x1_b0_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvUT_a1_x1_b0_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvUN_a1_x1_b0_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvLNc_a1_x1_bX_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvLC_a1_x1_bX_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvLT_a1_x1_bX_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvLN_a1_x1_bX_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvLNc_a1_x1_b1_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvLC_a1_x1_b1_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvLT_a1_x1_b1_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvLN_a1_x1_b1_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvLNc_a1_x1_bXi0_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvLC_a1_x1_bXi0_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvLT_a1_x1_bXi0_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvLN_a1_x1_bXi0_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvLNc_a1_x1_b0_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvLC_a1_x1_b0_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvLT_a1_x1_b0_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgpmvLN_a1_x1_b0_y1 +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgbmvNc_a1_x1_bX_y1 +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgbmvC_a1_x1_bX_y1 +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgbmvT_a1_x1_bX_y1 +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgbmvN_a1_x1_bX_y1 +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgbmvNc_a1_x1_b1_y1 +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgbmvC_a1_x1_b1_y1 +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgbmvT_a1_x1_b1_y1 +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgbmvN_a1_x1_b1_y1 +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgbmvNc_a1_x1_bXi0_y1 +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgbmvC_a1_x1_bXi0_y1 +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgbmvT_a1_x1_bXi0_y1 +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgbmvN_a1_x1_bXi0_y1 +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgbmvNc_a1_x1_b0_y1 +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgbmvC_a1_x1_b0_y1 +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgbmvT_a1_x1_b0_y1 +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_cgbmvN_a1_x1_b0_y1 +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_ztrsvLHU +( + const int, + const double *, const int, + double * +); + +void ATL_ztrsvLCU +( + const int, + const double *, const int, + double * +); + +void ATL_ztrsvLTU +( + const int, + const double *, const int, + double * +); + +void ATL_ztrsvLNU +( + const int, + const double *, const int, + double * +); + +void ATL_ztrsvLHN +( + const int, + const double *, const int, + double * +); + +void ATL_ztrsvLCN +( + const int, + const double *, const int, + double * +); + +void ATL_ztrsvLTN +( + const int, + const double *, const int, + double * +); + +void ATL_ztrsvLNN +( + const int, + const double *, const int, + double * +); + +void ATL_ztrsvUHU +( + const int, + const double *, const int, + double * +); + +void ATL_ztrsvUCU +( + const int, + const double *, const int, + double * +); + +void ATL_ztrsvUTU +( + const int, + const double *, const int, + double * +); + +void ATL_ztrsvUNU +( + const int, + const double *, const int, + double * +); + +void ATL_ztrsvUHN +( + const int, + const double *, const int, + double * +); + +void ATL_ztrsvUCN +( + const int, + const double *, const int, + double * +); + +void ATL_ztrsvUTN +( + const int, + const double *, const int, + double * +); + +void ATL_ztrsvUNN +( + const int, + const double *, const int, + double * +); + +void ATL_ztrsvLH +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztrsvLC +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztrsvLT +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztrsvLN +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztrsvUH +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztrsvUC +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztrsvUT +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztrsvUN +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztpsvLHU +( + const int, + const double *, const int, + double * +); + +void ATL_ztpsvLCU +( + const int, + const double *, const int, + double * +); + +void ATL_ztpsvLTU +( + const int, + const double *, const int, + double * +); + +void ATL_ztpsvLNU +( + const int, + const double *, const int, + double * +); + +void ATL_ztpsvLHN +( + const int, + const double *, const int, + double * +); + +void ATL_ztpsvLCN +( + const int, + const double *, const int, + double * +); + +void ATL_ztpsvLTN +( + const int, + const double *, const int, + double * +); + +void ATL_ztpsvLNN +( + const int, + const double *, const int, + double * +); + +void ATL_ztpsvUHU +( + const int, + const double *, const int, + double * +); + +void ATL_ztpsvUCU +( + const int, + const double *, const int, + double * +); + +void ATL_ztpsvUTU +( + const int, + const double *, const int, + double * +); + +void ATL_ztpsvUNU +( + const int, + const double *, const int, + double * +); + +void ATL_ztpsvUHN +( + const int, + const double *, const int, + double * +); + +void ATL_ztpsvUCN +( + const int, + const double *, const int, + double * +); + +void ATL_ztpsvUTN +( + const int, + const double *, const int, + double * +); + +void ATL_ztpsvUNN +( + const int, + const double *, const int, + double * +); + +void ATL_ztpsvLH +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztpsvLC +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztpsvLT +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztpsvLN +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztpsvUH +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztpsvUC +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztpsvUT +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztpsvUN +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztbsvLHU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvLCU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvLTU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvLNU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvLHN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvLCN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvLTN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvLNN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvUHU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvUCU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvUTU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvUNU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvUHN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvUCN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvUTN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvUNN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvLH +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvLC +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvLT +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvLN +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvUH +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvUC +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvUT +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbsvUN +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_ztrmvLHU +( + const int, + const double *, const int, + double * +); + +void ATL_ztrmvLCU +( + const int, + const double *, const int, + double * +); + +void ATL_ztrmvLTU +( + const int, + const double *, const int, + double * +); + +void ATL_ztrmvLNU +( + const int, + const double *, const int, + double * +); + +void ATL_ztrmvLHN +( + const int, + const double *, const int, + double * +); + +void ATL_ztrmvLCN +( + const int, + const double *, const int, + double * +); + +void ATL_ztrmvLTN +( + const int, + const double *, const int, + double * +); + +void ATL_ztrmvLNN +( + const int, + const double *, const int, + double * +); + +void ATL_ztrmvUHU +( + const int, + const double *, const int, + double * +); + +void ATL_ztrmvUCU +( + const int, + const double *, const int, + double * +); + +void ATL_ztrmvUTU +( + const int, + const double *, const int, + double * +); + +void ATL_ztrmvUNU +( + const int, + const double *, const int, + double * +); + +void ATL_ztrmvUHN +( + const int, + const double *, const int, + double * +); + +void ATL_ztrmvUCN +( + const int, + const double *, const int, + double * +); + +void ATL_ztrmvUTN +( + const int, + const double *, const int, + double * +); + +void ATL_ztrmvUNN +( + const int, + const double *, const int, + double * +); + +void ATL_ztrmvLH +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztrmvLC +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztrmvLT +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztrmvLN +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztrmvUH +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztrmvUC +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztrmvUT +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztrmvUN +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztpmvLHU +( + const int, + const double *, const int, + double * +); + +void ATL_ztpmvLCU +( + const int, + const double *, const int, + double * +); + +void ATL_ztpmvLTU +( + const int, + const double *, const int, + double * +); + +void ATL_ztpmvLNU +( + const int, + const double *, const int, + double * +); + +void ATL_ztpmvLHN +( + const int, + const double *, const int, + double * +); + +void ATL_ztpmvLCN +( + const int, + const double *, const int, + double * +); + +void ATL_ztpmvLTN +( + const int, + const double *, const int, + double * +); + +void ATL_ztpmvLNN +( + const int, + const double *, const int, + double * +); + +void ATL_ztpmvUHU +( + const int, + const double *, const int, + double * +); + +void ATL_ztpmvUCU +( + const int, + const double *, const int, + double * +); + +void ATL_ztpmvUTU +( + const int, + const double *, const int, + double * +); + +void ATL_ztpmvUNU +( + const int, + const double *, const int, + double * +); + +void ATL_ztpmvUHN +( + const int, + const double *, const int, + double * +); + +void ATL_ztpmvUCN +( + const int, + const double *, const int, + double * +); + +void ATL_ztpmvUTN +( + const int, + const double *, const int, + double * +); + +void ATL_ztpmvUNN +( + const int, + const double *, const int, + double * +); + +void ATL_ztpmvLH +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztpmvLC +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztpmvLT +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztpmvLN +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztpmvUH +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztpmvUC +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztpmvUT +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztpmvUN +( + const enum ATLAS_DIAG, + const int, + const double *, const int, + double * +); + +void ATL_ztbmvLHU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvLCU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvLTU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvLNU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvLHN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvLCN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvLTN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvLNN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvUHU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvUCU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvUTU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvUNU +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvUHN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvUCN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvUTN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvUNN +( + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvLH +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvLC +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvLT +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvLN +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvUH +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvUC +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvUT +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_ztbmvUN +( + const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double * +); + +void ATL_zher2U +( + const int, + const double *, + const double *, + double *, const int +); + +void ATL_zher2L +( + const int, + const double *, + const double *, + double *, const int +); + +void ATL_zhpr2U +( + const int, + const double *, + const double *, + double *, const int +); + +void ATL_zhpr2L +( + const int, + const double *, + const double *, + double *, const int +); + +void ATL_zherU +( + const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zherL +( + const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zhprU +( + const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zhprL +( + const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zhemvU +( + const int, + const double *, const int, + const double *, + const double *, + double * +); + +void ATL_zhemvL +( + const int, + const double *, const int, + const double *, + const double *, + double * +); + +void ATL_zhpmvU +( + const int, + const double *, const int, + const double *, + const double *, + double * +); + +void ATL_zhpmvL +( + const int, + const double *, const int, + const double *, + const double *, + double * +); + +void ATL_zhbmvU +( + const int, const int, + const double *, const int, + const double *, + const double *, + double * +); + +void ATL_zhbmvL +( + const int, const int, + const double *, const int, + const double *, + const double *, + double * +); + +void ATL_zgpmv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpruU +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zgpruL +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zgpru +( + const enum ATLAS_UPLO, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zgprcU +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zgprcL +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zgprc +( + const enum ATLAS_UPLO, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zgpr1uU_a1_x1_yX +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zgpr1uL_a1_x1_yX +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zgpr1cU_a1_x1_yX +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zgpr1cL_a1_x1_yX +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zgpmvUNc_a1_x1_bX_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvUC_a1_x1_bX_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvUT_a1_x1_bX_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvUN_a1_x1_bX_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvUNc_a1_x1_b1_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvUC_a1_x1_b1_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvUT_a1_x1_b1_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvUN_a1_x1_b1_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvUNc_a1_x1_bXi0_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvUC_a1_x1_bXi0_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvUT_a1_x1_bXi0_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvUN_a1_x1_bXi0_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvUNc_a1_x1_b0_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvUC_a1_x1_b0_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvUT_a1_x1_b0_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvUN_a1_x1_b0_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvLNc_a1_x1_bX_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvLC_a1_x1_bX_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvLT_a1_x1_bX_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvLN_a1_x1_bX_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvLNc_a1_x1_b1_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvLC_a1_x1_b1_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvLT_a1_x1_b1_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvLN_a1_x1_b1_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvLNc_a1_x1_bXi0_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvLC_a1_x1_bXi0_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvLT_a1_x1_bXi0_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvLN_a1_x1_bXi0_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvLNc_a1_x1_b0_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvLC_a1_x1_b0_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvLT_a1_x1_b0_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgpmvLN_a1_x1_b0_y1 +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgbmvNc_a1_x1_bX_y1 +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgbmvC_a1_x1_bX_y1 +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgbmvT_a1_x1_bX_y1 +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgbmvN_a1_x1_bX_y1 +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgbmvNc_a1_x1_b1_y1 +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgbmvC_a1_x1_b1_y1 +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgbmvT_a1_x1_b1_y1 +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgbmvN_a1_x1_b1_y1 +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgbmvNc_a1_x1_bXi0_y1 +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgbmvC_a1_x1_bXi0_y1 +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgbmvT_a1_x1_bXi0_y1 +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgbmvN_a1_x1_bXi0_y1 +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgbmvNc_a1_x1_b0_y1 +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgbmvC_a1_x1_b0_y1 +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgbmvT_a1_x1_b0_y1 +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zgbmvN_a1_x1_b0_y1 +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + + +#endif +/* + * End of atlas_kernel2.h + */ diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_kernel3.h b/kaldi_io/src/tools/ATLAS/include/atlas_kernel3.h new file mode 100644 index 0000000..a929c2d --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_kernel3.h @@ -0,0 +1,1393 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1999 R. Clint Whaley + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef ATLAS_KERNEL3_H +#define ATLAS_KERNEL3_H + +/* + * Real level 3 kernels + */ +void ATL_ssymmRU + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_ssymmLU + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_ssymmRL + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_ssymmLL + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_strsmLLTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strmmLLTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strsmLLTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strmmLLTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strsmLLNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strmmLLNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strsmLLNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strmmLLNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strsmLUTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strmmLUTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strsmLUTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strmmLUTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strsmLUNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strmmLUNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strsmLUNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strmmLUNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strsmRLTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strmmRLTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strsmRLTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strmmRLTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strsmRLNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strmmRLNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strsmRLNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strmmRLNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strsmRUTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strmmRUTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strsmRUTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strmmRUTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strsmRUNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strmmRUNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strsmRUNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_strmmRUNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ssyrkLT + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_ssyrkUT + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_ssyrkLN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_ssyrkUN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +int ATL_ssyr2kLT + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_ssyr2kUT + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_ssyr2kLN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_ssyr2kUN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +void ATL_dsymmRU + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_dsymmLU + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_dsymmRL + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_dsymmLL + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_dtrsmLLTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrmmLLTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrsmLLTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrmmLLTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrsmLLNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrmmLLNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrsmLLNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrmmLLNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrsmLUTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrmmLUTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrsmLUTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrmmLUTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrsmLUNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrmmLUNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrsmLUNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrmmLUNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrsmRLTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrmmRLTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrsmRLTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrmmRLTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrsmRLNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrmmRLNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrsmRLNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrmmRLNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrsmRUTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrmmRUTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrsmRUTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrmmRUTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrsmRUNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrmmRUNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrsmRUNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dtrmmRUNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_dsyrkLT + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_dsyrkUT + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_dsyrkLN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_dsyrkUN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +int ATL_dsyr2kLT + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_dsyr2kUT + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_dsyr2kLN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_dsyr2kUN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); + +/* + * Complex level 3 kernels + */ +void ATL_chemmRU + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_chemmLU + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_chemmRL + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_chemmLL + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_csymmRU + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_csymmLU + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_csymmRL + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_csymmLL + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_ctrsmLLTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmLLTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmLLTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmLLTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmLLNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmLLNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmLLNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmLLNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmLLCN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmLLCN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmLLCU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmLLCU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmLUTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmLUTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmLUTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmLUTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmLUNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmLUNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmLUNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmLUNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmLUCN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmLUCN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmLUCU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmLUCU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmRLTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmRLTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmRLTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmRLTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmRLNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmRLNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmRLNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmRLNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmRLCN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmRLCN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmRLCU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmRLCU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmRUTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmRUTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmRUTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmRUTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmRUNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmRUNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmRUNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmRUNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmRUCN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmRUCN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrsmRUCU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ctrmmRUCU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_cherkLC + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_cherkUC + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_cherkLN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_cherkUN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_csyrkLT + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_csyrkUT + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_csyrkLN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_csyrkUN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +int ATL_cher2kLC + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_cher2kUC + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_cher2kLN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_cher2kUN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_csyr2kLT + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_csyr2kUT + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_csyr2kLN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_csyr2kUN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +void ATL_zhemmRU + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_zhemmLU + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_zhemmRL + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_zhemmLL + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_zsymmRU + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_zsymmLU + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_zsymmRL + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_zsymmLL + (const int M, const int N, const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, void *C, const int ldc); +void ATL_ztrsmLLTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmLLTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmLLTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmLLTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmLLNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmLLNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmLLNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmLLNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmLLCN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmLLCN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmLLCU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmLLCU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmLUTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmLUTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmLUTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmLUTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmLUNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmLUNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmLUNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmLUNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmLUCN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmLUCN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmLUCU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmLUCU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmRLTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmRLTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmRLTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmRLTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmRLNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmRLNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmRLNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmRLNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmRLCN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmRLCN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmRLCU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmRLCU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmRUTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmRUTN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmRUTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmRUTU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmRUNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmRUNN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmRUNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmRUNU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmRUCN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmRUCN + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrsmRUCU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_ztrmmRUCU + (const int M, const int N, const void *valpha, const void *A, const int lda, + void *C, const int ldc); +void ATL_zherkLC + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_zherkUC + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_zherkLN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_zherkUN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_zsyrkLT + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_zsyrkUT + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_zsyrkLN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +void ATL_zsyrkUN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *vbeta, void *C, const int ldc); +int ATL_zher2kLC + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_zher2kUC + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_zher2kLN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_zher2kUN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_zsyr2kLT + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_zsyr2kUT + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_zsyr2kLN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); +int ATL_zsyr2kUN + (const int N, const int K, const void *valpha, const void *A, const int lda, + const void *B, const int ldb, const void *vbeta, void *C, const int ldc); + +/* + * Real level 3 kernel auxiliaries + */ +void ATL_ssycopyU_a0 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_ssycopyL_a0 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyU2L_N_a0 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyU2L_U_a0 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyU2U_N_a0 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyU2U_U_a0 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyL2L_N_a0 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyL2L_U_a0 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyL2U_N_a0 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyL2U_U_a0 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_ssycopyU_a1 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_ssycopyL_a1 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyU2L_N_a1 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyU2L_U_a1 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyU2U_N_a1 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyU2U_U_a1 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyL2L_N_a1 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyL2L_U_a1 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyL2U_N_a1 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyL2U_U_a1 + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_ssycopyU_aX + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_ssycopyL_aX + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyU2L_N_aX + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyU2L_U_aX + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyU2U_N_aX + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyU2U_U_aX + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyL2L_N_aX + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyL2L_U_aX + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyL2U_N_aX + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strcopyL2U_U_aX + (const int N, const float alpha, const float *A, const int lda, float *C); +void ATL_strinvertUU(const int N, float *A, const int lda); +void ATL_strinvertLU(const int N, float *A, const int lda); +void ATL_strinvertUN(const int N, float *A, const int lda); +void ATL_strinvertLN(const int N, float *A, const int lda); +void ATL_ssyr2k_putU_bX + (const int N, const float *v, const float beta, float *A, const int lda); +void ATL_ssyr2k_putL_bX + (const int N, const float *v, const float beta, float *A, const int lda); +void ATL_strputU_bX + (const int N, const float *v, const float beta, float *A, const int lda); +void ATL_strputL_bX + (const int N, const float *v, const float beta, float *A, const int lda); +void ATL_ssyr2k_putU_b1 + (const int N, const float *v, const float beta, float *A, const int lda); +void ATL_ssyr2k_putL_b1 + (const int N, const float *v, const float beta, float *A, const int lda); +void ATL_strputU_b1 + (const int N, const float *v, const float beta, float *A, const int lda); +void ATL_strputL_b1 + (const int N, const float *v, const float beta, float *A, const int lda); +void ATL_ssyr2k_putU_b0 + (const int N, const float *v, const float beta, float *A, const int lda); +void ATL_ssyr2k_putL_b0 + (const int N, const float *v, const float beta, float *A, const int lda); +void ATL_strputU_b0 + (const int N, const float *v, const float beta, float *A, const int lda); +void ATL_strputL_b0 + (const int N, const float *v, const float beta, float *A, const int lda); +void ATL_strsmKLLTN + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_strsmKLLTU + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_strsmKLLNN + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_strsmKLLNU + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_strsmKLUTN + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_strsmKLUTU + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_strsmKLUNN + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_strsmKLUNU + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_strsmKRLTN + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_strsmKRLTU + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_strsmKRLNN + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_strsmKRLNU + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_strsmKRUTN + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_strsmKRUTU + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_strsmKRUNN + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_strsmKRUNU + (const int M, const int N, const float alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_dsycopyU_a0 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dsycopyL_a0 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyU2L_N_a0 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyU2L_U_a0 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyU2U_N_a0 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyU2U_U_a0 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyL2L_N_a0 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyL2L_U_a0 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyL2U_N_a0 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyL2U_U_a0 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dsycopyU_a1 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dsycopyL_a1 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyU2L_N_a1 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyU2L_U_a1 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyU2U_N_a1 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyU2U_U_a1 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyL2L_N_a1 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyL2L_U_a1 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyL2U_N_a1 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyL2U_U_a1 + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dsycopyU_aX + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dsycopyL_aX + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyU2L_N_aX + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyU2L_U_aX + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyU2U_N_aX + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyU2U_U_aX + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyL2L_N_aX + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyL2L_U_aX + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyL2U_N_aX + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrcopyL2U_U_aX + (const int N, const double alpha, const double *A, const int lda, double *C); +void ATL_dtrinvertUU(const int N, double *A, const int lda); +void ATL_dtrinvertLU(const int N, double *A, const int lda); +void ATL_dtrinvertUN(const int N, double *A, const int lda); +void ATL_dtrinvertLN(const int N, double *A, const int lda); +void ATL_dsyr2k_putU_bX + (const int N, const double *v, const double beta, double *A, const int lda); +void ATL_dsyr2k_putL_bX + (const int N, const double *v, const double beta, double *A, const int lda); +void ATL_dtrputU_bX + (const int N, const double *v, const double beta, double *A, const int lda); +void ATL_dtrputL_bX + (const int N, const double *v, const double beta, double *A, const int lda); +void ATL_dsyr2k_putU_b1 + (const int N, const double *v, const double beta, double *A, const int lda); +void ATL_dsyr2k_putL_b1 + (const int N, const double *v, const double beta, double *A, const int lda); +void ATL_dtrputU_b1 + (const int N, const double *v, const double beta, double *A, const int lda); +void ATL_dtrputL_b1 + (const int N, const double *v, const double beta, double *A, const int lda); +void ATL_dsyr2k_putU_b0 + (const int N, const double *v, const double beta, double *A, const int lda); +void ATL_dsyr2k_putL_b0 + (const int N, const double *v, const double beta, double *A, const int lda); +void ATL_dtrputU_b0 + (const int N, const double *v, const double beta, double *A, const int lda); +void ATL_dtrputL_b0 + (const int N, const double *v, const double beta, double *A, const int lda); +void ATL_dtrsmKLLTN + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dtrsmKLLTU + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dtrsmKLLNN + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dtrsmKLLNU + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dtrsmKLUTN + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dtrsmKLUTU + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dtrsmKLUNN + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dtrsmKLUNU + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dtrsmKRLTN + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dtrsmKRLTU + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dtrsmKRLNN + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dtrsmKRLNU + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dtrsmKRUTN + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dtrsmKRUTU + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dtrsmKRUNN + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_dtrsmKRUNU + (const int M, const int N, const double alpha, const double *A, + const int lda, double *C, const int ldc); + +/* + * Complex level 3 kernel auxiliaries + */ +void ATL_cCtrsmKL + (enum ATLAS_UPLO Uplo, enum ATLAS_TRANS Trans, enum ATLAS_DIAG Diag, + const int M, const int N, const float *alpha, const float *A, + const int lda, float *B, const int ldb); +void ATL_checopy + (const int N, const float *A, const int lda, float *C); +void ATL_csycopy + (const int N, const float *A, const int lda, float *C); +void ATL_ctrcopyU2L_N + (const int N, const float *A, const int lda, float *C); +void ATL_ctrcopyU2Lc_N + (const int N, const float *A, const int lda, float *C); +void ATL_ctrcopyU2L_U + (const int N, const float *A, const int lda, float *C); +void ATL_ctrcopyU2Lc_U + (const int N, const float *A, const int lda, float *C); +void ATL_ctrcopyU2U_N + (const int N, const float *A, const int lda, float *C); +void ATL_ctrcopyU2Uc_N + (const int N, const float *A, const int lda, float *C); +void ATL_ctrcopyU2U_U + (const int N, const float *A, const int lda, float *C); +void ATL_ctrcopyU2Uc_U + (const int N, const float *A, const int lda, float *C); +void ATL_ctrcopyL2L_N + (const int N, const float *A, const int lda, float *C); +void ATL_ctrcopyL2Lc_N + (const int N, const float *A, const int lda, float *C); +void ATL_ctrcopyL2L_U + (const int N, const float *A, const int lda, float *C); +void ATL_ctrcopyL2Lc_U + (const int N, const float *A, const int lda, float *C); +void ATL_ctrcopyL2U_N + (const int N, const float *A, const int lda, float *C); +void ATL_ctrcopyL2Uc_N + (const int N, const float *A, const int lda, float *C); +void ATL_ctrcopyL2U_U + (const int N, const float *A, const int lda, float *C); +void ATL_ctrcopyL2Uc_U + (const int N, const float *A, const int lda, float *C); +void ATL_ctrmv_scalLNU_an1 + (const int N, const float *alpha, const float *A, const int lda, float *X); +void ATL_ctrmv_scalLNN_aX + (const int N, const float *alpha, const float *A, const int lda, float *X); +void ATL_ctrmv_scalUNU_an1 + (const int N, const float *alpha, const float *A, const int lda, float *X); +void ATL_ctrmv_scalUNN_aX + (const int N, const float *alpha, const float *A, const int lda, float *X); +void ATL_ctrinvertUU(const int N, float *A, const int lda); +void ATL_ctrinvertLU(const int N, float *A, const int lda); +void ATL_ctrinvertUN(const int N, float *A, const int lda); +void ATL_ctrinvertLN(const int N, float *A, const int lda); +void ATL_ctrputU_b0 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_ctrputL_b0 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_csyr2k_putU_b0 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_csyr2k_putL_b0 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_ctrputU_b1 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_ctrputL_b1 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_csyr2k_putU_b1 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_csyr2k_putL_b1 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_ctrputU_bX + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_ctrputL_bX + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_csyr2k_putU_bX + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_csyr2k_putL_bX + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_ctrputU_bXi0 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_ctrputL_bXi0 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_csyr2k_putU_bXi0 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_csyr2k_putL_bXi0 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_ctrputU_bn1 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_ctrputL_bn1 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_csyr2k_putU_bn1 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_csyr2k_putL_bn1 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_cher2k_putU_b0 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_cher2k_putL_b0 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_cheputU_b0 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_cheputL_b0 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_cher2k_putU_b1 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_cher2k_putL_b1 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_cheputU_b1 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_cheputL_b1 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_cher2k_putU_bXi0 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_cher2k_putL_bXi0 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_cheputU_bXi0 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_cheputL_bXi0 + (const int N, const float *v, const float *beta, float *A, const int lda); +void ATL_ctrsm0LLTN + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0LLTU + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0LLNN + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0LLNU + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0LLCN + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0LLCU + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0LUTN + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0LUTU + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0LUNN + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0LUNU + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0LUCN + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0LUCU + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0RLTN + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0RLTU + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0RLNN + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0RLNU + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0RLCN + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0RLCU + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0RUTN + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0RUTU + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0RUNN + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0RUNU + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0RUCN + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_ctrsm0RUCU + (const int M, const int N, const float *alpha, const float *A, + const int lda, float *C, const int ldc); +void ATL_zCtrsmKL + (enum ATLAS_UPLO Uplo, enum ATLAS_TRANS Trans, enum ATLAS_DIAG Diag, + const int M, const int N, const double *alpha, const double *A, + const int lda, double *B, const int ldb); +void ATL_zhecopy + (const int N, const double *A, const int lda, double *C); +void ATL_zsycopy + (const int N, const double *A, const int lda, double *C); +void ATL_ztrcopyU2L_N + (const int N, const double *A, const int lda, double *C); +void ATL_ztrcopyU2Lc_N + (const int N, const double *A, const int lda, double *C); +void ATL_ztrcopyU2L_U + (const int N, const double *A, const int lda, double *C); +void ATL_ztrcopyU2Lc_U + (const int N, const double *A, const int lda, double *C); +void ATL_ztrcopyU2U_N + (const int N, const double *A, const int lda, double *C); +void ATL_ztrcopyU2Uc_N + (const int N, const double *A, const int lda, double *C); +void ATL_ztrcopyU2U_U + (const int N, const double *A, const int lda, double *C); +void ATL_ztrcopyU2Uc_U + (const int N, const double *A, const int lda, double *C); +void ATL_ztrcopyL2L_N + (const int N, const double *A, const int lda, double *C); +void ATL_ztrcopyL2Lc_N + (const int N, const double *A, const int lda, double *C); +void ATL_ztrcopyL2L_U + (const int N, const double *A, const int lda, double *C); +void ATL_ztrcopyL2Lc_U + (const int N, const double *A, const int lda, double *C); +void ATL_ztrcopyL2U_N + (const int N, const double *A, const int lda, double *C); +void ATL_ztrcopyL2Uc_N + (const int N, const double *A, const int lda, double *C); +void ATL_ztrcopyL2U_U + (const int N, const double *A, const int lda, double *C); +void ATL_ztrcopyL2Uc_U + (const int N, const double *A, const int lda, double *C); +void ATL_ztrmv_scalLNU_an1 + (const int N, const double *alpha, const double *A, const int lda, double *X); +void ATL_ztrmv_scalLNN_aX + (const int N, const double *alpha, const double *A, const int lda, double *X); +void ATL_ztrmv_scalUNU_an1 + (const int N, const double *alpha, const double *A, const int lda, double *X); +void ATL_ztrmv_scalUNN_aX + (const int N, const double *alpha, const double *A, const int lda, double *X); +void ATL_ztrinvertUU(const int N, double *A, const int lda); +void ATL_ztrinvertLU(const int N, double *A, const int lda); +void ATL_ztrinvertUN(const int N, double *A, const int lda); +void ATL_ztrinvertLN(const int N, double *A, const int lda); +void ATL_ztrputU_b0 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_ztrputL_b0 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zsyr2k_putU_b0 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zsyr2k_putL_b0 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_ztrputU_b1 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_ztrputL_b1 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zsyr2k_putU_b1 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zsyr2k_putL_b1 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_ztrputU_bX + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_ztrputL_bX + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zsyr2k_putU_bX + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zsyr2k_putL_bX + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_ztrputU_bXi0 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_ztrputL_bXi0 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zsyr2k_putU_bXi0 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zsyr2k_putL_bXi0 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_ztrputU_bn1 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_ztrputL_bn1 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zsyr2k_putU_bn1 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zsyr2k_putL_bn1 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zher2k_putU_b0 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zher2k_putL_b0 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zheputU_b0 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zheputL_b0 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zher2k_putU_b1 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zher2k_putL_b1 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zheputU_b1 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zheputL_b1 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zher2k_putU_bXi0 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zher2k_putL_bXi0 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zheputU_bXi0 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_zheputL_bXi0 + (const int N, const double *v, const double *beta, double *A, const int lda); +void ATL_ztrsm0LLTN + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0LLTU + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0LLNN + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0LLNU + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0LLCN + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0LLCU + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0LUTN + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0LUTU + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0LUNN + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0LUNU + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0LUCN + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0LUCU + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0RLTN + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0RLTU + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0RLNN + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0RLNU + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0RLCN + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0RLCU + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0RUTN + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0RUTU + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0RUNN + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0RUNU + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0RUCN + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); +void ATL_ztrsm0RUCU + (const int M, const int N, const double *alpha, const double *A, + const int lda, double *C, const int ldc); + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_lapack.h b/kaldi_io/src/tools/ATLAS/include/atlas_lapack.h new file mode 100644 index 0000000..4b370b8 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_lapack.h @@ -0,0 +1,239 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1999 R. Clint Whaley + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef ATLAS_LAPACK_H + #define ATLAS_LAPACK_H + +#include "atlas_misc.h" +#include "cblas.h" + +#ifdef PATL + +#include "atlas_cblastypealias.h" +/* + * predefined type macro names + */ +#define ATL_getriR Mjoin(PATL,getriR) +#define ATL_getriC Mjoin(PATL,getriC) +#define ATL_getri Mjoin(PATL,getri) +#define ATL_lauumRL Mjoin(PATL,lauumRL) +#define ATL_lauumRU Mjoin(PATL,lauumRU) +#define ATL_lauumCL Mjoin(PATL,lauumCL) +#define ATL_lauumCU Mjoin(PATL,lauumCU) +#define ATL_lauum Mjoin(PATL,lauum) +#define ATL_trtriRL Mjoin(PATL,trtriRL) +#define ATL_trtriRU Mjoin(PATL,trtriRU) +#define ATL_trtriCL Mjoin(PATL,trtriCL) +#define ATL_trtriCU Mjoin(PATL,trtriCU) +#define ATL_trtri Mjoin(PATL,trtri) +#define ATL_potrfU Mjoin(PATL,potrfU) +#define ATL_potrfL Mjoin(PATL,potrfL) +#define ATL_potrs Mjoin(PATL,potrs) +#define ATL_potrf Mjoin(PATL,potrf) +#define ATL_getrfR Mjoin(PATL,getrfR) +#define ATL_getrfC Mjoin(PATL,getrfC) +#define ATL_getrs Mjoin(PATL,getrs) +#define ATL_getrf Mjoin(PATL,getrf) +#define ATL_laswp Mjoin(PATL,laswp) + +#endif + +int ATL_sgetri(const enum CBLAS_ORDER Order, const int N, TYPE *A, const int lda, + const int *ipiv, TYPE *wrk, int *lwrk); +int ATL_sgetriR(const int N, TYPE *A, const int lda, const int *ipiv, + TYPE *wrk, const int lwrk); +int ATL_sgetriC(const int N, TYPE *A, const int lda, const int *ipiv, + TYPE *wrk, const int lwrk); +void ATL_slauum(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, float *A, const int lda); +int ATL_spotrf(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, float *A, const int lda); +void ATL_spotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const int NRHS, const float *A, const int lda, + float *B, const int ldb); +int ATL_sgetrf(const enum CBLAS_ORDER Order, const int M, const int N, + float *A, const int lda, int *ipiv); +void ATL_sgetrs(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans, + const int N, const int NRHS, const float *A, const int lda, + const int *ipiv, float *B, const int ldb); +void ATL_slaswp(const int N, float *A, const int lda0, const int K1, + const int K2, const int *ipiv, const int inci); +int ATL_sgetrfC(const int M, const int N, float *A, const int lda, + int *ipiv); +int ATL_sgetrfR(const int M, const int N, float *A, const int lda, + int *ipiv); +void ATL_slauumRU(const int N, float *A, const int lda); +void ATL_slauumRL(const int N, float *A, const int lda); +void ATL_slauumCU(const int N, float *A, const int lda); +void ATL_slauumCL(const int N, float *A, const int lda); +int ATL_spotrfU(const int N, float *A, const int lda); +int ATL_spotrfL(const int N, float *A, const int lda); +int ATL_strtri(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_DIAG Diag, const int N, + float *A, const int lda); +int ATL_strtriRU(const enum CBLAS_DIAG Diag, const int N, float *A, + const int lda); +int ATL_strtriRL(const enum CBLAS_DIAG Diag, const int N, float *A, + const int lda); +int ATL_strtriCU(const enum CBLAS_DIAG Diag, const int N, float *A, + const int lda); +int ATL_strtriCL(const enum CBLAS_DIAG Diag, const int N, float *A, + const int lda); + +int ATL_dgetri(const enum CBLAS_ORDER Order, const int N, TYPE *A, const int lda, + const int *ipiv, TYPE *wrk, int *lwrk); +int ATL_dgetriR(const int N, TYPE *A, const int lda, const int *ipiv, + TYPE *wrk, const int lwrk); +int ATL_dgetriC(const int N, TYPE *A, const int lda, const int *ipiv, + TYPE *wrk, const int lwrk); +void ATL_dlauum(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, double *A, const int lda); +int ATL_dpotrf(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, double *A, const int lda); +void ATL_dpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const int NRHS, const double *A, const int lda, + double *B, const int ldb); +int ATL_dgetrf(const enum CBLAS_ORDER Order, const int M, const int N, + double *A, const int lda, int *ipiv); +void ATL_dgetrs(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans, + const int N, const int NRHS, const double *A, const int lda, + const int *ipiv, double *B, const int ldb); +void ATL_dlaswp(const int N, double *A, const int lda0, const int K1, + const int K2, const int *ipiv, const int inci); +int ATL_dgetrfC(const int M, const int N, double *A, const int lda, + int *ipiv); +int ATL_dgetrfR(const int M, const int N, double *A, const int lda, + int *ipiv); +void ATL_dlauumRU(const int N, double *A, const int lda); +void ATL_dlauumRL(const int N, double *A, const int lda); +void ATL_dlauumCU(const int N, double *A, const int lda); +void ATL_dlauumCL(const int N, double *A, const int lda); +int ATL_dpotrfU(const int N, double *A, const int lda); +int ATL_dpotrfL(const int N, double *A, const int lda); +int ATL_dtrtri(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_DIAG Diag, const int N, + double *A, const int lda); +int ATL_dtrtriRU(const enum CBLAS_DIAG Diag, const int N, double *A, + const int lda); +int ATL_dtrtriRL(const enum CBLAS_DIAG Diag, const int N, double *A, + const int lda); +int ATL_dtrtriCU(const enum CBLAS_DIAG Diag, const int N, double *A, + const int lda); +int ATL_dtrtriCL(const enum CBLAS_DIAG Diag, const int N, double *A, + const int lda); + +int ATL_cgetri(const enum CBLAS_ORDER Order, const int N, TYPE *A, const int lda, + const int *ipiv, TYPE *wrk, int *lwrk); +int ATL_cgetriR(const int N, TYPE *A, const int lda, const int *ipiv, + TYPE *wrk, const int lwrk); +int ATL_cgetriC(const int N, TYPE *A, const int lda, const int *ipiv, + TYPE *wrk, const int lwrk); +void ATL_clauum(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, float *A, const int lda); +int ATL_cpotrf(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, float *A, const int lda); +void ATL_cpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const int NRHS, const float *A, const int lda, + float *B, const int ldb); +int ATL_cgetrf(const enum CBLAS_ORDER Order, const int M, const int N, + float *A, const int lda, int *ipiv); +void ATL_cgetrs(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans, + const int N, const int NRHS, const float *A, const int lda, + const int *ipiv, float *B, const int ldb); +void ATL_claswp(const int N, float *A, const int lda0, const int K1, + const int K2, const int *ipiv, const int inci); +int ATL_cgetrfC(const int M, const int N, float *A, const int lda, + int *ipiv); +int ATL_cgetrfR(const int M, const int N, float *A, const int lda, + int *ipiv); +void ATL_clauumRU(const int N, float *A, const int lda); +void ATL_clauumRL(const int N, float *A, const int lda); +void ATL_clauumCU(const int N, float *A, const int lda); +void ATL_clauumCL(const int N, float *A, const int lda); +int ATL_cpotrfRU(const int N, float *A, const int lda); +int ATL_cpotrfRL(const int N, float *A, const int lda); +int ATL_cpotrfU(const int N, float *A, const int lda); +int ATL_cpotrfL(const int N, float *A, const int lda); +int ATL_ctrtri(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_DIAG Diag, const int N, + float *A, const int lda); +int ATL_ctrtriRU(const enum CBLAS_DIAG Diag, const int N, float *A, + const int lda); +int ATL_ctrtriRL(const enum CBLAS_DIAG Diag, const int N, float *A, + const int lda); +int ATL_ctrtriCU(const enum CBLAS_DIAG Diag, const int N, float *A, + const int lda); +int ATL_ctrtriCL(const enum CBLAS_DIAG Diag, const int N, float *A, + const int lda); + +int ATL_zgetri(const enum CBLAS_ORDER Order, const int N, TYPE *A, const int lda, + const int *ipiv, TYPE *wrk, int *lwrk); +int ATL_zgetriR(const int N, TYPE *A, const int lda, const int *ipiv, + TYPE *wrk, const int lwrk); +int ATL_zgetriC(const int N, TYPE *A, const int lda, const int *ipiv, + TYPE *wrk, const int lwrk); +void ATL_zlauum(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, double *A, const int lda); +int ATL_zpotrf(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, double *A, const int lda); +void ATL_zpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const int NRHS, const double *A, const int lda, + double *B, const int ldb); +int ATL_zgetrf(const enum CBLAS_ORDER Order, const int M, const int N, + double *A, const int lda, int *ipiv); +void ATL_zgetrs(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans, + const int N, const int NRHS, const double *A, const int lda, + const int *ipiv, double *B, const int ldb); +void ATL_zlaswp(const int N, double *A, const int lda0, const int K1, + const int K2, const int *ipiv, const int inci); +int ATL_zgetrfC(const int M, const int N, double *A, const int lda, + int *ipiv); +int ATL_zgetrfR(const int M, const int N, double *A, const int lda, + int *ipiv); +void ATL_zlauumRU(const int N, double *A, const int lda); +void ATL_zlauumRL(const int N, double *A, const int lda); +void ATL_zlauumCU(const int N, double *A, const int lda); +void ATL_zlauumCL(const int N, double *A, const int lda); +int ATL_zpotrfRU(const int N, double *A, const int lda); +int ATL_zpotrfRL(const int N, double *A, const int lda); +int ATL_zpotrfU(const int N, double *A, const int lda); +int ATL_zpotrfL(const int N, double *A, const int lda); +int ATL_ztrtri(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_DIAG Diag, const int N, + double *A, const int lda); +int ATL_ztrtriRU(const enum CBLAS_DIAG Diag, const int N, double *A, + const int lda); +int ATL_ztrtriRL(const enum CBLAS_DIAG Diag, const int N, double *A, + const int lda); +int ATL_ztrtriCU(const enum CBLAS_DIAG Diag, const int N, double *A, + const int lda); +int ATL_ztrtriCL(const enum CBLAS_DIAG Diag, const int N, double *A, + const int lda); + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_level1.h b/kaldi_io/src/tools/ATLAS/include/atlas_level1.h new file mode 100644 index 0000000..d4d61d8 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_level1.h @@ -0,0 +1,127 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1999 R. Clint Whaley + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * Prototypes ATLAS Level 1 functions not defined in atlas_aux.h + */ +#ifndef ATLAS_LEVEL1_H +#define ATLAS_LEVEL1_H + +/* + * Many level one blas routines actually taken care of by atlas auxiliary + */ +#include "atlas_aux.h" + +float ATL_sdsdot(const int N, const float alpha, const float *X, + const int incX, const float *Y, const int incY); +double ATL_dsdot(const int N, const float *X, const int incX, + const float *Y, const int incY); +/* + * Routines with all four types + */ +void ATL_sswap(const int N, float *X, const int incX, + float *Y, const int incY); +int ATL_isamax(const int N, const float *X, const int incX); + +void ATL_dswap(const int N, double *X, const int incX, + double *Y, const int incY); +int ATL_idamax(const int N, const double *X, const int incX); + +void ATL_cswap(const int N, float *X, const int incX, + float *Y, const int incY); +int ATL_icamax(const int N, const float *X, const int incX); + +void ATL_zswap(const int N, double *X, const int incX, + double *Y, const int incY); +int ATL_izamax(const int N, const double *X, const int incX); + +/* + * Routines with real types + */ +void ATL_srotg(float *a, float *b, float *c, float *s); +void ATL_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); +void ATL_srot(const int N, float *X, const int incX, + float *Y, const int incY, const float c, const float s); +void ATL_srotm(const int N, float *X, const int incX, + float *Y, const int incY, const float *P); +float ATL_sdot(const int N, const float *X, const int incX, + const float *Y, const int incY); +void ATL_sssq(const int N, const float *X, const int incX, + float *scal0, float *ssq0); +float ATL_snrm2(const int N, const float *X, const int incX); +float ATL_sasum(const int N, const float *X, const int incX); + +void ATL_drotg(double *a, double *b, double *c, double *s); +void ATL_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); +void ATL_drot(const int N, double *X, const int incX, + double *Y, const int incY, const double c, const double s); +void ATL_drotm(const int N, double *X, const int incX, + double *Y, const int incY, const double *P); +double ATL_ddot(const int N, const double *X, const int incX, + const double *Y, const int incY); +void ATL_dssq(const int N, const double *X, const int incX, + double *scal0, double *ssq0); +double ATL_dnrm2(const int N, const double *X, const int incX); +double ATL_dasum(const int N, const double *X, const int incX); + +/* + * Routines with complex types + */ +void ATL_csrot(const int N, float *X, const int incX, + float *Y, const int incY, const float c, const float s); +void ATL_crotg(float *a, const float *b, float *c, float *s); +void ATL_cdotu_sub(const int N, const float *X, const int incX, + const float *Y, const int incY, float *dot); +void ATL_cdotc_sub(const int N, const float *X, const int incX, + const float *Y, const int incY, float *dot); +void ATL_cssq(const int N, const float *X, const int incX, + float *scal0, float *ssq0); +float ATL_scnrm2(const int N, const float *X, const int incX); +float ATL_scasum(const int N, const float *X, const int incX); + +void ATL_zdrot(const int N, double *X, const int incX, + double *Y, const int incY, const double c, const double s); +void ATL_zrotg(double *a, const double *b, double *c, double *s); +void ATL_zdotu_sub(const int N, const double *X, const int incX, + const double *Y, const int incY, double *dot); +void ATL_zdotc_sub(const int N, const double *X, const int incX, + const double *Y, const int incY, double *dot); +void ATL_zssq(const int N, const double *X, const int incX, + double *scal0, double *ssq0); +double ATL_dznrm2(const int N, const double *X, const int incX); +double ATL_dzasum(const int N, const double *X, const int incX); + + +#define ATL_casum ATL_scasum +#define ATL_zasum ATL_dzasum +#define ATL_cnrm2 ATL_scnrm2 +#define ATL_znrm2 ATL_dznrm2 + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_level2.h b/kaldi_io/src/tools/ATLAS/include/atlas_level2.h new file mode 100644 index 0000000..d05f6d5 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_level2.h @@ -0,0 +1,267 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1999 R. Clint Whaley + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * =========================================================================== + * Prototypes for level 2 BLAS + * =========================================================================== + */ +#ifndef ATLAS_LEVEL2_H +#define ATLAS_LEVEL2_H + +/* + * Routines with standard 4 prefixes (S, D, C, Z) + */ +void ATL_sgemv(const enum ATLAS_TRANS TransA, const int M, const int N, + const float alpha, const float *A, const int lda, + const float *X, const int incX, const float beta, + float *Y, const int incY); +void ATL_sgbmv(const enum ATLAS_TRANS TransA, const int M, const int N, + const int KL, const int KU, const float alpha, + const float *A, const int lda, const float *X, + const int incX, const float beta, float *Y, const int incY); +void ATL_strmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, + const float *A, const int lda, float *X, const int incX); +void ATL_stbmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, const int K, + const float *A, const int lda, float *X, const int incX); +void ATL_stpmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, const float *Ap, + float *X, const int incX); +void ATL_strsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, + const float *A, const int lda, float *X, const int incX); +void ATL_stbsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, const int K, + const float *A, const int lda, float *X, const int incX); +void ATL_stpsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, + const float *Ap, float *X, const int incX); + +void ATL_dgemv(const enum ATLAS_TRANS TransA, const int M, const int N, + const double alpha, const double *A, const int lda, + const double *X, const int incX, const double beta, + double *Y, const int incY); +void ATL_dgbmv(const enum ATLAS_TRANS TransA, const int M, const int N, + const int KL, const int KU, const double alpha, + const double *A, const int lda, const double *X, + const int incX, const double beta, double *Y, const int incY); +void ATL_dtrmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, + const double *A, const int lda, double *X, const int incX); +void ATL_dtbmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, const int K, + const double *A, const int lda, double *X, const int incX); +void ATL_dtpmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, const double *Ap, + double *X, const int incX); +void ATL_dtrsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, + const double *A, const int lda, double *X, const int incX); +void ATL_dtbsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, const int K, + const double *A, const int lda, double *X, const int incX); +void ATL_dtpsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, + const double *Ap, double *X, const int incX); + +void ATL_cgemv(const enum ATLAS_TRANS TransA, const int M, const int N, + const float *alpha, const float *A, const int lda, + const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgbmv(const enum ATLAS_TRANS TransA, const int M, const int N, + const int KL, const int KU, const float *alpha, + const float *A, const int lda, const float *X, + const int incX, const float *beta, float *Y, const int incY); +void ATL_ctrmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, + const float *A, const int lda, float *X, const int incX); +void ATL_ctbmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, const int K, + const float *A, const int lda, float *X, const int incX); +void ATL_ctpmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, const float *Ap, + float *X, const int incX); +void ATL_ctrsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, + const float *A, const int lda, float *X, const int incX); +void ATL_ctbsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, const int K, + const float *A, const int lda, float *X, const int incX); +void ATL_ctpsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, + const float *Ap, float *X, const int incX); + +void ATL_zgemv(const enum ATLAS_TRANS TransA, const int M, const int N, + const double *alpha, const double *A, const int lda, + const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgbmv(const enum ATLAS_TRANS TransA, const int M, const int N, + const int KL, const int KU, const double *alpha, + const double *A, const int lda, const double *X, + const int incX, const double *beta, double *Y, const int incY); +void ATL_ztrmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, + const double *A, const int lda, double *X, const int incX); +void ATL_ztbmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, const int K, + const double *A, const int lda, double *X, const int incX); +void ATL_ztpmv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, const double *Ap, + double *X, const int incX); +void ATL_ztrsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, + const double *A, const int lda, double *X, const int incX); +void ATL_ztbsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, const int K, + const double *A, const int lda, double *X, const int incX); +void ATL_ztpsv(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS TransA, + const enum ATLAS_DIAG Diag, const int N, + const double *Ap, double *X, const int incX); + + +/* + * Routines with S and D prefixes only + */ +void ATL_ssymv(const enum ATLAS_UPLO Uplo, const int N, + const float alpha, const float *A, const int lda, + const float *X, const int incX, const float beta, + float *Y, const int incY); +void ATL_ssbmv(const enum ATLAS_UPLO Uplo, const int N, const int K, + const float alpha, const float *A, const int lda, + const float *X, const int incX, const float beta, + float *Y, const int incY); +void ATL_sspmv(const enum ATLAS_UPLO Uplo, const int N, const float alpha, + const float *Ap, const float *X, const int incX, + const float beta, float *Y, const int incY); +void ATL_sger(const int M, const int N, const float alpha, + const float *X, const int incX, const float *Y, const int incY, + float *A, const int lda); +void ATL_ssyr(const enum ATLAS_UPLO Uplo, const int N, const float alpha, + const float *X, const int incX, float *A, const int lda); +void ATL_sspr(const enum ATLAS_UPLO Uplo, const int N, const float alpha, + const float *X, const int incX, float *Ap); +void ATL_ssyr2(const enum ATLAS_UPLO Uplo, const int N, const float alpha, + const float *X, const int incX, const float *Y, const int incY, + float *A, const int lda); +void ATL_sspr2(const enum ATLAS_UPLO Uplo, const int N, const float alpha, + const float *X, const int incX, const float *Y, const int incY, + float *A); + +void ATL_dsymv(const enum ATLAS_UPLO Uplo, const int N, + const double alpha, const double *A, const int lda, + const double *X, const int incX, const double beta, + double *Y, const int incY); +void ATL_dsbmv(const enum ATLAS_UPLO Uplo, const int N, const int K, + const double alpha, const double *A, const int lda, + const double *X, const int incX, const double beta, + double *Y, const int incY); +void ATL_dspmv(const enum ATLAS_UPLO Uplo, const int N, const double alpha, + const double *Ap, const double *X, const int incX, + const double beta, double *Y, const int incY); +void ATL_dger(const int M, const int N, const double alpha, + const double *X, const int incX, const double *Y, const int incY, + double *A, const int lda); +void ATL_dsyr(const enum ATLAS_UPLO Uplo, const int N, const double alpha, + const double *X, const int incX, double *A, const int lda); +void ATL_dspr(const enum ATLAS_UPLO Uplo, const int N, const double alpha, + const double *X, const int incX, double *Ap); +void ATL_dsyr2(const enum ATLAS_UPLO Uplo, const int N, const double alpha, + const double *X, const int incX, const double *Y, const int incY, + double *A, const int lda); +void ATL_dspr2(const enum ATLAS_UPLO Uplo, const int N, const double alpha, + const double *X, const int incX, const double *Y, const int incY, + double *A); + + +/* + * Routines with C and Z prefixes only + */ +void ATL_chemv(const enum ATLAS_UPLO Uplo, const int N, + const float *alpha, const float *A, const int lda, + const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_chbmv(const enum ATLAS_UPLO Uplo, const int N, const int K, + const float *alpha, const float *A, const int lda, + const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_chpmv(const enum ATLAS_UPLO Uplo, const int N, + const float *alpha, const float *Ap, + const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgeru(const int M, const int N, const float *alpha, + const float *X, const int incX, const float *Y, const int incY, + float *A, const int lda); +void ATL_cgerc(const int M, const int N, const float *alpha, + const float *X, const int incX, const float *Y, const int incY, + float *A, const int lda); +void ATL_cher(const enum ATLAS_UPLO Uplo, const int N, const float alpha, + const float *X, const int incX, float *A, const int lda); +void ATL_chpr(const enum ATLAS_UPLO Uplo, const int N, const float alpha, + const float *X, const int incX, float *A); +void ATL_cher2(const enum ATLAS_UPLO Uplo, const int N, + const float *alpha, const float *X, const int incX, + const float *Y, const int incY, float *A, const int lda); +void ATL_chpr2(const enum ATLAS_UPLO Uplo, const int N, + const float *alpha, const float *X, const int incX, + const float *Y, const int incY, float *Ap); + +void ATL_zhemv(const enum ATLAS_UPLO Uplo, const int N, + const double *alpha, const double *A, const int lda, + const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zhbmv(const enum ATLAS_UPLO Uplo, const int N, const int K, + const double *alpha, const double *A, const int lda, + const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zhpmv(const enum ATLAS_UPLO Uplo, const int N, + const double *alpha, const double *Ap, + const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgeru(const int M, const int N, const double *alpha, + const double *X, const int incX, const double *Y, const int incY, + double *A, const int lda); +void ATL_zgerc(const int M, const int N, const double *alpha, + const double *X, const int incX, const double *Y, const int incY, + double *A, const int lda); +void ATL_zher(const enum ATLAS_UPLO Uplo, const int N, const double alpha, + const double *X, const int incX, double *A, const int lda); +void ATL_zhpr(const enum ATLAS_UPLO Uplo, const int N, const double alpha, + const double *X, const int incX, double *A); +void ATL_zher2(const enum ATLAS_UPLO Uplo, const int N, + const double *alpha, const double *X, const int incX, + const double *Y, const int incY, double *A, const int lda); +void ATL_zhpr2(const enum ATLAS_UPLO Uplo, const int N, + const double *alpha, const double *X, const int incX, + const double *Y, const int incY, double *Ap); + + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_level3.h b/kaldi_io/src/tools/ATLAS/include/atlas_level3.h new file mode 100644 index 0000000..023c63c --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_level3.h @@ -0,0 +1,181 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1997 R. Clint Whaley + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ +/* + * =========================================================================== + * Prototypes for level 3 BLAS + * =========================================================================== + */ +#ifndef ATLAS_LEVEL3_H +#define ATLAS_LEVEL3_H + + +/* + * Routines with standard 4 prefixes (S, D, C, Z) + */ +int ATL_sGetNB(void); +int ATL_sGetNCNB(void); +void ATL_sgemm(const enum ATLAS_TRANS TransA, const enum ATLAS_TRANS TransB, + const int M, const int N, const int K, const float alpha, + const float *A, const int lda, const float *B, const int ldb, + const float beta, float *C, const int ldc); +void ATL_ssymm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo, + const int M, const int N, const float alpha, + const float *A, const int lda, const float *B, const int ldb, + const float beta, float *C, const int ldc); +void ATL_ssyrk(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans, + const int N, const int K, const float alpha, + const float *A, const int lda, const float beta, + float *C, const int ldc); +void ATL_ssyr2k(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans, + const int N, const int K, const float alpha, + const float *A, const int lda, const float *B, const int ldb, + const float beta, float *C, const int ldc); +void ATL_strmm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo, + const enum ATLAS_TRANS TransA, const enum ATLAS_DIAG Diag, + const int M, const int N, const float alpha, + const float *A, const int lda, float *B, const int ldb); +void ATL_strsm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo, + const enum ATLAS_TRANS TransA, const enum ATLAS_DIAG Diag, + const int M, const int N, const float alpha, + const float *A, const int lda, float *B, const int ldb); + +int ATL_dGetNB(void); +int ATL_dGetNCNB(void); +void ATL_dgemm(const enum ATLAS_TRANS TransA, const enum ATLAS_TRANS TransB, + const int M, const int N, const int K, const double alpha, + const double *A, const int lda, const double *B, const int ldb, + const double beta, double *C, const int ldc); +void ATL_dsymm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo, + const int M, const int N, const double alpha, + const double *A, const int lda, const double *B, const int ldb, + const double beta, double *C, const int ldc); +void ATL_dsyrk(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans, + const int N, const int K, const double alpha, + const double *A, const int lda, const double beta, + double *C, const int ldc); +void ATL_dsyr2k(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans, + const int N, const int K, const double alpha, + const double *A, const int lda, const double *B, const int ldb, + const double beta, double *C, const int ldc); +void ATL_dtrmm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo, + const enum ATLAS_TRANS TransA, const enum ATLAS_DIAG Diag, + const int M, const int N, const double alpha, + const double *A, const int lda, double *B, const int ldb); +void ATL_dtrsm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo, + const enum ATLAS_TRANS TransA, const enum ATLAS_DIAG Diag, + const int M, const int N, const double alpha, + const double *A, const int lda, double *B, const int ldb); + +int ATL_cGetNB(void); +int ATL_cGetNCNB(void); +void ATL_cgemm(const enum ATLAS_TRANS TransA, const enum ATLAS_TRANS TransB, + const int M, const int N, const int K, const float *alpha, + const float *A, const int lda, const float *B, const int ldb, + const float *beta, float *C, const int ldc); +void ATL_csymm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo, + const int M, const int N, const float *alpha, + const float *A, const int lda, const float *B, const int ldb, + const float *beta, float *C, const int ldc); +void ATL_csyrk(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans, + const int N, const int K, const float *alpha, + const float *A, const int lda, const float *beta, + float *C, const int ldc); +void ATL_csyr2k(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans, + const int N, const int K, const float *alpha, + const float *A, const int lda, const float *B, const int ldb, + const float *beta, float *C, const int ldc); +void ATL_ctrmm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo, + const enum ATLAS_TRANS TransA, const enum ATLAS_DIAG Diag, + const int M, const int N, const float *alpha, + const float *A, const int lda, float *B, const int ldb); +void ATL_ctrsm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo, + const enum ATLAS_TRANS TransA, const enum ATLAS_DIAG Diag, + const int M, const int N, const float *alpha, + const float *A, const int lda, float *B, const int ldb); + +int ATL_zGetNB(void); +int ATL_zGetNCNB(void); +void ATL_zgemm(const enum ATLAS_TRANS TransA, const enum ATLAS_TRANS TransB, + const int M, const int N, const int K, const double *alpha, + const double *A, const int lda, const double *B, const int ldb, + const double *beta, double *C, const int ldc); +void ATL_zsymm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo, + const int M, const int N, const double *alpha, + const double *A, const int lda, const double *B, const int ldb, + const double *beta, double *C, const int ldc); +void ATL_zsyrk(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans, + const int N, const int K, const double *alpha, + const double *A, const int lda, const double *beta, + double *C, const int ldc); +void ATL_zsyr2k(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans, + const int N, const int K, const double *alpha, + const double *A, const int lda, const double *B, const int ldb, + const double *beta, double *C, const int ldc); +void ATL_ztrmm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo, + const enum ATLAS_TRANS TransA, const enum ATLAS_DIAG Diag, + const int M, const int N, const double *alpha, + const double *A, const int lda, double *B, const int ldb); +void ATL_ztrsm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo, + const enum ATLAS_TRANS TransA, const enum ATLAS_DIAG Diag, + const int M, const int N, const double *alpha, + const double *A, const int lda, double *B, const int ldb); + + +/* + * Routines with prefixes C and Z only + */ +void ATL_chemm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo, + const int M, const int N, const float *alpha, + const float *A, const int lda, const float *B, const int ldb, + const float *beta, float *C, const int ldc); +void ATL_cherk(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans, + const int N, const int K, const float alpha, + const float *A, const int lda, const float beta, + float *C, const int ldc); +void ATL_cher2k(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans, + const int N, const int K, const float *alpha, + const float *A, const int lda, const float *B, const int ldb, + const float beta, float *C, const int ldc); + +void ATL_zhemm(const enum ATLAS_SIDE Side, const enum ATLAS_UPLO Uplo, + const int M, const int N, const double *alpha, + const double *A, const int lda, const double *B, const int ldb, + const double *beta, double *C, const int ldc); +void ATL_zherk(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans, + const int N, const int K, const double alpha, + const double *A, const int lda, const double beta, + double *C, const int ldc); +void ATL_zher2k(const enum ATLAS_UPLO Uplo, const enum ATLAS_TRANS Trans, + const int N, const int K, const double *alpha, + const double *A, const int lda, const double *B, const int ldb, + const double beta, double *C, const int ldc); + + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_lvl2.h b/kaldi_io/src/tools/ATLAS/include/atlas_lvl2.h new file mode 100644 index 0000000..b09a021 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_lvl2.h @@ -0,0 +1,294 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1999 R. Clint Whaley + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "atlas_level2.h" +#include "atlas_kernel2.h" +#ifndef ATLAS_LVL2_H +#define ATLAS_LVL2_H + +/* + * Real kernels + */ +void ATL_sger1_a1_x1_yX + (const int M, const int N, const float alpha, const float *X, + const int incX, const float *Y, const int incY, float *A, const int lda); +void ATL_sgemvS_a1_x1_bX_y1 + (const int M, const int N, const float alpha, const float *A, + const int lda, const float *X, const int incX, const float beta, + float *Y, const int incY); +void ATL_sgemvT_a1_x1_bX_y1 + (const int M, const int N, const float alpha, const float *A, + const int lda, const float *X, const int incX, const float beta, + float *Y, const int incY); +void ATL_sgemvN_a1_x1_bX_y1 + (const int M, const int N, const float alpha, const float *A, + const int lda, const float *X, const int incX, const float beta, + float *Y, const int incY); +void ATL_sgemvS_a1_x1_b1_y1 + (const int M, const int N, const float alpha, const float *A, + const int lda, const float *X, const int incX, const float beta, + float *Y, const int incY); +void ATL_sgemvT_a1_x1_b1_y1 + (const int M, const int N, const float alpha, const float *A, + const int lda, const float *X, const int incX, const float beta, + float *Y, const int incY); +void ATL_sgemvN_a1_x1_b1_y1 + (const int M, const int N, const float alpha, const float *A, + const int lda, const float *X, const int incX, const float beta, + float *Y, const int incY); +void ATL_sgemvS_a1_x1_b0_y1 + (const int M, const int N, const float alpha, const float *A, + const int lda, const float *X, const int incX, const float beta, + float *Y, const int incY); +void ATL_sgemvT_a1_x1_b0_y1 + (const int M, const int N, const float alpha, const float *A, + const int lda, const float *X, const int incX, const float beta, + float *Y, const int incY); +void ATL_sgemvN_a1_x1_b0_y1 + (const int M, const int N, const float alpha, const float *A, + const int lda, const float *X, const int incX, const float beta, + float *Y, const int incY); +void ATL_dger1_a1_x1_yX + (const int M, const int N, const double alpha, const double *X, + const int incX, const double *Y, const int incY, double *A, const int lda); +void ATL_dgemvS_a1_x1_bX_y1 + (const int M, const int N, const double alpha, const double *A, + const int lda, const double *X, const int incX, const double beta, + double *Y, const int incY); +void ATL_dgemvT_a1_x1_bX_y1 + (const int M, const int N, const double alpha, const double *A, + const int lda, const double *X, const int incX, const double beta, + double *Y, const int incY); +void ATL_dgemvN_a1_x1_bX_y1 + (const int M, const int N, const double alpha, const double *A, + const int lda, const double *X, const int incX, const double beta, + double *Y, const int incY); +void ATL_dgemvS_a1_x1_b1_y1 + (const int M, const int N, const double alpha, const double *A, + const int lda, const double *X, const int incX, const double beta, + double *Y, const int incY); +void ATL_dgemvT_a1_x1_b1_y1 + (const int M, const int N, const double alpha, const double *A, + const int lda, const double *X, const int incX, const double beta, + double *Y, const int incY); +void ATL_dgemvN_a1_x1_b1_y1 + (const int M, const int N, const double alpha, const double *A, + const int lda, const double *X, const int incX, const double beta, + double *Y, const int incY); +void ATL_dgemvS_a1_x1_b0_y1 + (const int M, const int N, const double alpha, const double *A, + const int lda, const double *X, const int incX, const double beta, + double *Y, const int incY); +void ATL_dgemvT_a1_x1_b0_y1 + (const int M, const int N, const double alpha, const double *A, + const int lda, const double *X, const int incX, const double beta, + double *Y, const int incY); +void ATL_dgemvN_a1_x1_b0_y1 + (const int M, const int N, const double alpha, const double *A, + const int lda, const double *X, const int incX, const double beta, + double *Y, const int incY); + +/* + * Complex kernels + */ +void ATL_cger1u_a1_x1_yX + (const int M, const int N, const float *alpha, const float *X, + const int incX, const float *Y, const int incY, float *A, const int lda); +void ATL_cger1c_a1_x1_yX + (const int M, const int N, const float *alpha, const float *X, + const int incX, const float *Y, const int incY, float *A, const int lda); +void ATL_cgemvS_a1_x1_bXi0_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvC_a1_x1_bXi0_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvNc_a1_x1_bXi0_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvT_a1_x1_bXi0_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvN_a1_x1_bXi0_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvS_a1_x1_bX_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvC_a1_x1_bX_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvNc_a1_x1_bX_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvT_a1_x1_bX_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvN_a1_x1_bX_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvS_a1_x1_b1_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvC_a1_x1_b1_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvNc_a1_x1_b1_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvT_a1_x1_b1_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvN_a1_x1_b1_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvS_a1_x1_b0_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvC_a1_x1_b0_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvNc_a1_x1_b0_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvT_a1_x1_b0_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_cgemvN_a1_x1_b0_y1 + (const int M, const int N, const float *alpha, const float *A, + const int lda, const float *X, const int incX, const float *beta, + float *Y, const int incY); +void ATL_zger1u_a1_x1_yX + (const int M, const int N, const double *alpha, const double *X, + const int incX, const double *Y, const int incY, double *A, const int lda); +void ATL_zger1c_a1_x1_yX + (const int M, const int N, const double *alpha, const double *X, + const int incX, const double *Y, const int incY, double *A, const int lda); +void ATL_zgemvS_a1_x1_bXi0_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvC_a1_x1_bXi0_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvNc_a1_x1_bXi0_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvT_a1_x1_bXi0_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvN_a1_x1_bXi0_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvS_a1_x1_bX_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvC_a1_x1_bX_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvNc_a1_x1_bX_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvT_a1_x1_bX_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvN_a1_x1_bX_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvS_a1_x1_b1_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvC_a1_x1_b1_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvNc_a1_x1_b1_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvT_a1_x1_b1_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvN_a1_x1_b1_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvS_a1_x1_b0_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvC_a1_x1_b0_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvNc_a1_x1_b0_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvT_a1_x1_b0_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); +void ATL_zgemvN_a1_x1_b0_y1 + (const int M, const int N, const double *alpha, const double *A, + const int lda, const double *X, const int incX, const double *beta, + double *Y, const int incY); + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_lvl3.h b/kaldi_io/src/tools/ATLAS/include/atlas_lvl3.h new file mode 100644 index 0000000..eab93c0 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_lvl3.h @@ -0,0 +1,512 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1997 R. Clint Whaley + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef ATLAS_LVL3_H +#define ATLAS_LVL3_H + +#include "atlas_misc.h" +#include "atlas_f77.h" +#include "atlas_level3.h" +#if defined(SREAL) + #include "smm.h" + #include "sXover.h" +#elif defined(DREAL) + #include "dmm.h" + #include "dXover.h" +#elif defined(QREAL) + #include "qmm.h" + #include "qXover.h" +#elif defined(SCPLX) + #ifdef ATL_NCMM + #include "atlas_cNCmm.h" + #else + #include "cmm.h" + #endif + #include "cXover.h" +#elif defined(DCPLX) + #ifdef ATL_NCMM + #include "atlas_zNCmm.h" + #else + #include "zmm.h" + #endif + #include "zmm.h" + #include "zXover.h" +#endif +#ifndef ATL_3NB + #define ATL_3NB 3*NB + + #define NN_MNK_M NBNB*NB + #define NN_MNK_N NBNB*NB + #define NN_MNK_K NBNB*NB + #define NN_MNK_MN NBNB*NB + #define NN_MNK_GE NBNB*NB + + #define NT_MNK_M NBNB*NB + #define NT_MNK_N NBNB*NB + #define NT_MNK_K NBNB*NB + #define NT_MNK_MN NBNB*NB + #define NT_MNK_GE NBNB*NB + + #define TN_MNK_M NBNB*NB + #define TN_MNK_N NBNB*NB + #define TN_MNK_K NBNB*NB + #define TN_MNK_MN NBNB*NB + #define TN_MNK_GE NBNB*NB + + #define TT_MNK_M NBNB*NB + #define TT_MNK_N NBNB*NB + #define TT_MNK_K NBNB*NB + #define TT_MNK_MN NBNB*NB + #define TT_MNK_GE NBNB*NB +#endif + +#ifndef CN_MNK_M + #define CN_MNK_M TN_MNK_M + #define CN_MNK_N TN_MNK_N + #define CN_MNK_K TN_MNK_K + #define CN_MNK_MN TN_MNK_MN + #define CN_MNK_GE TN_MNK_GE +#endif +#ifndef NC_MNK_M + #define NC_MNK_M NT_MNK_M + #define NC_MNK_N NT_MNK_N + #define NC_MNK_K NT_MNK_K + #define NC_MNK_MN NT_MNK_MN + #define NC_MNK_GE NT_MNK_GE +#endif +#ifndef CT_MNK_M + #define CT_MNK_M TT_MNK_M + #define CT_MNK_N TT_MNK_N + #define CT_MNK_K TT_MNK_K + #define CT_MNK_MN TT_MNK_MN + #define CT_MNK_GE TT_MNK_GE +#endif +#ifndef TC_MNK_M + #define TC_MNK_M TT_MNK_M + #define TC_MNK_N TT_MNK_N + #define TC_MNK_K TT_MNK_K + #define TC_MNK_MN TT_MNK_MN + #define TC_MNK_GE TT_MNK_GE +#endif +#ifndef CC_MNK_M + #define CC_MNK_M TT_MNK_M + #define CC_MNK_N TT_MNK_N + #define CC_MNK_K TT_MNK_K + #define CC_MNK_MN TT_MNK_MN + #define CC_MNK_GE TT_MNK_GE +#endif + +#define CPAT Mjoin(C_ATL_, PRE); + +#ifndef ATL_MaxMalloc + #define ATL_MaxMalloc 67108864 +#endif + +typedef void (*MAT2BLK)(int, int, const TYPE*, int, TYPE*, const SCALAR); +typedef void (*MAT2BLK2)(const int, const int, const SCALAR, const TYPE*, + const int, TYPE*, const int); +typedef void (*MATSCAL)(const int, const int, const SCALAR, TYPE*, const int); +typedef void (*PUTBLK)(int, int, TYPE*, TYPE*, int, const SCALAR); +typedef void (*NBCLEANUP)(const TYPE*, const TYPE*, TYPE*, const int); +typedef int (*MMINTR)(const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const SCALAR, + const TYPE *, const int, const TYPE *, const int, + const SCALAR, TYPE *, const int); +typedef void (*NBMM0)(const int, const int, const int, const TYPE, + const TYPE*, const int, const TYPE*, const int, + const TYPE, TYPE*, const int); + +void ATL_xerbla(int p, char *rout, char *form, ...); +int Mjoin(PATL,GetNB)(void); +int Mjoin(PATL,GetNCNB)(void); + +void Mjoin(PATL, gescal_bX)(const int, const int, const SCALAR, TYPE*, + const int); +void Mjoin(PATL, gescal_bn1)(const int, const int, const SCALAR, TYPE*, + const int); +void Mjoin(PATL, gescal_b0)(const int, const int, const SCALAR, TYPE*, + const int); + +void Mjoin(PATL,pKBmm_bX)(const int M, const int N, const int K, + const TYPE alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const TYPE beta, + TYPE *C, const int ldc); +void Mjoin(PATL,pNBmm_bX)(const int M, const int N, const int K, + const TYPE alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const TYPE beta, + TYPE *C, const int ldc); +void Mjoin(PATL,pMBmm_bX)(const int M, const int N, const int K, + const TYPE alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const TYPE beta, + TYPE *C, const int ldc); +void Mjoin(PATL,pKBmm_b1)(const int M, const int N, const int K, + const TYPE alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const TYPE beta, + TYPE *C, const int ldc); +void Mjoin(PATL,pNBmm_b1)(const int M, const int N, const int K, + const TYPE alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const TYPE beta, + TYPE *C, const int ldc); +void Mjoin(PATL,pMBmm_b1)(const int M, const int N, const int K, + const TYPE alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const TYPE beta, + TYPE *C, const int ldc); +void Mjoin(PATL,pKBmm_b0)(const int M, const int N, const int K, + const TYPE alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const TYPE beta, + TYPE *C, const int ldc); +void Mjoin(PATL,pNBmm_b0)(const int M, const int N, const int K, + const TYPE alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const TYPE beta, + TYPE *C, const int ldc); +void Mjoin(PATL,pMBmm_b0)(const int M, const int N, const int K, + const TYPE alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const TYPE beta, + TYPE *C, const int ldc); +void Mjoin(PATL,pKBmm)(const int M, const int N, const int K, + const TYPE alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const TYPE beta, + TYPE *C, const int ldc); + +void Mjoin(PATL,MBJBmm)(const int N, const int K, const TYPE *A, const TYPE *B, + const TYPE beta, TYPE *C, const int ldc); +void Mjoin(PATL,IBJBmm)(int IB, int JB, int K, const TYPE *A, const TYPE *B, + const TYPE beta, TYPE *C, const int ldc); +void Mjoin(PATL,IBNBmm)(const int M, const int K, const TYPE *A, const TYPE *B, + const TYPE beta, TYPE *C, const int ldc); +#ifdef TCPLX + +void Mjoin(PATL,CNBmm_b0)(const int M, const int N, const int K, + const TYPE alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const TYPE beta, + TYPE *C, const int ldc); +void Mjoin(PATL,CNBmm_b1)(const int M, const int N, const int K, + const TYPE alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const TYPE beta, + TYPE *C, const int ldc); +void Mjoin(PATL,CNBmm_bX)(const int M, const int N, const int K, + const TYPE alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const TYPE beta, + TYPE *C, const int ldc); +void Mjoin(PATL, gescal_bXi0)(const int, const int, const SCALAR, TYPE*, + const int); + +void Mjoin(PATL,row2blkT_aXi0) + (const int, const int, const TYPE*, const int, TYPE*, const SCALAR); +void Mjoin(PATL,row2blkT2_aXi0) + (const int, const int, const TYPE*, const int, TYPE*, const SCALAR); +void Mjoin(PATL,col2blk_aXi0) + (const int, const int, const TYPE*, const int, TYPE*, const SCALAR); +void Mjoin(PATL,col2blk2_aXi0) + (const int, const int, const TYPE*, const int, TYPE*, const SCALAR); + +void Mjoin(PATL,row2blkC_aX) + (const int, const int, const TYPE*, const int, TYPE*, const SCALAR); +void Mjoin(PATL,row2blkC2_aX) + (const int, const int, const TYPE*, const int, TYPE*, const SCALAR); +void Mjoin(PATL,col2blkConj_aX) + (const int, const int, const TYPE*, const int, TYPE*, const SCALAR); +void Mjoin(PATL,col2blkConj2_aX) + (const int, const int, const TYPE*, const int, TYPE*, const SCALAR); +void Mjoin(PATL,row2blkC_a1) + (const int, const int, const TYPE*, const int, TYPE*, const SCALAR); +void Mjoin(PATL,row2blkC2_a1) + (const int, const int, const TYPE*, const int, TYPE*, const SCALAR); +void Mjoin(PATL,col2blkConj_a1) + (const int, const int, const TYPE*, const int, TYPE*, const SCALAR); +void Mjoin(PATL,col2blkConj2_a1) + (const int, const int, const TYPE*, const int, TYPE*, const SCALAR); +void Mjoin(PATL,row2blkC_aXi0) + (const int, const int, const TYPE*, const int, TYPE*, const SCALAR); +void Mjoin(PATL,row2blkC2_aXi0) + (const int, const int, const TYPE*, const int, TYPE*, const SCALAR); +void Mjoin(PATL,col2blkConj_aXi0) + (const int, const int, const TYPE*, const int, TYPE*, const SCALAR); +void Mjoin(PATL,col2blkConj2_aXi0) + (const int, const int, const TYPE*, const int, TYPE*, const SCALAR); + +void Mjoin(PATL,mmJIK2) + (int K, int nMb, int nNb, int nKb, int ib, int jb, int kb, + const SCALAR alpha, const TYPE *pA0, const TYPE *B, int ldb, TYPE *pB0, + int incB, MAT2BLK B2blk, const SCALAR beta, TYPE *C, int ldc, + MATSCAL gescal, NBMM0 NBmm0); + +void Mjoin(PATL,mmIJK2) + (int K, int nMb, int nNb, int nKb, int ib, int jb, int kb, + const SCALAR alpha, const TYPE *A, const int lda, TYPE *pA0, const int incA, + MAT2BLK A2blk, TYPE *pB0, const SCALAR beta, TYPE *C, int ldc, + MATSCAL gescal, NBMM0 NBmm0); + +#else /* real */ + +void Mjoin(PATL,putblk_bX)(int M, int N, TYPE *V, TYPE *C, int ldc, const SCALAR beta); +void Mjoin(PATL,putblk_bn1)(int M, int N, TYPE *V, TYPE *C, int ldc, const SCALAR beta); +void Mjoin(PATL,putblk_b1)(int M, int N, TYPE *V, TYPE *C, int ldc, const SCALAR beta); +void Mjoin(PATL,putblk_b0)(int M, int N, TYPE *V, TYPE *C, int ldc, const SCALAR beta); +void ATL_gereal2cplx(const int M, const int N, TYPE *alpha, TYPE *R, int ldr, + TYPE *I, int ldi, TYPE *beta, TYPE *C, int ldc); + +void NBmm_b1(const int M, const int N, const int K, const TYPE alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const TYPE beta, TYPE *C, const int ldc); +void NBmm_b0(const int M, const int N, const int K, const TYPE alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const TYPE beta, TYPE *C, const int ldc); +void NBmm_bX(const int M, const int N, const int K, const TYPE alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const TYPE beta, TYPE *C, const int ldc); +void Mjoin(PATL,mmJIK2)(int K, int nMb, int nNb, int nKb, int ib, int jb, + int kb, const SCALAR alpha, const TYPE *pA0, + const TYPE *B, int ldb, TYPE *pB0, int incB, + MAT2BLK B2blk, const SCALAR beta, TYPE *C, int ldc, + TYPE *pC, PUTBLK putblk, NBMM0 NBmm0); + +void Mjoin(PATL,mmIJK2)(int K, int nMb, int nNb, int nKb, int ib, int jb, + int kb, const SCALAR alpha, const TYPE *A, int lda, + TYPE *pA0, int incA, MAT2BLK A2blk, const TYPE *pB0, + const SCALAR beta, TYPE *C, int ldc, TYPE *pC, + PUTBLK putblk, NBMM0 NBmm0); + + +void Mjoin(PATL,aliased_gemm) + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void Mjoin(PATL,f77gemm) + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void Mjoin(PATL,gemm) + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void Mjoin(PATL,small_mm) + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void Mjoin(PATL,big_mm) + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +#endif + +#ifdef USERGEMM +int Mjoin(PATU,usergemm)(const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const SCALAR, + const TYPE*, const int, const TYPE*, + const int, const SCALAR, TYPE*, const int); +#endif +int Mjoin(PATL,NCmmJIK)(const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const SCALAR, + const TYPE*, const int, const TYPE*, + const int, const SCALAR, TYPE*, const int); +int Mjoin(PATL,NCmmIJK)(const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const SCALAR, + const TYPE*, const int, const TYPE*, + const int, const SCALAR, TYPE*, const int); +int Mjoin(PATL,NCmmJIK_c)(const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const SCALAR, + const TYPE*, const int, const TYPE*, + const int, const SCALAR, TYPE*, const int); +int Mjoin(PATL,NCmmIJK_c)(const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const SCALAR, + const TYPE*, const int, const TYPE*, + const int, const SCALAR, TYPE*, const int); + +void Mjoin(PATL,row2blkT2_aX)(int, int, const TYPE*, int, TYPE*, const SCALAR); +void Mjoin(PATL,row2blkT_aX)(int, int, const TYPE*, int, TYPE*, const SCALAR); +void Mjoin(PATL,col2blk2_aX)(int, int, const TYPE*, int, TYPE*, const SCALAR); +void Mjoin(PATL,col2blk_aX)(int, int, const TYPE*, int, TYPE*, const SCALAR); +void Mjoin(PATL,row2blkT2_an1)(int, int, const TYPE*, int, TYPE*, + const SCALAR); +void Mjoin(PATL,row2blkT_an1)(int, int, const TYPE*, int, TYPE*, const SCALAR); +void Mjoin(PATL,col2blk2_an1)(int, int, const TYPE*, int, TYPE*, const SCALAR); +void Mjoin(PATL,col2blk_an1)(int, int, const TYPE*, int, TYPE*, const SCALAR); +void Mjoin(PATL,row2blkT2_a1)(int, int, const TYPE*, int, TYPE*, const SCALAR); +void Mjoin(PATL,row2blkT_a1)(int, int, const TYPE*, int, TYPE*, const SCALAR); +void Mjoin(PATL,col2blk2_a1)(int, int, const TYPE*, int, TYPE*, const SCALAR); +void Mjoin(PATL,col2blk_a1)(int, int, const TYPE*, int, TYPE*, const SCALAR); + +int Mjoin(PATL,mmJITcp)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, + const SCALAR alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const SCALAR beta, + TYPE *C, const int ldc); +int Mjoin(PATL,mmJIK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, + const SCALAR alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const SCALAR beta, + TYPE *C, const int ldc); +int Mjoin(PATL,mmIJK)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, + const SCALAR alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const SCALAR beta, + TYPE *C, const int ldc); +int Mjoin(PATL,mmJKI)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, + const SCALAR alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const SCALAR beta, + TYPE *C, const int ldc); + +void Mjoin(PATL,mmK) + (int M, int m, int N, int n, int nblk, int kr, int KR, const SCALAR alphaA, + const SCALAR alphaB, const SCALAR beta, const TYPE *A, const int lda, + const int incA, TYPE *pA, const int incAW, const TYPE *B, const int ldb, + const int incB, TYPE *pB, const int incBW, TYPE *C, const int ldc, + MAT2BLK2 A2blk, MAT2BLK2 B2blk, NBMM0 NBmm0, NBMM0 NBmm1); + +int Mjoin(PATL,mmBPP)(const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, + const SCALAR alpha, const TYPE *A, const int lda, + const TYPE *B, const int ldb, const SCALAR beta, + TYPE *C, const int ldc); + + +void Mjoin(PATL,gemmTT) + (const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void Mjoin(PATL,aliased_gemmTT) + (const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void Mjoin(PATL,gemmTN) + (const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void Mjoin(PATL,aliased_gemmTN) + (const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void Mjoin(PATL,gemmNT) + (const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void Mjoin(PATL,aliased_gemmNT) + (const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void Mjoin(PATL,gemmNN) + (const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void Mjoin(PATL,aliased_gemmNN) + (const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); + + +void NCmmNNIJK_c + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void NCmmNTIJK_c + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void NCmmTNIJK_c + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void NCmmTTIJK_c + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void NCmmNNIJK + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void NCmmNTIJK + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void NCmmTNIJK + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void NCmmTTIJK + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void NCmmNNJIK_c + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void NCmmNTJIK_c + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void NCmmTNJIK_c + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void NCmmTTJIK_c + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void NCmmNNJIK + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void NCmmNTJIK + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void NCmmTNJIK + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); +void NCmmTTJIK + (const enum ATLAS_TRANS TA, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, TYPE *C, const int ldc); + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_misc.h b/kaldi_io/src/tools/ATLAS/include/atlas_misc.h new file mode 100644 index 0000000..88f754d --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_misc.h @@ -0,0 +1,416 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1997 R. Clint Whaley + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include "atlas_enum.h" + +#ifndef ATLAS_MISC_H +#define ATLAS_MISC_H +#include "atlas_type.h" +#ifdef ATL_PROFILE + extern int ATL_ProfGemmCameFrom; +#endif +/* + * Some useful macro functions + */ +#if (defined(PentiumCPS) || defined(ATL_USEPTHREADS)) && !defined(WALL) + #define WALL +#endif +#ifndef time00 + #if defined(WALL) + #define time00 ATL_walltime + #else + #define time00 ATL_cputime + #endif +#endif +#define Mabs(x) ( (x) >= 0 ? (x) : -(x) ) +#define Mmax(x, y) ( (x) > (y) ? (x) : (y) ) +#define Mmin(x, y) ( (x) > (y) ? (y) : (x) ) +#define Mlowcase(C) ( ((C) > 64 && (C) < 91) ? (C) | 32 : (C) ) +#define Mupcase(C) ( ((C) > 96 && (C) < 123) ? (C) & 0xDF : (C) ) +/* + * packed indexing functions (upper & lower) + */ + +#define Mjoin(pre, nam) my_join(pre, nam) +#define my_join(pre, nam) pre ## nam +#define Mstr2(m) # m +#define Mstr(m) Mstr2(m) + +#define ATL_assert(n_) \ +{ \ + if (!(n_)) \ + { \ + ATL_xerbla(0, __FILE__, "assertion %s failed, line %d of file %s\n", \ + Mstr(n_), __LINE__, __FILE__); \ + } \ +} + +/* + * Define some C99 features that we use when we know the compiler supports them + */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__/100 >= 1999) + #define INLINE inline + #define RESTRICT restrict +#else + #define INLINE + #define RESTRICT +#endif + +#if defined(SREAL) + #define EPS 5.0e-7 + #define TYPE float + #define PRE s + #define UPR s + #define PREU S + #define PATL ATL_s + #define PATU ATLU_s + #define UATL ATLU_s + #define CBLA cblas_s + #define PATLU ATL_s + #define ATL_rone 1.0f + #define ATL_rnone -1.0f + #define ATL_rzero 0.0f + #define ATL_typify(m_) Mjoin(m_,f) + #include "atlas_ssysinfo.h" +#elif defined(DREAL) + #define EPS 1.0e-15 + #define TYPE double + #define PRE d + #define UPR d + #define PREU D + #define PATL ATL_d + #define PATU ATLU_d + #define UATL ATLU_d + #define CBLA cblas_d + #define PATLU ATL_d + #define ATL_rone 1.0 + #define ATL_rnone -1.0 + #define ATL_rzero 0.0 + #define ATL_typify(m_) m_ + #include "atlas_dsysinfo.h" +#elif defined (QREAL) + #define EPS 1.9259299443872358530559779425849273E-34L + #define TYPE long double + #define PRE q + #define UPR q + #define PREU Q + #define PATL ATL_q + #define PATU ATLU_q + #define CBLA cblas_q +#elif defined(SCPLX) + #define EPS 5.0e-7 + #define TYPE float + #define PRE c + #define UPR s + #define PREU C + #define PATL ATL_c + #define PATLU ATL_s + #define PATU ATLU_c + #define UATL ATLU_s + #define ATL_rone 1.0f + #define ATL_rnone -1.0f + #define ATL_rzero 0.0f + #define ATL_typify(m_) Mjoin(m_,f) + #define CBLA cblas_c + #include "atlas_csysinfo.h" +#elif defined(DCPLX) + #define TYPE double + #define PRE z + #define UPR d + #define PREU Z + #define PATL ATL_z + #define PATLU ATL_d + #define PATU ATLU_z + #define UATL ATLU_d + #define EPS 1.0e-15 + #define ATL_rone 1.0 + #define ATL_rnone -1.0 + #define ATL_rzero 0.0 + #define ATL_typify(m_) m_ + #define CBLA cblas_z + #include "atlas_zsysinfo.h" +#endif + +#if defined (SREAL) || defined (DREAL) || defined (SCPLX) || defined (DCPLX) + #define ATL_sizeof Mjoin(PATL,size) + #define ATL_MulBySize Mjoin(PATL,MulBySize) + #define ATL_DivBySize Mjoin(PATL,DivBySize) +#endif + +#if ( defined(SREAL) || defined(DREAL) || defined(QREAL) ) + #define TREAL + #define SHIFT + #define SCALAR TYPE + #define SADD & + #define SVAL + #define SVVAL * + #define SCALAR_IS_ONE(M_scalar) ((M_scalar) == ATL_rone) + #define SCALAR_IS_NONE(M_scalar) ((M_scalar) == ATL_rnone) + #define SCALAR_IS_ZERO(M_scalar) ((M_scalar) == ATL_rzero) +#elif defined(SCPLX) || defined(DCPLX) + #define TCPLX +/* + * c = b*c + v; + */ + #define CMULT2(v, a, b, tmp) \ + { \ + tmp = *(a) * *(b) - *(a+1) * *(b+1); \ + *(b+1) = *(a) * *(b+1) + *(a+1) * *(b) + *(v+1); \ + *(b) = tmp + *v; \ + } + #define SHIFT << 1 + #define SCALAR TYPE * + #define SADD + #define SVAL * + #define SVVAL + #define SCALAR_IS_ONE(M_scalar) \ + ( (*(M_scalar) == ATL_rone) && ((M_scalar)[1] == ATL_rzero) ) + #define SCALAR_IS_NONE(M_scalar) \ + ( (*(M_scalar) == ATL_rnone) && ((M_scalar)[1] == ATL_rzero) ) + #define SCALAR_IS_ZERO(M_scalar) \ + ( (*(M_scalar) == ATL_rzero) && ((M_scalar)[1] == ATL_rzero) ) +#endif + +#if defined(ALPHA1) + #define ATL_MulByALPHA(x_) (x_) + #define NM _a1 +#elif defined (ALPHA0) + #define ATL_MulByALPHA(x_) ATL_rzero + #define NM _a0 +#elif defined (ALPHAN1) + #define ATL_MulByALPHA(x_) (-(x_)) + #define NM _an1 +#elif defined (ALPHAXI0) + #define ATL_MulByALPHA(x_) (ralpha*(x_)) + #define NM _aXi0 +#elif defined (ALPHA1C) + #define NM _a1c +#elif defined (ALPHAN1C) + #define NM _an1c +#elif defined (ALPHAXI0C) + #define NM _aXi0c +#elif defined (ALPHAXC) + #define NM _aXc +#elif defined (ALPHAX) + #define ATL_MulByALPHA(x_) (alpha*(x_)) + #define NM _aX +#endif + +#if defined(BETA1) + #define ATL_MulByBETA(x_) (x_) + #define MSTAT A[i] += v[i] + #define BNM _b1 +#elif defined(BETA1C) + #define BNM _b1c +#elif defined(BETAN1) + #define ATL_MulByBETA(x_) (-(x_)) + #define MSTAT A[i] = v[i] - A[i] + #define BNM _bn1 +#elif defined(BETAN1C) + #define BNM _bn1c +#elif defined(BETA0) + #define ATL_MulByBETA(x_) ATL_rzero + #define MSTAT A[i] = v[i] + #define BNM _b0 +#elif defined (BETAXI0) + #define BNM _bXi0 + #define ATL_MulByBETA(x_) (rbeta*(x_)) +#elif defined (BETAXI0C) + #define BNM _bXi0c +#elif defined (BETAX) + #define ATL_MulByBETA(x_) (beta*(x_)) + #define MSTAT A[i] = beta*A[i] + v[i] + #define BNM _bX +#elif defined (BETAXC) + #define BNM _bXc +#endif + +/* any alignment below this forces data copy in gemm */ +#ifndef ATL_MinMMAlign + #define ATL_MinMMAlign 16 +#endif +#if (ATL_MinMMAlign == 1 || ATL_MinMMAlign == 0) + #define ATL_DataIsMinAligned(ptr) 1 +#elif (ATL_MinMMAlign == 2) + #define ATL_DataIsMinAligned(ptr) \ + ( (((size_t) (ptr))>>1)<<1 == (size_t) (ptr) ) +#elif (ATL_MinMMAlign == 4) + #define ATL_DataIsMinAligned(ptr) \ + ( (((size_t) (ptr))>>2)<<2 == (size_t) (ptr) ) +#elif (ATL_MinMMAlign == 8) + #define ATL_DataIsMinAligned(ptr) \ + ( (((size_t) (ptr))>>3)<<3 == (size_t) (ptr) ) +#elif (ATL_MinMMAlign == 16) + #define ATL_DataIsMinAligned(ptr) \ + ( (((size_t) (ptr))>>4)<<4 == (size_t) (ptr) ) +#elif (ATL_MinMMAlign == 32) + #define ATL_DataIsMinAligned(ptr) \ + ( (((size_t) (ptr))>>5)<<5 == (size_t) (ptr) ) +#elif (ATL_MinMMAlign == 64) + #define ATL_DataIsMinAligned(ptr) \ + ( (((size_t) (ptr))>>6)<<6 == (size_t) (ptr) ) +#elif (ATL_MinMMAlign == 128) + #define ATL_DataIsMinAligned(ptr) \ + ( (((size_t) (ptr))>>7)<<7 == (size_t) (ptr) ) +#else + #define ATL_DataIsMinAligned(ptr) \ + ( (((size_t) (ptr))/ATL_MinMMAlign)*ATL_MinMMAlign == (size_t) (ptr) ) +#endif + +#define ATL_Cachelen 32 +#if (ATL_Cachelen == 4) + #define ATL_MulByCachelen(N_) ( (N_) << 2 ) + #define ATL_DivByCachelen(N_) ( (N_) >> 2 ) +#elif (ATL_Cachelen == 8) + #define ATL_MulByCachelen(N_) ( (N_) << 3 ) + #define ATL_DivByCachelen(N_) ( (N_) >> 3 ) +#elif (ATL_Cachelen == 16) + #define ATL_MulByCachelen(N_) ( (N_) << 4 ) + #define ATL_DivByCachelen(N_) ( (N_) >> 4 ) +#elif (ATL_Cachelen == 32) + #define ATL_MulByCachelen(N_) ( (N_) << 5 ) + #define ATL_DivByCachelen(N_) ( (N_) >> 5 ) +#elif (ATL_Cachelen == 64) + #define ATL_MulByCachelen(N_) ( (N_) << 6 ) + #define ATL_DivByCachelen(N_) ( (N_) >> 6 ) +#elif (ATL_Cachelen == 128) + #define ATL_MulByCachelen(N_) ( (N_) << 7 ) + #define ATL_DivByCachelen(N_) ( (N_) >> 7 ) +#elif (ATL_Cachelen == 256) + #define ATL_MulByCachelen(N_) ( (N_) << 8 ) + #define ATL_DivByCachelen(N_) ( (N_) >> 8 ) +#else + #define ATL_MulByCachelen(N_) ( (N_) * ATL_Cachelen ) + #define ATL_DivByCachelen(N_) ( (N_) / ATL_Cachelen ) +#endif + +#if (ATL_Cachelen < ATL_MinMMAlign) + Force a compilation error if our required alignment is at least the + minimum!!@^ +#endif + +#define ATL_AlignPtr(vp) \ + (void*) (ATL_Cachelen + ATL_MulByCachelen(ATL_DivByCachelen((size_t) (vp)))) + +#define ATL_FindPtrAdjust(vp, iadj_) \ +{ \ + (iadj_) = ((size_t)(vp))-ATL_MulByCachelen(ATL_DivByCachelen((size_t)(vp)));\ + if (iadj_) \ + { \ + if ( (iadj_) == ATL_MulBySize(ATL_DivBySize(iadj_)) ) \ + (iadj_) = ATL_DivBySize(iadj_); \ + else (iadj_) = 0; \ + }\ +} +#define ATL_FindMatAdjust(vp_, lda_, iadj_) \ +{ \ + if (ATL_MulByCachelen(ATL_DivByCachelen(ATL_MulBySize(lda_))) \ + == ATL_MulBySize(lda_)) \ + { \ + ATL_FindPtrAdjust(vp_, iadj_); \ + } \ + else (iadj_) = 0; \ +} + +#define ATL_sqrtLL(x, res) \ + asm ("fsqrt" : "=t" (res) : "0" (x)); + +/* + * Find N necessary for alignment. Written as function for optimization, + * declared static to encourage inlining + */ +static int ATL_AlignOffset +(const int N, /* max return value */ + const void *vp, /* pointer to be aligned */ + const int inc, /* size of each elt, in bytes */ + const int align) /* required alignment, in bytes */ +{ + const int p = align/inc; + const size_t k=(size_t)vp, j=k/inc; + int iret; + if (k == (j)*inc && p*inc == align) + { + iret = ((j+p-1) / p)*p - j; + if (iret <= N) return(iret); + } + return(N); +} + +/* + * Gcc links in crap that MSVC++ and DVF can't handle if you use stdout + * or stderr, so use this beautiful kludge to avoid this problem -- RCW + */ +#ifdef GCCWIN + +#include <stdarg.h> +static int WINFPRINTF(FILE *fpout, char *form, ...) +{ + int ierr=0; + va_list argptr; + + va_start(argptr, form); + if (fpout == NULL) ierr = vprintf(form, argptr); + else ierr = vfprintf(fpout, form, argptr); + va_end(argptr); + + return(ierr); +} + +#ifdef stdout + #undef stdout +#endif +#ifdef stderr + #undef stderr +#endif +#ifdef assert + #undef assert +#endif + +#define stdout NULL +#define stderr NULL +#define fprintf WINFPRINTF +#define assert WINASSERT +#define WINASSERT(n_) \ +{ \ + if (!(n_)) \ + { \ + printf("assertion %s failed, line %d of file %s\n", \ + Mstr(n_), __LINE__, __FILE__); \ + exit(1); \ + } \ +} + +#endif + +#include "atlas_aux.h" + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_mv.h b/kaldi_io/src/tools/ATLAS/include/atlas_mv.h new file mode 100644 index 0000000..f26da5f --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_mv.h @@ -0,0 +1,45 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1999 R. Clint Whaley + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef ATLAS_MV_H + #define ATLAS_MV_H + +#include "atlas_misc.h" +#if defined(SREAL) + #include "atlas_smv.h" +#elif defined(DREAL) + #include "atlas_dmv.h" +#elif defined(SCPLX) + #include "atlas_cmv.h" +#elif defined(DCPLX) + #include "atlas_zmv.h" +#endif + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_pkblas.h b/kaldi_io/src/tools/ATLAS/include/atlas_pkblas.h new file mode 100644 index 0000000..b9c7d82 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_pkblas.h @@ -0,0 +1,569 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 2003 R. Clint Whaley + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef ATLAS_PKBLAS_H +#define ATLAS_PKBLAS_H + +#include "atlas_misc.h" +#ifndef ATL_NOL3 +#include "atlas_lvl3.h" +#endif + +#define CBLAS_ENUM_ONLY +#include "cblas.h" +#undef CBLAS_ENUM_ONLY + +enum PACK_UPLO {PackUpper=121, PackLower=122, PackGen=123}; + +#define PACK_ORDER CBLAS_ORDER + #define PackRowMajor CblasRowMajor + #define PackColMajor CblasColMajor +#define PACK_TRANS CBLAS_TRANSPOSE + #define PackNoTrans CblasNoTrans + #define PackTrans CblasTrans + #define PackConjTrans CblasConjTrans + #define PackConj AtlasConj +#define PACK_DIAG CBLAS_DIAG + #define PackNonUnit CblasNonUnit + #define PackUnit CblasUnit +#define PACK_SIDE CBLAS_SIDE + #define PackLeft CblasLeft + #define PackRight CblasRight + +#ifndef ATL_pkMaxMalloc + #define ATL_pkMaxMalloc ATL_MaxMalloc +#endif + +#ifdef TCPLX + #define MindexPL(I_,J_,lda_) ( (((J_)*((lda_)+(lda_)-(J_)-1))) + (I_)+(I_) ) + #define MindexPU(I_,J_,lda_) ( ((((lda_)+(lda_)+(J_)-1)*(J_))) + (I_)+(I_) ) +#else + #define MindexPL(I_,J_,lda_) ( (((J_)*((lda_)+(lda_)-(J_)-1))>>1) + (I_) ) + #define MindexPU(I_,J_,lda_) ( ((((lda_)+(lda_)+(J_)-1)*(J_))>>1) + (I_) ) +#endif +#define MindexP(uplo_,I_,J_,lda_) \ + ( (uplo_) == PackUpper ? MindexPU(I_,J_,lda_) : \ + ( (uplo_) == PackLower ? MindexPL(I_,J_,lda_) : \ + (((J_)*(lda_)+(I_))SHIFT) ) ) +#define Mpld(uplo_,J_,lda_) (uplo_) == PackUpper ? (lda_)+(J_) : \ + ( (uplo_) == PackLower ? (lda_)-(J_) : (lda_) ) + + +void ATL_sgpmm(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum PACK_UPLO UB, const enum PACK_TRANS TB, + const enum PACK_UPLO UC, + const int M, const int N, const int K, const float alpha, + const float *A, const int IA, const int JA, const int lda, + const float *B, const int IB, const int JB, const int ldb, + const float beta, float *C, const int IC, const int JC, + const int ldc); +void ATL_sprankK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA, + const enum PACK_UPLO UB, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, int R, + const SCALAR alpha, const TYPE *A, int lda, + const TYPE *B, int ldb, const SCALAR beta, + const enum PACK_UPLO UC, TYPE *C, int ldc); +int ATL_spmmJIKF(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA, + const enum PACK_UPLO UB, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, const enum PACK_UPLO UC, + TYPE *C, const int ldc); +int ATL_spmmJIK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA, + const enum PACK_UPLO UB, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const float alpha, + const float *A, const int lda, const float *B, const int ldb, + const float beta, const enum PACK_UPLO UC, + float *C, const int ldc); +void ATL_spcol2blkF(const int M, const int N, const float alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_sprow2blkTF(const int M, const int N, const float alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_spcol2blk_a1(const int M, const int N, const float alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_spcol2blk_aX(const int M, const int N, const float alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_sprow2blkT_a1(const int M, const int N, const float alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_sprow2blkT_aX(const int M, const int N, const float alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_spputblk(const int M, const int N, const TYPE *V, TYPE *C, + int ldc, int ldcinc, const SCALAR beta); +void ATL_spputblk_diag + (const int M, const int N, const float *V, const enum ATLAS_UPLO UC, + float *C, int ldc, int ldcinc, const float alpha, const float beta); +void ATL_spputblk_aX + (const int M, const int N, const float *V, float *C, int ldc, int ldcinc, + const float alpha, const float beta); +void ATL_ssprk(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum ATLAS_UPLO UC, const int CP, + const int N, const int K, const float alpha, + const float *A, const int IA, const int JA, const int lda, + const float beta, + float *C, const int IC, const int JC, const int ldc); +void ATL_shprk(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum ATLAS_UPLO UC, const int CP, + const int N, const int K, const float alpha, + const float *A, const int IA, const int JA, const int lda, + const float beta, + float *C, const int IC, const int JC, const int ldc); +void ATL_shprk_rK(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum ATLAS_UPLO UC, const int CP, + const int N, const int K, int R, const float alpha, + const float *A, int lda, const float beta, + float *C, const int ldc); +int ATL_sphk_kmm(const enum ATLAS_UPLO UC, const enum PACK_UPLO UA, + const enum ATLAS_TRANS TA, const int N, const int K, + const float alpha, const float *A, const int lda, + const float beta, const int CP, float *C, const int ldc); +void ATL_ssprk_rK(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum ATLAS_UPLO UC, const int CP, + const int N, const int K, int R, const float alpha, + const float *A, int lda, const float beta, + float *C, const int ldc); +int ATL_sprk_kmm(const enum ATLAS_UPLO UC, const enum PACK_UPLO UA, + const enum ATLAS_TRANS TA, const int N, const int K, + const float alpha, const float *A, const int lda, + const float beta, const int CP, float *C, const int ldc); + +void ATL_dgpmm(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum PACK_UPLO UB, const enum PACK_TRANS TB, + const enum PACK_UPLO UC, + const int M, const int N, const int K, const double alpha, + const double *A, const int IA, const int JA, const int lda, + const double *B, const int IB, const int JB, const int ldb, + const double beta, double *C, const int IC, const int JC, + const int ldc); +void ATL_dprankK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA, + const enum PACK_UPLO UB, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, int R, + const SCALAR alpha, const TYPE *A, int lda, + const TYPE *B, int ldb, const SCALAR beta, + const enum PACK_UPLO UC, TYPE *C, int ldc); +int ATL_dpmmJIKF(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA, + const enum PACK_UPLO UB, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, const enum PACK_UPLO UC, + TYPE *C, const int ldc); +int ATL_dpmmJIK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA, + const enum PACK_UPLO UB, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const double alpha, + const double *A, const int lda, const double *B, const int ldb, + const double beta, const enum PACK_UPLO UC, + double *C, const int ldc); +void ATL_dpcol2blkF(const int M, const int N, const double alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_dprow2blkTF(const int M, const int N, const double alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_dpcol2blk_a1(const int M, const int N, const double alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_dpcol2blk_aX(const int M, const int N, const double alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_dprow2blkT_a1(const int M, const int N, const double alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_dprow2blkT_aX(const int M, const int N, const double alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_dpputblk(const int M, const int N, const TYPE *V, TYPE *C, + int ldc, int ldcinc, const SCALAR beta); +void ATL_dpputblk_diag + (const int M, const int N, const double *V, const enum ATLAS_UPLO UC, + double *C, int ldc, int ldcinc, const double alpha, const double beta); +void ATL_dpputblk_aX + (const int M, const int N, const double *V, double *C, int ldc, int ldcinc, + const double alpha, const double beta); +void ATL_dsprk(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum ATLAS_UPLO UC, const int CP, + const int N, const int K, const double alpha, + const double *A, const int IA, const int JA, const int lda, + const double beta, + double *C, const int IC, const int JC, const int ldc); +void ATL_dhprk(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum ATLAS_UPLO UC, const int CP, + const int N, const int K, const double alpha, + const double *A, const int IA, const int JA, const int lda, + const double beta, + double *C, const int IC, const int JC, const int ldc); +void ATL_dhprk_rK(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum ATLAS_UPLO UC, const int CP, + const int N, const int K, int R, const double alpha, + const double *A, int lda, const double beta, + double *C, const int ldc); +int ATL_dphk_kmm(const enum ATLAS_UPLO UC, const enum PACK_UPLO UA, + const enum ATLAS_TRANS TA, const int N, const int K, + const double alpha, const double *A, const int lda, + const double beta, const int CP, double *C, const int ldc); +void ATL_dsprk_rK(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum ATLAS_UPLO UC, const int CP, + const int N, const int K, int R, const double alpha, + const double *A, int lda, const double beta, + double *C, const int ldc); +int ATL_dprk_kmm(const enum ATLAS_UPLO UC, const enum PACK_UPLO UA, + const enum ATLAS_TRANS TA, const int N, const int K, + const double alpha, const double *A, const int lda, + const double beta, const int CP, double *C, const int ldc); + +void ATL_cgpmm(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum PACK_UPLO UB, const enum PACK_TRANS TB, + const enum PACK_UPLO UC, + const int M, const int N, const int K, const float* alpha, + const float *A, const int IA, const int JA, const int lda, + const float *B, const int IB, const int JB, const int ldb, + const float* beta, float *C, const int IC, const int JC, + const int ldc); +void ATL_cprankK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA, + const enum PACK_UPLO UB, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, int R, + const SCALAR alpha, const TYPE *A, int lda, + const TYPE *B, int ldb, const SCALAR beta, + const enum PACK_UPLO UC, TYPE *C, int ldc); +int ATL_cpmmJIKF(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA, + const enum PACK_UPLO UB, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, const enum PACK_UPLO UC, + TYPE *C, const int ldc); +int ATL_cpmmJIK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA, + const enum PACK_UPLO UB, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const float* alpha, + const float *A, const int lda, const float *B, const int ldb, + const float* beta, const enum PACK_UPLO UC, + float *C, const int ldc); +void ATL_cpcol2blkF(const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cprow2blkTF(const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cpcol2blk_a1(const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cpcol2blk_aX(const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cprow2blkT_a1(const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cprow2blkT_aX(const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cpputblk(const int M, const int N, const TYPE *V, TYPE *C, + int ldc, int ldcinc, const SCALAR beta); +void ATL_cpputblk_diag + (const int M, const int N, const float *V, const enum ATLAS_UPLO UC, + float *C, int ldc, int ldcinc, const float* alpha, const float* beta); +void ATL_cpputblk_aX + (const int M, const int N, const float *V, float *C, int ldc, int ldcinc, + const float* alpha, const float* beta); +void ATL_csprk(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum ATLAS_UPLO UC, const int CP, + const int N, const int K, const float* alpha, + const float *A, const int IA, const int JA, const int lda, + const float* beta, + float *C, const int IC, const int JC, const int ldc); +void ATL_chprk(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum ATLAS_UPLO UC, const int CP, + const int N, const int K, const float alpha, + const float *A, const int IA, const int JA, const int lda, + const float beta, + float *C, const int IC, const int JC, const int ldc); +void ATL_chprk_rK(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum ATLAS_UPLO UC, const int CP, + const int N, const int K, int R, const float* alpha, + const float *A, int lda, const float* beta, + float *C, const int ldc); +int ATL_cphk_kmm(const enum ATLAS_UPLO UC, const enum PACK_UPLO UA, + const enum ATLAS_TRANS TA, const int N, const int K, + const float* alpha, const float *A, const int lda, + const float* beta, const int CP, float *C, const int ldc); +void ATL_csprk_rK(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum ATLAS_UPLO UC, const int CP, + const int N, const int K, int R, const float* alpha, + const float *A, int lda, const float* beta, + float *C, const int ldc); +int ATL_cprk_kmm(const enum ATLAS_UPLO UC, const enum PACK_UPLO UA, + const enum ATLAS_TRANS TA, const int N, const int K, + const float* alpha, const float *A, const int lda, + const float* beta, const int CP, float *C, const int ldc); + +void ATL_zgpmm(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum PACK_UPLO UB, const enum PACK_TRANS TB, + const enum PACK_UPLO UC, + const int M, const int N, const int K, const double* alpha, + const double *A, const int IA, const int JA, const int lda, + const double *B, const int IB, const int JB, const int ldb, + const double* beta, double *C, const int IC, const int JC, + const int ldc); +void ATL_zprankK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA, + const enum PACK_UPLO UB, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, int R, + const SCALAR alpha, const TYPE *A, int lda, + const TYPE *B, int ldb, const SCALAR beta, + const enum PACK_UPLO UC, TYPE *C, int ldc); +int ATL_zpmmJIKF(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA, + const enum PACK_UPLO UB, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const SCALAR alpha, + const TYPE *A, const int lda, const TYPE *B, const int ldb, + const SCALAR beta, const enum PACK_UPLO UC, + TYPE *C, const int ldc); +int ATL_zpmmJIK(const enum PACK_UPLO UA, const enum ATLAS_TRANS TA, + const enum PACK_UPLO UB, const enum ATLAS_TRANS TB, + const int M, const int N, const int K, const double* alpha, + const double *A, const int lda, const double *B, const int ldb, + const double* beta, const enum PACK_UPLO UC, + double *C, const int ldc); +void ATL_zpcol2blkF(const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zprow2blkTF(const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zpcol2blk_a1(const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zpcol2blk_aX(const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zprow2blkT_a1(const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zprow2blkT_aX(const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zpputblk(const int M, const int N, const TYPE *V, TYPE *C, + int ldc, int ldcinc, const SCALAR beta); +void ATL_zpputblk_diag + (const int M, const int N, const double *V, const enum ATLAS_UPLO UC, + double *C, int ldc, int ldcinc, const double* alpha, const double* beta); +void ATL_zpputblk_aX + (const int M, const int N, const double *V, double *C, int ldc, int ldcinc, + const double* alpha, const double* beta); +void ATL_zsprk(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum ATLAS_UPLO UC, const int CP, + const int N, const int K, const double* alpha, + const double *A, const int IA, const int JA, const int lda, + const double* beta, + double *C, const int IC, const int JC, const int ldc); +void ATL_zhprk(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum ATLAS_UPLO UC, const int CP, + const int N, const int K, const double alpha, + const double *A, const int IA, const int JA, const int lda, + const double beta, + double *C, const int IC, const int JC, const int ldc); +void ATL_zhprk_rK(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum ATLAS_UPLO UC, const int CP, + const int N, const int K, int R, const double* alpha, + const double *A, int lda, const double* beta, + double *C, const int ldc); +int ATL_zphk_kmm(const enum ATLAS_UPLO UC, const enum PACK_UPLO UA, + const enum ATLAS_TRANS TA, const int N, const int K, + const double* alpha, const double *A, const int lda, + const double* beta, const int CP, double *C, const int ldc); +void ATL_zsprk_rK(const enum PACK_UPLO UA, const enum PACK_TRANS TA, + const enum ATLAS_UPLO UC, const int CP, + const int N, const int K, int R, const double* alpha, + const double *A, int lda, const double* beta, + double *C, const int ldc); +int ATL_zprk_kmm(const enum ATLAS_UPLO UC, const enum PACK_UPLO UA, + const enum ATLAS_TRANS TA, const int N, const int K, + const double* alpha, const double *A, const int lda, + const double* beta, const int CP, double *C, const int ldc); + +void ATL_cpcol2blk_aX_blk + (const int blk, const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cprow2blkT_aX_blk + (const int blk, const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cpcol2blk_a1_blk + (const int blk, const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cprow2blkT_a1_blk + (const int blk, const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cpcol2blkConjF + (const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cpcol2blkConj_a1 + (const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cpcol2blkConj_aX + (const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cpcol2blk_aXi0 + (const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cpcol2blkConj_aXi0 + (const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc,float*V); +void ATL_cprow2blkHF + (const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cprow2blkH_a1 + (const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cprow2blkH_aX + (const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cprow2blkH_aXi0 + (const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cprow2blkT_aXi0 + (const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cpcol2blkConjF_blk + (const int blk, const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cpcol2blkConj_a1_blk + (const int blk, const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cpcol2blkConj_aX_blk + (const int blk, const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cpcol2blk_aXi0_blk + (const int blk, const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cpcol2blkConj_aXi0_blk + (const int blk, const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc,float*V); +void ATL_cprow2blkHF_blk + (const int blk, const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cprow2blkH_a1_blk + (const int blk, const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cprow2blkH_aX_blk + (const int blk, const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cprow2blkH_aXi0_blk + (const int blk, const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); +void ATL_cprow2blkT_aXi0_blk + (const int blk, const int M, const int N, const float* alpha, + const float *A, int lda, const int ldainc, float *V); + +void ATL_cprow2blkT_KB_aXi0 + (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda, + const int ldainc, TYPE *V); +void ATL_cprow2blkT_KB_aX + (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda, + const int ldainc, TYPE *V); +void ATL_cprow2blkT_KB_a1 + (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda, + const int ldainc, TYPE *V); +void ATL_cprow2blkH_KB_aXi0 + (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda, + const int ldainc, TYPE *V); +void ATL_cprow2blkH_KB_aX + (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda, + const int ldainc, TYPE *V); +void ATL_cprow2blkH_KB_a1 + (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda, + const int ldainc, TYPE *V); +void ATL_zpcol2blk_aX_blk + (const int blk, const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zprow2blkT_aX_blk + (const int blk, const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zpcol2blk_a1_blk + (const int blk, const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zprow2blkT_a1_blk + (const int blk, const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zpcol2blkConjF + (const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zpcol2blkConj_a1 + (const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zpcol2blkConj_aX + (const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zpcol2blk_aXi0 + (const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zpcol2blkConj_aXi0 + (const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc,double*V); +void ATL_zprow2blkHF + (const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zprow2blkH_a1 + (const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zprow2blkH_aX + (const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zprow2blkH_aXi0 + (const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zprow2blkT_aXi0 + (const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zpcol2blkConjF_blk + (const int blk, const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zpcol2blkConj_a1_blk + (const int blk, const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zpcol2blkConj_aX_blk + (const int blk, const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zpcol2blk_aXi0_blk + (const int blk, const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zpcol2blkConj_aXi0_blk + (const int blk, const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc,double*V); +void ATL_zprow2blkHF_blk + (const int blk, const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zprow2blkH_a1_blk + (const int blk, const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zprow2blkH_aX_blk + (const int blk, const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zprow2blkH_aXi0_blk + (const int blk, const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); +void ATL_zprow2blkT_aXi0_blk + (const int blk, const int M, const int N, const double* alpha, + const double *A, int lda, const int ldainc, double *V); + +void ATL_zprow2blkT_KB_aXi0 + (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda, + const int ldainc, TYPE *V); +void ATL_zprow2blkT_KB_aX + (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda, + const int ldainc, TYPE *V); +void ATL_zprow2blkT_KB_a1 + (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda, + const int ldainc, TYPE *V); +void ATL_zprow2blkH_KB_aXi0 + (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda, + const int ldainc, TYPE *V); +void ATL_zprow2blkH_KB_aX + (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda, + const int ldainc, TYPE *V); +void ATL_zprow2blkH_KB_a1 + (const int mb, const int nb, const SCALAR alpha, const TYPE *A, int lda, + const int ldainc, TYPE *V); + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_prefetch.h b/kaldi_io/src/tools/ATLAS/include/atlas_prefetch.h new file mode 100644 index 0000000..83ee2df --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_prefetch.h @@ -0,0 +1,197 @@ +#ifndef ATLAS_PREFETCH_H +#define ATLAS_PREFETCH_H +/* + * Altivec prefetch model not well utilized by SSE-like prefetch, so have + * special commands for it. + */ +#if defined(ATL_AltiVec) + #include "atlas_altivec.h" +#endif +/* + * + * ATL_pfl1R(mem) : fetch location mem to L1, with intent to read *only* + * ATL_pfl1W(mem) : fetch location mem to L1, with intent to read/write + * ATL_pfl1WO(mem) : fetch location mem to L1, with intent to write ONLY + */ + +#if defined(ATL_3DNow) + #ifdef __GNUC__ + #define ATL_pfl1R(mem) \ + __asm__ __volatile__ ("prefetch %0" : : "m" (*((char *)(mem)))) + #define ATL_pfl1W(mem) \ + __asm__ __volatile__ ("prefetchw %0" : : "m" (*((char *)(mem)))) + #define ATL_pfl1WO ATL_pfl1W + #define ATL_GOT_L1PREFETCH + #ifdef ATL_SSE1 + #define ATL_pfl2R(mem) \ + __asm__ __volatile__ ("prefetcht1 %0" : : "m" (*((char *)(mem)))) + #define ATL_pfl2W(mem) \ + __asm__ __volatile__ ("prefetcht1 %0" : : "m" (*((char *)(mem)))) + #define ATL_pfl2WO ATL_pfl2W + #define ATL_GOT_L2PREFETCH + #endif + #endif +#elif defined(ATL_SSE1) || defined (ATL_SSE2) /* SSE prefetch is available */ + #ifdef __GNUC__ + #define ATL_pfl1R(mem) \ + __asm__ __volatile__ ("prefetchnta %0" : : "m" (*((char *)(mem)))) + #define ATL_pfl1W(mem) \ + __asm__ __volatile__ ("prefetchnta %0" : : "m" (*((char *)(mem)))) + #define ATL_pfl1WO ATL_pfl1W + #define ATL_GOT_L1PREFETCH + + #define ATL_pfl2R(mem) \ + __asm__ __volatile__ ("prefetcht1 %0" : : "m" (*((char *)(mem)))) + #define ATL_pfl2W(mem) \ + __asm__ __volatile__ ("prefetcht1 %0" : : "m" (*((char *)(mem)))) + #define ATL_pfl2WO ATL_pfl2W + #define ATL_GOT_L2PREFETCH + #endif +#elif defined(__SUNPRO_C) && defined(__sparc) /* && __SUNPRO_CC > 0x600 */ + #include <sun_prefetch.h> + #define ATL_pfl1R(mem) sparc_prefetch_read_many((void*)(mem)) + #define ATL_pfl1W(mem) sparc_prefetch_write_many((void*)(mem)) + #define ATL_GOT_L1PREFETCH + #define ATL_pfl2R(mem) sparc_prefetch_read_many((void*)(mem)) + #define ATL_pfl2W(mem) sparc_prefetch_write_many((void*)(mem)) + #define ATL_GOT_L2PREFETCH +#elif defined(ATL_ARCH_21264) + #ifdef __GNUC__ + #define ATL_pfl1R(mem) \ + __asm__ __volatile__ ("ldt $f31, %0" : : "m" (*((char *)(mem)))) + #define ATL_pfl1W(mem) \ + __asm__ __volatile__ ("lds $f31, %0" : : "m" (*((char *)(mem)))) + #define ATL_pfl1WO(mem) \ + __asm__ __volatile__ ("wh64 %0" : : "m" (*((char *)(mem)))) + #define ATL_GOT_L1PREFETCH + #elif defined(__DECC) + #include "c_asm.h" + #define ATL_pfl1R(mem) asm ("ldt %f31,(%a0) ;", mem) + #define ATL_pfl1W(mem) asm ("lds %f31,(%a0) ;", mem) + #define ATL_pfl1WO(mem) asm ("wh64 (%a0) ;", mem) + #define ATL_GOT_L1PREFETCH + #endif +/* + * Note: SunUS5/10 seems to get no benefit from prefetch, so don't enable + */ +#elif defined(ATL_ARCH_USIV) || defined(ATL_ARCH_SunUSIII) || \ + defined(ATL_ARCH_SunUSII) || defined(ATL_ARCH_SunUSI) + #ifdef __GNUC__ + #define ATL_pfl1R(mem) \ + __asm__ __volatile__ ("prefetch %0,0" : : "m" (*((char *)(mem)))) + #define ATL_pfl1W(mem) \ + __asm__ __volatile__ ("prefetch %0,2" : : "m" (*((char *)(mem)))) + #define ATL_GOT_L1PREFETCH + #define ATL_pfl2R(mem) \ + __asm__ __volatile__ ("prefetch %0,3" : : "m" (*((char *)(mem)))) + #define ATL_pfl2W(mem) \ + __asm__ __volatile__ ("prefetch %0,2" : : "m" (*((char *)(mem)))) + #define ATL_GOT_L2PREFETCH + #endif +/* + * Gives gigantic slowdown on POWER4, so don't enable there, just use gcc + * builtin + */ +#elif defined(ATL_ARCH_PPCG5) || defined(ATL_ARCH_PPCG5) || \ + defined(ATL_ARCH_POWER5) + #if defined(__GNUC__) || defined(__IBM_GCC_ASM) + #define ATL_pfl1R(mem) \ + __asm__ __volatile__ ("dcbt 0, %0, 0" : : "r" ((mem))) + #define ATL_pfl1W(mem) \ + __asm__ __volatile__ ("dcbtst 0, %0" : : "r" ((mem))) + #define ATL_pfST(mem) \ + __asm__ __volatile__ ("dcbt 0, %0, 1" : : "r" ((mem))) + #define ATL_pfl1STi(mem, str) \ + __asm__ __volatile__ ("rlwinm %0, %0, 0, 0, 24\n\t" \ + "ori %0, %0, 96+%2\n\t" \ + "dcbt 0, %0, 8" \ + : "=r" (mem) \ + : "0" (mem), "i" (str)) + + #define ATL_GOT_L1PREFETCH + #define ATL_L1LS 128 + #endif +#elif defined(ATL_ARCH_IA64Itan) || defined(ATL_ARCH_IA64Itan2) +/* + * Have to use nt2, 'cause fpu ignored L1. + * NOTE: just let icc to prefetch, keep inst here for reference + */ + #if defined(__ECC) && 0 + #include "ia64intrin.h" + #define ATL_pfl1R(mem) __lfetch(2, (mem)) + #define ATL_pfl1W(mem) __lfetch_excl(2, (mem)) + #define ATL_GOT_L1PREFETCH + #elif defined(__GNUC__) && !defined(__ECC) + #define ATL_pfl1R(mem) \ + __asm__ (" lfetch.nt2 [%0]": : "r"((void *)(mem))) + #define ATL_pfl1W(mem) \ + __asm__ (" lfetch.excl [%0]": : "r"((void *)(mem))) + #define ATL_GOT_L1PREFETCH + #endif +#elif defined(ATL_ARCH_HPPA20) && defined(__GNUC__) + #define ATL_pfl1R(mem) \ + __asm__ __volatile__ ("ldw %0, %%r0" : : "m" (*((char *)(mem)))) + #define ATL_pfl1W(mem) \ + __asm__ __volatile__ ("ldd %0, %%r0" : : "m" (*((char *)(mem)))) + #define ATL_GOT_L1PREFETCH +#elif defined(ATL_AltiVec) && !defined(ATL_pfl1R) + #ifndef ATL_NoFakePF + /* 33619968 is ATL_GetCtrl(0, 1, 2), or fetch 1 32-byte block */ + #define ATL_pfl1R(mem) ATL_pfavR(mem, 33619968, 3) + #define ATL_pfl1W(mem) ATL_pfavW(mem, 33619968, 2) + #define ATL_GOT_L1PREFETCH + #endif +#elif defined(ATL_ARCH_MIPSICE9) && defined(__GNUC__) + #define ATL_pfl1R(mem) \ + __asm__ __volatile__ ("pref 6,%0" : : "m" (*((char *)(mem)))) + #define ATL_pfl1W(mem) \ + __asm__ __volatile__ ("pref 7,%0" : : "m" (*((char *)(mem)))) + #define ATL_GOT_L1PREFETCH + #define ATL_L1LS 32 + #define ATL_L2LS 64 +#elif defined(__GNUC__) /* last ditch, use gcc predefined func */ + #define ATL_pfl1R(mem) __builtin_prefetch(mem, 0, 3) + #define ATL_pfl1W(mem) __builtin_prefetch(mem, 1, 3) + #define ATL_GOT_L1PREFETCH +#endif +#if defined(ATL_pfl1W) && !defined(ATL_pfl1WO) + #define ATL_pfl1WO ATL_pfl1W +#endif + +#ifdef ATL_NOL1PREFETCH + #ifdef ATL_GOT_L1PREFETCH + #undef ATL_pfl1R + #undef ATL_pfl1W + #undef ATL_pfl1WO + #undef ATL_GOT_L1PREFETCH + #endif +#endif +#ifdef ATL_NOL2PREFETCH + #ifdef ATL_GOT_L2PREFETCH + #undef ATL_pfl2R + #undef ATL_pfl2W + #undef ATL_pfl2WO + #undef ATL_GOT_L2PREFETCH + #endif +#endif +#ifndef ATL_GOT_L1PREFETCH /* dummy calls cpp takes out of code */ + #define ATL_pfl1R(mem) + #define ATL_pfl1W(mem) + #define ATL_pfl1WO(mem) +#endif +#ifndef ATL_GOT_L2PREFETCH /* dummy calls cpp takes out of code */ + #define ATL_pfl2R(mem) + #define ATL_pfl2W(mem) +#endif + +/* + * Define Cache line sizes for L1 and L2 + */ +#ifndef ATL_L1LS + #define ATL_L1LS 64 +#endif +#ifndef ATL_L2LS + #define ATL_L2LS ATL_L1LS +#endif + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_ptalias1.h b/kaldi_io/src/tools/ATLAS/include/atlas_ptalias1.h new file mode 100644 index 0000000..2a45eda --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_ptalias1.h @@ -0,0 +1,60 @@ +#define ATLAS_PTALIAS1_H /* no threaded routs for Level 1 and 2 yet */ +#ifndef ATLAS_PTALIAS1_H +#define ATLAS_PTALIAS1_H +/* + * Real BLAS + */ + #define ATL_dsdot ATL_dsptdot + #define ATL_sdsdot ATL_sdsptdot + #define ATL_sasum ATL_sptasum + #define ATL_snrm2 ATL_sptnrm2 + #define ATL_sdot ATL_sptdot + #define ATL_saxpy ATL_sptaxpy + #define ATL_scopy ATL_sptcopy + #define ATL_sscal ATL_sptscal + #define ATL_sswap ATL_sptswap + #define ATL_srotm ATL_sptrotm + #define ATL_srot ATL_sptrot + #define ATL_srotmg ATL_sptrotmg + #define ATL_srotg ATL_sptrotg + #define ATL_isamax ATL_isptamax + + #define ATL_dasum ATL_dptasum + #define ATL_dnrm2 ATL_dptnrm2 + #define ATL_ddot ATL_dptdot + #define ATL_daxpy ATL_dptaxpy + #define ATL_dcopy ATL_dptcopy + #define ATL_dscal ATL_dptscal + #define ATL_dswap ATL_dptswap + #define ATL_drotm ATL_dptrotm + #define ATL_drot ATL_dptrot + #define ATL_drotmg ATL_dptrotmg + #define ATL_drotg ATL_dptrotg + #define ATL_idamax ATL_idptamax + +/* + * Complex BLAS + */ + #define ATL_cdotc_sub ATL_cptdotc_sub + #define ATL_cdotu_sub ATL_cptdotu_sub + #define ATL_caxpy ATL_cptaxpy + #define ATL_ccopy ATL_cptcopy + #define ATL_cscal ATL_cptscal + #define ATL_cswap ATL_cptswap + #define ATL_icamax ATL_icptamax + #define ATL_csscal ATL_csptscal + #define ATL_scnrm2 ATL_scptnrm2 + #define ATL_scasum ATL_scptasum + + #define ATL_zdotc_sub ATL_zptdotc_sub + #define ATL_zdotu_sub ATL_zptdotu_sub + #define ATL_zaxpy ATL_zptaxpy + #define ATL_zcopy ATL_zptcopy + #define ATL_zscal ATL_zptscal + #define ATL_zswap ATL_zptswap + #define ATL_izamax ATL_izptamax + #define ATL_zdscal ATL_zdptscal + #define ATL_dznrm2 ATL_dzptnrm2 + #define ATL_dzasum ATL_dzptasum + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_ptalias2.h b/kaldi_io/src/tools/ATLAS/include/atlas_ptalias2.h new file mode 100644 index 0000000..66b1e0e --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_ptalias2.h @@ -0,0 +1,80 @@ +#define ATLAS_PTALIAS2_H /* no threaded routs for Level 1 and 2 yet */ +#ifndef ATLAS_PTALIAS2_H +#define ATLAS_PTALIAS2_H +/* + * Real BLAS + */ + #define ATL_sspr2 ATL_sptspr2 + #define ATL_ssyr2 ATL_sptsyr2 + #define ATL_sspr ATL_sptspr + #define ATL_ssyr ATL_sptsyr + #define ATL_sger ATL_sptger + #define ATL_stpsv ATL_spttpsv + #define ATL_stbsv ATL_spttbsv + #define ATL_strsv ATL_spttrsv + #define ATL_stpmv ATL_spttpmv + #define ATL_stbmv ATL_spttbmv + #define ATL_strmv ATL_spttrmv + #define ATL_sspmv ATL_sptspmv + #define ATL_ssbmv ATL_sptsbmv + #define ATL_ssymv ATL_sptsymv + #define ATL_sgbmv ATL_sptgbmv + #define ATL_sgemv ATL_sptgemv + + #define ATL_dspr2 ATL_dptspr2 + #define ATL_dsyr2 ATL_dptsyr2 + #define ATL_dspr ATL_dptspr + #define ATL_dsyr ATL_dptsyr + #define ATL_dger ATL_dptger + #define ATL_dtpsv ATL_dpttpsv + #define ATL_dtbsv ATL_dpttbsv + #define ATL_dtrsv ATL_dpttrsv + #define ATL_dtpmv ATL_dpttpmv + #define ATL_dtbmv ATL_dpttbmv + #define ATL_dtrmv ATL_dpttrmv + #define ATL_dspmv ATL_dptspmv + #define ATL_dsbmv ATL_dptsbmv + #define ATL_dsymv ATL_dptsymv + #define ATL_dgbmv ATL_dptgbmv + #define ATL_dgemv ATL_dptgemv + +/* + * Complex BLAS + */ + #define ATL_chpr2 ATL_cpthpr2 + #define ATL_cher2 ATL_cpther2 + #define ATL_chpr ATL_cpthpr + #define ATL_cher ATL_cpther + #define ATL_cgerc ATL_cptgerc + #define ATL_cgeru ATL_cptgeru + #define ATL_ctpsv ATL_cpttpsv + #define ATL_ctbsv ATL_cpttbsv + #define ATL_ctrsv ATL_cpttrsv + #define ATL_ctpmv ATL_cpttpmv + #define ATL_ctbmv ATL_cpttbmv + #define ATL_ctrmv ATL_cpttrmv + #define ATL_chpmv ATL_cpthpmv + #define ATL_chbmv ATL_cpthbmv + #define ATL_chemv ATL_cpthemv + #define ATL_cgbmv ATL_cptgbmv + #define ATL_cgemv ATL_cptgemv + + #define ATL_zhpr2 ATL_zpthpr2 + #define ATL_zher2 ATL_zpther2 + #define ATL_zhpr ATL_zpthpr + #define ATL_zher ATL_zpther + #define ATL_zgerc ATL_zptgerc + #define ATL_zgeru ATL_zptgeru + #define ATL_ztpsv ATL_zpttpsv + #define ATL_ztbsv ATL_zpttbsv + #define ATL_ztrsv ATL_zpttrsv + #define ATL_ztpmv ATL_zpttpmv + #define ATL_ztbmv ATL_zpttbmv + #define ATL_ztrmv ATL_zpttrmv + #define ATL_zhpmv ATL_zpthpmv + #define ATL_zhbmv ATL_zpthbmv + #define ATL_zhemv ATL_zpthemv + #define ATL_zgbmv ATL_zptgbmv + #define ATL_zgemv ATL_zptgemv + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_ptalias3.h b/kaldi_io/src/tools/ATLAS/include/atlas_ptalias3.h new file mode 100644 index 0000000..2a25d23 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_ptalias3.h @@ -0,0 +1,43 @@ +#ifndef ATLAS_PTALIAS3_H +#define ATLAS_PTALIAS3_H +/* + * Real BLAS + */ + #define ATL_strsm ATL_spttrsm + #define ATL_strmm ATL_spttrmm + #define ATL_ssyr2k ATL_sptsyr2k + #define ATL_ssyrk ATL_sptsyrk + #define ATL_ssymm ATL_sptsymm + #define ATL_sgemm ATL_sptgemm + + #define ATL_dtrsm ATL_dpttrsm + #define ATL_dtrmm ATL_dpttrmm + #define ATL_dsyr2k ATL_dptsyr2k + #define ATL_dsyrk ATL_dptsyrk + #define ATL_dsymm ATL_dptsymm + #define ATL_dgemm ATL_dptgemm + +/* + * Complex BLAS + */ + #define ATL_ctrsm ATL_cpttrsm + #define ATL_ctrmm ATL_cpttrmm + #define ATL_cher2k ATL_cpther2k + #define ATL_csyr2k ATL_cptsyr2k + #define ATL_cherk ATL_cptherk + #define ATL_csyrk ATL_cptsyrk + #define ATL_chemm ATL_cpthemm + #define ATL_csymm ATL_cptsymm + #define ATL_cgemm ATL_cptgemm + + #define ATL_ztrsm ATL_zpttrsm + #define ATL_ztrmm ATL_zpttrmm + #define ATL_zher2k ATL_zpther2k + #define ATL_zsyr2k ATL_zptsyr2k + #define ATL_zherk ATL_zptherk + #define ATL_zsyrk ATL_zptsyrk + #define ATL_zhemm ATL_zpthemm + #define ATL_zsymm ATL_zptsymm + #define ATL_zgemm ATL_zptgemm + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_ptlevel3.h b/kaldi_io/src/tools/ATLAS/include/atlas_ptlevel3.h new file mode 100644 index 0000000..d1bded3 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_ptlevel3.h @@ -0,0 +1,284 @@ + +/* --------------------------------------------------------------------- + * + * -- Automatically Tuned Linear Algebra Software (ATLAS) + * (C) Copyright 2000 All Rights Reserved + * + * -- ATLAS routine -- Version 3.2 -- December 25, 2000 + * + * Author : Antoine P. Petitet + * Originally developed at the University of Tennessee, + * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA. + * + * --------------------------------------------------------------------- + * + * -- Copyright notice and Licensing terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in + * the documentation and/or other materials provided with the distri- + * bution. + * 3. The name of the University, the ATLAS group, or the names of its + * contributors may not be used to endorse or promote products deri- + * ved from this software without specific written permission. + * + * -- Disclaimer: + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO- + * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN- + * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --------------------------------------------------------------------- + */ +#ifndef ATLAS_PTLEVEL3_H +#define ATLAS_PTLEVEL3_H +/* + * ===================================================================== + * Include files + * ===================================================================== + */ +#include "atlas_enum.h" +#include "atlas_pthreads.h" +/* + * ===================================================================== + * Prototypes for single precision real Level 3 multi-threaded ATLAS + * BLAS routines. + * ===================================================================== + */ +void ATL_sptgeadd +( const int, const int, const float, const float *, + const int, const float, float *, const int ); +void ATL_sptgezero +( const int, const int, float *, const int ); +void ATL_sptgescal +( const int, const int, const float, float *, + const int ); +void ATL_spttrscal +( const enum ATLAS_UPLO, const int, const int, + const float, float *, const int ); + +void ATL_sptgemm +( const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const float, + const float *, const int, const float *, const int, + const float, float *, const int ); +void ATL_sptsymm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, const float, const float *, + const int, const float *, const int, const float, + float *, const int ); +void ATL_sptsyrk +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const float, const float *, + const int, const float, float *, const int ); +void ATL_sptsyr2k +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const float, const float *, + const int, const float *, const int, const float, + float *, const int ); +void ATL_spttrmm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const float, const float *, + const int, float *, const int ); +void ATL_spttrsm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const float, const float *, + const int, float *, const int ); +/* + * ===================================================================== + * Prototypes for double precision real Level 3 multi-threaded ATLAS + * BLAS routines. + * ===================================================================== + */ +void ATL_dptgeadd +( const int, const int, const double, const double *, + const int, const double, double *, const int ); +void ATL_dptgezero +( const int, const int, double *, const int ); +void ATL_dptgescal +( const int, const int, const double, double *, + const int ); +void ATL_dpttrscal +( const enum ATLAS_UPLO, const int, const int, + const double, double *, const int ); + +void ATL_dptgemm +( const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const double, + const double *, const int, const double *, const int, + const double, double *, const int ); +void ATL_dptsymm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, const double, const double *, + const int, const double *, const int, const double, + double *, const int ); +void ATL_dptsyrk +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const double, const double *, + const int, const double, double *, const int ); +void ATL_dptsyr2k +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const double, const double *, + const int, const double *, const int, const double, + double *, const int ); +void ATL_dpttrmm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const double, const double *, + const int, double *, const int ); +void ATL_dpttrsm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const double, const double *, + const int, double *, const int ); +/* + * ===================================================================== + * Prototypes for single precision complex Level 3 multi-threaded ATLAS + * BLAS routines. + * ===================================================================== + */ +void ATL_cptgeadd +( const int, const int, const float *, const float *, + const int, const float *, float *, const int ); +void ATL_cptgezero +( const int, const int, float *, const int ); +void ATL_cptgescal +( const int, const int, const float *, float *, + const int ); +void ATL_cpttrscal +( const enum ATLAS_UPLO, const int, const int, + const float *, float *, const int ); +void ATL_cpthescal +( const enum ATLAS_UPLO, const int, const int, + const float, float *, const int ); + +void ATL_cptgemm +( const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const float *, + const float *, const int, const float *, const int, + const float *, float *, const int ); +void ATL_cptsymm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, const float *, const float *, + const int, const float *, const int, const float *, + float *, const int ); +void ATL_cptsyrk +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const float *, const float *, + const int, const float *, float *, const int ); +void ATL_cptsyr2k +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const float *, const float *, + const int, const float *, const int, const float *, + float *, const int ); +void ATL_cpttrmm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const float *, const float *, + const int, float *, const int ); +void ATL_cpttrsm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const float *, const float *, + const int, float *, const int ); +/* + * ===================================================================== + * Prototypes for double precision complex Level 3 multi-threaded ATLAS + * BLAS routines. + * ===================================================================== + */ +void ATL_zptgeadd +( const int, const int, const double *, const double *, + const int, const double *, double *, const int ); +void ATL_zptgezero +( const int, const int, double *, const int ); +void ATL_zptgescal +( const int, const int, const double *, double *, + const int ); +void ATL_zpttrscal +( const enum ATLAS_UPLO, const int, const int, + const double *, double *, const int ); +void ATL_zpthescal +( const enum ATLAS_UPLO, const int, const int, + const double, double *, const int ); + +void ATL_zptgemm +( const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const double *, + const double *, const int, const double *, const int, + const double *, double *, const int ); +void ATL_zptsymm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, const double *, const double *, + const int, const double *, const int, const double *, + double *, const int ); +void ATL_zptsyrk +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const double *, const double *, + const int, const double *, double *, const int ); +void ATL_zptsyr2k +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const double *, const double *, + const int, const double *, const int, const double *, + double *, const int ); +void ATL_zpttrmm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const double *, const double *, + const int, double *, const int ); +void ATL_zpttrsm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const double *, const double *, + const int, double *, const int ); + +void ATL_cpthemm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, const float *, const float *, + const int, const float *, const int, const float *, + float *, const int ); +void ATL_cptherk +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const float, const float *, + const int, const float, float *, const int ); +void ATL_cpther2k +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const float *, const float *, + const int, const float *, const int, const float, + float *, const int ); + +void ATL_zpthemm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, const double *, const double *, + const int, const double *, const int, const double *, + double *, const int ); +void ATL_zptherk +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const double, const double *, + const int, const double, double *, const int ); +void ATL_zpther2k +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const double *, const double *, + const int, const double *, const int, const double, + double *, const int ); + +#endif +/* + * End of atlas_ptlevel3.h + */ diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_ptlvl3.h b/kaldi_io/src/tools/ATLAS/include/atlas_ptlvl3.h new file mode 100644 index 0000000..916afd0 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_ptlvl3.h @@ -0,0 +1,389 @@ + +/* --------------------------------------------------------------------- + * + * -- Automatically Tuned Linear Algebra Software (ATLAS) + * (C) Copyright 2000 All Rights Reserved + * + * -- ATLAS routine -- Version 3.2 -- December 25, 2000 + * + * Author : Antoine P. Petitet + * Originally developed at the University of Tennessee, + * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA. + * + * --------------------------------------------------------------------- + * + * -- Copyright notice and Licensing terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in + * the documentation and/or other materials provided with the distri- + * bution. + * 3. The name of the University, the ATLAS group, or the names of its + * contributors may not be used to endorse or promote products deri- + * ved from this software without specific written permission. + * + * -- Disclaimer: + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO- + * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN- + * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --------------------------------------------------------------------- + */ +#ifndef ATLAS_PTLVL3_H +#define ATLAS_PTLVL3_H +/* + * ===================================================================== + * Include files + * ===================================================================== + */ +#include "atlas_ptmisc.h" +#include "atlas_level3.h" +#include "atlas_rblas3.h" +/* + * ===================================================================== + * macro constants + * ===================================================================== + */ +#ifdef TREAL +#define ATL_XOVER_L3_DEFAULT 8 /* number of NB x NB blocks */ +#else +#define ATL_XOVER_L3_DEFAULT 4 +#endif +/* + * ===================================================================== + * macro functions + * ===================================================================== + */ +#define Mpt3( a_, i_, siz_ ) ( ( (char*)(a_) + ( (i_) * (siz_) ) ) ) +#define Mvpt3( a_, i_, siz_ ) ( (void *)(Mpt3( (a_), (i_), (siz_) ))) +/* + * ===================================================================== + * typedef definitions + * ===================================================================== + */ +typedef PT_TREE_T (*PT_GEMM_FUN_T) +( + const unsigned int, pthread_attr_t *, + const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int +); + +typedef PT_TREE_T (*PT_TRMM_FUN_T) +( + const unsigned int, pthread_attr_t *, + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const void *, const void *, + const int, void *, const int +); + +typedef int (*PT_SYR2K_FUN_T) +( + const unsigned int, pthread_attr_t *, + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_TRANS, const int, const int, + const void *, const void *, const int, const void *, + const int, const void *, void *, const int +); + + +typedef struct +{ + size_t size; + void * negone, * one, * zero; + PT_FUN_T geadd0, gemm0, symm0, hemm0, syrk0, syr2k0, + herk0, her2k0, trmm0, trsm0; + PT_GEMM_FUN_T ptgemm; + PT_TRMM_FUN_T pttrmm; + PT_SYR2K_FUN_T ptsyr2k0, pther2k0; +} PT_LVL3_TYPE_T; + +typedef struct +{ + const void * a, * al, * b, * be; + void * c; + enum ATLAS_TRANS ta, tb; + int k, la, lb, lc, m, n; +} PT_GEMM_ARGS_T; + +typedef struct +{ + const void * a, * al, * b, * be; + void * c; + enum ATLAS_SIDE si; + enum ATLAS_UPLO up; + int la, lb, lc, m, n; +} PT_SYMM_ARGS_T; + +typedef struct +{ + const void * a, * al, * be; + void * c; + enum ATLAS_UPLO up; + enum ATLAS_TRANS tr; + int l, la, lc, m, n, k; +} PT_SYRK_ARGS_T; + +typedef struct +{ + const void * a, * al, * ac, * b, * be; + void * c; + enum ATLAS_UPLO up; + enum ATLAS_TRANS tr; + int l, la, lb, lc, m, n, k; +} PT_SYR2K_ARGS_T; + +typedef struct +{ + const void * a, * al; + void * b; + enum ATLAS_SIDE si; + enum ATLAS_UPLO up; + enum ATLAS_TRANS tr; + enum ATLAS_DIAG di; + int la, lb, m, n; +} PT_TRMM_ARGS_T; + +typedef struct +{ + const void * a, * al; + void * b; + enum ATLAS_SIDE si; + enum ATLAS_UPLO up; + enum ATLAS_TRANS tr; + enum ATLAS_DIAG di; + int la, lb, m, n; +} PT_TRSM_ARGS_T; + +/* + * ===================================================================== + * Function prototypes + * ===================================================================== + */ +PT_TREE_T ATL_Sgemm +( const PT_LVL3_TYPE_T *, const unsigned int, + const unsigned int, pthread_attr_t *, const int, + const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +PT_TREE_T ATL_Ssymm +( const PT_LVL3_TYPE_T *, const unsigned int, + const unsigned int, pthread_attr_t *, const int, + const enum ATLAS_TRANS, const enum ATLAS_SIDE, + const enum ATLAS_UPLO, const int, const int, + const void *, const void *, const int, const void *, + const int, const void *, void *, const int ); +PT_TREE_T ATL_Ssyrk +( const PT_LVL3_TYPE_T *, const unsigned int, + const unsigned int, pthread_attr_t *, const int, + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_TRANS, const int, const int, + const int, const int, const void *, const void *, + const int, const void *, void *, const int ); +PT_TREE_T ATL_Ssyr2k +( const PT_LVL3_TYPE_T *, const unsigned int, + const unsigned int, pthread_attr_t *, const int, + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_TRANS, const int, const int, + const int, const int, const void *, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +PT_TREE_T ATL_Strmm +( const PT_LVL3_TYPE_T *, const unsigned int, + const unsigned int, pthread_attr_t *, const int, + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const void *, const void *, + const int, void *, const int ); +PT_TREE_T ATL_Strsm +( const PT_LVL3_TYPE_T *, const unsigned int, + const unsigned int, pthread_attr_t *, const int, + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const void *, const void *, + const int, void *, const int ); + +#if defined( TREAL ) || defined( TCPLX ) + +int Mjoin( PATL, GetNB ) ( void ); + +void Mjoin( PATL, ptl3settype ) ( PT_LVL3_TYPE_T * ); + +void Mjoin( PATL, gemmNN ) +( const int, const int, const int, const SCALAR, + const TYPE *, const int, const TYPE *, const int, + const SCALAR, TYPE *, const int ); +void Mjoin( PATL, gemmNT ) +( const int, const int, const int, const SCALAR, + const TYPE *, const int, const TYPE *, const int, + const SCALAR, TYPE *, const int ); +void Mjoin( PATL, gemmTN ) +( const int, const int, const int, const SCALAR, + const TYPE *, const int, const TYPE *, const int, + const SCALAR, TYPE *, const int ); + +#if defined( TCPLX ) +void Mjoin( PATL, gemmNC ) +( const int, const int, const int, const SCALAR, + const TYPE *, const int, const TYPE *, const int, + const SCALAR, TYPE *, const int ); +void Mjoin( PATL, gemmCN ) +( const int, const int, const int, const SCALAR, + const TYPE *, const int, const TYPE *, const int, + const SCALAR, TYPE *, const int ); +#endif + +PT_FUN_ARG_T Mjoin( PATL, ptgemm0 ) ( PT_FUN_ARG_T ); +PT_FUN_ARG_T Mjoin( PATL, ptsymm0 ) ( PT_FUN_ARG_T ); +PT_FUN_ARG_T Mjoin( PATL, ptsyr2k0 ) ( PT_FUN_ARG_T ); +PT_FUN_ARG_T Mjoin( PATL, ptsyrk0 ) ( PT_FUN_ARG_T ); +PT_FUN_ARG_T Mjoin( PATL, pttrmm0 ) ( PT_FUN_ARG_T ); +PT_FUN_ARG_T Mjoin( PATL, pttrsm0 ) ( PT_FUN_ARG_T ); + +#if defined( TCPLX ) +PT_FUN_ARG_T Mjoin( PATL, pthemm0 ) ( PT_FUN_ARG_T ); +PT_FUN_ARG_T Mjoin( PATL, pther2k0 ) ( PT_FUN_ARG_T ); +PT_FUN_ARG_T Mjoin( PATL, ptherk0 ) ( PT_FUN_ARG_T ); +#endif +/* + * ===================================================================== + * Prototypes for the Level 3 multi-threaded ATLAS BLAS routines + * ===================================================================== + */ +PT_TREE_T Mjoin( PATL, ptgemm_nt ) +( const unsigned int, pthread_attr_t *, + const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +PT_TREE_T Mjoin( PATL, ptsymm_nt ) +( const unsigned int, pthread_attr_t *, + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, const void *, const void *, + const int, const void *, const int, const void *, + void *, const int ); +PT_TREE_T Mjoin( PATL, ptsyr2k_nt ) +( const unsigned int, pthread_attr_t *, + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const void *, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +int Mjoin( PATL, ptsyr2k0_nt ) +( const unsigned int, pthread_attr_t *, + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_TRANS, const int, const int, + const void *, const void *, const int, const void *, + const int, const void *, void *, const int ); +PT_TREE_T Mjoin( PATL, ptsyrk_nt ) +( const unsigned int, pthread_attr_t *, + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const void *, const void *, + const int, const void *, void *, const int ); +PT_TREE_T Mjoin( PATL, pttrmm_nt ) +( const unsigned int, pthread_attr_t *, + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const void *, const void *, + const int, void *, const int ); +PT_TREE_T Mjoin( PATL, pttrsm_nt ) +( const unsigned int, pthread_attr_t *, + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const void *, const void *, + const int, void *, const int ); + +void Mjoin( PATL, ptgemm ) +( const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const SCALAR, + const TYPE *, const int, const TYPE *, const int, + const SCALAR, TYPE *, const int ); +void Mjoin( PATL, ptsymm ) +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, const SCALAR, const TYPE *, + const int, const TYPE *, const int, const SCALAR, + TYPE *, const int ); +void Mjoin( PATL, ptsyr2k ) +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const SCALAR, const TYPE *, + const int, const TYPE *, const int, const SCALAR, + TYPE *, const int ); +void Mjoin( PATL, ptsyrk ) +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const SCALAR, const TYPE *, + const int, const SCALAR, TYPE *, const int ); +void Mjoin( PATL, pttrmm ) +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const SCALAR, const TYPE *, + const int, TYPE *, const int ); +void Mjoin( PATL, pttrsm ) +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const SCALAR, const TYPE *, + const int, TYPE *, const int ); + +#if defined( TCPLX ) +PT_TREE_T Mjoin( PATL, pthemm_nt ) +( const unsigned int, pthread_attr_t *, + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, const void *, const void *, + const int, const void *, const int, const void *, + void *, const int ); +PT_TREE_T Mjoin( PATL, pther2k_nt ) +( const unsigned int, pthread_attr_t *, + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const void *, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +int Mjoin( PATL, pther2k0_nt ) +( const unsigned int, pthread_attr_t *, + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_TRANS, const int, const int, + const void *, const void *, const int, const void *, + const int, const void *, void *, const int ); +PT_TREE_T Mjoin( PATL, ptherk_nt ) +( const unsigned int, pthread_attr_t *, + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const void *, const void *, + const int, const void *, void *, const int ); + +void Mjoin( PATL, pthemm ) +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, const SCALAR, const TYPE *, + const int, const TYPE *, const int, const SCALAR, + TYPE *, const int ); +void Mjoin( PATL, pther2k ) +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const SCALAR, const TYPE *, + const int, const TYPE *, const int, const TYPE, + TYPE *, const int ); +void Mjoin( PATL, ptherk ) +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const TYPE, const TYPE *, + const int, const TYPE, TYPE *, const int ); +#endif + +#endif + +#endif +/* + * End of atlas_ptlvl3.h + */ diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_ptmisc.h b/kaldi_io/src/tools/ATLAS/include/atlas_ptmisc.h new file mode 100644 index 0000000..4c3db23 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_ptmisc.h @@ -0,0 +1,410 @@ +/* --------------------------------------------------------------------- + * + * -- Automatically Tuned Linear Algebra Software (ATLAS) + * (C) Copyright 2000 All Rights Reserved + * + * -- ATLAS routine -- Version 3.2 -- December 25, 2000 + * + * Author : Antoine P. Petitet + * Originally developed at the University of Tennessee, + * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA. + * + * --------------------------------------------------------------------- + * + * -- Copyright notice and Licensing terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in + * the documentation and/or other materials provided with the distri- + * bution. + * 3. The name of the University, the ATLAS group, or the names of its + * contributors may not be used to endorse or promote products deri- + * ved from this software without specific written permission. + * + * -- Disclaimer: + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO- + * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN- + * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --------------------------------------------------------------------- + */ +#ifndef ATLAS_PTMISC_H +#define ATLAS_PTMISC_H +/* + * ===================================================================== + * Include Files + * ===================================================================== + */ +#include <math.h> +#include <pthread.h> + +#include "atlas_misc.h" +#include "atlas_pthreads.h" +/* + * ===================================================================== + * #define macro constants + * ===================================================================== + * + * ATL_XOVER_MI_DEFAULT is the smallest number of NB-by-NB blocks for + * which threading is enabled, where NB is the value returned by the + * ATLAS function Mjoin( PATL, GetNB ). + */ +#ifdef TREAL +#define ATL_XOVER_MI_DEFAULT 8 /* number of NB x NB blocks */ +#else +#define ATL_XOVER_MI_DEFAULT 4 +#endif + +#define NOSPLIT 0 /* For convenience */ +#define SPLIT_M 1 +#define SPLIT_N 2 +#define SPLIT_K 3 + +/* + * ===================================================================== + * macro functions + * ===================================================================== + */ +#define Mptm( a_, i_, siz_ ) ( ( (char*)(a_) + ( (i_) * (siz_) ) ) ) +#define Mvptm( a_, i_, siz_ ) ( (void *)(Mptm( (a_), (i_), (siz_) ))) +/* + * ===================================================================== + * typedef definitions + * ===================================================================== + * + * Definition of the Binary (recursive) task tree: Each node of the tree + * mainly consist a node number, a reference counter to enforce depen- + * dencies, a argument structure and a function to be applied. + */ +typedef void * PT_DATA_T; +typedef void * PT_FUN_VAL_T; +typedef void * PT_FUN_ARG_T; +typedef PT_FUN_VAL_T (*PT_FUN_T) ( PT_FUN_ARG_T ); + +typedef struct PT_node_T +{ + pthread_t pid; + pthread_mutex_t mutex; + pthread_cond_t cond; + struct PT_node_T * left; + struct PT_node_T * right; + PT_DATA_T data; + PT_FUN_VAL_T * val; + PT_FUN_T fun; + PT_FUN_ARG_T arg; + unsigned int node; + unsigned int count; +} PT_NODE_T; + +typedef PT_NODE_T * PT_TREE_T; +typedef void (*PT_APPLY_FUN_T)( PT_TREE_T ); + +enum DIM_1DSPLIT_E +{ + Atlas1dSplit = 100, + Atlas1dNoSplit = 199 +}; + +enum DIM_TZSPLIT_E +{ + AtlasTzSplitMrow = 200, + AtlasTzSplitKrow = 201, + AtlasTzSplitKcol = 202, + AtlasTzSplitNcol = 203, + AtlasTzNoSplit = 299 +}; + +typedef enum DIM_1DSPLIT_E DIM_1DSPLIT_T; +typedef enum DIM_TZSPLIT_E DIM_TZSPLIT_T; + +/* + * Type definitions for some auxiliaries that have been multi-threaded + * as well. + */ +typedef struct +{ + size_t size; + PT_FUN_T fun; +} PT_MISC_TYPE_T; + +typedef struct +{ + const void * al, * be; + const void * a; + void * c; + int la, lc, m, n; +} PT_GEADD_ARGS_T; + +typedef struct +{ + void * a; + int la, m, n; +} PT_GEZERO_ARGS_T; + +typedef struct +{ + const void * al; + void * a; + int la, m, n; +} PT_GESCAL_ARGS_T; + +typedef struct +{ + enum ATLAS_UPLO up; + const void * al; + void * a; + int k, la, m, n; +} PT_TZSCAL_ARGS_T; + +/* + * ===================================================================== + * Function prototypes + * ===================================================================== + */ +int ATL_sGetNB ( void ); +int ATL_dGetNB ( void ); +int ATL_cGetNB ( void ); +int ATL_zGetNB ( void ); + +DIM_1DSPLIT_T ATL_1dsplit +( + const unsigned int, + const int, + const int, + unsigned int *, + unsigned int *, + int *, + int *, + double * +); + +DIM_TZSPLIT_T ATL_tzsplit +( + const enum ATLAS_UPLO, + const unsigned int, + const int, + const int, + const int, + const int, + unsigned int *, + unsigned int *, + int *, + int * +); +/* + * Task tree management + */ +PT_TREE_T ATL_init_node +( unsigned int, PT_TREE_T, PT_TREE_T, PT_DATA_T, + PT_FUN_VAL_T *, PT_FUN_T, PT_FUN_ARG_T ); + +void ATL_traverse_tree ( PT_TREE_T ); +void ATL_apply_tree ( PT_TREE_T, PT_APPLY_FUN_T ); +void ATL_free_tree ( PT_TREE_T ); +void ATL_free_node ( PT_TREE_T ); +void ATL_print_node_id ( PT_TREE_T ); + +void ATL_thread_init ( pthread_attr_t * ); +void ATL_thread_exit ( pthread_attr_t * ); +void ATL_wait_tree ( PT_TREE_T ); +void ATL_signal_tree ( PT_TREE_T ); +void ATL_thread_tree ( PT_TREE_T, pthread_attr_t * ); +void ATL_join_tree ( PT_TREE_T ); + +PT_TREE_T ATL_create_tree +( unsigned int *, const int, const int ); +/* + * Typeless auxiliary functions + */ +PT_TREE_T ATL_Sgeadd +( const PT_MISC_TYPE_T *, const unsigned int, + const unsigned int, pthread_attr_t *, const int, + const int, const int, const void *, const void *, + const int, const void *, void *, const int ); +PT_TREE_T ATL_Sgescal +( const PT_MISC_TYPE_T *, const unsigned int, + const unsigned int, pthread_attr_t *, const int, + const int, const int, const void *, void *, + const int ); +PT_TREE_T ATL_Sgezero +( const PT_MISC_TYPE_T *, const unsigned int, + const unsigned int, pthread_attr_t *, const int, + const int, const int, void *, const int ); +PT_TREE_T ATL_Stzscal +( const PT_MISC_TYPE_T *, const unsigned int, + const unsigned int, pthread_attr_t *, const int, + const enum ATLAS_UPLO, const int, const int, + const int, const void *, void *, const int ); +/* + * Single precision real auxiliary functions + */ +PT_FUN_ARG_T ATL_sptgeadd0 ( PT_FUN_ARG_T ); +PT_FUN_ARG_T ATL_sptgescal0 ( PT_FUN_ARG_T ); +PT_FUN_ARG_T ATL_sptgezero0 ( PT_FUN_ARG_T ); +PT_FUN_ARG_T ATL_spttzscal0 ( PT_FUN_ARG_T ); + +PT_TREE_T ATL_sptgeadd_nt +( const unsigned int, pthread_attr_t *, const int, + const int, const void *, const void *, const int, + const void *, void *, const int ); +PT_TREE_T ATL_sptgescal_nt +( const unsigned int, pthread_attr_t *, const int, + const int, const void *, void *, const int ); +PT_TREE_T ATL_sptgezero_nt +( const unsigned int, pthread_attr_t *, const int, + const int, void *, const int ); +PT_TREE_T ATL_spttrscal_nt +( const unsigned int, pthread_attr_t *, + const enum ATLAS_UPLO, const int, const int, + const void *, void *, const int ); + +void ATL_sptgeadd +( const int, const int, const float, const float *, + const int, const float, float *, const int ); +void ATL_sptgescal +( const int, const int, const float, float *, + const int ); +void ATL_sptgezero +( const int, const int, float *, const int ); +void ATL_spttrscal +( const enum ATLAS_UPLO, const int, const int, + const float, float *, const int ); + +/* + * Double precision real auxiliary functions + */ +PT_FUN_ARG_T ATL_dptgeadd0 ( PT_FUN_ARG_T ); +PT_FUN_ARG_T ATL_dptgescal0 ( PT_FUN_ARG_T ); +PT_FUN_ARG_T ATL_dptgezero0 ( PT_FUN_ARG_T ); +PT_FUN_ARG_T ATL_dpttzscal0 ( PT_FUN_ARG_T ); + +PT_TREE_T ATL_dptgeadd_nt +( const unsigned int, pthread_attr_t *, const int, + const int, const void *, const void *, const int, + const void *, void *, const int ); +PT_TREE_T ATL_dptgescal_nt +( const unsigned int, pthread_attr_t *, const int, + const int, const void *, void *, const int ); +PT_TREE_T ATL_dptgezero_nt +( const unsigned int, pthread_attr_t *, const int, + const int, void *, const int ); +PT_TREE_T ATL_dpttrscal_nt +( const unsigned int, pthread_attr_t *, + const enum ATLAS_UPLO, const int, const int, + const void *, void *, const int ); + +void ATL_dptgeadd +( const int, const int, const double, const double *, + const int, const double, double *, const int ); +void ATL_dptgescal +( const int, const int, const double, double *, + const int ); +void ATL_dptgezero +( const int, const int, double *, const int ); +void ATL_dpttrscal +( const enum ATLAS_UPLO, const int, const int, + const double, double *, const int ); +/* + * Single precision complex auxiliary functions + */ +PT_FUN_ARG_T ATL_cptgeadd0 ( PT_FUN_ARG_T ); +PT_FUN_ARG_T ATL_cptgescal0 ( PT_FUN_ARG_T ); +PT_FUN_ARG_T ATL_cptgezero0 ( PT_FUN_ARG_T ); +PT_FUN_ARG_T ATL_cpthescal0 ( PT_FUN_ARG_T ); +PT_FUN_ARG_T ATL_cpttzscal0 ( PT_FUN_ARG_T ); + +PT_TREE_T ATL_cptgeadd_nt +( const unsigned int, pthread_attr_t *, const int, + const int, const void *, const void *, const int, + const void *, void *, const int ); +PT_TREE_T ATL_cptgescal_nt +( const unsigned int, pthread_attr_t *, const int, + const int, const void *, void *, const int ); +PT_TREE_T ATL_cptgezero_nt +( const unsigned int, pthread_attr_t *, const int, + const int, void *, const int ); +PT_TREE_T ATL_cpttrscal_nt +( const unsigned int, pthread_attr_t *, + const enum ATLAS_UPLO, const int, const int, + const void *, void *, const int ); +PT_TREE_T ATL_cpthescal_nt +( const unsigned int, pthread_attr_t *, + const enum ATLAS_UPLO, const int, const int, + const void *, void *, const int ); + +void ATL_cptgeadd +( const int, const int, const float *, const float *, + const int, const float *, float *, const int ); +void ATL_cptgezero +( const int, const int, float *, const int ); +void ATL_cptgescal +( const int, const int, const float *, float *, + const int ); +void ATL_cpttrscal +( const enum ATLAS_UPLO, const int, const int, + const float *, float *, const int ); +void ATL_cpthescal +( const enum ATLAS_UPLO, const int, const int, + const float, float *, const int ); +/* + * Double precision complex auxiliary functions + */ +PT_FUN_ARG_T ATL_zptgeadd0 ( PT_FUN_ARG_T ); +PT_FUN_ARG_T ATL_zptgescal0 ( PT_FUN_ARG_T ); +PT_FUN_ARG_T ATL_zptgezero0 ( PT_FUN_ARG_T ); +PT_FUN_ARG_T ATL_zpthescal0 ( PT_FUN_ARG_T ); +PT_FUN_ARG_T ATL_zpttzscal0 ( PT_FUN_ARG_T ); + +PT_TREE_T ATL_zptgeadd_nt +( const unsigned int, pthread_attr_t *, const int, + const int, const void *, const void *, const int, + const void *, void *, const int ); +PT_TREE_T ATL_zptgescal_nt +( const unsigned int, pthread_attr_t *, const int, + const int, const void *, void *, const int ); +PT_TREE_T ATL_zptgezero_nt +( const unsigned int, pthread_attr_t *, const int, + const int, void *, const int ); +PT_TREE_T ATL_zpttrscal_nt +( const unsigned int, pthread_attr_t *, + const enum ATLAS_UPLO, const int, const int, + const void *, void *, const int ); +PT_TREE_T ATL_zpthescal_nt +( const unsigned int, pthread_attr_t *, + const enum ATLAS_UPLO, const int, const int, + const void *, void *, const int ); + +void ATL_zptgeadd +( const int, const int, const double *, const double *, + const int, const double *, double *, const int ); +void ATL_zptgezero +( const int, const int, double *, const int ); +void ATL_zptgescal +( const int, const int, const double *, double *, + const int ); +void ATL_zpttrscal +( const enum ATLAS_UPLO, const int, const int, + const double *, double *, const int ); +void ATL_zpthescal +( const enum ATLAS_UPLO, const int, const int, + const double, double *, const int ); + +#endif +/* + * End of atlas_ptmisc.h + */ diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_r1.h b/kaldi_io/src/tools/ATLAS/include/atlas_r1.h new file mode 100644 index 0000000..dc49fe2 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_r1.h @@ -0,0 +1,39 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1999 R. Clint Whaley + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifdef SREAL + #include "atlas_sr1.h" +#elif defined(DREAL) + #include "atlas_dr1.h" +#elif defined(SCPLX) + #include "atlas_cr1.h" +#elif defined(DCPLX) + #include "atlas_zr1.h" +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_rblas3.h b/kaldi_io/src/tools/ATLAS/include/atlas_rblas3.h new file mode 100644 index 0000000..9ad27e7 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_rblas3.h @@ -0,0 +1,474 @@ +/* --------------------------------------------------------------------- + * + * -- Automatically Tuned Linear Algebra Software (ATLAS) + * (C) Copyright 2000 All Rights Reserved + * + * -- ATLAS routine -- Version 3.2 -- December 25, 2000 + * + * Author : Antoine P. Petitet + * Contributor(s) : R. Clint Whaley + * Originally developed at the University of Tennessee, + * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA. + * + * --------------------------------------------------------------------- + * + * -- Copyright notice and Licensing terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in + * the documentation and/or other materials provided with the distri- + * bution. + * 3. The name of the University, the ATLAS group, or the names of its + * contributors may not be used to endorse or promote products deri- + * ved from this software without specific written permission. + * + * -- Disclaimer: + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO- + * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN- + * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --------------------------------------------------------------------- + */ +#ifndef ATLAS_RBLAS3_H +#define ATLAS_RBLAS3_H +/* + * ===================================================================== + * Include files + * ===================================================================== + */ +#include "atlas_misc.h" +/* + * ===================================================================== + * #define macros definitions + * ===================================================================== + */ +#define Mrc3( a_, i_, j_, lda_, siz_ ) \ + ( (void*) ( (char*)(a_) + ( ( (i_)+(j_)*(lda_) )*(siz_) ) ) ) +/* + * ===================================================================== + * #typedef definitions + * ===================================================================== + */ +typedef void (*KR3_FUN_GEMM_T) +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +typedef void (*KR3_FUN_HEMM_T) +( const int, const int, const void *, const void *, + const int, const void *, const int, const void *, + void *, const int ); +typedef int (*KR3_FUN_HER2K_T) +( const int, const int, const void *, const void *, + const int, const void *, const int, const void *, + void *, const int ); +typedef void (*KR3_FUN_HERK_T) +( const int, const int, const void *, const void *, + const int, const void *, void *, const int ); +typedef void (*KR3_FUN_SYMM_T) +( const int, const int, const void *, const void *, + const int, const void *, const int, const void *, + void *, const int ); +typedef int (*KR3_FUN_SYR2K_T) +( const int, const int, const void *, const void *, + const int, const void *, const int, const void *, + void *, const int ); +typedef void (*KR3_FUN_SYRK_T) +( const int, const int, const void *, const void *, + const int, const void *, void *, const int ); +typedef void (*KR3_FUN_TRMM_T) +( const int, const int, const void *, const void *, + const int, void *, const int ); +typedef void (*KR3_FUN_TRSM_T) +( const int, const int, const void *, const void *, + const int, void *, const int ); + +typedef struct +{ + size_t size; + void * one; + KR3_FUN_GEMM_T TgemmNN; + KR3_FUN_GEMM_T Tgemm; + KR3_FUN_SYMM_T Tsymm; +} RC3_SYMM_T; + +typedef struct +{ + size_t size; + void * one; + KR3_FUN_GEMM_T TgemmNN; + KR3_FUN_GEMM_T Tgemm; + KR3_FUN_HEMM_T Themm; +} RC3_HEMM_T; + +typedef struct +{ + size_t size; + KR3_FUN_GEMM_T Tgemm; + KR3_FUN_SYRK_T Tsyrk; +} RC3_SYRK_T; + +typedef struct +{ + size_t size; + KR3_FUN_GEMM_T Tgemm; + KR3_FUN_HERK_T Therk; +} RC3_HERK_T; + +typedef struct +{ + size_t size; + void * one; + KR3_FUN_GEMM_T Tgemm; + KR3_FUN_SYR2K_T Tsyr2k; +} RC3_SYR2K_T; + +typedef struct +{ + size_t size; + void * one; + KR3_FUN_GEMM_T Tgemm; + KR3_FUN_HER2K_T Ther2k; +} RC3_HER2K_T; + +typedef struct +{ + size_t size; + void * one; + KR3_FUN_GEMM_T Tgemm; + KR3_FUN_TRMM_T Ttrmm; +} RC3_TRMM_T; + +typedef struct +{ + size_t size; + void * one, * negone; + KR3_FUN_GEMM_T Tgemm; + KR3_FUN_TRSM_T Ttrsm; +} RC3_TRSM_T; + +typedef void (*RC3_FUN_HEMM_T) +( RC3_HEMM_T *, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int, const int ); +typedef void (*RC3_FUN_HER2K_T) +( RC3_HER2K_T *, const int, const int, const void *, + const void *, const void *, const int, const void *, + const int, const void *, void *, const int, + const int ); +typedef void (*RC3_FUN_HERK_T) +( RC3_HERK_T *, const int, const int, const void *, + const void *, const int, const void *, void *, + const int, const int ); +typedef void (*RC3_FUN_SYMM_T) +( RC3_SYMM_T *, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int, const int ); +typedef void (*RC3_FUN_SYR2K_T) +( RC3_SYR2K_T *, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int, const int ); +typedef void (*RC3_FUN_SYRK_T) +( RC3_SYRK_T *, const int, const int, const void *, + const void *, const int, const void *, void *, + const int, const int ); +typedef void (*RC3_FUN_TRMM_T) +( RC3_TRMM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +typedef void (*RC3_FUN_TRSM_T) +( RC3_TRSM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +/* + * ===================================================================== + * Level 3 recursive BLAS internal function prototypes + * ===================================================================== + */ +void ATL_sgemmTN_RB +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +void ATL_sgemmNT_RB +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +void ATL_sgemmNN_RB +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +void ATL_dgemmTN_RB +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +void ATL_dgemmNT_RB +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +void ATL_dgemmNN_RB +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +void ATL_cgemmCN_RB +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +void ATL_cgemmNC_RB +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +void ATL_cgemmTN_RB +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +void ATL_cgemmNT_RB +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +void ATL_cgemmNN_RB +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +void ATL_zgemmCN_RB +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +void ATL_zgemmNC_RB +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +void ATL_zgemmTN_RB +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +void ATL_zgemmNT_RB +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +void ATL_zgemmNN_RB +( const int, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int ); +/* + * ===================================================================== + * Recursive BLAS function prototypes + * ===================================================================== + */ +void ATL_rsymmRU +( RC3_SYMM_T *, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int, const int ); +void ATL_rhemmRU +( RC3_HEMM_T *, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int, const int ); +void ATL_rsymmRL +( RC3_SYMM_T *, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int, const int ); +void ATL_rhemmRL +( RC3_HEMM_T *, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int, const int ); +void ATL_rsymmLU +( RC3_SYMM_T *, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int, const int ); +void ATL_rhemmLU +( RC3_HEMM_T *, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int, const int ); +void ATL_rsymmLL +( RC3_SYMM_T *, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int, const int ); +void ATL_rhemmLL +( RC3_HEMM_T *, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int, const int ); + +void ATL_rsyrkUT +( RC3_SYRK_T *, const int, const int, const void *, + const void *, const int, const void *, void *, + const int, const int ); +void ATL_rsyr2kUT +( RC3_SYR2K_T *, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int, const int ); +void ATL_rsyrkUN +( RC3_SYRK_T *, const int, const int, const void *, + const void *, const int, const void *, void *, + const int, const int ); +void ATL_rsyr2kUN +( RC3_SYR2K_T *, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int, const int ); +void ATL_rsyrkLT +( RC3_SYRK_T *, const int, const int, const void *, + const void *, const int, const void *, void *, + const int, const int ); +void ATL_rsyr2kLT +( RC3_SYR2K_T *, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int, const int ); +void ATL_rsyrkLN +( RC3_SYRK_T *, const int, const int, const void *, + const void *, const int, const void *, void *, + const int, const int ); +void ATL_rsyr2kLN +( RC3_SYR2K_T *, const int, const int, const void *, + const void *, const int, const void *, const int, + const void *, void *, const int, const int ); + +void ATL_rherkUC +( RC3_HERK_T *, const int, const int, const void *, + const void *, const int, const void *, void *, + const int, const int ); +void ATL_rher2kUC +( RC3_HER2K_T *, const int, const int, const void *, + const void *, const void *, const int, const void *, + const int, const void *, void *, const int, + const int ); +void ATL_rherkUN +( RC3_HERK_T *, const int, const int, const void *, + const void *, const int, const void *, void *, + const int, const int ); +void ATL_rher2kUN +( RC3_HER2K_T *, const int, const int, const void *, + const void *, const void *, const int, const void *, + const int, const void *, void *, const int, + const int ); +void ATL_rherkLC +( RC3_HERK_T *, const int, const int, const void *, + const void *, const int, const void *, void *, + const int, const int ); +void ATL_rher2kLC +( RC3_HER2K_T *, const int, const int, const void *, + const void *, const void *, const int, const void *, + const int, const void *, void *, const int, + const int ); +void ATL_rherkLN +( RC3_HERK_T *, const int, const int, const void *, + const void *, const int, const void *, void *, + const int, const int ); +void ATL_rher2kLN +( RC3_HER2K_T *, const int, const int, const void *, + const void *, const void *, const int, const void *, + const int, const void *, void *, const int, + const int ); + +void ATL_rtrmmRUC +( RC3_TRMM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrsmRUC +( RC3_TRSM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrmmRLC +( RC3_TRMM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrsmRLC +( RC3_TRSM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrmmRUT +( RC3_TRMM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrsmRUT +( RC3_TRSM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrmmRLT +( RC3_TRMM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrsmRLT +( RC3_TRSM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrmmRUN +( RC3_TRMM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrsmRUN +( RC3_TRSM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrmmRLN +( RC3_TRMM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrsmRLN +( RC3_TRSM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrmmLUC +( RC3_TRMM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrsmLUC +( RC3_TRSM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrmmLLC +( RC3_TRMM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrsmLLC +( RC3_TRSM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrmmLUT +( RC3_TRMM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrsmLUT +( RC3_TRSM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrmmLLT +( RC3_TRMM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrsmLLT +( RC3_TRSM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrmmLUN +( RC3_TRMM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrsmLUN +( RC3_TRSM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrmmLLN +( RC3_TRMM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); +void ATL_rtrsmLLN +( RC3_TRSM_T *, const int, const int, const void *, + const void *, const int, void *, const int, + const int ); + +#endif +/* + * End of atlas_rblas3.h + */ diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_refalias1.h b/kaldi_io/src/tools/ATLAS/include/atlas_refalias1.h new file mode 100644 index 0000000..7dcac8a --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_refalias1.h @@ -0,0 +1,59 @@ +#ifndef ATLAS_REFALIAS1_H +#define ATLAS_REFALIAS1_H +/* + * Real BLAS + */ + #define ATL_dsdot ATL_dsrefdot + #define ATL_sdsdot ATL_sdsrefdot + #define ATL_sasum ATL_srefasum + #define ATL_snrm2 ATL_srefnrm2 + #define ATL_sdot ATL_srefdot + #define ATL_saxpy ATL_srefaxpy + #define ATL_scopy ATL_srefcopy + #define ATL_sscal ATL_srefscal + #define ATL_sswap ATL_srefswap + #define ATL_srotm ATL_srefrotm + #define ATL_srot ATL_srefrot + #define ATL_srotmg ATL_srefrotmg + #define ATL_srotg ATL_srefrotg + #define ATL_isamax ATL_isrefamax + + #define ATL_dasum ATL_drefasum + #define ATL_dnrm2 ATL_drefnrm2 + #define ATL_ddot ATL_drefdot + #define ATL_daxpy ATL_drefaxpy + #define ATL_dcopy ATL_drefcopy + #define ATL_dscal ATL_drefscal + #define ATL_dswap ATL_drefswap + #define ATL_drotm ATL_drefrotm + #define ATL_drot ATL_drefrot + #define ATL_drotmg ATL_drefrotmg + #define ATL_drotg ATL_drefrotg + #define ATL_idamax ATL_idrefamax + +/* + * Complex BLAS + */ + #define ATL_cdotc_sub ATL_crefdotc_sub + #define ATL_cdotu_sub ATL_crefdotu_sub + #define ATL_caxpy ATL_crefaxpy + #define ATL_ccopy ATL_crefcopy + #define ATL_cscal ATL_crefscal + #define ATL_cswap ATL_crefswap + #define ATL_icamax ATL_icrefamax + #define ATL_csscal ATL_csrefscal + #define ATL_scnrm2 ATL_screfnrm2 + #define ATL_scasum ATL_screfasum + + #define ATL_zdotc_sub ATL_zrefdotc_sub + #define ATL_zdotu_sub ATL_zrefdotu_sub + #define ATL_zaxpy ATL_zrefaxpy + #define ATL_zcopy ATL_zrefcopy + #define ATL_zscal ATL_zrefscal + #define ATL_zswap ATL_zrefswap + #define ATL_izamax ATL_izrefamax + #define ATL_zdscal ATL_zdrefscal + #define ATL_dznrm2 ATL_dzrefnrm2 + #define ATL_dzasum ATL_dzrefasum + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_refalias2.h b/kaldi_io/src/tools/ATLAS/include/atlas_refalias2.h new file mode 100644 index 0000000..5871491 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_refalias2.h @@ -0,0 +1,79 @@ +#ifndef ATLAS_REFALIAS2_H +#define ATLAS_REFALIAS2_H +/* + * Real BLAS + */ + #define ATL_sspr2 ATL_srefspr2 + #define ATL_ssyr2 ATL_srefsyr2 + #define ATL_sspr ATL_srefspr + #define ATL_ssyr ATL_srefsyr + #define ATL_sger ATL_srefger + #define ATL_stpsv ATL_sreftpsv + #define ATL_stbsv ATL_sreftbsv + #define ATL_strsv ATL_sreftrsv + #define ATL_stpmv ATL_sreftpmv + #define ATL_stbmv ATL_sreftbmv + #define ATL_strmv ATL_sreftrmv + #define ATL_sspmv ATL_srefspmv + #define ATL_ssbmv ATL_srefsbmv + #define ATL_ssymv ATL_srefsymv + #define ATL_sgbmv ATL_srefgbmv + #define ATL_sgemv ATL_srefgemv + + #define ATL_dspr2 ATL_drefspr2 + #define ATL_dsyr2 ATL_drefsyr2 + #define ATL_dspr ATL_drefspr + #define ATL_dsyr ATL_drefsyr + #define ATL_dger ATL_drefger + #define ATL_dtpsv ATL_dreftpsv + #define ATL_dtbsv ATL_dreftbsv + #define ATL_dtrsv ATL_dreftrsv + #define ATL_dtpmv ATL_dreftpmv + #define ATL_dtbmv ATL_dreftbmv + #define ATL_dtrmv ATL_dreftrmv + #define ATL_dspmv ATL_drefspmv + #define ATL_dsbmv ATL_drefsbmv + #define ATL_dsymv ATL_drefsymv + #define ATL_dgbmv ATL_drefgbmv + #define ATL_dgemv ATL_drefgemv + +/* + * Complex BLAS + */ + #define ATL_chpr2 ATL_crefhpr2 + #define ATL_cher2 ATL_crefher2 + #define ATL_chpr ATL_crefhpr + #define ATL_cher ATL_crefher + #define ATL_cgerc ATL_crefgerc + #define ATL_cgeru ATL_crefgeru + #define ATL_ctpsv ATL_creftpsv + #define ATL_ctbsv ATL_creftbsv + #define ATL_ctrsv ATL_creftrsv + #define ATL_ctpmv ATL_creftpmv + #define ATL_ctbmv ATL_creftbmv + #define ATL_ctrmv ATL_creftrmv + #define ATL_chpmv ATL_crefhpmv + #define ATL_chbmv ATL_crefhbmv + #define ATL_chemv ATL_crefhemv + #define ATL_cgbmv ATL_crefgbmv + #define ATL_cgemv ATL_crefgemv + + #define ATL_zhpr2 ATL_zrefhpr2 + #define ATL_zher2 ATL_zrefher2 + #define ATL_zhpr ATL_zrefhpr + #define ATL_zher ATL_zrefher + #define ATL_zgerc ATL_zrefgerc + #define ATL_zgeru ATL_zrefgeru + #define ATL_ztpsv ATL_zreftpsv + #define ATL_ztbsv ATL_zreftbsv + #define ATL_ztrsv ATL_zreftrsv + #define ATL_ztpmv ATL_zreftpmv + #define ATL_ztbmv ATL_zreftbmv + #define ATL_ztrmv ATL_zreftrmv + #define ATL_zhpmv ATL_zrefhpmv + #define ATL_zhbmv ATL_zrefhbmv + #define ATL_zhemv ATL_zrefhemv + #define ATL_zgbmv ATL_zrefgbmv + #define ATL_zgemv ATL_zrefgemv + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_refalias3.h b/kaldi_io/src/tools/ATLAS/include/atlas_refalias3.h new file mode 100644 index 0000000..f10e65c --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_refalias3.h @@ -0,0 +1,43 @@ +#ifndef ATLAS_REFALIAS3_H +#define ATLAS_REFALIAS3_H +/* + * Real BLAS + */ + #define ATL_strsm ATL_sreftrsm + #define ATL_strmm ATL_sreftrmm + #define ATL_ssyr2k ATL_srefsyr2k + #define ATL_ssyrk ATL_srefsyrk + #define ATL_ssymm ATL_srefsymm + #define ATL_sgemm ATL_srefgemm + + #define ATL_dtrsm ATL_dreftrsm + #define ATL_dtrmm ATL_dreftrmm + #define ATL_dsyr2k ATL_drefsyr2k + #define ATL_dsyrk ATL_drefsyrk + #define ATL_dsymm ATL_drefsymm + #define ATL_dgemm ATL_drefgemm + +/* + * Complex BLAS + */ + #define ATL_ctrsm ATL_creftrsm + #define ATL_ctrmm ATL_creftrmm + #define ATL_cher2k ATL_crefher2k + #define ATL_csyr2k ATL_crefsyr2k + #define ATL_cherk ATL_crefherk + #define ATL_csyrk ATL_crefsyrk + #define ATL_chemm ATL_crefhemm + #define ATL_csymm ATL_crefsymm + #define ATL_cgemm ATL_crefgemm + + #define ATL_ztrsm ATL_zreftrsm + #define ATL_ztrmm ATL_zreftrmm + #define ATL_zher2k ATL_zrefher2k + #define ATL_zsyr2k ATL_zrefsyr2k + #define ATL_zherk ATL_zrefherk + #define ATL_zsyrk ATL_zrefsyrk + #define ATL_zhemm ATL_zrefhemm + #define ATL_zsymm ATL_zrefsymm + #define ATL_zgemm ATL_zrefgemm + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_reflevel1.h b/kaldi_io/src/tools/ATLAS/include/atlas_reflevel1.h new file mode 100644 index 0000000..2f79ac8 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_reflevel1.h @@ -0,0 +1,421 @@ +/* --------------------------------------------------------------------- + * + * -- Automatically Tuned Linear Algebra Software (ATLAS) + * (C) Copyright 2000 All Rights Reserved + * + * -- ATLAS routine -- Version 3.2 -- December 25, 2000 + * + * Author : Antoine P. Petitet + * Originally developed at the University of Tennessee, + * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA. + * + * --------------------------------------------------------------------- + * + * -- Copyright notice and Licensing terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in + * the documentation and/or other materials provided with the distri- + * bution. + * 3. The name of the University, the ATLAS group, or the names of its + * contributors may not be used to endorse or promote products deri- + * ved from this software without specific written permission. + * + * -- Disclaimer: + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO- + * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN- + * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --------------------------------------------------------------------- + */ +#ifndef ATLAS_REFLEVEL1_H +#define ATLAS_REFLEVEL1_H +/* + * ===================================================================== + * Prototypes for Level 1 Reference ATLAS BLAS routines + * ===================================================================== + */ +void ATL_srefrotg +( + float *, + float *, + float *, + float * +); + +void ATL_srefrotmg +( + float *, + float *, + float *, + const float, + float * +); + +float ATL_srefnrm2 +( + const int, + const float *, const int +); + +float ATL_srefasum +( + const int, + const float *, const int +); + +int ATL_isrefamax +( + const int, + const float *, const int +); + +void ATL_srefscal +( + const int, + const float, + float *, const int +); + +void ATL_srefswap +( + const int, + float *, const int, + float *, const int +); + +void ATL_srefcopy +( + const int, + const float *, const int, + float *, const int +); + +void ATL_srefaxpy +( + const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_srefrot +( + const int, + float *, const int, + float *, const int, + const float, + const float +); + +void ATL_srefrotm +( + const int, + float *, const int, + float *, const int, + const float * +); + +float ATL_srefdot +( + const int, + const float *, const int, + const float *, const int +); + +float ATL_sdsrefdot +( + const int, + const float, + const float *, const int, + const float *, const int +); + +double ATL_dsrefdot +( + const int, + const float *, const int, + const float *, const int +); + +void ATL_drefrotg +( + double *, + double *, + double *, + double * +); + +void ATL_drefrotmg +( + double *, + double *, + double *, + const double, + double * +); + +double ATL_drefnrm2 +( + const int, + const double *, const int +); + +double ATL_drefasum +( + const int, + const double *, const int +); + +int ATL_idrefamax +( + const int, + const double *, const int +); + +void ATL_drefscal +( + const int, + const double, + double *, const int +); + +void ATL_drefswap +( + const int, + double *, const int, + double *, const int +); + +void ATL_drefcopy +( + const int, + const double *, const int, + double *, const int +); + +void ATL_drefaxpy +( + const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_drefrot +( + const int, + double *, const int, + double *, const int, + const double, + const double +); + +void ATL_drefrotm +( + const int, + double *, const int, + double *, const int, + const double * +); + +double ATL_drefdot +( + const int, + const double *, const int, + const double *, const int +); + +void ATL_crefrotg +( + float *, + const float *, + float *, + float * +); + +float ATL_screfnrm2 +( + const int, + const float *, const int +); + +float ATL_screfasum +( + const int, + const float *, const int +); + +int ATL_icrefamax +( + const int, + const float *, const int +); + +void ATL_crefscal +( + const int, + const float *, + float *, const int +); + +void ATL_csrefscal +( + const int, + const float, + float *, const int +); + +void ATL_crefswap +( + const int, + float *, const int, + float *, const int +); + +void ATL_crefcopy +( + const int, + const float *, const int, + float *, const int +); + +void ATL_crefaxpy +( + const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_csrefrot +( + const int, + float *, const int, + float *, const int, + const float, + const float +); + +void ATL_crefdotc_sub +( + const int, + const float *, const int, + const float *, const int, + float * +); + +void ATL_crefdotu_sub +( + const int, + const float *, const int, + const float *, const int, + float * +); + +void ATL_zrefrotg +( + double *, + const double *, + double *, + double * +); + +double ATL_dzrefnrm2 +( + const int, + const double *, const int +); + +double ATL_dzrefasum +( + const int, + const double *, const int +); + +int ATL_izrefamax +( + const int, + const double *, const int +); + +void ATL_zrefscal +( + const int, + const double *, + double *, const int +); + +void ATL_zdrefscal +( + const int, + const double, + double *, const int +); + +void ATL_zrefswap +( + const int, + double *, const int, + double *, const int +); + +void ATL_zrefcopy +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zrefaxpy +( + const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zdrefrot +( + const int, + double *, const int, + double *, const int, + const double, + const double +); + +void ATL_zrefdotc_sub +( + const int, + const double *, const int, + const double *, const int, + double * +); + +void ATL_zrefdotu_sub +( + const int, + const double *, const int, + const double *, const int, + double * +); + +#endif +/* + * End of atlas_reflevel1.h + */ diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_reflevel2.h b/kaldi_io/src/tools/ATLAS/include/atlas_reflevel2.h new file mode 100644 index 0000000..6158d17 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_reflevel2.h @@ -0,0 +1,788 @@ +/* --------------------------------------------------------------------- + * + * -- Automatically Tuned Linear Algebra Software (ATLAS) + * (C) Copyright 2000 All Rights Reserved + * + * -- ATLAS routine -- Version 3.2 -- December 25, 2000 + * + * Author : Antoine P. Petitet + * Originally developed at the University of Tennessee, + * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA. + * + * --------------------------------------------------------------------- + * + * -- Copyright notice and Licensing terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in + * the documentation and/or other materials provided with the distri- + * bution. + * 3. The name of the University, the ATLAS group, or the names of its + * contributors may not be used to endorse or promote products deri- + * ved from this software without specific written permission. + * + * -- Disclaimer: + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO- + * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN- + * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --------------------------------------------------------------------- + */ +#ifndef ATLAS_REFLEVEL2_H +#define ATLAS_REFLEVEL2_H + +#include "atlas_enum.h" +/* + * ===================================================================== + * Prototypes for Level 2 Reference ATLAS BLAS routines + * ===================================================================== + */ +void ATL_srefgbmv +( + const enum ATLAS_TRANS, + const int, const int, + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefgpmv +( + const enum ATLAS_UPLO, + const enum ATLAS_TRANS, + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefgemv +( + const enum ATLAS_TRANS, + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefgpr +( + const enum ATLAS_UPLO, + const int, const int, + const float, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_srefger +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_srefsbmv +( + const enum ATLAS_UPLO, + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefspmv +( + const enum ATLAS_UPLO, + const int, + const float, + const float *, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefspr +( + const enum ATLAS_UPLO, + const int, + const float, + const float *, const int, + float * +); + +void ATL_srefspr2 +( + const enum ATLAS_UPLO, + const int, + const float, + const float *, const int, + const float *, const int, + float * +); + +void ATL_srefsymv +( + const enum ATLAS_UPLO, + const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsyr +( + const enum ATLAS_UPLO, + const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_srefsyr2 +( + const enum ATLAS_UPLO, + const int, + const float, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbmv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbsv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpmv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, + const float *, + float *, const int +); + +void ATL_sreftpsv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, + const float *, + float *, const int +); + +void ATL_sreftrmv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrsv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, + const float *, const int, + float *, const int +); + +void ATL_drefgbmv +( + const enum ATLAS_TRANS, + const int, const int, + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefgpmv +( + const enum ATLAS_UPLO, + const enum ATLAS_TRANS, + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefgemv +( + const enum ATLAS_TRANS, + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefgpr +( + const enum ATLAS_UPLO, + const int, const int, + const double, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_drefger +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_drefsbmv +( + const enum ATLAS_UPLO, + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefspmv +( + const enum ATLAS_UPLO, + const int, + const double, + const double *, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefspr +( + const enum ATLAS_UPLO, + const int, + const double, + const double *, const int, + double * +); + +void ATL_drefspr2 +( + const enum ATLAS_UPLO, + const int, + const double, + const double *, const int, + const double *, const int, + double * +); + +void ATL_drefsymv +( + const enum ATLAS_UPLO, + const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsyr +( + const enum ATLAS_UPLO, + const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_drefsyr2 +( + const enum ATLAS_UPLO, + const int, + const double, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbmv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbsv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpmv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, + const double *, + double *, const int +); + +void ATL_dreftpsv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, + const double *, + double *, const int +); + +void ATL_dreftrmv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrsv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, + const double *, const int, + double *, const int +); + +void ATL_crefgbmv +( + const enum ATLAS_TRANS, + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgpmv +( + const enum ATLAS_UPLO, + const enum ATLAS_TRANS, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgemv +( + const enum ATLAS_TRANS, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgprc +( + const enum ATLAS_UPLO, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_crefgpru +( + const enum ATLAS_UPLO, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_crefgerc +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_crefgeru +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_crefhbmv +( + const enum ATLAS_UPLO, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefhpmv +( + const enum ATLAS_UPLO, + const int, + const float *, + const float *, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefhpr +( + const enum ATLAS_UPLO, + const int, + const float, + const float *, const int, + float * +); + +void ATL_crefhpr2 +( + const enum ATLAS_UPLO, + const int, + const float *, + const float *, const int, + const float *, const int, + float * +); + +void ATL_crefhemv +( + const enum ATLAS_UPLO, + const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefher +( + const enum ATLAS_UPLO, + const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_crefher2 +( + const enum ATLAS_UPLO, + const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, + const float *, + float *, const int +); + +void ATL_creftpsv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, + const float *, + float *, const int +); + +void ATL_creftrmv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, + const float *, const int, + float *, const int +); + +void ATL_zrefgbmv +( + const enum ATLAS_TRANS, + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgpmv +( + const enum ATLAS_UPLO, + const enum ATLAS_TRANS, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgemv +( + const enum ATLAS_TRANS, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgprc +( + const enum ATLAS_UPLO, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zrefgpru +( + const enum ATLAS_UPLO, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zrefgerc +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zrefgeru +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zrefhbmv +( + const enum ATLAS_UPLO, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefhpmv +( + const enum ATLAS_UPLO, + const int, + const double *, + const double *, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefhpr +( + const enum ATLAS_UPLO, + const int, + const double, + const double *, const int, + double * +); + +void ATL_zrefhpr2 +( + const enum ATLAS_UPLO, + const int, + const double *, + const double *, const int, + const double *, const int, + double * +); + +void ATL_zrefhemv +( + const enum ATLAS_UPLO, + const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefher +( + const enum ATLAS_UPLO, + const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_zrefher2 +( + const enum ATLAS_UPLO, + const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, + const double *, + double *, const int +); + +void ATL_zreftpsv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, + const double *, + double *, const int +); + +void ATL_zreftrmv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsv +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, + const double *, const int, + double *, const int +); + +#endif +/* + * End of atlas_reflevel2.h + */ diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_reflevel3.h b/kaldi_io/src/tools/ATLAS/include/atlas_reflevel3.h new file mode 100644 index 0000000..eba976b --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_reflevel3.h @@ -0,0 +1,374 @@ +/* --------------------------------------------------------------------- + * + * -- Automatically Tuned Linear Algebra Software (ATLAS) + * (C) Copyright 2000 All Rights Reserved + * + * -- ATLAS routine -- Version 3.2 -- December 25, 2000 + * + * Author : Antoine P. Petitet + * Originally developed at the University of Tennessee, + * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA. + * + * --------------------------------------------------------------------- + * + * -- Copyright notice and Licensing terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in + * the documentation and/or other materials provided with the distri- + * bution. + * 3. The name of the University, the ATLAS group, or the names of its + * contributors may not be used to endorse or promote products deri- + * ved from this software without specific written permission. + * + * -- Disclaimer: + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO- + * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN- + * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --------------------------------------------------------------------- + */ +#ifndef ATLAS_REFLEVEL3_H +#define ATLAS_REFLEVEL3_H + +#include "atlas_enum.h" +/* + * ===================================================================== + * Prototypes for Level 3 Reference ATLAS BLAS routines + * ===================================================================== + */ +void ATL_srefgemm +( + const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsymm +( + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsyrk +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, + const float, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsyr2k +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sreftrmm +( + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsm +( + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_drefgemm +( + const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsymm +( + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsyrk +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, + const double, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsyr2k +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dreftrmm +( + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsm +( + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_crefgemm +( + const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefhemm +( + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefherk +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, + const float, + const float *, const int, + const float, + float *, const int +); + +void ATL_crefher2k +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_crefsymm +( + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefsyrk +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, + const float *, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefsyr2k +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_creftrmm +( + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsm +( + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_zrefgemm +( + const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefhemm +( + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefherk +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, + const double, + const double *, const int, + const double, + double *, const int +); + +void ATL_zrefher2k +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_zrefsymm +( + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefsyrk +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, + const double *, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefsyr2k +( + const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zreftrmm +( + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsm +( + const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +#endif +/* + * End of atlas_reflevel3.h + */ diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_reflvl2.h b/kaldi_io/src/tools/ATLAS/include/atlas_reflvl2.h new file mode 100644 index 0000000..c557f04 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_reflvl2.h @@ -0,0 +1,3184 @@ +/* --------------------------------------------------------------------- + * + * -- Automatically Tuned Linear Algebra Software (ATLAS) + * (C) Copyright 2000 All Rights Reserved + * + * -- ATLAS routine -- Version 3.2 -- December 25, 2000 + * + * Author : Antoine P. Petitet + * Originally developed at the University of Tennessee, + * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA. + * + * --------------------------------------------------------------------- + * + * -- Copyright notice and Licensing terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in + * the documentation and/or other materials provided with the distri- + * bution. + * 3. The name of the University, the ATLAS group, or the names of its + * contributors may not be used to endorse or promote products deri- + * ved from this software without specific written permission. + * + * -- Disclaimer: + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO- + * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN- + * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --------------------------------------------------------------------- + */ +#ifndef ATLAS_REFLVL2_H +#define ATLAS_REFLVL2_H +/* + * ===================================================================== + * Prototypes for Level 2 Reference Internal ATLAS BLAS routines + * ===================================================================== + */ +void ATL_srefgbmvN +( + const int, const int, + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefgbmvT +( + const int, const int, + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefgpmvUN +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefgpmvUT +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefgpmvLN +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefgpmvLT +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefgemvN +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefgemvT +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefgprL +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_srefgprU +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_srefsbmvL +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsbmvU +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefspmvL +( + const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefspmvU +( + const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsprL +( + const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_srefsprU +( + const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_srefspr2L +( + const int, + const float, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_srefspr2U +( + const int, + const float, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_srefsymvL +( + const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsymvU +( + const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsyrL +( + const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_srefsyrU +( + const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_srefsyr2L +( + const int, + const float, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_srefsyr2U +( + const int, + const float, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbmvLNN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbmvLNU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbmvLTN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbmvLTU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbmvUNN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbmvUNU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbmvUTN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbmvUTU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpmvLNN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpmvLNU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpmvLTN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpmvLTU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpmvUNN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpmvUNU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpmvUTN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpmvUTU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrmvLNN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrmvLNU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrmvLTN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrmvLTU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrmvUNN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrmvUNU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrmvUTN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrmvUTU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbsvLNN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbsvLNU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbsvLTN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbsvLTU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbsvUNN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbsvUNU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbsvUTN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftbsvUTU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpsvLNN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpsvLNU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpsvLTN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpsvLTU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpsvUNN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpsvUNU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpsvUTN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftpsvUTU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrsvLNN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrsvLNU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrsvLTN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrsvLTU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrsvUNN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrsvUNU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrsvUTN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_sreftrsvUTU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_drefgbmvN +( + const int, const int, + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefgbmvT +( + const int, const int, + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefgpmvUN +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefgpmvUT +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefgpmvLN +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefgpmvLT +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefgemvN +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefgemvT +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefgprL +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_drefgprU +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_drefsbmvL +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsbmvU +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefspmvL +( + const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefspmvU +( + const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsprL +( + const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_drefsprU +( + const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_drefspr2L +( + const int, + const double, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_drefspr2U +( + const int, + const double, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_drefsymvL +( + const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsymvU +( + const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsyrL +( + const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_drefsyrU +( + const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_drefsyr2L +( + const int, + const double, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_drefsyr2U +( + const int, + const double, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbmvLNN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbmvLNU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbmvLTN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbmvLTU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbmvUNN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbmvUNU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbmvUTN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbmvUTU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpmvLNN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpmvLNU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpmvLTN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpmvLTU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpmvUNN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpmvUNU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpmvUTN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpmvUTU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrmvLNN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrmvLNU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrmvLTN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrmvLTU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrmvUNN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrmvUNU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrmvUTN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrmvUTU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbsvLNN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbsvLNU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbsvLTN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbsvLTU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbsvUNN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbsvUNU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbsvUTN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftbsvUTU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpsvLNN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpsvLNU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpsvLTN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpsvLTU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpsvUNN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpsvUNU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpsvUTN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftpsvUTU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrsvLNN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrsvLNU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrsvLTN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrsvLTU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrsvUNN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrsvUNU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrsvUTN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_dreftrsvUTU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_crefgbmvN +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgbmvT +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgbmvC +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgbmvH +( + const int, const int, + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgpmvUN +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgpmvUT +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgpmvUC +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgpmvUH +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgpmvLN +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgpmvLT +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgpmvLC +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgpmvLH +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgemvN +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgemvT +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgemvC +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgemvH +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgprcL +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_crefgprcU +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_crefgpruL +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_crefgpruU +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_crefhbmvL +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefhbmvU +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefhpmvL +( + const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefhpmvU +( + const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefhprL +( + const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_crefhprU +( + const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_crefhpr2L +( + const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_crefhpr2U +( + const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_crefhemvL +( + const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefhemvU +( + const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefherL +( + const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_crefherU +( + const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_crefher2L +( + const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_crefher2U +( + const int, + const float *, + const float *, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmvLNN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmvLNU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmvLTN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmvLTU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmvLCN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmvLCU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmvLHN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmvLHU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmvUNN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmvUNU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmvUTN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmvUTU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmvUCN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmvUCU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmvUHN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbmvUHU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmvLNN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmvLNU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmvLTN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmvLTU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmvLCN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmvLCU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmvLHN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmvLHU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmvUNN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmvUNU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmvUTN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmvUTU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmvUCN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmvUCU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmvUHN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpmvUHU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrmvLNN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrmvLNU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrmvLTN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrmvLTU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrmvLCN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrmvLCU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrmvLHN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrmvLHU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrmvUNN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrmvUNU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrmvUTN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrmvUTU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrmvUCN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrmvUCU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrmvUHN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrmvUHU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsvLNN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsvLNU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsvLTN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsvLTU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsvLCN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsvLCU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsvLHN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsvLHU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsvUNN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsvUNU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsvUTN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsvUTU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsvUCN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsvUCU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsvUHN +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftbsvUHU +( + const int, const int, + const float *, const int, + float *, const int +); + +void ATL_creftpsvLNN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpsvLNU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpsvLTN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpsvLTU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpsvLCN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpsvLCU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpsvLHN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpsvLHU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpsvUNN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpsvUNU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpsvUTN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpsvUTU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpsvUCN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpsvUCU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpsvUHN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftpsvUHU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsvLNN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsvLNU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsvLTN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsvLTU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsvLCN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsvLCU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsvLHN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsvLHU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsvUNN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsvUNU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsvUTN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsvUTU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsvUCN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsvUCU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsvUHN +( + const int, + const float *, const int, + float *, const int +); + +void ATL_creftrsvUHU +( + const int, + const float *, const int, + float *, const int +); + +void ATL_zrefgbmvN +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgbmvT +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgbmvC +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgbmvH +( + const int, const int, + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgpmvUN +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgpmvUT +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgpmvUC +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgpmvUH +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgpmvLN +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgpmvLT +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgpmvLC +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgpmvLH +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgemvN +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgemvT +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgemvC +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgemvH +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgprcL +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zrefgprcU +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zrefgpruL +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zrefgpruU +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zrefhbmvL +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefhbmvU +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefhpmvL +( + const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefhpmvU +( + const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefhprL +( + const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_zrefhprU +( + const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_zrefhpr2L +( + const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zrefhpr2U +( + const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zrefhemvL +( + const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefhemvU +( + const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefherL +( + const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_zrefherU +( + const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_zrefher2L +( + const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zrefher2U +( + const int, + const double *, + const double *, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmvLNN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmvLNU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmvLTN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmvLTU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmvLCN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmvLCU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmvLHN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmvLHU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmvUNN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmvUNU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmvUTN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmvUTU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmvUCN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmvUCU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmvUHN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbmvUHU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmvLNN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmvLNU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmvLTN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmvLTU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmvLCN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmvLCU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmvLHN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmvLHU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmvUNN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmvUNU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmvUTN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmvUTU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmvUCN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmvUCU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmvUHN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpmvUHU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrmvLNN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrmvLNU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrmvLTN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrmvLTU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrmvLCN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrmvLCU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrmvLHN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrmvLHU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrmvUNN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrmvUNU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrmvUTN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrmvUTU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrmvUCN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrmvUCU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrmvUHN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrmvUHU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsvLNN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsvLNU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsvLTN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsvLTU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsvLCN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsvLCU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsvLHN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsvLHU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsvUNN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsvUNU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsvUTN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsvUTU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsvUCN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsvUCU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsvUHN +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftbsvUHU +( + const int, const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpsvLNN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpsvLNU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpsvLTN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpsvLTU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpsvLCN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpsvLCU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpsvLHN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpsvLHU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpsvUNN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpsvUNU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpsvUTN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpsvUTU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpsvUCN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpsvUCU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpsvUHN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftpsvUHU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsvLNN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsvLNU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsvLTN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsvLTU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsvLCN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsvLCU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsvLHN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsvLHU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsvUNN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsvUNU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsvUTN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsvUTU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsvUCN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsvUCU +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsvUHN +( + const int, + const double *, const int, + double *, const int +); + +void ATL_zreftrsvUHU +( + const int, + const double *, const int, + double *, const int +); + +#endif +/* + * End of atlas_reflvl2.h + */ diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_reflvl3.h b/kaldi_io/src/tools/ATLAS/include/atlas_reflvl3.h new file mode 100644 index 0000000..0451ff9 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_reflvl3.h @@ -0,0 +1,2292 @@ +/* --------------------------------------------------------------------- + * + * -- Automatically Tuned Linear Algebra Software (ATLAS) + * (C) Copyright 2000 All Rights Reserved + * + * -- ATLAS routine -- Version 3.2 -- December 25, 2000 + * + * Author : Antoine P. Petitet + * Originally developed at the University of Tennessee, + * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA. + * + * --------------------------------------------------------------------- + * + * -- Copyright notice and Licensing terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in + * the documentation and/or other materials provided with the distri- + * bution. + * 3. The name of the University, the ATLAS group, or the names of its + * contributors may not be used to endorse or promote products deri- + * ved from this software without specific written permission. + * + * -- Disclaimer: + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO- + * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN- + * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --------------------------------------------------------------------- + */ +#ifndef ATLAS_REFLVL3_H +#define ATLAS_REFLVL3_H +/* + * ===================================================================== + * Prototypes for Level 3 Reference Internal ATLAS BLAS routines + * ===================================================================== + */ +void ATL_srefgemmNN +( + const int, const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefgemmNT +( + const int, const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefgemmTN +( + const int, const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefgemmTT +( + const int, const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsymmLL +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsymmLU +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsymmRL +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsymmRU +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsyrkLN +( + const int, const int, + const float, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsyrkLT +( + const int, const int, + const float, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsyrkUN +( + const int, const int, + const float, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsyrkUT +( + const int, const int, + const float, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsyr2kLN +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsyr2kLT +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsyr2kUN +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_srefsyr2kUT +( + const int, const int, + const float, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_sreftrmmLLNN +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrmmLLNU +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrmmLLTN +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrmmLLTU +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrmmLUNN +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrmmLUNU +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrmmLUTN +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrmmLUTU +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrmmRLNN +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrmmRLNU +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrmmRLTN +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrmmRLTU +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrmmRUNN +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrmmRUNU +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrmmRUTN +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrmmRUTU +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsmLLNN +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsmLLNU +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsmLLTN +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsmLLTU +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsmLUNN +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsmLUNU +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsmLUTN +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsmLUTU +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsmRLNN +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsmRLNU +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsmRLTN +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsmRLTU +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsmRUNN +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsmRUNU +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsmRUTN +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_sreftrsmRUTU +( + const int, const int, + const float, + const float *, const int, + float *, const int +); + +void ATL_drefgemmNN +( + const int, const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefgemmNT +( + const int, const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefgemmTN +( + const int, const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefgemmTT +( + const int, const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsymmLL +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsymmLU +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsymmRL +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsymmRU +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsyrkLN +( + const int, const int, + const double, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsyrkLT +( + const int, const int, + const double, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsyrkUN +( + const int, const int, + const double, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsyrkUT +( + const int, const int, + const double, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsyr2kLN +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsyr2kLT +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsyr2kUN +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_drefsyr2kUT +( + const int, const int, + const double, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_dreftrmmLLNN +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrmmLLNU +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrmmLLTN +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrmmLLTU +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrmmLUNN +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrmmLUNU +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrmmLUTN +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrmmLUTU +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrmmRLNN +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrmmRLNU +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrmmRLTN +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrmmRLTU +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrmmRUNN +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrmmRUNU +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrmmRUTN +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrmmRUTU +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsmLLNN +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsmLLNU +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsmLLTN +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsmLLTU +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsmLUNN +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsmLUNU +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsmLUTN +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsmLUTU +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsmRLNN +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsmRLNU +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsmRLTN +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsmRLTU +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsmRUNN +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsmRUNU +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsmRUTN +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_dreftrsmRUTU +( + const int, const int, + const double, + const double *, const int, + double *, const int +); + +void ATL_crefgemmNN +( + const int, const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgemmNT +( + const int, const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgemmNC +( + const int, const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgemmTN +( + const int, const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgemmTT +( + const int, const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgemmTC +( + const int, const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgemmCN +( + const int, const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgemmCT +( + const int, const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefgemmCC +( + const int, const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefhemmLL +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefhemmLU +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefhemmRL +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefhemmRU +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefherkLN +( + const int, const int, + const float, + const float *, const int, + const float, + float *, const int +); + +void ATL_crefherkLC +( + const int, const int, + const float, + const float *, const int, + const float, + float *, const int +); + +void ATL_crefherkUN +( + const int, const int, + const float, + const float *, const int, + const float, + float *, const int +); + +void ATL_crefherkUC +( + const int, const int, + const float, + const float *, const int, + const float, + float *, const int +); + +void ATL_crefher2kLN +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_crefher2kLC +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_crefher2kUN +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_crefher2kUC +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float, + float *, const int +); + +void ATL_crefsymmLL +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefsymmLU +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefsymmRL +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefsymmRU +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefsyrkLN +( + const int, const int, + const float *, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefsyrkLT +( + const int, const int, + const float *, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefsyrkUN +( + const int, const int, + const float *, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefsyrkUT +( + const int, const int, + const float *, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefsyr2kLN +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefsyr2kLT +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefsyr2kUN +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_crefsyr2kUT +( + const int, const int, + const float *, + const float *, const int, + const float *, const int, + const float *, + float *, const int +); + +void ATL_creftrmmLLNN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmLLNU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmLLTN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmLLTU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmLLCN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmLLCU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmLUNN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmLUNU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmLUTN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmLUTU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmLUCN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmLUCU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmRLNN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmRLNU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmRLTN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmRLTU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmRLCN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmRLCU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmRUNN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmRUNU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmRUTN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmRUTU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmRUCN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrmmRUCU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmLLNN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmLLNU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmLLTN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmLLTU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmLLCN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmLLCU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmLUNN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmLUNU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmLUTN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmLUTU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmLUCN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmLUCU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmRLNN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmRLNU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmRLTN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmRLTU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmRLCN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmRLCU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmRUNN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmRUNU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmRUTN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmRUTU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmRUCN +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_creftrsmRUCU +( + const int, const int, + const float *, + const float *, const int, + float *, const int +); + +void ATL_zrefgemmNN +( + const int, const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgemmNT +( + const int, const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgemmNC +( + const int, const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgemmTN +( + const int, const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgemmTT +( + const int, const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgemmTC +( + const int, const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgemmCN +( + const int, const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgemmCT +( + const int, const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefgemmCC +( + const int, const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefhemmLL +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefhemmLU +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefhemmRL +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefhemmRU +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefherkLN +( + const int, const int, + const double, + const double *, const int, + const double, + double *, const int +); + +void ATL_zrefherkLC +( + const int, const int, + const double, + const double *, const int, + const double, + double *, const int +); + +void ATL_zrefherkUN +( + const int, const int, + const double, + const double *, const int, + const double, + double *, const int +); + +void ATL_zrefherkUC +( + const int, const int, + const double, + const double *, const int, + const double, + double *, const int +); + +void ATL_zrefher2kLN +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_zrefher2kLC +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_zrefher2kUN +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_zrefher2kUC +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double, + double *, const int +); + +void ATL_zrefsymmLL +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefsymmLU +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefsymmRL +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefsymmRU +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefsyrkLN +( + const int, const int, + const double *, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefsyrkLT +( + const int, const int, + const double *, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefsyrkUN +( + const int, const int, + const double *, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefsyrkUT +( + const int, const int, + const double *, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefsyr2kLN +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefsyr2kLT +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefsyr2kUN +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zrefsyr2kUT +( + const int, const int, + const double *, + const double *, const int, + const double *, const int, + const double *, + double *, const int +); + +void ATL_zreftrmmLLNN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmLLNU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmLLTN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmLLTU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmLLCN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmLLCU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmLUNN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmLUNU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmLUTN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmLUTU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmLUCN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmLUCU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmRLNN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmRLNU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmRLTN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmRLTU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmRLCN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmRLCU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmRUNN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmRUNU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmRUTN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmRUTU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmRUCN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrmmRUCU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmLLNN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmLLNU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmLLTN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmLLTU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmLLCN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmLLCU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmLUNN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmLUNU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmLUTN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmLUTU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmLUCN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmLUCU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmRLNN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmRLNU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmRLTN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmRLTU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmRLCN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmRLCU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmRUNN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmRUNU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmRUTN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmRUTU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmRUCN +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +void ATL_zreftrsmRUCU +( + const int, const int, + const double *, + const double *, const int, + double *, const int +); + +#endif +/* + * End of atlas_reflvl3.h + */ diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_refmisc.h b/kaldi_io/src/tools/ATLAS/include/atlas_refmisc.h new file mode 100644 index 0000000..d8b600e --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_refmisc.h @@ -0,0 +1,367 @@ +/* --------------------------------------------------------------------- + * + * -- Automatically Tuned Linear Algebra Software (ATLAS) + * (C) Copyright 2000 All Rights Reserved + * + * -- ATLAS routine -- Version 3.2 -- December 25, 2000 + * + * Author : Antoine P. Petitet + * Originally developed at the University of Tennessee, + * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA. + * + * --------------------------------------------------------------------- + * + * -- Copyright notice and Licensing terms: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in + * the documentation and/or other materials provided with the distri- + * bution. + * 3. The name of the University, the ATLAS group, or the names of its + * contributors may not be used to endorse or promote products deri- + * ved from this software without specific written permission. + * + * -- Disclaimer: + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, + * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO- + * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN- + * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --------------------------------------------------------------------- + */ +#ifndef ATL_REFMISC_H +#define ATL_REFMISC_H +/* + * ===================================================================== + * Include files + * ===================================================================== + */ +#include <math.h> +#include "atlas_enum.h" +/* + * ===================================================================== + * #define macro constants + * ===================================================================== + */ +#define ATL_sNONE (-1.0f) +#define ATL_sNTWO (-2.0f) +#define ATL_sONE ( 1.0f) +#define ATL_sZERO ( 0.0f) + +#define ATL_dNONE (-1.0) +#define ATL_dNTWO (-2.0) +#define ATL_dONE ( 1.0) +#define ATL_dZERO ( 0.0) +/* + * ===================================================================== + * # macro functions + * ===================================================================== + */ +#define Msabs( a_ ) ( ( (a_) < ATL_sZERO ) ? -(a_) : (a_) ) + +#define Mszero( a_r_, a_i_ ) \ + ( ( (a_r_) == ATL_sZERO ) && ( (a_i_) == ATL_sZERO ) ) + +#define Msone( a_r_, a_i_ ) \ + ( ( (a_r_) == ATL_sONE ) && ( (a_i_) == ATL_sZERO ) ) + +#define Msscl( a_r_, a_i_, c_r_, c_i_ ) \ + { \ + register float tmp_r_, tmp_i_; \ + tmp_r_ = (a_r_) * c_r_ - (a_i_) * c_i_; \ + tmp_i_ = (a_r_) * c_i_ + (a_i_) * c_r_; \ + c_r_ = tmp_r_; \ + c_i_ = tmp_i_; \ + } +/* + * Msdiv performs complex division in real arithmetic + * a_r_ + i * a_i_ = ( a_r_ + i * a_i_ ) / ( b_r_ + i * b_i_ ); + * The algorithm is due to Robert L. Smith and can be found in D. Knuth, + * The art of Computer Programming, Vol.2, p.195 + */ +#define Msdiv( b_r_, b_i_, a_r_, a_i_ ) \ + { \ + register float c_i_, c_r_, tmp1_, tmp2_; \ + if( Msabs( b_i_ ) < Msabs( b_r_ ) ) \ + { \ + tmp1_ = (b_i_) / (b_r_); \ + tmp2_ = (b_r_) + (b_i_) * tmp1_; \ + c_r_ = ( (a_r_) + (a_i_) * tmp1_ ) / tmp2_; \ + c_i_ = ( (a_i_) - (a_r_) * tmp1_ ) / tmp2_; \ + } \ + else \ + { \ + tmp1_ = (b_r_) / (b_i_); \ + tmp2_ = (b_i_) + (b_r_) * tmp1_; \ + c_r_ = ( (a_i_) + (a_r_) * tmp1_ ) / tmp2_; \ + c_i_ = ( -(a_r_) + (a_i_) * tmp1_ ) / tmp2_; \ + } \ + a_r_ = c_r_; \ + a_i_ = c_i_; \ + } + +#define Mdabs( a_ ) ( ( (a_) < ATL_dZERO ) ? -(a_) : (a_) ) + +#define Mdzero( a_r_, a_i_ ) \ + ( ( (a_r_) == ATL_dZERO ) && ( (a_i_) == ATL_dZERO ) ) + +#define Mdone( a_r_, a_i_ ) \ + ( ( (a_r_) == ATL_dONE ) && ( (a_i_) == ATL_dZERO ) ) + +#define Mdscl( a_r_, a_i_, c_r_, c_i_ ) \ + { \ + register double tmp_r_, tmp_i_; \ + tmp_r_ = (a_r_) * c_r_ - (a_i_) * c_i_; \ + tmp_i_ = (a_r_) * c_i_ + (a_i_) * c_r_; \ + c_r_ = tmp_r_; \ + c_i_ = tmp_i_; \ + } +/* + * Mddiv performs complex division in real arithmetic + * a_r_ + i * a_i_ = ( a_r_ + i * a_i_ ) / ( b_r_ + i * b_i_ ); + * The algorithm is due to Robert L. Smith and can be found in D. Knuth, + * The art of Computer Programming, Vol.2, p.195 + */ +#define Mddiv( b_r_, b_i_, a_r_, a_i_ ) \ + { \ + register double c_i_, c_r_, tmp1_, tmp2_; \ + if( Mdabs( b_i_ ) < Mdabs( b_r_ ) ) \ + { \ + tmp1_ = (b_i_) / (b_r_); \ + tmp2_ = (b_r_) + (b_i_) * tmp1_; \ + c_r_ = ( (a_r_) + (a_i_) * tmp1_ ) / tmp2_; \ + c_i_ = ( (a_i_) - (a_r_) * tmp1_ ) / tmp2_; \ + } \ + else \ + { \ + tmp1_ = (b_r_) / (b_i_); \ + tmp2_ = (b_i_) + (b_r_) * tmp1_; \ + c_r_ = ( (a_i_) + (a_r_) * tmp1_ ) / tmp2_; \ + c_i_ = ( -(a_r_) + (a_i_) * tmp1_ ) / tmp2_; \ + } \ + a_r_ = c_r_; \ + a_i_ = c_i_; \ + } + +#define Mmin( a_, b_ ) ( ( (a_) < (b_) ) ? (a_) : (b_) ) + +#define Mmax( a_, b_ ) ( ( (a_) > (b_) ) ? (a_) : (b_) ) + +#define Mmul( a_r_, a_i_, b_r_, b_i_, c_r_, c_i_ ) \ + { \ + c_r_ = (a_r_) * (b_r_) - (a_i_) * (b_i_); \ + c_i_ = (a_r_) * (b_i_) + (a_i_) * (b_r_); \ + } + +#define Mmla( a_r_, a_i_, b_r_, b_i_, c_r_, c_i_ ) \ + { \ + c_r_ += (a_r_) * (b_r_) - (a_i_) * (b_i_); \ + c_i_ += (a_r_) * (b_i_) + (a_i_) * (b_r_); \ + } + +#define Mmls( a_r_, a_i_, b_r_, b_i_, c_r_, c_i_ ) \ + { \ + c_r_ -= (a_r_) * (b_r_) - (a_i_) * (b_i_); \ + c_i_ -= (a_r_) * (b_i_) + (a_i_) * (b_r_); \ + } + +#define Mset( a_r_, a_i_, b_r_, b_i_ ) \ + { \ + b_r_ = (a_r_); \ + b_i_ = (a_i_); \ + } + +#define Mselscal( al_, a_ ) \ + { \ + if( (al_) == ATL_sZERO ) { (a_) = ATL_sZERO; } \ + else if( (al_) != ATL_sONE ) { (a_) *= (al_); } \ + } + +#define Mdelscal( al_, a_ ) \ + { \ + if( (al_) == ATL_dZERO ) { (a_) = ATL_dZERO; } \ + else if( (al_) != ATL_dONE ) { (a_) *= (al_); } \ + } + +#define Mcelscal( al_r_, al_i_, a_r_, a_i_ ) \ + { \ + if( Mszero( (al_r_), (al_i_) ) ) \ + { (a_r_) = (a_i_) = ATL_sZERO; } \ + else if( ! Msone( (al_r_), (al_i_) ) ) \ + { Msscl( (al_r_), (al_i_), (a_r_), (a_i_) ); } \ + } + +#define Mzelscal( al_r_, al_i_, a_r_, a_i_ ) \ + { \ + if( Mdzero( (al_r_), (al_i_) ) ) \ + { (a_r_) = (a_i_) = ATL_dZERO; } \ + else if( ! Mdone( (al_r_), (al_i_) ) ) \ + { Mdscl( (al_r_), (al_i_), (a_r_), (a_i_) ); } \ + } + +#define Msvscal( n_, al_, x_, incx_ ) \ + { \ + int i_, ix_; \ + if( (al_) == ATL_sZERO ) \ + { \ + for( i_ = 0, ix_ = 0; i_ < (n_); i_++, ix_ += (incx_) ) \ + { (x_)[ix_] = ATL_sZERO; } \ + } \ + else if( (al_) != ATL_sONE ) \ + { \ + for( i_ = 0, ix_ = 0; i_ < (n_); i_++, ix_ += (incx_) ) \ + { (x_)[ix_] *= (al_); } \ + } \ + } + +#define Mdvscal( n_, al_, x_, incx_ ) \ + { \ + int i_, ix_; \ + if( (al_) == ATL_dZERO ) \ + { \ + for( i_ = 0, ix_ = 0; i_ < (n_); i_++, ix_ += (incx_) ) \ + { (x_)[ix_] = ATL_dZERO; } \ + } \ + else if( (al_) != ATL_dONE ) \ + { \ + for( i_ = 0, ix_ = 0; i_ < (n_); i_++, ix_ += (incx_) ) \ + { (x_)[ix_] *= (al_); } \ + } \ + } + +#define Mcvscal( n_, al_, x_, incx_ ) \ + { \ + int i_, ix_, incx2_ = ( 2 * (incx_) ); \ + if( Mszero( (al_)[0], (al_)[1] ) ) \ + { \ + for( i_ = 0, ix_ = 0; i_ < (n_); i_++, ix_ += (incx2_) ) \ + { (x_)[ix_] = (x_)[ix_+1] = ATL_sZERO; } \ + } \ + else if( ! Msone( (al_)[0], (al_)[1] ) ) \ + { \ + for( i_ = 0, ix_ = 0; i_ < (n_); i_++, ix_ += (incx2_) ) \ + { Msscl( (al_)[0], (al_)[1], (x_)[ix_], (x_)[ix_+1] ); } \ + } \ + } + +#define Mzvscal( n_, al_, x_, incx_ ) \ + { \ + int i_, ix_, incx2_ = ( 2 * (incx_) ); \ + if( Mdzero( (al_)[0], (al_)[1] ) ) \ + { \ + for( i_ = 0, ix_ = 0; i_ < (n_); i_++, ix_ += (incx2_) ) \ + { (x_)[ix_] = (x_)[ix_+1] = ATL_dZERO; } \ + } \ + else if( ! Mdone( (al_)[0], (al_)[1] ) ) \ + { \ + for( i_ = 0, ix_ = 0; i_ < (n_); i_++, ix_ += (incx2_) ) \ + { Mdscl( (al_)[0], (al_)[1], (x_)[ix_], (x_)[ix_+1] ); } \ + } \ + } + +#define Msgescal( m_, n_, al_, a_, lda_ ) \ + { \ + int i_, iaij_, j_, jaj_; \ + if( (al_) == ATL_sZERO ) \ + { \ + for( j_ = 0, jaj_ = 0; j_ < (n_); j_++, jaj_ += (lda_) ) \ + { \ + for( i_ = 0, iaij_ = jaj_; i_ < (m_); i_++, iaij_ += 1 ) \ + { (a_)[iaij_] = ATL_sZERO; } \ + } \ + } \ + else if( (al_) != ATL_sONE ) \ + { \ + for( j_ = 0, jaj_ = 0; j_ < (n_); j_++, jaj_ += (lda_) ) \ + { \ + for( i_ = 0, iaij_ = jaj_; i_ < (m_); i_++, iaij_ += 1 ) \ + { (a_)[iaij_] *= (al_); } \ + } \ + } \ + } + +#define Mdgescal( m_, n_, al_, a_, lda_ ) \ + { \ + int i_, iaij_, j_, jaj_; \ + if( (al_) == ATL_dZERO ) \ + { \ + for( j_ = 0, jaj_ = 0; j_ < (n_); j_++, jaj_ += (lda_) ) \ + { \ + for( i_ = 0, iaij_ = jaj_; i_ < (m_); i_++, iaij_ += 1 ) \ + { (a_)[iaij_] = ATL_dZERO; } \ + } \ + } \ + else if( (al_) != ATL_dONE ) \ + { \ + for( j_ = 0, jaj_ = 0; j_ < (n_); j_++, jaj_ += (lda_) ) \ + { \ + for( i_ = 0, iaij_ = jaj_; i_ < (m_); i_++, iaij_ += 1 ) \ + { (a_)[iaij_] *= (al_); } \ + } \ + } \ + } + +#define Mcgescal( m_, n_, al_, a_, lda_ ) \ + { \ + int i_, iaij_, j_, jaj_, lda2_ = ( (lda_) << 1 ); \ + if( Mszero( (al_)[0], (al_)[1] ) ) \ + { \ + for( j_ = 0, jaj_ = 0; j_ < (n_); j_++, jaj_ += lda2_ ) \ + { \ + for( i_ = 0, iaij_ = jaj_; i_ < (m_); i_++, iaij_ += 2 ) \ + { (a_)[iaij_] = (a_)[iaij_+1] = ATL_sZERO; } \ + } \ + } \ + else if( ! Msone( (al_)[0], (al_)[1] ) ) \ + { \ + for( j_ = 0, jaj_ = 0; j_ < (n_); j_++, jaj_ += lda2_ ) \ + { \ + for( i_ = 0, iaij_ = jaj_; i_ < (m_); i_++, iaij_ += 2 ) \ + { \ + Msscl( (al_)[0], (al_)[1], (a_)[iaij_], (a_)[iaij_+1] ); \ + } \ + } \ + } \ + } + +#define Mzgescal( m_, n_, al_, a_, lda_ ) \ + { \ + int i_, iaij_, j_, jaj_, lda2_ = ( (lda_) << 1 ); \ + if( Mdzero( (al_)[0], (al_)[1] ) ) \ + { \ + for( j_ = 0, jaj_ = 0; j_ < (n_); j_++, jaj_ += lda2_ ) \ + { \ + for( i_ = 0, iaij_ = jaj_; i_ < (m_); i_++, iaij_ += 2 ) \ + { (a_)[iaij_] = (a_)[iaij_+1] = ATL_dZERO; } \ + } \ + } \ + else if( ! Mdone( (al_)[0], (al_)[1] ) ) \ + { \ + for( j_ = 0, jaj_ = 0; j_ < (n_); j_++, jaj_ += lda2_ ) \ + { \ + for( i_ = 0, iaij_ = jaj_; i_ < (m_); i_++, iaij_ += 2 ) \ + { \ + Mdscl( (al_)[0], (al_)[1], (a_)[iaij_], (a_)[iaij_+1] ); \ + } \ + } \ + } \ + } + +#endif +/* + * End of atlas_refmisc.h + */ diff --git a/kaldi_io/src/tools/ATLAS/include/atlas_tst.h b/kaldi_io/src/tools/ATLAS/include/atlas_tst.h new file mode 100644 index 0000000..1ea5f5e --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/atlas_tst.h @@ -0,0 +1,909 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1999 R. Clint Whaley + * + * Code contributers : R. Clint Whaley, Antoine P. Petitet + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef ATLAS_TST_H + #define ATLAS_TST_H + +#include "atlas_enum.h" + +double time00(); +#ifndef UseCRand + void ATL_srand(int iseed); + int ATL_rand(void); + #define dumb_seed(iseed_) ATL_srand(iseed_) + #define dumb_rand() ( 0.5 - ((double)ATL_rand())/(2147483648.0) ) +#else + #define dumb_seed(iseed_) srand(iseed_) + #ifndef RAND_MAX /* rather dangerous non-ansi workaround */ + #define RAND_MAX ((unsigned long)(1<<30)) + #endif + #define dumb_rand() ( 0.5 - ((double)rand())/((double)RAND_MAX) ) +#endif + +void ATL_ststsqtran(const int N, float *A, const int lda); +void ATL_sgeprint + (char *mat, const int M, const int N, const float *A, const int lda); + +float ATL_sgediffnrm1 + (const int M, const int N, const float *A, const int lda, + const float *B, const int ldb); +float ATL_shediffnrm + (const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, const int N, + const float *A0, const int ld0, const float *A1, const int ld1); +float ATL_sinfnrm(const int N, const float *X, const int incX); +float ATL_sgenrm1 + (const int M, const int N, const float *A, const int lda); +float ATL_strnrm1 + (const enum ATLAS_UPLO Upper, const enum ATLAS_DIAG Diag, const int N, + const float *A, const int lda); +float ATL_sgbnrm1 + (const int M, const int N, const int KL, const int KU, + const float *A, const int lda); +float ATL_stpnrm1 + (const enum ATLAS_UPLO UPLO, const enum ATLAS_DIAG DIAG, const int N, + const float *A); +float ATL_stbnrm1 + (const enum ATLAS_UPLO UPLO, const enum ATLAS_DIAG DIAG, + const int N, const int K, const float *A, const int LDA); +float ATL_ssynrm + (const enum ATLAS_UPLO UPLO, const int N, const float *A, const int LDA); +float ATL_shenrm + (const enum ATLAS_UPLO UPLO, const int N, const float *A, const int LDA); +float ATL_sspnrm + (const enum ATLAS_UPLO UPLO, const int N, const float *A); +float ATL_shpnrm + (const enum ATLAS_UPLO UPLO, const int N, const float *A); +float ATL_ssbnrm + (const enum ATLAS_UPLO UPLO, const int N, const int K, + const float *A, const int LDA); +float ATL_shbnrm + (const enum ATLAS_UPLO UPLO, const int N, const int K, + const float *A, const int LDA); + +void ATL_sgefillgap(const int M, const int N, float *A, const int lda0); +int ATL_sgechkgap(const int M0, const int N, float *A, const int lda0); +void ATL_strgen(const enum ATLAS_UPLO Uplo, const enum ATLAS_DIAG Diag, + const int N, float *A, const int lda, const int seed); +void ATL_sgegen(const int M0, const int N, float *A, const int lda, + const int seed); +float ATL_sepsilon(void); +void ATL_svdiff(const int N, const float *X, const int incX, + const float *Y, const int incY, float *Z, const int incZ); +void ATL_sgediff(const int M, const int N, const float *A, const int lda, + const float *B, const int ldb, float *C, const int ldc); +void ATL_dtstsqtran(const int N, double *A, const int lda); +void ATL_dgeprint + (char *mat, const int M, const int N, const double *A, const int lda); + +double ATL_dgediffnrm1 + (const int M, const int N, const double *A, const int lda, + const double *B, const int ldb); +double ATL_dhediffnrm + (const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, const int N, + const double *A0, const int ld0, const double *A1, const int ld1); +double ATL_dinfnrm(const int N, const double *X, const int incX); +double ATL_dgenrm1 + (const int M, const int N, const double *A, const int lda); +double ATL_dtrnrm1 + (const enum ATLAS_UPLO Upper, const enum ATLAS_DIAG Diag, const int N, + const double *A, const int lda); +double ATL_dgbnrm1 + (const int M, const int N, const int KL, const int KU, + const double *A, const int lda); +double ATL_dtpnrm1 + (const enum ATLAS_UPLO UPLO, const enum ATLAS_DIAG DIAG, const int N, + const double *A); +double ATL_dtbnrm1 + (const enum ATLAS_UPLO UPLO, const enum ATLAS_DIAG DIAG, + const int N, const int K, const double *A, const int LDA); +double ATL_dsynrm + (const enum ATLAS_UPLO UPLO, const int N, const double *A, const int LDA); +double ATL_dhenrm + (const enum ATLAS_UPLO UPLO, const int N, const double *A, const int LDA); +double ATL_dspnrm + (const enum ATLAS_UPLO UPLO, const int N, const double *A); +double ATL_dhpnrm + (const enum ATLAS_UPLO UPLO, const int N, const double *A); +double ATL_dsbnrm + (const enum ATLAS_UPLO UPLO, const int N, const int K, + const double *A, const int LDA); +double ATL_dhbnrm + (const enum ATLAS_UPLO UPLO, const int N, const int K, + const double *A, const int LDA); + +void ATL_dgefillgap(const int M, const int N, double *A, const int lda0); +int ATL_dgechkgap(const int M0, const int N, double *A, const int lda0); +void ATL_dtrgen(const enum ATLAS_UPLO Uplo, const enum ATLAS_DIAG Diag, + const int N, double *A, const int lda, const int seed); +void ATL_dgegen(const int M0, const int N, double *A, const int lda, + const int seed); +double ATL_depsilon(void); +void ATL_dvdiff(const int N, const double *X, const int incX, + const double *Y, const int incY, double *Z, const int incZ); +void ATL_dgediff(const int M, const int N, const double *A, const int lda, + const double *B, const int ldb, double *C, const int ldc); +void ATL_ctstsqtran(const int N, float *A, const int lda); +void ATL_cgeprint + (char *mat, const int M, const int N, const float *A, const int lda); + +float ATL_cgediffnrm1 + (const int M, const int N, const float *A, const int lda, + const float *B, const int ldb); +float ATL_chediffnrm + (const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, const int N, + const float *A0, const int ld0, const float *A1, const int ld1); +float ATL_cinfnrm(const int N, const float *X, const int incX); +float ATL_cgenrm1 + (const int M, const int N, const float *A, const int lda); +float ATL_ctrnrm1 + (const enum ATLAS_UPLO Upper, const enum ATLAS_DIAG Diag, const int N, + const float *A, const int lda); +float ATL_cgbnrm1 + (const int M, const int N, const int KL, const int KU, + const float *A, const int lda); +float ATL_ctpnrm1 + (const enum ATLAS_UPLO UPLO, const enum ATLAS_DIAG DIAG, const int N, + const float *A); +float ATL_ctbnrm1 + (const enum ATLAS_UPLO UPLO, const enum ATLAS_DIAG DIAG, + const int N, const int K, const float *A, const int LDA); +float ATL_csynrm + (const enum ATLAS_UPLO UPLO, const int N, const float *A, const int LDA); +float ATL_chenrm + (const enum ATLAS_UPLO UPLO, const int N, const float *A, const int LDA); +float ATL_cspnrm + (const enum ATLAS_UPLO UPLO, const int N, const float *A); +float ATL_chpnrm + (const enum ATLAS_UPLO UPLO, const int N, const float *A); +float ATL_csbnrm + (const enum ATLAS_UPLO UPLO, const int N, const int K, + const float *A, const int LDA); +float ATL_chbnrm + (const enum ATLAS_UPLO UPLO, const int N, const int K, + const float *A, const int LDA); + +void ATL_cgefillgap(const int M, const int N, float *A, const int lda0); +int ATL_cgechkgap(const int M0, const int N, float *A, const int lda0); +void ATL_ctrgen(const enum ATLAS_UPLO Uplo, const enum ATLAS_DIAG Diag, + const int N, float *A, const int lda, const int seed); +void ATL_cgegen(const int M0, const int N, float *A, const int lda, + const int seed); +float ATL_cepsilon(void); +void ATL_cvdiff(const int N, const float *X, const int incX, + const float *Y, const int incY, float *Z, const int incZ); +void ATL_cgediff(const int M, const int N, const float *A, const int lda, + const float *B, const int ldb, float *C, const int ldc); +void ATL_ztstsqtran(const int N, double *A, const int lda); +void ATL_zgeprint + (char *mat, const int M, const int N, const double *A, const int lda); + +double ATL_zgediffnrm1 + (const int M, const int N, const double *A, const int lda, + const double *B, const int ldb); +double ATL_zhediffnrm + (const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, const int N, + const double *A0, const int ld0, const double *A1, const int ld1); +double ATL_zinfnrm(const int N, const double *X, const int incX); +double ATL_zgenrm1 + (const int M, const int N, const double *A, const int lda); +double ATL_ztrnrm1 + (const enum ATLAS_UPLO Upper, const enum ATLAS_DIAG Diag, const int N, + const double *A, const int lda); +double ATL_zgbnrm1 + (const int M, const int N, const int KL, const int KU, + const double *A, const int lda); +double ATL_ztpnrm1 + (const enum ATLAS_UPLO UPLO, const enum ATLAS_DIAG DIAG, const int N, + const double *A); +double ATL_ztbnrm1 + (const enum ATLAS_UPLO UPLO, const enum ATLAS_DIAG DIAG, + const int N, const int K, const double *A, const int LDA); +double ATL_zsynrm + (const enum ATLAS_UPLO UPLO, const int N, const double *A, const int LDA); +double ATL_zhenrm + (const enum ATLAS_UPLO UPLO, const int N, const double *A, const int LDA); +double ATL_zspnrm + (const enum ATLAS_UPLO UPLO, const int N, const double *A); +double ATL_zhpnrm + (const enum ATLAS_UPLO UPLO, const int N, const double *A); +double ATL_zsbnrm + (const enum ATLAS_UPLO UPLO, const int N, const int K, + const double *A, const int LDA); +double ATL_zhbnrm + (const enum ATLAS_UPLO UPLO, const int N, const int K, + const double *A, const int LDA); + +void ATL_zgefillgap(const int M, const int N, double *A, const int lda0); +int ATL_zgechkgap(const int M0, const int N, double *A, const int lda0); +void ATL_ztrgen(const enum ATLAS_UPLO Uplo, const enum ATLAS_DIAG Diag, + const int N, double *A, const int lda, const int seed); +void ATL_zgegen(const int M0, const int N, double *A, const int lda, + const int seed); +double ATL_zepsilon(void); +void ATL_zvdiff(const int N, const double *X, const int incX, + const double *Y, const int incY, double *Z, const int incZ); +void ATL_zgediff(const int M, const int N, const double *A, const int lda, + const double *B, const int ldb, double *C, const int ldc); + +/* + * Wrappers so that C can call F77 LAPACK + */ +int ATL_sf77getri + (const enum ATLAS_ORDER, const int, float*, const int, int*, + float*, int*); +int ATL_sf77getrf + (const enum ATLAS_ORDER, const int, const int, float*, const int, int*); +int ATL_sf77potrf(const enum ATLAS_UPLO, const int, float*, const int); +int ATL_sf77lauum(const enum ATLAS_UPLO, const int, float*, const int); +int ATL_sf77trtri(const enum ATLAS_UPLO, const enum ATLAS_DIAG, const int, + float*, const int); +int ATL_sf77posv(const enum ATLAS_UPLO, const int, const int, float*, const int, float*, const int); +int ATL_sf77gesv(const int, const int, float*, const int, int*, float*, const int); +int ATL_sf77gels(const enum ATLAS_TRANS, const int, const int, const int, float*, const int, float*, const int); +int ATL_df77getri + (const enum ATLAS_ORDER, const int, double*, const int, int*, + double*, int*); +int ATL_df77getrf + (const enum ATLAS_ORDER, const int, const int, double*, const int, int*); +int ATL_df77potrf(const enum ATLAS_UPLO, const int, double*, const int); +int ATL_df77lauum(const enum ATLAS_UPLO, const int, double*, const int); +int ATL_df77trtri(const enum ATLAS_UPLO, const enum ATLAS_DIAG, const int, + double*, const int); +int ATL_df77posv(const enum ATLAS_UPLO, const int, const int, double*, const int, double*, const int); +int ATL_df77gesv(const int, const int, double*, const int, int*, double*, const int); +int ATL_df77gels(const enum ATLAS_TRANS, const int, const int, const int, double*, const int, double*, const int); +int ATL_cf77getri + (const enum ATLAS_ORDER, const int, float*, const int, int*, + float*, int*); +int ATL_cf77getrf + (const enum ATLAS_ORDER, const int, const int, float*, const int, int*); +int ATL_cf77potrf(const enum ATLAS_UPLO, const int, float*, const int); +int ATL_cf77lauum(const enum ATLAS_UPLO, const int, float*, const int); +int ATL_cf77trtri(const enum ATLAS_UPLO, const enum ATLAS_DIAG, const int, + float*, const int); +int ATL_cf77posv(const enum ATLAS_UPLO, const int, const int, float*, const int, float*, const int); +int ATL_cf77gesv(const int, const int, float*, const int, int*, float*, const int); +int ATL_cf77gels(const enum ATLAS_TRANS, const int, const int, const int, float*, const int, float*, const int); +int ATL_zf77getri + (const enum ATLAS_ORDER, const int, double*, const int, int*, + double*, int*); +int ATL_zf77getrf + (const enum ATLAS_ORDER, const int, const int, double*, const int, int*); +int ATL_zf77potrf(const enum ATLAS_UPLO, const int, double*, const int); +int ATL_zf77lauum(const enum ATLAS_UPLO, const int, double*, const int); +int ATL_zf77trtri(const enum ATLAS_UPLO, const enum ATLAS_DIAG, const int, + double*, const int); +int ATL_zf77posv(const enum ATLAS_UPLO, const int, const int, double*, const int, double*, const int); +int ATL_zf77gesv(const int, const int, double*, const int, int*, double*, const int); +int ATL_zf77gels(const enum ATLAS_TRANS, const int, const int, const int, double*, const int, double*, const int); +/* + * ===================================================================== + * Prototypes for C-callable F77 interface to the Level 1 BLAS routines + * ===================================================================== + */ +void ATL_sf77rotg +( float *, float *, float *, float * ); +void ATL_df77rotg +( double *, double *, double *, double * ); +void ATL_cf77rotg +( float *, const float *, float *, float * ); +void ATL_zf77rotg +( double *, const double *, double *, double * ); + +void ATL_sf77rotmg +( float *, float *, float *, const float, + float * ); +void ATL_df77rotmg +( double *, double *, double *, const double, + double * ); + +float ATL_sf77nrm2 +( const int, const float *, const int ); +double ATL_df77nrm2 +( const int, const double *, const int ); +float ATL_scf77nrm2 +( const int, const float *, const int ); +double ATL_dzf77nrm2 +( const int, const double *, const int ); + +float ATL_sf77asum +( const int, const float *, const int ); +double ATL_df77asum +( const int, const double *, const int ); +float ATL_scf77asum +( const int, const float *, const int ); +double ATL_dzf77asum +( const int, const double *, const int ); + +int ATL_isf77amax +( const int, const float *, const int ); +int ATL_idf77amax +( const int, const double *, const int ); +int ATL_icf77amax +( const int, const float *, const int ); +int ATL_izf77amax +( const int, const double *, const int ); + +void ATL_sf77scal +( const int, const float, float *, const int ); +void ATL_df77scal +( const int, const double, double *, const int ); +void ATL_cf77scal +( const int, const float *, float *, const int ); +void ATL_zf77scal +( const int, const double *, double *, const int ); +void ATL_csf77scal +( const int, const float, float *, const int ); +void ATL_zdf77scal +( const int, const double, double *, const int ); + +void ATL_sf77set(const int, const float, float*, const int); +void ATL_df77set(const int, const double, double*, const int); +void ATL_cf77set(const int, const float*, float*, const int); +void ATL_zf77set(const int, const double*, double*, const int); +void ATL_sf77axpby + (const int, const float, const float*, const int, const float, + float*, const int); +void ATL_df77axpby + (const int, const double, const double*, const int, const double, + double*, const int); +void ATL_cf77axpby + (const int, const float*, const float*, const int, const float*, + float*, const int); +void ATL_zf77axpby + (const int, const double*, const double*, const int, const double*, + double*, const int); + +void ATL_sf77axpy +( const int, const float, const float *, const int, + float *, const int ); +void ATL_df77axpy +( const int, const double, const double *, const int, + double *, const int ); +void ATL_cf77axpy +( const int, const float *, const float *, const int, + float *, const int ); +void ATL_zf77axpy +( const int, const double *, const double *, const int, + double *, const int ); + +void ATL_sf77copy +( const int, const float *, const int, float *, + const int ); +void ATL_df77copy +( const int, const double *, const int, double *, + const int ); +void ATL_cf77copy +( const int, const float *, const int, float *, + const int ); +void ATL_zf77copy +( const int, const double *, const int, double *, + const int ); + +void ATL_sf77swap +( const int, float *, const int, float *, + const int ); +void ATL_df77swap +( const int, double *, const int, double *, + const int ); +void ATL_cf77swap +( const int, float *, const int, float *, + const int ); +void ATL_zf77swap +( const int, double *, const int, double *, + const int ); + +void ATL_sf77rot +( const int, float *, const int, float *, + const int, const float, const float ); +void ATL_df77rot +( const int, double *, const int, double *, + const int, const double, const double ); +void ATL_csf77rot +( const int, float *, const int, float *, + const int, const float, const float ); +void ATL_zdf77rot +( const int, double *, const int, double *, + const int, const double, const double ); + +void ATL_sf77rotm +( const int, float *, const int, float *, + const int, const float * ); +void ATL_df77rotm +( const int, double *, const int, double *, + const int, const double * ); + +float ATL_sf77dot +( const int, const float *, const int, const float *, + const int ); +double ATL_df77dot +( const int, const double *, const int, const double *, + const int ); +void ATL_cf77dotu_sub +( const int, const float *, const int, const float *, + const int, float * ); +void ATL_cf77dotc_sub +( const int, const float *, const int, const float *, + const int, float * ); +void ATL_zf77dotu_sub +( const int, const double *, const int, const double *, + const int, double * ); +void ATL_zf77dotc_sub +( const int, const double *, const int, const double *, + const int, double * ); + +float ATL_sdsf77dot +( const int, const float, const float *, const int, + const float *, const int ); +double ATL_dsf77dot +( const int, const float *, const int, const float *, + const int ); +/* + * ===================================================================== + * Prototypes for C-callable F77 interface to the Level 2 BLAS routines + * ===================================================================== + */ +void ATL_sf77gemv +( const enum ATLAS_TRANS, const int, const int, + const float, const float *, const int, const float *, + const int, const float, float *, const int ); +void ATL_df77gemv +( const enum ATLAS_TRANS, const int, const int, + const double, const double *, const int, const double *, + const int, const double, double *, const int ); +void ATL_cf77gemv +( const enum ATLAS_TRANS, const int, const int, + const float *, const float *, const int, const float *, + const int, const float *, float *, const int ); +void ATL_zf77gemv +( const enum ATLAS_TRANS, const int, const int, + const double *, const double *, const int, const double *, + const int, const double *, double *, const int ); + +void ATL_sf77gbmv +( const enum ATLAS_TRANS, const int, const int, + const int, const int, const float, const float *, + const int, const float *, const int, const float, + float *, const int ); +void ATL_df77gbmv +( const enum ATLAS_TRANS, const int, const int, + const int, const int, const double, const double *, + const int, const double *, const int, const double, + double *, const int ); +void ATL_cf77gbmv +( const enum ATLAS_TRANS, const int, const int, + const int, const int, const float *, const float *, + const int, const float *, const int, const float *, + float *, const int ); +void ATL_zf77gbmv +( const enum ATLAS_TRANS, const int, const int, + const int, const int, const double *, const double *, + const int, const double *, const int, const double *, + double *, const int ); + +void ATL_sf77trmv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const float *, + const int, float *, const int ); +void ATL_df77trmv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const double *, + const int, double *, const int ); +void ATL_cf77trmv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const float *, + const int, float *, const int ); +void ATL_zf77trmv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const double *, + const int, double *, const int ); + +void ATL_sf77tbmv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const int, + const float *, const int, float *, const int ); +void ATL_df77tbmv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const int, + const double *, const int, double *, const int ); +void ATL_cf77tbmv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const int, + const float *, const int, float *, const int ); +void ATL_zf77tbmv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const int, + const double *, const int, double *, const int ); + +void ATL_sf77tpmv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const float *, + float *, const int ); +void ATL_df77tpmv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const double *, + double *, const int ); +void ATL_cf77tpmv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const float *, + float *, const int ); +void ATL_zf77tpmv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const double *, + double *, const int ); + +void ATL_sf77trsv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const float *, + const int, float *, const int ); +void ATL_df77trsv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const double *, + const int, double *, const int ); +void ATL_cf77trsv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const float *, + const int, float *, const int ); +void ATL_zf77trsv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const double *, + const int, double *, const int ); + +void ATL_sf77tbsv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const int, + const float *, const int, float *, const int ); +void ATL_df77tbsv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const int, + const double *, const int, double *, const int ); +void ATL_cf77tbsv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const int, + const float *, const int, float *, const int ); +void ATL_zf77tbsv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const int, + const double *, const int, double *, const int ); + +void ATL_sf77tpsv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const float *, + float *, const int ); +void ATL_df77tpsv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const double *, + double *, const int ); +void ATL_cf77tpsv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const float *, + float *, const int ); +void ATL_zf77tpsv +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const enum ATLAS_DIAG, const int, const double *, + double *, const int ); + +void ATL_sf77symv +( const enum ATLAS_UPLO, const int, const float, + const float *, const int, const float *, const int, + const float, float *, const int ); +void ATL_df77symv +( const enum ATLAS_UPLO, const int, const double, + const double *, const int, const double *, const int, + const double, double *, const int ); + +void ATL_cf77hemv +( const enum ATLAS_UPLO, const int, const float *, + const float *, const int, const float *, const int, + const float *, float *, const int ); +void ATL_zf77hemv +( const enum ATLAS_UPLO, const int, const double *, + const double *, const int, const double *, const int, + const double *, double *, const int ); + +void ATL_sf77sbmv +( const enum ATLAS_UPLO, const int, const int, + const float, const float *, const int, const float *, + const int, const float, float *, const int ); +void ATL_df77sbmv +( const enum ATLAS_UPLO, const int, const int, + const double, const double *, const int, const double *, + const int, const double, double *, const int ); +void ATL_cf77hbmv +( const enum ATLAS_UPLO, const int, const int, + const float *, const float *, const int, const float *, + const int, const float *, float *, const int ); +void ATL_zf77hbmv +( const enum ATLAS_UPLO, const int, const int, + const double *, const double *, const int, const double *, + const int, const double *, double *, const int ); + +void ATL_sf77spmv +( const enum ATLAS_UPLO, const int, const float, + const float *, const float *, const int, const float, + float *, const int ); +void ATL_df77spmv +( const enum ATLAS_UPLO, const int, const double, + const double *, const double *, const int, const double, + double *, const int ); +void ATL_cf77hpmv +( const enum ATLAS_UPLO, const int, const float *, + const float *, const float *, const int, const float *, + float *, const int ); +void ATL_zf77hpmv +( const enum ATLAS_UPLO, const int, const double *, + const double *, const double *, const int, const double *, + double *, const int ); + +void ATL_sf77ger +( const int, const int, const float, const float *, + const int, const float *, const int, float *, + const int ); +void ATL_df77ger +( const int, const int, const double, const double *, + const int, const double *, const int, double *, + const int ); +void ATL_cf77gerc +( const int, const int, const float *, const float *, + const int, const float *, const int, float *, + const int ); +void ATL_cf77geru +( const int, const int, const float *, const float *, + const int, const float *, const int, float *, + const int ); +void ATL_zf77gerc +( const int, const int, const double *, const double *, + const int, const double *, const int, double *, + const int ); +void ATL_zf77geru +( const int, const int, const double *, const double *, + const int, const double *, const int, double *, + const int ); + +void ATL_sf77syr +( const enum ATLAS_UPLO, const int, const float, + const float *, const int, float *, const int ); +void ATL_df77syr +( const enum ATLAS_UPLO, const int, const double, + const double *, const int, double *, const int ); +void ATL_cf77her +( const enum ATLAS_UPLO, const int, const float, + const float *, const int, float *, const int ); +void ATL_zf77her +( const enum ATLAS_UPLO, const int, const double, + const double *, const int, double *, const int ); + +void ATL_sf77spr +( const enum ATLAS_UPLO, const int, const float, + const float *, const int, float * ); +void ATL_df77spr +( const enum ATLAS_UPLO, const int, const double, + const double *, const int, double * ); +void ATL_cf77hpr +( const enum ATLAS_UPLO, const int, const float, + const float *, const int, float * ); +void ATL_zf77hpr +( const enum ATLAS_UPLO, const int, const double, + const double *, const int, double * ); + +void ATL_sf77syr2 +( const enum ATLAS_UPLO, const int, const float, + const float *, const int, const float *, const int, + float *, const int ); +void ATL_df77syr2 +( const enum ATLAS_UPLO, const int, const double, + const double *, const int, const double *, const int, + double *, const int ); +void ATL_cf77her2 +( const enum ATLAS_UPLO, const int, const float *, + const float *, const int, const float *, const int, + float *, const int ); +void ATL_zf77her2 +( const enum ATLAS_UPLO, const int, const double *, + const double *, const int, const double *, const int, + double *, const int ); + +void ATL_sf77spr2 +( const enum ATLAS_UPLO, const int, const float, + const float *, const int, const float *, const int, + float * ); +void ATL_df77spr2 +( const enum ATLAS_UPLO, const int, const double, + const double *, const int, const double *, const int, + double * ); +void ATL_cf77hpr2 +( const enum ATLAS_UPLO, const int, const float *, + const float *, const int, const float *, const int, + float * ); +void ATL_zf77hpr2 +( const enum ATLAS_UPLO, const int, const double *, + const double *, const int, const double *, const int, + double * ); +/* + * ===================================================================== + * Prototypes for C-callable F77 interface to the Level 3 BLAS routines + * ===================================================================== + */ +void ATL_sf77gemm +( const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const float, + const float *, const int, const float *, const int, + const float, float *, const int ); +void ATL_df77gemm +( const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const double, + const double *, const int, const double *, const int, + const double, double *, const int ); +void ATL_cf77gemm +( const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const float *, + const float *, const int, const float *, const int, + const float *, float *, const int ); +void ATL_zf77gemm +( const enum ATLAS_TRANS, const enum ATLAS_TRANS, + const int, const int, const int, const double *, + const double *, const int, const double *, const int, + const double *, double *, const int ); + +void ATL_cf77hemm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, const float *, const float *, + const int, const float *, const int, const float *, + float *, const int ); +void ATL_zf77hemm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, const double *, const double *, + const int, const double *, const int, const double *, + double *, const int ); + +void ATL_cf77herk +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const float, const float *, + const int, const float, float *, const int ); +void ATL_zf77herk +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const double, const double *, + const int, const double, double *, const int ); + +void ATL_cf77her2k +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const float *, const float *, + const int, const float *, const int, const float, + float *, const int ); +void ATL_zf77her2k +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const double *, const double *, + const int, const double *, const int, const double, + double *, const int ); + +void ATL_sf77symm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, const float, const float *, + const int, const float *, const int, const float, + float *, const int ); +void ATL_df77symm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, const double, const double *, + const int, const double *, const int, const double, + double *, const int ); +void ATL_cf77symm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, const float *, const float *, + const int, const float *, const int, const float *, + float *, const int ); +void ATL_zf77symm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const int, const int, const double *, const double *, + const int, const double *, const int, const double *, + double *, const int ); + +void ATL_sf77syrk +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const float, const float *, + const int, const float, float *, const int ); +void ATL_df77syrk +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const double, const double *, + const int, const double, double *, const int ); +void ATL_cf77syrk +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const float *, const float *, + const int, const float *, float *, const int ); +void ATL_zf77syrk +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const double *, const double *, + const int, const double *, double *, const int ); + +void ATL_sf77syr2k +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const float, const float *, + const int, const float *, const int, const float, + float *, const int ); +void ATL_df77syr2k +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const double, const double *, + const int, const double *, const int, const double, + double *, const int ); +void ATL_cf77syr2k +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const float *, const float *, + const int, const float *, const int, const float *, + float *, const int ); +void ATL_zf77syr2k +( const enum ATLAS_UPLO, const enum ATLAS_TRANS, + const int, const int, const double *, const double *, + const int, const double *, const int, const double *, + double *, const int ); + +void ATL_sf77trmm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const float, const float *, + const int, float *, const int ); +void ATL_df77trmm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const double, const double *, + const int, double *, const int ); +void ATL_cf77trmm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const float *, const float *, + const int, float *, const int ); +void ATL_zf77trmm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const double *, const double *, + const int, double *, const int ); + +void ATL_sf77trsm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const float, const float *, + const int, float *, const int ); +void ATL_df77trsm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const double, const double *, + const int, double *, const int ); +void ATL_cf77trsm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const float *, const float *, + const int, float *, const int ); +void ATL_zf77trsm +( const enum ATLAS_SIDE, const enum ATLAS_UPLO, + const enum ATLAS_TRANS, const enum ATLAS_DIAG, + const int, const int, const double *, const double *, + const int, double *, const int ); + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/cblas.h b/kaldi_io/src/tools/ATLAS/include/cblas.h new file mode 100644 index 0000000..4087ffb --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/cblas.h @@ -0,0 +1,596 @@ +#ifndef CBLAS_H + +#ifndef CBLAS_ENUM_DEFINED_H + #define CBLAS_ENUM_DEFINED_H + enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 }; + enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, + AtlasConj=114}; + enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; + enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; + enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; +#endif + +#ifndef CBLAS_ENUM_ONLY +#define CBLAS_H +#define CBLAS_INDEX int + +int cblas_errprn(int ierr, int info, char *form, ...); + +/* + * =========================================================================== + * Prototypes for level 1 BLAS functions (complex are recast as routines) + * =========================================================================== + */ +float cblas_sdsdot(const int N, const float alpha, const float *X, + const int incX, const float *Y, const int incY); +double cblas_dsdot(const int N, const float *X, const int incX, const float *Y, + const int incY); +float cblas_sdot(const int N, const float *X, const int incX, + const float *Y, const int incY); +double cblas_ddot(const int N, const double *X, const int incX, + const double *Y, const int incY); +/* + * Functions having prefixes Z and C only + */ +void cblas_cdotu_sub(const int N, const void *X, const int incX, + const void *Y, const int incY, void *dotu); +void cblas_cdotc_sub(const int N, const void *X, const int incX, + const void *Y, const int incY, void *dotc); + +void cblas_zdotu_sub(const int N, const void *X, const int incX, + const void *Y, const int incY, void *dotu); +void cblas_zdotc_sub(const int N, const void *X, const int incX, + const void *Y, const int incY, void *dotc); + + +/* + * Functions having prefixes S D SC DZ + */ +float cblas_snrm2(const int N, const float *X, const int incX); +float cblas_sasum(const int N, const float *X, const int incX); + +double cblas_dnrm2(const int N, const double *X, const int incX); +double cblas_dasum(const int N, const double *X, const int incX); + +float cblas_scnrm2(const int N, const void *X, const int incX); +float cblas_scasum(const int N, const void *X, const int incX); + +double cblas_dznrm2(const int N, const void *X, const int incX); +double cblas_dzasum(const int N, const void *X, const int incX); + + +/* + * Functions having standard 4 prefixes (S D C Z) + */ +CBLAS_INDEX cblas_isamax(const int N, const float *X, const int incX); +CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX); +CBLAS_INDEX cblas_icamax(const int N, const void *X, const int incX); +CBLAS_INDEX cblas_izamax(const int N, const void *X, const int incX); + +/* + * =========================================================================== + * Prototypes for level 1 BLAS routines + * =========================================================================== + */ + +/* + * Routines with standard 4 prefixes (s, d, c, z) + */ +void cblas_sswap(const int N, float *X, const int incX, + float *Y, const int incY); +void cblas_scopy(const int N, const float *X, const int incX, + float *Y, const int incY); +void cblas_saxpy(const int N, const float alpha, const float *X, + const int incX, float *Y, const int incY); +void catlas_saxpby(const int N, const float alpha, const float *X, + const int incX, const float beta, float *Y, const int incY); +void catlas_sset + (const int N, const float alpha, float *X, const int incX); + +void cblas_dswap(const int N, double *X, const int incX, + double *Y, const int incY); +void cblas_dcopy(const int N, const double *X, const int incX, + double *Y, const int incY); +void cblas_daxpy(const int N, const double alpha, const double *X, + const int incX, double *Y, const int incY); +void catlas_daxpby(const int N, const double alpha, const double *X, + const int incX, const double beta, double *Y, const int incY); +void catlas_dset + (const int N, const double alpha, double *X, const int incX); + +void cblas_cswap(const int N, void *X, const int incX, + void *Y, const int incY); +void cblas_ccopy(const int N, const void *X, const int incX, + void *Y, const int incY); +void cblas_caxpy(const int N, const void *alpha, const void *X, + const int incX, void *Y, const int incY); +void catlas_caxpby(const int N, const void *alpha, const void *X, + const int incX, const void *beta, void *Y, const int incY); +void catlas_cset + (const int N, const void *alpha, void *X, const int incX); + +void cblas_zswap(const int N, void *X, const int incX, + void *Y, const int incY); +void cblas_zcopy(const int N, const void *X, const int incX, + void *Y, const int incY); +void cblas_zaxpy(const int N, const void *alpha, const void *X, + const int incX, void *Y, const int incY); +void catlas_zaxpby(const int N, const void *alpha, const void *X, + const int incX, const void *beta, void *Y, const int incY); +void catlas_zset + (const int N, const void *alpha, void *X, const int incX); + + +/* + * Routines with S and D prefix only + */ +void cblas_srotg(float *a, float *b, float *c, float *s); +void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); +void cblas_srot(const int N, float *X, const int incX, + float *Y, const int incY, const float c, const float s); +void cblas_srotm(const int N, float *X, const int incX, + float *Y, const int incY, const float *P); + +void cblas_drotg(double *a, double *b, double *c, double *s); +void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); +void cblas_drot(const int N, double *X, const int incX, + double *Y, const int incY, const double c, const double s); +void cblas_drotm(const int N, double *X, const int incX, + double *Y, const int incY, const double *P); + + +/* + * Routines with S D C Z CS and ZD prefixes + */ +void cblas_sscal(const int N, const float alpha, float *X, const int incX); +void cblas_dscal(const int N, const double alpha, double *X, const int incX); +void cblas_cscal(const int N, const void *alpha, void *X, const int incX); +void cblas_zscal(const int N, const void *alpha, void *X, const int incX); +void cblas_csscal(const int N, const float alpha, void *X, const int incX); +void cblas_zdscal(const int N, const double alpha, void *X, const int incX); + +/* + * Extra reference routines provided by ATLAS, but not mandated by the standard + */ +void cblas_crotg(void *a, void *b, void *c, void *s); +void cblas_zrotg(void *a, void *b, void *c, void *s); +void cblas_csrot(const int N, void *X, const int incX, void *Y, const int incY, + const float c, const float s); +void cblas_zdrot(const int N, void *X, const int incX, void *Y, const int incY, + const double c, const double s); + +/* + * =========================================================================== + * Prototypes for level 2 BLAS + * =========================================================================== + */ + +/* + * Routines with standard 4 prefixes (S, D, C, Z) + */ +void cblas_sgemv(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float *A, const int lda, + const float *X, const int incX, const float beta, + float *Y, const int incY); +void cblas_sgbmv(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const int KL, const int KU, const float alpha, + const float *A, const int lda, const float *X, + const int incX, const float beta, float *Y, const int incY); +void cblas_strmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const float *A, const int lda, + float *X, const int incX); +void cblas_stbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const int K, const float *A, const int lda, + float *X, const int incX); +void cblas_stpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const float *Ap, float *X, const int incX); +void cblas_strsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const float *A, const int lda, float *X, + const int incX); +void cblas_stbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const int K, const float *A, const int lda, + float *X, const int incX); +void cblas_stpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const float *Ap, float *X, const int incX); + +void cblas_dgemv(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const double alpha, const double *A, const int lda, + const double *X, const int incX, const double beta, + double *Y, const int incY); +void cblas_dgbmv(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const int KL, const int KU, const double alpha, + const double *A, const int lda, const double *X, + const int incX, const double beta, double *Y, const int incY); +void cblas_dtrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const double *A, const int lda, + double *X, const int incX); +void cblas_dtbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const int K, const double *A, const int lda, + double *X, const int incX); +void cblas_dtpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const double *Ap, double *X, const int incX); +void cblas_dtrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const double *A, const int lda, double *X, + const int incX); +void cblas_dtbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const int K, const double *A, const int lda, + double *X, const int incX); +void cblas_dtpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const double *Ap, double *X, const int incX); + +void cblas_cgemv(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *X, const int incX, const void *beta, + void *Y, const int incY); +void cblas_cgbmv(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const int KL, const int KU, const void *alpha, + const void *A, const int lda, const void *X, + const int incX, const void *beta, void *Y, const int incY); +void cblas_ctrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const void *A, const int lda, + void *X, const int incX); +void cblas_ctbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const int K, const void *A, const int lda, + void *X, const int incX); +void cblas_ctpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const void *Ap, void *X, const int incX); +void cblas_ctrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const void *A, const int lda, void *X, + const int incX); +void cblas_ctbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const int K, const void *A, const int lda, + void *X, const int incX); +void cblas_ctpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const void *Ap, void *X, const int incX); + +void cblas_zgemv(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *X, const int incX, const void *beta, + void *Y, const int incY); +void cblas_zgbmv(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const int KL, const int KU, const void *alpha, + const void *A, const int lda, const void *X, + const int incX, const void *beta, void *Y, const int incY); +void cblas_ztrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const void *A, const int lda, + void *X, const int incX); +void cblas_ztbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const int K, const void *A, const int lda, + void *X, const int incX); +void cblas_ztpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const void *Ap, void *X, const int incX); +void cblas_ztrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const void *A, const int lda, void *X, + const int incX); +void cblas_ztbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const int K, const void *A, const int lda, + void *X, const int incX); +void cblas_ztpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const void *Ap, void *X, const int incX); + + +/* + * Routines with S and D prefixes only + */ +void cblas_ssymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const float alpha, const float *A, + const int lda, const float *X, const int incX, + const float beta, float *Y, const int incY); +void cblas_ssbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const int K, const float alpha, const float *A, + const int lda, const float *X, const int incX, + const float beta, float *Y, const int incY); +void cblas_sspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const float alpha, const float *Ap, + const float *X, const int incX, + const float beta, float *Y, const int incY); +void cblas_sger(const enum CBLAS_ORDER Order, const int M, const int N, + const float alpha, const float *X, const int incX, + const float *Y, const int incY, float *A, const int lda); +void cblas_ssyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const float alpha, const float *X, + const int incX, float *A, const int lda); +void cblas_sspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const float alpha, const float *X, + const int incX, float *Ap); +void cblas_ssyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const float alpha, const float *X, + const int incX, const float *Y, const int incY, float *A, + const int lda); +void cblas_sspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const float alpha, const float *X, + const int incX, const float *Y, const int incY, float *A); + +void cblas_dsymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const double alpha, const double *A, + const int lda, const double *X, const int incX, + const double beta, double *Y, const int incY); +void cblas_dsbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const int K, const double alpha, const double *A, + const int lda, const double *X, const int incX, + const double beta, double *Y, const int incY); +void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const double alpha, const double *Ap, + const double *X, const int incX, + const double beta, double *Y, const int incY); +void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N, + const double alpha, const double *X, const int incX, + const double *Y, const int incY, double *A, const int lda); +void cblas_dsyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const double alpha, const double *X, + const int incX, double *A, const int lda); +void cblas_dspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const double alpha, const double *X, + const int incX, double *Ap); +void cblas_dsyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const double alpha, const double *X, + const int incX, const double *Y, const int incY, double *A, + const int lda); +void cblas_dspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const double alpha, const double *X, + const int incX, const double *Y, const int incY, double *A); + + +/* + * Routines with C and Z prefixes only + */ +void cblas_chemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const void *alpha, const void *A, + const int lda, const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_chbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const int K, const void *alpha, const void *A, + const int lda, const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_chpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const void *alpha, const void *Ap, + const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_cgeru(const enum CBLAS_ORDER Order, const int M, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_cgerc(const enum CBLAS_ORDER Order, const int M, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_cher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const float alpha, const void *X, const int incX, + void *A, const int lda); +void cblas_chpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const float alpha, const void *X, + const int incX, void *A); +void cblas_cher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_chpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *Ap); + +void cblas_zhemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const void *alpha, const void *A, + const int lda, const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_zhbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const int K, const void *alpha, const void *A, + const int lda, const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const void *alpha, const void *Ap, + const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_zher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const double alpha, const void *X, const int incX, + void *A, const int lda); +void cblas_zhpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const double alpha, const void *X, + const int incX, void *A); +void cblas_zher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_zhpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *Ap); + +/* + * =========================================================================== + * Prototypes for level 3 BLAS + * =========================================================================== + */ + +/* + * Routines with standard 4 prefixes (S, D, C, Z) + */ +void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const float alpha, const float *A, + const int lda, const float *B, const int ldb, + const float beta, float *C, const int ldc); +void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const int M, const int N, + const float alpha, const float *A, const int lda, + const float *B, const int ldb, const float beta, + float *C, const int ldc); +void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const float alpha, const float *A, const int lda, + const float beta, float *C, const int ldc); +void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const float alpha, const float *A, const int lda, + const float *B, const int ldb, const float beta, + float *C, const int ldc); +void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const int M, const int N, + const float alpha, const float *A, const int lda, + float *B, const int ldb); +void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const int M, const int N, + const float alpha, const float *A, const int lda, + float *B, const int ldb); + +void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const double alpha, const double *A, + const int lda, const double *B, const int ldb, + const double beta, double *C, const int ldc); +void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const int M, const int N, + const double alpha, const double *A, const int lda, + const double *B, const int ldb, const double beta, + double *C, const int ldc); +void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const double alpha, const double *A, const int lda, + const double beta, double *C, const int ldc); +void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const double alpha, const double *A, const int lda, + const double *B, const int ldb, const double beta, + double *C, const int ldc); +void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const int M, const int N, + const double alpha, const double *A, const int lda, + double *B, const int ldb); +void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const int M, const int N, + const double alpha, const double *A, const int lda, + double *B, const int ldb); + +void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const void *alpha, const void *A, + const int lda, const void *B, const int ldb, + const void *beta, void *C, const int ldc); +void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *beta, void *C, const int ldc); +void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const int M, const int N, + const void *alpha, const void *A, const int lda, + void *B, const int ldb); +void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const int M, const int N, + const void *alpha, const void *A, const int lda, + void *B, const int ldb); + +void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const void *alpha, const void *A, + const int lda, const void *B, const int ldb, + const void *beta, void *C, const int ldc); +void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *beta, void *C, const int ldc); +void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const int M, const int N, + const void *alpha, const void *A, const int lda, + void *B, const int ldb); +void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const int M, const int N, + const void *alpha, const void *A, const int lda, + void *B, const int ldb); + + +/* + * Routines with prefixes C and Z only + */ +void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const float alpha, const void *A, const int lda, + const float beta, void *C, const int ldc); +void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const float beta, + void *C, const int ldc); +void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const double alpha, const void *A, const int lda, + const double beta, void *C, const int ldc); +void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const double beta, + void *C, const int ldc); + +int cblas_errprn(int ierr, int info, char *form, ...); + +#endif /* end #ifdef CBLAS_ENUM_ONLY */ +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/cblas_test.h b/kaldi_io/src/tools/ATLAS/include/cblas_test.h new file mode 100644 index 0000000..b871a47 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/cblas_test.h @@ -0,0 +1,542 @@ +/* + * Added by R. Clint Whaley to make compatible with ATLAS + */ +#if defined(Add_) || defined(Add__) + #define ADD_ +#elif defined(NoChange) + #define NOCHANGE +#elif defined(UpCase) + #define UPCASE +#endif + +#ifdef ADD_ + #define F77_crotg crotgtest_ + #define F77_zrotg zrotgtest_ + #define F77_csrot csrottest_ + #define F77_zdrot zdrottest_ +#elif defined NOCHANGE + #define F77_crotg crotgtest + #define F77_zrotg zrotgtest + #define F77_csrot csrottest + #define F77_zdrot zdrottest +#elif defined UPCASE + #define F77_crotg CROTGTEST + #define F77_zrotg ZROTGTEST + #define F77_csrot CSROTTEST + #define F77_zdrot ZDROTTEST +#endif + + +/* + * cblas_test.h + * Written by Keita Teranishi + */ +#ifndef CBLAS_TEST_H +#define CBLAS_TEST_H +#include "cblas.h" + +#define TRUE 1 +#define PASSED 1 +#define TEST_ROW_MJR 1 + +#define FALSE 0 +#define FAILED 0 +#define TEST_COL_MJR 0 + +#define INVALID -1 +#define UNDEFINED -1 + +typedef struct { float real; float imag; } CBLAS_TEST_COMPLEX; +typedef struct { double real; double imag; } CBLAS_TEST_ZOMPLEX; + +#if defined(ADD_) + #define F77_xerbla xerbla_ +/* + * Level 1 BLAS + */ + #define F77_srotg srotgtest_ + #define F77_srotmg srotmgtest_ + #define F77_srot srottest_ + #define F77_srotm srotmtest_ + #define F77_drotg drotgtest_ + #define F77_drotmg drotmgtest_ + #define F77_drot drottest_ + #define F77_drotm drotmtest_ + #define F77_sswap sswaptest_ + #define F77_scopy scopytest_ + #define F77_saxpy saxpytest_ + #define F77_isamax isamaxtest_ + #define F77_dswap dswaptest_ + #define F77_dcopy dcopytest_ + #define F77_daxpy daxpytest_ + #define F77_idamax idamaxtest_ + #define F77_cswap cswaptest_ + #define F77_ccopy ccopytest_ + #define F77_caxpy caxpytest_ + #define F77_icamax icamaxtest_ + #define F77_zswap zswaptest_ + #define F77_zcopy zcopytest_ + #define F77_zaxpy zaxpytest_ + #define F77_izamax izamaxtest_ + #define F77_sdot sdottestsub_ + #define F77_ddot ddottestsub_ + #define F77_dsdot dsdottest_ + #define F77_sscal sscaltest_ + #define F77_dscal dscaltest_ + #define F77_cscal cscaltest_ + #define F77_zscal zscaltest_ + #define F77_csscal csscaltest_ + #define F77_zdscal zdscaltest_ + #define F77_cdotu cdotutest_ + #define F77_cdotc cdotctest_ + #define F77_zdotu zdotutest_ + #define F77_zdotc zdotctest_ + #define F77_snrm2 snrm2testsub_ + #define F77_sasum sasumtestsub_ + #define F77_dnrm2 dnrm2testsub_ + #define F77_dasum dasumtestsub_ + #define F77_scnrm2 scnrm2testsub_ + #define F77_scasum scasumtestsub_ + #define F77_dznrm2 dznrm2testsub_ + #define F77_dzasum dzasumtestsub_ + #define F77_sdsdot sdsdottest_ +/* + * Level 2 BLAS + */ + #define F77_s2chke cs2chke_ + #define F77_d2chke cd2chke_ + #define F77_c2chke cc2chke_ + #define F77_z2chke cz2chke_ + #define F77_ssymv cssymv_ + #define F77_ssbmv cssbmv_ + #define F77_sspmv csspmv_ + #define F77_sger csger_ + #define F77_ssyr cssyr_ + #define F77_sspr csspr_ + #define F77_ssyr2 cssyr2_ + #define F77_sspr2 csspr2_ + #define F77_dsymv cdsymv_ + #define F77_dsbmv cdsbmv_ + #define F77_dspmv cdspmv_ + #define F77_dger cdger_ + #define F77_dsyr cdsyr_ + #define F77_dspr cdspr_ + #define F77_dsyr2 cdsyr2_ + #define F77_dspr2 cdspr2_ + #define F77_chemv cchemv_ + #define F77_chbmv cchbmv_ + #define F77_chpmv cchpmv_ + #define F77_cgeru ccgeru_ + #define F77_cgerc ccgerc_ + #define F77_cher ccher_ + #define F77_chpr cchpr_ + #define F77_cher2 ccher2_ + #define F77_chpr2 cchpr2_ + #define F77_zhemv czhemv_ + #define F77_zhbmv czhbmv_ + #define F77_zhpmv czhpmv_ + #define F77_zgeru czgeru_ + #define F77_zgerc czgerc_ + #define F77_zher czher_ + #define F77_zhpr czhpr_ + #define F77_zher2 czher2_ + #define F77_zhpr2 czhpr2_ + #define F77_sgemv csgemv_ + #define F77_sgbmv csgbmv_ + #define F77_strmv cstrmv_ + #define F77_stbmv cstbmv_ + #define F77_stpmv cstpmv_ + #define F77_strsv cstrsv_ + #define F77_stbsv cstbsv_ + #define F77_stpsv cstpsv_ + #define F77_dgemv cdgemv_ + #define F77_dgbmv cdgbmv_ + #define F77_dtrmv cdtrmv_ + #define F77_dtbmv cdtbmv_ + #define F77_dtpmv cdtpmv_ + #define F77_dtrsv cdtrsv_ + #define F77_dtbsv cdtbsv_ + #define F77_dtpsv cdtpsv_ + #define F77_cgemv ccgemv_ + #define F77_cgbmv ccgbmv_ + #define F77_ctrmv cctrmv_ + #define F77_ctbmv cctbmv_ + #define F77_ctpmv cctpmv_ + #define F77_ctrsv cctrsv_ + #define F77_ctbsv cctbsv_ + #define F77_ctpsv cctpsv_ + #define F77_zgemv czgemv_ + #define F77_zgbmv czgbmv_ + #define F77_ztrmv cztrmv_ + #define F77_ztbmv cztbmv_ + #define F77_ztpmv cztpmv_ + #define F77_ztrsv cztrsv_ + #define F77_ztbsv cztbsv_ + #define F77_ztpsv cztpsv_ +/* + * Level 3 BLAS + */ + #define F77_s3chke cs3chke_ + #define F77_d3chke cd3chke_ + #define F77_c3chke cc3chke_ + #define F77_z3chke cz3chke_ + #define F77_chemm cchemm_ + #define F77_cherk ccherk_ + #define F77_cher2k ccher2k_ + #define F77_zhemm czhemm_ + #define F77_zherk czherk_ + #define F77_zher2k czher2k_ + #define F77_sgemm csgemm_ + #define F77_ssymm cssymm_ + #define F77_ssyrk cssyrk_ + #define F77_ssyr2k cssyr2k_ + #define F77_strmm cstrmm_ + #define F77_strsm cstrsm_ + #define F77_dgemm cdgemm_ + #define F77_dsymm cdsymm_ + #define F77_dsyrk cdsyrk_ + #define F77_dsyr2k cdsyr2k_ + #define F77_dtrmm cdtrmm_ + #define F77_dtrsm cdtrsm_ + #define F77_cgemm ccgemm_ + #define F77_csymm ccsymm_ + #define F77_csyrk ccsyrk_ + #define F77_csyr2k ccsyr2k_ + #define F77_ctrmm cctrmm_ + #define F77_ctrsm cctrsm_ + #define F77_zgemm czgemm_ + #define F77_zsymm czsymm_ + #define F77_zsyrk czsyrk_ + #define F77_zsyr2k czsyr2k_ + #define F77_ztrmm cztrmm_ + #define F77_ztrsm cztrsm_ +#elif defined(UPCASE) + #define F77_xerbla XERBLA +/* + * Level 1 BLAS + */ + #define F77_srotg SROTGTEST + #define F77_srotmg SROTMGTEST + #define F77_srot SROTTEST + #define F77_srotm SROTMTEST + #define F77_drotg DROTGTEST + #define F77_drotmg DROTMGTEST + #define F77_drot DROTTEST + #define F77_drotm DROTMTEST + #define F77_sswap SSWAPTEST + #define F77_scopy SCOPYTEST + #define F77_saxpy SAXPYTEST + #define F77_isamax ISAMAXTEST + #define F77_dswap DSWAPTEST + #define F77_dcopy DCOPYTEST + #define F77_daxpy DAXPYTEST + #define F77_idamax IDAMAXTEST + #define F77_cswap CSWAPTEST + #define F77_ccopy CCOPYTEST + #define F77_caxpy CAXPYTEST + #define F77_icamax ICAMAXTEST + #define F77_zswap ZSWAPTEST + #define F77_zcopy ZCOPYTEST + #define F77_zaxpy ZAXPYTEST + #define F77_izamax IZAMAXTEST + #define F77_sdot SDOTTESTSUB + #define F77_ddot DDOTTESTSUB + #define F77_dsdot DSDOTTEST + #define F77_sscal SSCALTEST + #define F77_dscal DSCALTEST + #define F77_cscal CSCALTEST + #define F77_zscal ZSCALTEST + #define F77_csscal CSSCALTEST + #define F77_zdscal ZDSCALTEST + #define F77_cdotu CDOTUTEST + #define F77_cdotc CDOTCTEST + #define F77_zdotu ZDOTUTEST + #define F77_zdotc ZDOTCTEST + #define F77_snrm2 SNRM2TESTSUB + #define F77_sasum SASUMTESTSUB + #define F77_dnrm2 DNRM2TESTSUB + #define F77_dasum DASUMTESTSUB + #define F77_scnrm2 SCNRM2TESTSUB + #define F77_scasum SCASUMTESTSUB + #define F77_dznrm2 DZNRM2TESTSUB + #define F77_dzasum DZASUMTESTSUB + #define F77_sdsdot SDSDOTTEST +/* + * Level 2 BLAS + */ + #define F77_s2chke CS2CHKE + #define F77_d2chke CD2CHKE + #define F77_c2chke CC2CHKE + #define F77_z2chke CZ2CHKE + #define F77_ssymv CSSYMV + #define F77_ssbmv CSSBMV + #define F77_sspmv CSSPMV + #define F77_sger CSGER + #define F77_ssyr CSSYR + #define F77_sspr CSSPR + #define F77_ssyr2 CSSYR2 + #define F77_sspr2 CSSPR2 + #define F77_dsymv CDSYMV + #define F77_dsbmv CDSBMV + #define F77_dspmv CDSPMV + #define F77_dger CDGER + #define F77_dsyr CDSYR + #define F77_dspr CDSPR + #define F77_dsyr2 CDSYR2 + #define F77_dspr2 CDSPR2 + #define F77_chemv CCHEMV + #define F77_chbmv CCHBMV + #define F77_chpmv CCHPMV + #define F77_cgeru CCGERU + #define F77_cgerc CCGERC + #define F77_cher CCHER + #define F77_chpr CCHPR + #define F77_cher2 CCHER2 + #define F77_chpr2 CCHPR2 + #define F77_zhemv CZHEMV + #define F77_zhbmv CZHBMV + #define F77_zhpmv CZHPMV + #define F77_zgeru CZGERU + #define F77_zgerc CZGERC + #define F77_zher CZHER + #define F77_zhpr CZHPR + #define F77_zher2 CZHER2 + #define F77_zhpr2 CZHPR2 + #define F77_sgemv CSGEMV + #define F77_sgbmv CSGBMV + #define F77_strmv CSTRMV + #define F77_stbmv CSTBMV + #define F77_stpmv CSTPMV + #define F77_strsv CSTRSV + #define F77_stbsv CSTBSV + #define F77_stpsv CSTPSV + #define F77_dgemv CDGEMV + #define F77_dgbmv CDGBMV + #define F77_dtrmv CDTRMV + #define F77_dtbmv CDTBMV + #define F77_dtpmv CDTPMV + #define F77_dtrsv CDTRSV + #define F77_dtbsv CDTBSV + #define F77_dtpsv CDTPSV + #define F77_cgemv CCGEMV + #define F77_cgbmv CCGBMV + #define F77_ctrmv CCTRMV + #define F77_ctbmv CCTBMV + #define F77_ctpmv CCTPMV + #define F77_ctrsv CCTRSV + #define F77_ctbsv CCTBSV + #define F77_ctpsv CCTPSV + #define F77_zgemv CZGEMV + #define F77_zgbmv CZGBMV + #define F77_ztrmv CZTRMV + #define F77_ztbmv CZTBMV + #define F77_ztpmv CZTPMV + #define F77_ztrsv CZTRSV + #define F77_ztbsv CZTBSV + #define F77_ztpsv CZTPSV +/* + * Level 3 BLAS + */ + #define F77_s3chke CS3CHKE + #define F77_d3chke CD3CHKE + #define F77_c3chke CC3CHKE + #define F77_z3chke CZ3CHKE + #define F77_chemm CCHEMM + #define F77_cherk CCHERK + #define F77_cher2k CCHER2K + #define F77_zhemm CZHEMM + #define F77_zherk CZHERK + #define F77_zher2k CZHER2K + #define F77_sgemm CSGEMM + #define F77_ssymm CSSYMM + #define F77_ssyrk CSSYRK + #define F77_ssyr2k CSSYR2K + #define F77_strmm CSTRMM + #define F77_strsm CSTRSM + #define F77_dgemm CDGEMM + #define F77_dsymm CDSYMM + #define F77_dsyrk CDSYRK + #define F77_dsyr2k CDSYR2K + #define F77_dtrmm CDTRMM + #define F77_dtrsm CDTRSM + #define F77_cgemm CCGEMM + #define F77_csymm CCSYMM + #define F77_csyrk CCSYRK + #define F77_csyr2k CCSYR2K + #define F77_ctrmm CCTRMM + #define F77_ctrsm CCTRSM + #define F77_zgemm CZGEMM + #define F77_zsymm CZSYMM + #define F77_zsyrk CZSYRK + #define F77_zsyr2k CZSYR2K + #define F77_ztrmm CZTRMM + #define F77_ztrsm CZTRSM +#elif defined(NOCHANGE) + #define F77_xerbla xerbla +/* + * Level 1 BLAS + */ + #define F77_srotg srotgtest + #define F77_srotmg srotmgtest + #define F77_srot srottest + #define F77_srotm srotmtest + #define F77_drotg drotgtest + #define F77_drotmg drotmgtest + #define F77_drot drottest + #define F77_drotm drotmtest + #define F77_sswap sswaptest + #define F77_scopy scopytest + #define F77_saxpy saxpytest + #define F77_isamax isamaxtest + #define F77_dswap dswaptest + #define F77_dcopy dcopytest + #define F77_daxpy daxpytest + #define F77_idamax idamaxtest + #define F77_cswap cswaptest + #define F77_ccopy ccopytest + #define F77_caxpy caxpytest + #define F77_icamax icamaxtest + #define F77_zswap zswaptest + #define F77_zcopy zcopytest + #define F77_zaxpy zaxpytest + #define F77_izamax izamaxtest + #define F77_sdot sdottestsub + #define F77_ddot ddottestsub + #define F77_dsdot dsdottest + #define F77_sscal sscaltest + #define F77_dscal dscaltest + #define F77_cscal cscaltest + #define F77_zscal zscaltest + #define F77_csscal csscaltest + #define F77_zdscal zdscaltest + #define F77_cdotu cdotutest + #define F77_cdotc cdotctest + #define F77_zdotu zdotutest + #define F77_zdotc zdotctest + #define F77_snrm2 snrm2testsub + #define F77_sasum sasumtestsub + #define F77_dnrm2 dnrm2testsub + #define F77_dasum dasumtestsub + #define F77_scnrm2 scnrm2testsub + #define F77_scasum scasumtestsub + #define F77_dznrm2 dznrm2testsub + #define F77_dzasum dzasumtestsub + #define F77_sdsdot sdsdottest +/* + * Level 2 BLAS + */ + #define F77_s2chke cs2chke + #define F77_d2chke cd2chke + #define F77_c2chke cc2chke + #define F77_z2chke cz2chke + #define F77_ssymv cssymv + #define F77_ssbmv cssbmv + #define F77_sspmv csspmv + #define F77_sger csger + #define F77_ssyr cssyr + #define F77_sspr csspr + #define F77_ssyr2 cssyr2 + #define F77_sspr2 csspr2 + #define F77_dsymv cdsymv + #define F77_dsbmv cdsbmv + #define F77_dspmv cdspmv + #define F77_dger cdger + #define F77_dsyr cdsyr + #define F77_dspr cdspr + #define F77_dsyr2 cdsyr2 + #define F77_dspr2 cdspr2 + #define F77_chemv cchemv + #define F77_chbmv cchbmv + #define F77_chpmv cchpmv + #define F77_cgeru ccgeru + #define F77_cgerc ccgerc + #define F77_cher ccher + #define F77_chpr cchpr + #define F77_cher2 ccher2 + #define F77_chpr2 cchpr2 + #define F77_zhemv czhemv + #define F77_zhbmv czhbmv + #define F77_zhpmv czhpmv + #define F77_zgeru czgeru + #define F77_zgerc czgerc + #define F77_zher czher + #define F77_zhpr czhpr + #define F77_zher2 czher2 + #define F77_zhpr2 czhpr2 + #define F77_sgemv csgemv + #define F77_sgbmv csgbmv + #define F77_strmv cstrmv + #define F77_stbmv cstbmv + #define F77_stpmv cstpmv + #define F77_strsv cstrsv + #define F77_stbsv cstbsv + #define F77_stpsv cstpsv + #define F77_dgemv cdgemv + #define F77_dgbmv cdgbmv + #define F77_dtrmv cdtrmv + #define F77_dtbmv cdtbmv + #define F77_dtpmv cdtpmv + #define F77_dtrsv cdtrsv + #define F77_dtbsv cdtbsv + #define F77_dtpsv cdtpsv + #define F77_cgemv ccgemv + #define F77_cgbmv ccgbmv + #define F77_ctrmv cctrmv + #define F77_ctbmv cctbmv + #define F77_ctpmv cctpmv + #define F77_ctrsv cctrsv + #define F77_ctbsv cctbsv + #define F77_ctpsv cctpsv + #define F77_zgemv czgemv + #define F77_zgbmv czgbmv + #define F77_ztrmv cztrmv + #define F77_ztbmv cztbmv + #define F77_ztpmv cztpmv + #define F77_ztrsv cztrsv + #define F77_ztbsv cztbsv + #define F77_ztpsv cztpsv +/* + * Level 3 BLAS + */ + #define F77_s3chke cs3chke + #define F77_d3chke cd3chke + #define F77_c3chke cc3chke + #define F77_z3chke cz3chke + #define F77_chemm cchemm + #define F77_cherk ccherk + #define F77_cher2k ccher2k + #define F77_zhemm czhemm + #define F77_zherk czherk + #define F77_zher2k czher2k + #define F77_sgemm csgemm + #define F77_ssymm cssymm + #define F77_ssyrk cssyrk + #define F77_ssyr2k cssyr2k + #define F77_strmm cstrmm + #define F77_strsm cstrsm + #define F77_dgemm cdgemm + #define F77_dsymm cdsymm + #define F77_dsyrk cdsyrk + #define F77_dsyr2k cdsyr2k + #define F77_dtrmm cdtrmm + #define F77_dtrsm cdtrsm + #define F77_cgemm ccgemm + #define F77_csymm ccsymm + #define F77_csyrk ccsyrk + #define F77_csyr2k ccsyr2k + #define F77_ctrmm cctrmm + #define F77_ctrsm cctrsm + #define F77_zgemm czgemm + #define F77_zsymm czsymm + #define F77_zsyrk czsyrk + #define F77_zsyr2k czsyr2k + #define F77_ztrmm cztrmm + #define F77_ztrsm cztrsm +#endif + +void get_transpose_type(char *type, enum CBLAS_TRANSPOSE *trans); +void get_uplo_type(char *type, enum CBLAS_UPLO *uplo); +void get_diag_type(char *type, enum CBLAS_DIAG *diag); +void get_side_type(char *type, enum CBLAS_SIDE *side); + +#endif /* CBLAS_TEST_H */ diff --git a/kaldi_io/src/tools/ATLAS/include/clapack.h b/kaldi_io/src/tools/ATLAS/include/clapack.h new file mode 100644 index 0000000..c5dde3f --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/clapack.h @@ -0,0 +1,149 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1999 R. Clint Whaley + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef CLAPACK_H + +#define CLAPACK_H +#include "cblas.h" + +#ifndef ATLAS_ORDER + #define ATLAS_ORDER CBLAS_ORDER +#endif +#ifndef ATLAS_UPLO + #define ATLAS_UPLO CBLAS_UPLO +#endif +#ifndef ATLAS_DIAG + #define ATLAS_DIAG CBLAS_DIAG +#endif +int clapack_sgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS, + float *A, const int lda, int *ipiv, + float *B, const int ldb); +int clapack_sgetrf(const enum CBLAS_ORDER Order, const int M, const int N, + float *A, const int lda, int *ipiv); +int clapack_sgetrs + (const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans, + const int N, const int NRHS, const float *A, const int lda, + const int *ipiv, float *B, const int ldb); +int clapack_sgetri(const enum CBLAS_ORDER Order, const int N, float *A, + const int lda, const int *ipiv); +int clapack_sposv(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, + const int N, const int NRHS, float *A, const int lda, + float *B, const int ldb); +int clapack_spotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, + const int N, float *A, const int lda); +int clapack_spotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const int NRHS, const float *A, const int lda, + float *B, const int ldb); +int clapack_spotri(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, + const int N, float *A, const int lda); +int clapack_slauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, + const int N, float *A, const int lda); +int clapack_strtri(const enum ATLAS_ORDER Order,const enum ATLAS_UPLO Uplo, + const enum ATLAS_DIAG Diag,const int N, float *A, const int lda); + +int clapack_dgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS, + double *A, const int lda, int *ipiv, + double *B, const int ldb); +int clapack_dgetrf(const enum CBLAS_ORDER Order, const int M, const int N, + double *A, const int lda, int *ipiv); +int clapack_dgetrs + (const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans, + const int N, const int NRHS, const double *A, const int lda, + const int *ipiv, double *B, const int ldb); +int clapack_dgetri(const enum CBLAS_ORDER Order, const int N, double *A, + const int lda, const int *ipiv); +int clapack_dposv(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, + const int N, const int NRHS, double *A, const int lda, + double *B, const int ldb); +int clapack_dpotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, + const int N, double *A, const int lda); +int clapack_dpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const int NRHS, const double *A, const int lda, + double *B, const int ldb); +int clapack_dpotri(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, + const int N, double *A, const int lda); +int clapack_dlauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, + const int N, double *A, const int lda); +int clapack_dtrtri(const enum ATLAS_ORDER Order,const enum ATLAS_UPLO Uplo, + const enum ATLAS_DIAG Diag,const int N, double *A, const int lda); + +int clapack_cgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS, + void *A, const int lda, int *ipiv, + void *B, const int ldb); +int clapack_cgetrf(const enum CBLAS_ORDER Order, const int M, const int N, + void *A, const int lda, int *ipiv); +int clapack_cgetrs + (const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans, + const int N, const int NRHS, const void *A, const int lda, + const int *ipiv, void *B, const int ldb); +int clapack_cgetri(const enum CBLAS_ORDER Order, const int N, void *A, + const int lda, const int *ipiv); +int clapack_cposv(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, + const int N, const int NRHS, void *A, const int lda, + void *B, const int ldb); +int clapack_cpotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, + const int N, void *A, const int lda); +int clapack_cpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const int NRHS, const void *A, const int lda, + void *B, const int ldb); +int clapack_cpotri(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, + const int N, void *A, const int lda); +int clapack_clauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, + const int N, void *A, const int lda); +int clapack_ctrtri(const enum ATLAS_ORDER Order,const enum ATLAS_UPLO Uplo, + const enum ATLAS_DIAG Diag,const int N, void *A, const int lda); + +int clapack_zgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS, + void *A, const int lda, int *ipiv, + void *B, const int ldb); +int clapack_zgetrf(const enum CBLAS_ORDER Order, const int M, const int N, + void *A, const int lda, int *ipiv); +int clapack_zgetrs + (const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans, + const int N, const int NRHS, const void *A, const int lda, + const int *ipiv, void *B, const int ldb); +int clapack_zgetri(const enum CBLAS_ORDER Order, const int N, void *A, + const int lda, const int *ipiv); +int clapack_zposv(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, + const int N, const int NRHS, void *A, const int lda, + void *B, const int ldb); +int clapack_zpotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, + const int N, void *A, const int lda); +int clapack_zpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const int NRHS, const void *A, const int lda, + void *B, const int ldb); +int clapack_zpotri(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, + const int N, void *A, const int lda); +int clapack_zlauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, + const int N, void *A, const int lda); +int clapack_ztrtri(const enum ATLAS_ORDER Order,const enum ATLAS_UPLO Uplo, + const enum ATLAS_DIAG Diag,const int N, void *A, const int lda); + +#endif diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h b/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h new file mode 100644 index 0000000..118d3de --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h @@ -0,0 +1,188 @@ +#ifdef GER +#undef NO_TRANSPOSE +#define NO_TRANSPOSE +#endif + + +#if NDPM > 4 +#error Max NDPM is 4 +#endif + +#if !defined(ATL_SSE1) && ( defined(SREAL) || defined(SCPLX) ) +#error This routine needs ATL_SSE1 defined +#endif + +#if !defined(ATL_SSE2) && ( defined(DREAL) || defined(DCPLX) ) +#error This routine needs ATL_SSE2 defined +#endif + +#include <stdio.h> +#include <stdlib.h> + +#include "camm_util.h" + +#ifndef GER +#if defined(BETAX) || defined(BETAXI0) +#include "camm_scale.h" +#endif +#endif + +#if NDPM >= 4 +#define EXT4 Mjoin(4dp,BLC) +#undef NDP +#define NDP 4 +#undef EXT +#define EXT EXT4 +#include "camm_dpa.h" +#endif + +#if NDPM >= 3 +#define EXT3 Mjoin(3dp,BLC) +#undef NDP +#define NDP 3 +#undef EXT +#define EXT EXT3 +#include "camm_dpa.h" +#endif + +#if NDPM >= 2 +#define EXT2 Mjoin(2dp,BLC) +#undef NDP +#define NDP 2 +#undef EXT +#define EXT EXT2 +#include "camm_dpa.h" +#endif + +#define EXT1 Mjoin(1dp,BLC) +#undef NDP +#define NDP 1 +#undef EXT +#define EXT EXT1 +#include "camm_dpa.h" + +#undef NDP +#define NDP NDPM +#undef EXT +#define EXT Mjoin(Mjoin(NDP,Mjoin(dp,BLC)),m) +#include "camm_dpa.h" + +#ifdef GER +#if defined(SCPLX) || defined(DCPLX) +#ifdef Conj_ +#define IM 1c +#else +#define IM 1u +#endif +#else +#define IM 1 +#endif + + +#define FN Mjoin(Mjoin(Mjoin(ATL_,PREC),Mjoin(ger,IM)),_a1_x1_yX) + +#undef MY_FUNCTION +#define MY_FUNCTION FN + +void +MY_FUNCTION(int m,int n, const SCALAR alpha,const TYPE *c, + int cinc,const TYPE *b,int binc, + TYPE *a,int lda) { + +#else + + +#define FN Mjoin(Mjoin(Mjoin(ATL_,PREC),gemv),Mjoin(FEXT,Mjoin(_a1_x1_,Mjoin(BL,_y1)))) + +#undef MY_FUNCTION +#define MY_FUNCTION FN + +void +MY_FUNCTION(int m,int n, const SCALAR alpha,const TYPE *a, + int lda,const TYPE *b,int binc, + const SCALAR beta,TYPE *c,int cinc) { + +#endif + + int i,mm,nn; + const TYPE *ae; +#ifdef NO_TRANSPOSE + int len=m,w=n; +#define zz b +#else + int len=n,w=m; +#define zz c +#endif + +#ifdef GER +#define zzinc binc +#else +#define zzinc 1 + + +#if defined(NO_TRANSPOSE) && defined(BETA0) + memset(c,0,m*sizeof(*c)); +#endif + +#if defined(BETAX) || defined(BETAXI0) +#if defined(SCPLX) || defined(DCPLX) + SCALE(beta,c,m); +#endif +#if defined(SREAL) || defined(DREAL) + SCALE(&beta,c,m); +#endif +#endif + +#endif + + ae=a+w*lda; + nn=STRIDE*lda; + + +#if NDPM == 1 + for (;a<ae;a+=lda,zz+=zzinc) + Mjoin(dp,EXT)(a,nn,b,c,STRIDE*zzinc,len); + +#else + + while (a+NDPM*nn<=ae) { + for (i=0;i<STRIDE;i++,a+=lda,zz+=zzinc) + Mjoin(dp,EXT)(a,nn,b,c,STRIDE*zzinc,len); + + a+=(NDPM-1)*nn; + zz+=(NDPM-1)*STRIDE*zzinc; + } + + for (i=0;a<ae && i<STRIDE;i++,a+=lda,zz+=zzinc) { + + mm=(ae-a)/nn; +#if STRIDE > 1 + if (((ae-a)/lda)%STRIDE) + mm++; +#endif + + if (mm == 1) + Mjoin(dp,EXT1)(a,nn,b,c,STRIDE*zzinc,len); + +#if ( NDPM == 2 && STRIDE > 1 ) || NDPM > 2 + else if (mm == 2) + Mjoin(dp,EXT2)(a,nn,b,c,STRIDE*zzinc,len); +#endif + +#if ( NDPM == 3 && STRIDE > 1 ) || NDPM > 3 + else if (mm == 3) + Mjoin(dp,EXT3)(a,nn,b,c,STRIDE*zzinc,len); +#endif + +#if ( NDPM == 4 && STRIDE > 1 ) || NDPM > 4 + else if (mm == 4) + Mjoin(dp,EXT4)(a,nn,b,c,STRIDE*zzinc,len); +#endif + + + } + +#endif + +} + diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext b/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext new file mode 100644 index 0000000..f7f9a0a --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext @@ -0,0 +1,39 @@ + +topd = /home/whaley/atlas3.8/AtlasBase +incs = -def topd /home/whaley/atlas3.8/AtlasBase \ + -def incd /home/whaley/atlas3.8/AtlasBase/Clint \ + -def BASEdir /home/whaley/atlas3.8/AtlasBase/Antoine/ \ + -def basd /home/whaley/atlas3.8/AtlasBase/Clint +ext = extract +extF = $(ext) -langF -lnlen71 -Remtblank -llwarn2 -LAPACK1 $(incs) +extC = $(ext) -langC -lnlen79 -Remtblank -llwarn2 $(incs) +extM = $(ext) -langM -lnlen79 -llwarn2 $(incs) + +default: all +force_build: +basd = /home/whaley/atlas3.8/AtlasBase/Clint +basdRCW = /home/whaley/atlas3.8/AtlasBase/Clint +basdAPP = /home/whaley/atlas3.8/AtlasBase/Antoine +incf = /home/whaley/atlas3.8/AtlasBase/gen.inc + +files = ATL_gemv_ger_SSE.h SSE3Dnow.h camm_dpa.h camm_pipe3.h camm_scale.h \ + camm_strat1.h camm_tpipe.h camm_util.h + +all : $(files) + +camm_strat1.h : $(topd)/kernel/CammMaguire/camm_strat1.h + cp $(topd)/kernel/CammMaguire/camm_strat1.h . +camm_tpipe.h : $(topd)/kernel/CammMaguire/camm_tpipe.h + cp $(topd)/kernel/CammMaguire/camm_tpipe.h . +camm_pipe3.h : $(topd)/kernel/CammMaguire/camm_pipe3.h + cp $(topd)/kernel/CammMaguire/camm_pipe3.h . +ATL_gemv_ger_SSE.h : $(topd)/kernel/CammMaguire/ATL_gemv_ger_SSE.h + cp $(topd)/kernel/CammMaguire/ATL_gemv_ger_SSE.h . +camm_util.h : $(topd)/kernel/CammMaguire/camm_util.h + cp $(topd)/kernel/CammMaguire/camm_util.h . +camm_scale.h : $(topd)/kernel/CammMaguire/camm_scale.h + cp $(topd)/kernel/CammMaguire/camm_scale.h . +camm_dpa.h : $(topd)/kernel/CammMaguire/camm_dpa.h + cp $(topd)/kernel/CammMaguire/camm_dpa.h . +SSE3Dnow.h : $(topd)/kernel/PeterSoendergaard/SSE3Dnow.h + cp $(topd)/kernel/PeterSoendergaard/SSE3Dnow.h . diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h b/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h new file mode 100644 index 0000000..a783749 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h @@ -0,0 +1,709 @@ +#if !defined(ATL_GAS_x8632) && !defined(ATL_GAS_x8664) + #error "This kernel requires gas x86 assembler!" +#endif +#ifndef Mstr /* Added by RCW to make multiline macros work */ + #define Mstr2(m) # m + #define Mstr(m) Mstr2(m) +#endif +/* The mening of the defined macros is as follows: + * VECLEN: The length of a singleprecision vector register + * vec_add: Add to single precision vectors. + * vec_mul: Multiply to single precision vectors. + * vec_mov: Moves data around + * vec_mov1: Load one element in a vector and zero all other entries! + * vec_splat: Load one element relpicated in all positions in the vector. + * vec_load_apart: Load elements from different memory positions into a register. + * vec_sum: Sums a register. + * vec_store_one: Stores lowest element in vector to memory, no zero-extend! + * Meaning of suffixes is as follows: + * mr means memory to register + * rr means register to register + * rm means register to memory + * a means that instruction needs aligned data + * 1 means that the instructions only operates on the lowest element of the + * vector. + * + * The _1 instructions work under one important assumption: That you never mix + * them with regular instructions, e.g. loading into a register with a normal + * mov, and then using add_rr_1 will not work under 3dnow! since it is in + * reality a normal add. However, if using a mov_1 first, the upper part of + * the register will be zeroed, and it will therefore work. The _1 system is + * more robust under SSE, but other architectures might be implemented the + * same way as 3dnow! + * + * RCW: I added the following functionality for SSE only (note that vw may + * be overwritten with intermediate results, but is not used as input, + * and that all input array may be overwritten wt intermediate results. + * VL : vector length -1): + * vec_red(vd, vw) : vd[0] = sum(vd[0:VL]) + * vec_red2(v1, v2, vw) : v1[0] = sum(v1[0:VL]); v1[1] = sum(v2[0:VL]) + * vec_red4(v0, v1, v2, v3 vw1, vw2) : + * v0[0] = sum(v0[0:VL]); v0[1] = sum(v1[0:VL]) + * if type = double: + * v2[0] = sum(v2[0:VL]); v2[1] = sum(v3[0:VL]) + * else + * v0[2] = sum(v2[0:VL]); v0[3] = sum(v3[0:VL]) + * vec_zero(vd) : vd[0:VL] = 0.0 + */ + + +/* Things to try: + * Non-temporal stores + * Sequences of instructions instead of movups + * + * + * + * + */ + + + +#define gen_vec_rr(op,reg1,reg2) \ + __asm__ __volatile__ (#op " " #reg1 ", " #reg2 \ + : /* nothing */ \ + : /* nothing */) + + +#define w(p) p + +#define nop() __asm__ __volatile__ ("nop") + +#define rep() __asm__ __volatile__ ("rep") + +#define align() __asm__ __volatile__ (".align 16") + + +#ifdef x87double + +#define st0 %%st(0) +#define st1 %%st(1) +#define st2 %%st(2) +#define st3 %%st(3) +#define st4 %%st(4) +#define st5 %%st(5) +#define st6 %%st(6) +#define st7 %%st(7) + + +#define gen_stack_rt(op,reg) \ + __asm__ __volatile__ (#op " " #reg \ + : /* nothing */ \ + : /* nothing */) + +#define gen_stack_tr(op,reg) \ + __asm__ __volatile__ (#op " %%st(0)," #reg \ + : \ + : ) + + +#define gen_stack_rr(op,reg1,reg2) \ + __asm__ __volatile__ (#op " " #reg1 ", " #reg2 \ + : /* nothing */ \ + : /* nothing */) + +#define gen_stack_t(op) \ + __asm__ __volatile__ (#op \ + : /* nothing */ \ + : /* nothing */) + + +#define gen_stack_tm(op,mem) \ + __asm__ __volatile__ (#op " %0" \ + : "=m" (((mem)[0])) \ + : ) + +#define gen_stack_mt(op,mem) \ + __asm__ __volatile__ (#op " %0" \ + : \ + : "m" (((mem)[0]))) + + +#define stack_mov_mt_push(mem) gen_stack_mt(fldl,mem) + +#define stack_add_tr_pop(reg) gen_stack_tr(faddp,reg) +#define stack_add_mt(mem) gen_stack_mt(faddl,mem) + +#define stack_mul_tr(reg) gen_stack_tr(fmul,reg) +#define stack_mul_tr_pop(reg) gen_stack_tr(fmulp,reg) +#define stack_mul_mt(mem) gen_stack_mt(fmul,mem) + +#define stack_mov_tm_pop(mem) gen_stack_tm(fstpl,mem) + +#define stack_zero_push() gen_stack_t(fldz) + +#endif /* x87double */ + +#ifdef SSE + +/* Peculiarities of SSE: Alignment is good, but not mandatory. It is possible to + * load/store from misaligned adresses using movups at a cost of some cycles. Loading + * using mul/add must always be aligned. Alignment is 16 bytes. + * No muladd. + */ + + + +#define gen_vec_mr(op,mem,reg) \ + __asm__ __volatile__ (#op " %0, " #reg \ + : /* nothing */ \ + : "m" (((mem)[0])), "m" (((mem)[1])), "m" (((mem)[2])), "m" (((mem)[3]))) + + +#define gen_vec_rm(op,reg,mem) \ + __asm__ __volatile__ (#op " " #reg ", %0" \ + : "=m" (((mem)[0])), "=m" (((mem)[1])), "=m" (((mem)[2])), "=m" (((mem)[3])) \ + : /* nothing */ ) + + + + +#define VECLEN 4 + +#define reg0 %%xmm0 +#define reg1 %%xmm1 +#define reg2 %%xmm2 +#define reg3 %%xmm3 +#define reg4 %%xmm4 +#define reg5 %%xmm5 +#define reg6 %%xmm6 +#define reg7 %%xmm7 +#ifdef ATL_GAS_x8664 + #define reg8 %%xmm8 + #define reg9 %%xmm9 + #define reg10 %%xmm10 + #define reg11 %%xmm11 + #define reg12 %%xmm12 + #define reg13 %%xmm13 + #define reg14 %%xmm14 + #define reg15 %%xmm15 +#endif + +#define vec_mov_mr(mem,reg) gen_vec_mr(movups,mem,reg) +#define vec_mov_rm(reg,mem) gen_vec_rm(movups,reg,mem) +#define vec_mov_mr_a(mem,reg) gen_vec_mr(movaps,mem,reg) +#define vec_mov_rm_a(reg,mem) gen_vec_rm(movaps,reg,mem) +#define vec_mov_rr(reg1,reg2) gen_vec_rr(movaps,reg1,reg2) + +#define vec_add_mr_a(mem,reg) gen_vec_mr(addps,mem,reg) +#define vec_mul_mr_a(mem,reg) gen_vec_mr(mulps,mem,reg) + +#define vec_add_rr(mem,reg) gen_vec_rr(addps,mem,reg) +#define vec_mul_rr(mem,reg) gen_vec_rr(mulps,mem,reg) + +#define vec_mov_mr_1(mem,reg) gen_vec_mr(movss,mem,reg) +#define vec_mov_rm_1(reg,mem) gen_vec_rm(movss,reg,mem) +#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movss,reg1,reg2) + +#define vec_add_mr_1(mem,reg) gen_vec_mr(addss,mem,reg) +#define vec_add_rr_1(reg1,reg2) gen_vec_rr(addss,reg1,reg2) + +#define vec_mul_mr_1(mem,reg) gen_vec_mr(mulss,mem,reg) +#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(mulss,reg1,reg2) + +#define vec_unpack_low(reg1,reg2) gen_vec_rr(unpcklps,reg1,reg2) +#define vec_unpack_high(reg1,reg2) gen_vec_rr(unpckhps,reg1,reg2) +#define vec_shuffle(mode,reg1,reg2) vec_shuffle_wrap(mode,reg1,reg2) +#define vec_shuffle_wrap(mode,reg1,reg2) \ + __asm__ __volatile__ ("shufps " #mode ", " #reg1 ", " #reg2 \ + : /* nothing */\ + : /* nothing */) + +/* Hack! */ +/* To use this instruction be sure that register 7 is not in use!!! */ +/* It must be possible to reduce this sequence to only four instructions. + * please tell me how! */ +#define vec_sum(reg) vec_sum_wrap(reg) +#define vec_sum_wrap(reg) \ + __asm__ __volatile__ ("movhlps " #reg ", %%xmm7\n"\ + "addps " #reg ", %%xmm7\n"\ + "movaps %%xmm7, " #reg "\n"\ + "shufps $1, " #reg ", %%xmm7\n"\ + "addss %%xmm7, " #reg "\n"\ + : /* nothing */\ + : /* nothing */) + +/* RCW: added to safely replace vec_sum (vec reduce), and use SSE3 when avail */ +#define vec_zero(vd) __asm__ __volatile__("xorps " Mstr(vd) ", " Mstr(vd) ::) +#ifdef ATL_SSE3 + #define vec_red(vr, vwrk) \ + __asm__ __volatile__("haddps " Mstr(vr) ", " Mstr(vr) "\n"\ + "haddps " Mstr(vr) ", " Mstr(vr) "\n" ::) +/* + * haddps v1 v0 # v0 = {v1cd, v1ab, v0cd, v0ab} + * haddps v0 v0 # v0 = {v1abcd, v0abcd, v1abcd, v0abcd} + */ + #define vec_red2(v0, v1, vwork) \ + __asm__ __volatile__("haddps " Mstr(v1) ", " Mstr(v0) "\n"\ + "haddps " Mstr(v0) ", " Mstr(v0) "\n" ::) +/* + * haddps v1, v0 # v0 = {v1cd,v1ab,v0cd,v0ab} + * haddps v3, v2 # v2 = {v3cd,v3ab,v2cd,v2ab} + * haddps v2, v0 # v0 = {v3abcd,v2abcd,v1abcd, v0abcd} + */ + #define vec_red4(v0, v1, v2, v3, w0, w1) \ + __asm__ __volatile__("haddps " Mstr(v1) ", " Mstr(v0) "\n"\ + "haddps " Mstr(v3) ", " Mstr(v2) "\n"\ + "haddps " Mstr(v2) ", " Mstr(v0) "\n" ::) +#elif defined(ATL_SSE2) + #define vec_red(vr, vwrk) \ + __asm__ __volatile__ ("pshufd $0xEE, " Mstr(vr) ", " Mstr(vwrk) "\n"\ + "addps " Mstr(vwrk) ", " Mstr(vr) "\n"\ + "pshufd $0xE5, " Mstr(vr) ", " Mstr(vwrk) "\n"\ + "addss " Mstr(vwrk) ", " Mstr(vr) "\n"\ + ::) +#else + #define vec_red(vr, vwrk) \ + __asm__ __volatile__ ("movhlps " Mstr(vr) ", " Mstr(vwrk) "\n"\ + "addps " Mstr(vwrk) ", " Mstr(vr) "\n"\ + "movaps " Mstr(vr) ", " Mstr(vwrk) "\n"\ + "shufps $0xE5, " Mstr(vr) ", " Mstr(vr) "\n"\ + "addss " Mstr(vwrk) ", " Mstr(vr) "\n"\ + ::) +#endif +#ifndef ATL_SSE3 /* codes that are the same for SSE2 and SSE1 */ +/* + # v0 = {v0d,v0c,v0b,v0a} + # v1 = {v1d,v1c,v1b,v1a} + movaps v0, vw # vw = {v0d,v0c,v0b,v0a} + unpacklps v1, v0 # v0 = {v1b,v0b,v1a,v0a} + unpackhps v1, vw # vw = {v1d,v0d,v1c,v0c} + addps vw, v0 # v0 = {v1bd,v0bd,v1ac,v0ac} + movhlps v0, vw # vw = {X , X,v1bd,v0bd} + addps vw, v0 # v0 = {X , X,v1abcd,v0abcd} +*/ + #define vec_red2(v0, v1, vw) \ + __asm__ __volatile__ ("movaps " Mstr(v0) ", " Mstr(vw) "\n"\ + "unpcklps " Mstr(v1) ", " Mstr(v0) "\n"\ + "unpckhps " Mstr(v1) ", " Mstr(vw) "\n"\ + "addps " Mstr(vw) ", " Mstr(v0) "\n"\ + "movhlps " Mstr(v0) ", " Mstr(vw) "\n"\ + "addps " Mstr(vw) ", " Mstr(v0) "\n"\ + ::) +/* + * movaps v0, w0 # w0 = {v0d, v0c, v0b, v0a} + * unpcklps v1, v0 # v0 = {v1b, v0b, v1a, v0a} + * movaps v2, w1 # w1 = {v2d, v2c, v2b, v2a} + * unpckhps v1, w0 # w0 = {v1d, v0d, v1c, v0c} + * unpcklps v3, v2 # v2 = {v3b, v2b, v3a, v2a} + * addps w0, v0 # v0 = {v1bd, v0bd, v1ac, v0ac} + * unpckhps v3, w1 # w1 = {v3d, v2d, v3c, v2c} + * movaps v0, w0 # w0 = {v1bd, v0bd, v1ac, v0ac} + * addps w1, v2 # v2 = {v3bd, v2bd, v3ac, v2ac} + * shufps $0x44,v2,v0 # v0 = {v3ac, v2ac, v1ac, v0ac} + * shufps $0xEE,v2,w0 # w0 = {v3bd, v2bd, v1bd, v0bd} + * addps w0, v0 # v0 = {v3abcd, v2abcd, v1abcd, v0abcd} + */ + #define vec_red4(v0, v1, v2, v3, w0, w1) \ + __asm__ __volatile__ ("movaps " Mstr(v0) ", " Mstr(w0) "\n"\ + "unpcklps " Mstr(v1) ", " Mstr(v0) "\n"\ + "movaps " Mstr(v2) ", " Mstr(w1) "\n"\ + "unpckhps " Mstr(v1) ", " Mstr(w0) "\n"\ + "unpcklps " Mstr(v3) ", " Mstr(v2) "\n"\ + "addps " Mstr(w0) ", " Mstr(v0) "\n"\ + "unpckhps " Mstr(v3) ", " Mstr(w1) "\n"\ + "movaps " Mstr(v0) ", " Mstr(w0) "\n"\ + "addps " Mstr(w1) ", " Mstr(v2) "\n"\ + "shufps $0x44, " Mstr(v2) ", " Mstr(v0) "\n"\ + "shufps $0xEE, " Mstr(v2) ", " Mstr(w0) "\n"\ + "addps " Mstr(w0) ", " Mstr(v0) "\n"\ + ::) +#endif + +#define vec_splat(mem,reg) vec_splat_wrap(mem,reg) +#define vec_splat_wrap(mem,reg) \ + __asm__ __volatile__ ("movss %0, " #reg "\n"\ + "unpcklps " #reg ", " #reg "\n"\ + "movlhps " #reg ", " #reg "\n"\ + : /* nothing */ \ + : "m" ((mem)[0])) + + +/* This instruction sequence appears courtesy of Camm Maguire. */ +#define vec_sum_full(reg0,reg1,reg2,reg3,regout,empty0,empty1) vec_sum_full_wrap(reg0,reg1,reg2,reg3,regout,empty0,empty1) +#define vec_sum_full_wrap(reg0,reg1,reg2,reg3,regout,empty0,empty1) \ + __asm__ __volatile__ ("movaps " #reg0 "," #empty0 "\n"\ + "unpcklps " #reg1 "," #reg0 "\n"\ + "movaps " #reg2 "," #empty1 "\n"\ + "unpckhps " #reg1 "," #empty0 "\n"\ + "unpcklps " #reg3 "," #reg2 "\n"\ + "addps " #empty0 "," #reg0 "\n"\ + "unpckhps " #reg3 "," #empty1 "\n"\ + "movaps " #reg0 "," #regout "\n"\ + "addps " #empty1 "," #reg2 "\n"\ + "shufps $0x44," #reg2 "," #reg0 "\n"\ + "shufps $0xee," #reg2 "," #regout "\n"\ + "addps " #reg0 "," #regout "\n"\ + : /* nothing */ \ + : /* nothing */) + + + +typedef float vector[VECLEN]; + +#endif /* end ifdef SSE */ + + +#ifdef SSE2 + +/* Peculiarities of SSE: Alignment is good, but not mandatory. It is possible to + * load/store from misaligned adresses using movups at a cost of some cycles. Loading + * using mul/add must always be aligned. Alignment is 16 bytes. + * No muladd. + */ + + + +#define gen_vec_mr(op,mem,reg) \ + __asm__ __volatile__ (#op " %0, " #reg \ + : /* nothing */ \ + : "m" (((mem)[0])), "m" (((mem)[1]))) + + +#define gen_vec_rm(op,reg,mem) \ + __asm__ __volatile__ (#op " " #reg ", %0" \ + : "=m" (((mem)[0])), "=m" (((mem)[1])) \ + : /* nothing */ ) + + + + +#define VECLEN 2 + +#define reg0 %%xmm0 +#define reg1 %%xmm1 +#define reg2 %%xmm2 +#define reg3 %%xmm3 +#define reg4 %%xmm4 +#define reg5 %%xmm5 +#define reg6 %%xmm6 +#define reg7 %%xmm7 +#ifdef ATL_GAS_x8664 + #define reg8 %%xmm8 + #define reg9 %%xmm9 + #define reg10 %%xmm10 + #define reg11 %%xmm11 + #define reg12 %%xmm12 + #define reg13 %%xmm13 + #define reg14 %%xmm14 + #define reg15 %%xmm15 +#endif + + +#define vec_mov_mr(mem,reg) gen_vec_mr(movupd,mem,reg) +#define vec_mov_rm(reg,mem) gen_vec_rm(movupd,reg,mem) +#define vec_mov_mr_a(mem,reg) gen_vec_mr(movapd,mem,reg) +#define vec_mov_rm_a(reg,mem) gen_vec_rm(movapd,reg,mem) +#define vec_mov_rr(reg1,reg2) gen_vec_rr(movapd,reg1,reg2) + +#define vec_add_mr_a(mem,reg) gen_vec_mr(addpd,mem,reg) +#define vec_mul_mr_a(mem,reg) gen_vec_mr(mulpd,mem,reg) + +#define vec_add_rr(mem,reg) gen_vec_rr(addpd,mem,reg) +#define vec_mul_rr(mem,reg) gen_vec_rr(mulpd,mem,reg) + +#define vec_mov_mr_1(mem,reg) gen_vec_mr(movsd,mem,reg) +#define vec_mov_rm_1(reg,mem) gen_vec_rm(movsd,reg,mem) +#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movsd,reg1,reg2) + +#define vec_add_mr_1(mem,reg) gen_vec_mr(addsd,mem,reg) +#define vec_add_rr_1(reg1,reg2) gen_vec_rr(addsd,reg1,reg2) + +#define vec_mul_mr_1(mem,reg) gen_vec_mr(mulsd,mem,reg) +#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(mulsd,reg1,reg2) + +#define vec_splat(mem,reg) vec_splat_wrap(mem,reg) +#define vec_splat_wrap(mem,reg) \ + __asm__ __volatile__ ("movsd %0, " #reg "\n"\ + "unpcklpd " #reg ", " #reg \ + : /* nothing */ \ + : "m" ((mem)[0])) + +/* Hack! */ +/* To use this instruction be sure that register 7 is not in use!!! */ +#define vec_sum(reg) vec_sum_wrap(reg) +#define vec_sum_wrap(reg) \ + __asm__ __volatile__ ("movhlps " #reg ", %%xmm7\n"\ + "addpd %%xmm7, " #reg "\n"\ + : /* nothing */\ + : /* nothing */) +/* + * Added by RCW to improve performance and avoid xmm7 hack (replace vec_sum) + */ +#define vec_zero(vd) __asm__ __volatile__("xorps " Mstr(vd) ", " Mstr(vd) ::) +#ifdef ATL_SSE3 + #define vec_red(vr, vwrk) \ + __asm__ __volatile__("haddpd " Mstr(vr) ", " Mstr(vr) "\n" ::) + #define vec_red2(v0, v1, vw) \ + __asm__ __volatile__("haddpd " Mstr(v1) ", " Mstr(v0) "\n" ::) + #define vec_red4(v0, v1, v2, v3, w0, w1) \ + __asm__ __volatile__("haddpd " Mstr(v1) ", " Mstr(v0) "\n"\ + "haddpd " Mstr(v3) ", " Mstr(v2) "\n"\ + ::) +#else + #define vec_red(vr, vwrk) \ + __asm__ __volatile__ ("pshufd $0xEE, " Mstr(vr) ", " Mstr(vwrk) "\n"\ + "addsd " Mstr(vwrk) ", " Mstr(vr) "\n" ::) +/* + * movapd v0, vw # vw = {v0b, v0a} + * unpcklpd v1,v0 # v0 = {v1a, v0a} + * unpckhpd v1, vw # vw = {v1b, v0b} + * addpd vw, v0 # v0 = {v1ab,v0ab} + */ + #define vec_red2(v0, v1, vw) \ + __asm__ __volatile__("movapd " Mstr(v0) ", " Mstr(vw) "\n"\ + "unpcklpd " Mstr(v1) ", " Mstr(v0) "\n"\ + "unpckhpd " Mstr(v1) ", " Mstr(vw) "\n"\ + "addpd " Mstr(vw) ", " Mstr(v0) "\n"\ + ::) +/* + * movapd v0, w0 # w0 = {v0b, v0a} + * movapd v2, w1 # w1 = {v2b, v2a} + * unpcklpd v1, v0 # v0 = {v1a, v0a} + * unpcklpd v3, v2 # v2 = {v3a, v2a} + * unpckhpd v1, w0 # w0 = {v1b, v0b} + * unpckhpd v3, w1 # w1 = {v3b, v2b} + * addpd w0, v0 # v0 = {v1ab, v0ab} + * addpd w1, v2 # v2 = {v3ab, v2ab} + */ + #define vec_red4(v0, v1, v2, v3, w0, w1) \ + __asm__ __volatile__("movapd " Mstr(v0) ", " Mstr(w0) "\n"\ + "movapd " Mstr(v2) ", " Mstr(w1) "\n"\ + "unpcklpd " Mstr(v1) ", " Mstr(v0) "\n"\ + "unpcklpd " Mstr(v3) ", " Mstr(v2) "\n"\ + "unpckhpd " Mstr(v1) ", " Mstr(w0) "\n"\ + "unpckhpd " Mstr(v3) ", " Mstr(w1) "\n"\ + "addpd " Mstr(w0) ", " Mstr(v0) "\n"\ + "addpd " Mstr(w1) ", " Mstr(v2) "\n"\ + ::) +#endif + +#define vec_sum_full(reg1,reg2,empty1) vec_sum_full_wrap(reg1,reg2,empty1) +#define vec_sum_full_wrap(reg1,reg2,empty1) \ + __asm__ __volatile__ ("movhlps " #reg2 ", " #empty1 "\n"\ + "movlhps " #reg2 ", " #empty1 "\n"\ + "addpd " #empty1 ", " #reg1 "\n"\ + : /* nothing */\ + : /* nothing */) + + +typedef double vector[VECLEN]; + +#endif /* end ifdef SSE2 */ + + +#ifdef THREEDNOW + +/* Peculiarities of 3DNOW. Alignment is not an issue, + * all alignments are legal, however alignment gives a speed increase. + * The vec_acc instruction can be used to sum to registers at once more efficiently + * than a series of vec_sum and vec_store_one + * No muladd. + */ + + +#define gen_vec_mr(op,mem,reg) \ + __asm__ __volatile__ (#op " %0, " #reg \ + : /* nothing */ \ + : "m" (((mem)[0])), "m" (((mem)[1]))) + +#define gen_vec_rm(op,reg,mem) \ + __asm__ __volatile__ (#op " " #reg ", %0" \ + : "=m" (((mem)[0])), "=m" (((mem)[1])) \ + : /* nothing */ ) + + + + +#define VECLEN 2 + +#define reg0 %%mm0 +#define reg1 %%mm1 +#define reg2 %%mm2 +#define reg3 %%mm3 +#define reg4 %%mm4 +#define reg5 %%mm5 +#define reg6 %%mm6 +#define reg7 %%mm7 + +#define vec_add_mr(mem,reg) gen_vec_mr(pfadd,mem,reg) +#define vec_mul_mr(mem,reg) gen_vec_mr(pfmul,mem,reg) +#define vec_mov_mr(mem,reg) gen_vec_mr(movq,mem,reg) +#define vec_mov_rm(reg,mem) gen_vec_rm(movq,reg,mem) +#define vec_add_rr(reg1,reg2) gen_vec_rr(pfadd,reg1,reg2) +#define vec_mul_rr(reg1,reg2) gen_vec_rr(pfmul,reg1,reg2) +#define vec_acc_rr(reg1,reg2) gen_vec_rr(pfacc,reg1,reg2) +#define vec_mov_rr(reg1,reg2) gen_vec_rr(movq,reg1,reg2) + +#define vec_sum(reg) gen_vec_rr(pfacc,reg,reg) +#define vec_sum_full(reg1,reg2) gen_vec_rr(pfacc,reg1,reg2) + +#define vec_mov_mr_1(mem,reg) gen_vec_mr(movd,mem,reg) +#define vec_mov_rm_1(reg,mem) gen_vec_rm(movd,reg,mem) +#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movd,reg1,reg2) + +#define vec_add_rr_1(reg1,reg2) gen_vec_rr(pfadd,reg1,reg2) +#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(pfmul,reg1,reg2) + + +#define vec_splat(mem,reg) vec_splat_wrap(mem,reg) +#define vec_splat_wrap(mem,reg) \ + __asm__ __volatile__ ("movd %0, " #reg "\n"\ + "punpckldq " #reg ", " #reg \ + : /* nothing */ \ + : "m" ((mem)[0])) + + +#define vec_load_apart(mem1,mem2,reg) vec_load_apart_wrap(mem1,mem2,reg) +#define vec_load_apart_wrap(mem1,mem2,reg) \ + __asm__ __volatile__ ("movd %0, " #reg "\n"\ + "punpckldq %1, " #reg \ + : /* nothing */ \ + : "m" ((mem1)[0]), "m" (((mem2)[0]))) + + +#define vec_zero(reg) gen_vec_rr(pxor,reg,reg) + +#define vec_enter() __asm__ __volatile__ ("femms") +#define vec_exit() __asm__ __volatile__ ("femms") + +#define align() __asm__ __volatile__ (".align 16") + + +typedef float vector[VECLEN]; + +#endif + + + + + +#ifdef ALTIVEC + +#define VECLEN 4 + +#define reg0 %%vr0 +#define reg1 %%vr1 +#define reg2 %%vr2 +#define reg3 %%vr3 +#define reg4 %%vr4 +#define reg5 %%vr5 +#define reg6 %%vr6 +#define reg7 %%vr7 +#define reg8 %%vr8 +#define reg9 %%vr9 +#define reg10 %%vr10 +#define reg11 %%vr11 +#define reg12 %%vr12 +#define reg13 %%vr13 +#define reg14 %%vr14 +#define reg15 %%vr15 +#define reg16 %%vr16 +#define reg17 %%vr17 +#define reg18 %%vr18 +#define reg19 %%vr19 +#define reg20 %%vr20 +#define reg21 %%vr21 +#define reg22 %%vr22 +#define reg23 %%vr23 +#define reg24 %%vr24 +#define reg25 %%vr25 +#define reg26 %%vr26 +#define reg27 %%vr27 +#define reg28 %%vr28 +#define reg29 %%vr29 +#define reg30 %%vr30 +#define reg31 %%vr31 + +#define gen_vec_mr(op,mem,reg) \ + __asm__ __volatile__ (#op " %0, " #reg \ + : /* nothing */ \ + : "m" (((mem)[0])), "m" (((mem)[1])), "m" (((mem)[2])), "m" (((mem)[3]))) + + +#define gen_vec_rm(op,reg,mem) \ + __asm__ __volatile__ (#op " " #reg ", %0" \ + : "=m" (((mem)[0])), "=m" (((mem)[1])), "=m" (((mem)[2])), "=m" (((mem)[3])) \ + : /* nothing */ ) + + +#define gen_alti3(op,reg1,reg2,regout) \ + __asm__ __volatile__ (#op " " #reg1 ", " #reg2 ", " #regout \ + : /* nothing */ \ + : /* nothing */) + +#define gen_alti_muladd(op,reg1,reg2,regout) \ + __asm__ __volatile__ (#op " " #reg1 ", " #reg2 ", " #regout ", " #regout \ + : /* nothing */ \ + : /* nothing */) + + + +#define vec_mov_mr_a(mem,reg) gen_vec_mr(lvx,mem,reg) +#define vec_mov_rm_a(reg,mem) gen_vec_rm(svx,reg,mem) +#define vec_muladd(reg1,reg2,regout) gen_alti3(vmaddfp,reg1,reg2,regout) + +#define vec_zero(reg) gen_alti3(vxor,reg,reg,reg) + + +typedef float vector[VECLEN]; + +#endif + + +#ifdef ALTIVEC_C + +/* These macros have been written by, or greatly inspired by, + * Nicholas A. Coult . Thanks. + */ + +/* assumes that last four registers are not in use! */ +#define transpose(x0,x1,x2,x3) \ +reg28 = vec_mergeh(x0,x2); \ +reg29 = vec_mergeh(x1,x3); \ +reg30 = vec_mergel(x0,x2); \ +reg31 = vec_mergel(x1,x3); \ +x0 = vec_mergeh(reg28,reg29); \ +x1 = vec_mergel(reg28,reg29); \ +x2 = vec_mergeh(reg30,reg31); \ +x3 = vec_mergel(reg30,reg31) + +#define vec_mov_rm(v, where) \ +low = vec_ld(0, (where)); \ +high = vec_ld(16, (where)); \ +p_vector = vec_lvsr(0, (int *)(where)); \ +mask = vec_perm((vector unsigned char)(0), (vector unsigned char)(-1), p_vector); \ +v = vec_perm(v, v, p_vector); \ +low = vec_sel(low, v, mask); \ +high = vec_sel(v, high, mask); \ +vec_st(low, 0, (where)); \ +vec_st(high, 16, (where)) + +#define vec_mov_mr_a(mem,reg) reg = vec_ld(0, mem) + +#define vec_mov_mr(u,v) \ +p_vector = (vector unsigned char)vec_lvsl(0, (int*)(v)); \ +low = (vector unsigned char)vec_ld(0, (v)); \ +high = (vector unsigned char)vec_ld(16, (v)); \ +u=(vector float)vec_perm(low, high, p_vector) + +#define vec_muladd(reg1,reg2,regout) regout = vec_madd(reg1,reg2,regout) +#define vec_add_rr(reg1,reg2) reg2 = vec_add(reg1,reg2) + +#define vec_zero(reg) reg = vec_xor(reg,reg) + +#define vec_sum_full(reg0,reg1,reg2,reg3,regout,empty0,empty1) \ +transpose(reg0, reg1,reg2,reg3,regout,empty0,empty1); \ +empty0 = vec_add(reg0,reg1); \ +empty1 = vec_add(reg2,reg3); \ +regout = vec_add(empty0,empty1) + + +#endif /* ALTIVEC_C */ + + + + + + + + diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h new file mode 100644 index 0000000..af9c6b1 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h @@ -0,0 +1,1626 @@ +#include <stdlib.h> +#include <sys/time.h> +#include <stdio.h> + +#include "camm_util.h" + + +#if defined(ALIGN) +#if( defined(SCPLX) || defined(DCPLX)) +#error Cannot align complex routines +#endif +#if defined(SREAL) && ( NDPM != 1 ) && ( STRIDE % 4 != 0) +#error Can only align SREAL with NDPM 1 or STRIDE % 4 = 0 +#endif +#if defined(DREAL) && ( NDPM != 1 ) && ( STRIDE % 2 != 0) +#error Can only align DREAL with NDPM 1 or STRIDE % 2 = 0 +#endif +#endif + +/****************************************************************************** + * Single Precision Complex Macros + ******************************************************************************/ + +#ifdef SCPLX + +#ifdef NO_TRANSPOSE + +#if NDPM > 3 +#error Max NDPM is 3 for SCPLX NO_TRANSPOSE +#endif + +#undef plax +#define plax + +#undef R1 +#define R1 2 +#undef R2 +#define R2 4 +#undef R3 +#define R3 6 +#undef R4 +#define R4 6 + +#undef TREG +#define TREG 1 +#undef SREG +#define SREG 0 +#undef CREG +#define CREG 0 + +#ifdef GER +#undef AREG +#define AREG 0 +#undef targ +#define targ(a_) AREG +#undef wb +#define wb(a_,b_) pu(AREG,a_,b_) +#undef wbd +#define wbd(a_,b_) pud(AREG,a_,b_) +#undef w +#define w(a_) +#undef w1_2 +#define w1_2(a_) +#else +#undef AREG +#define AREG TREG +#undef targ +#define targ(a_) CREG +#undef wb +#define wb(a_,b_) +#undef wbd +#define wbd(a_,b_) +#undef w +#define w(a_) pu(CREG,a_ ## 0,si) +#undef w1_2 +#define w1_2(a_) pud(CREG,a_ ## 0,si) +#endif + +#undef src +#define src(a_) a_ +#undef mpx +#define mpx(a_) pls(0,si,a_) ps(0,a_,a_) pls(4,si,P(a_,1)) \ + ps(0,P(a_,1),P(a_,1)) sign(a_) +#undef madd +#define madd(a_,b_,c_) pas(a_,b_,c_) +#undef ulfa +#define ulfa(a_) + +#else + +#undef R1 +#define R1 4 +#undef R2 +#define R2 5 +#undef R3 +#define R3 6 +#undef R4 +#define R4 7 + +#undef TREG +#define TREG 3 +#undef SREG +#define SREG 2 +#undef CREG +#define CREG 0 +#undef targ +#define targ(a_) a_ +#undef src +#define src(a_) 0 +#undef w +#define w(a_) +#undef w1_2 +#define w1_2(a_) +#undef mpx +#define mpx(a_) px(a_) +#ifdef BETA0 +#undef ulfa +#define ulfa(a_) phl(a_,0) pa(0,a_) pud(a_,0,si) +#else +#undef ulfa +#define ulfa(a_) pld(0,si,TREG) phl(a_,0) pa(0,a_) pa(TREG,a_) pud(a_,0,si) +#endif +#undef AREG +#define AREG TREG +#undef wb +#define wb(a_,b_) +#undef wbd +#define wbd(a_,b_) +#undef wbs +#define wbs(a_,b_) + + +#undef plax +#define plax pc(CREG,1) ps(160,CREG,CREG) ps(245,1,1) sign(CREG) + + + +#endif + +#if defined(Conj_) && ! defined(GER) +#undef sign +#define sign(a_) pm(SREG,a_) +#else +#undef sign +#define sign(a_) pm(SREG,P(a_,1)) +#endif + + + +#undef plb +#define plb(a_,b_) pl(a_,b_,AREG) +#undef plbd +#define plbd(a_,b_) px(AREG) pld(a_,b_,AREG) + +#undef dpr +#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_)) +#undef dprp +#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_)) +#undef dpi +#define dpi(a_) pm(P(src(a_),1),TREG) ps(177,TREG,TREG) pa(TREG,targ(a_)) + +#ifndef GER + + +#undef plaa +#define plaa(a_) pl(a_ ## 0,si,CREG) plax +#undef wa +#define wa(a_) w(a_) +#undef dp +#define dp(a_,b_,c_) plb(a_ ## 0,b_) dpr(c_) plb(a_ ## 0,b_) dpi(c_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) plb(a_ ## 0,b_) dprp(c_,d_,e_) plb(a_ ## 0,b_) dpi(c_) +#undef ddp +#define ddp(a_,b_,c_) dp(a_,b_,c_) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) + +#undef plaa1_2 +#define plaa1_2(a_) px(CREG) pld(a_ ## 0,si,CREG) plax +#undef wa1_2 +#define wa1_2(a_) w1_2(a_) +#undef dp1_2 +#define dp1_2(a_,b_,c_) plbd(a_ ## 0,b_) dpr(c_) plbd(a_ ## 0,b_) dpi(c_) +#undef dpp1_2 +#define dpp1_2(a_,b_,c_,d_,e_) plbd(a_ ## 0,b_) dprp(c_,d_,e_) plbd(a_ ## 0,b_) dpi(c_) +#undef ddp1_2 +#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) +#undef ddpp1_2 +#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) + + +#else + +#undef lqc +#define lqc(a_) pl(a_ ## 0,si,TREG) +#undef lqc1 +#define lqc1_2(a_) px(TREG) pld(a_ ## 0,si,TREG) + + +#undef plaa +#define plaa(a_) +#undef wa +#define wa(a_) +#undef dp +#define dp(a_,b_,c_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) \ + lqc(a_) dpi(c_) wb(a_ ## 0,b_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) pf(d_,e_) \ + lqc(a_) dpi(c_) wb(a_ ## 0,b_) +#undef ddp +#define ddp(a_,b_,c_) dp(a_,b_,c_) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) + +#undef plaa1_2 +#define plaa1_2(a_) +#undef wa1_2 +#define wa1_2(a_) +#undef dp1_2 +#define dp1_2(a_,b_,c_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) \ + lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) +#undef dpp1_2 +#define dpp1_2(a_,b_,c_,d_,e_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) pf(d_,e_) \ + lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) +#undef ddp1_2 +#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) +#undef ddpp1_2 +#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) + +#endif + +#endif + +/****************************************************************************** + * Single Precision Real Macros + ******************************************************************************/ + +#ifdef SREAL + +#ifdef NO_TRANSPOSE + +#undef mpx +#define mpx(a_) pls(0,si,a_) ps(0,a_,a_) +#undef madd +#define madd(a_,b_,c_) pas(a_,b_,c_) +#undef TREG +#define TREG 1 +#undef targ +#define targ(a_) 0 +#undef src +#define src(a_) a_ +#undef ulfa +#define ulfa(a_) + +#ifdef GER +#undef w +#define w(a_) +#undef w1_2 +#define w1_2(a_) +#undef w1_4 +#define w1_4(a_) +#undef CREG +#define CREG 2 +#undef AREG +#define AREG 0 +#undef cp +#define cp pc(CREG,TREG) +#undef wb +#define wb(a_,b_) pu(AREG,a_,b_) +#undef wbd +#define wbd(a_,b_) pud(AREG,a_,b_) +#undef wbs +#define wbs(a_,b_) pus(AREG,a_,b_) +#else +#undef CREG +#define CREG 0 +#undef AREG +#define AREG TREG +#undef cp +#define cp +#undef wb +#define wb(a_,b_) +#undef wbd +#define wbd(a_,b_) +#undef wbs +#define wbs(a_,b_) +#undef w +#define w(a_) pu(CREG,a_ ## 0,si) +#undef w1_2 +#define w1_2(a_) pud(CREG,a_ ## 0,si) +#undef w1_4 +#define w1_4(a_) pus(CREG,a_ ## 0,si) +#endif + +#else + +#undef mpx +#define mpx(a_) px(a_) +#ifdef BETA0 +#undef madd +#define madd(a_,b_,c_) +#else +#undef madd +#define madd(a_,b_,c_) pas(a_,b_,c_) +#endif +#undef TREG +#define TREG 3 +#undef targ +#define targ(a_) a_ +#undef src +#define src(a_) 0 +#undef w +#define w(a_) +#undef w1_2 +#define w1_2(a_) +#undef w1_4 +#define w1_4(a_) +#undef ulfa +#undef ulfa +#define ulfa(a_) phl(a_,0) pa(0,a_) pc(a_,0) ps(1,0,0) pa(0,a_) \ + madd(0,si,a_) pus(a_,0,si) + +#undef CREG +#define CREG 0 +#undef AREG +#define AREG TREG +#undef cp +#define cp +#undef wb +#define wb(a_,b_) +#undef wbd +#define wbd(a_,b_) +#undef wbs +#define wbs(a_,b_) + +#endif + +#if defined(ALIGN) +#undef plb +#define plb(a_,b_) pla(a_,b_,AREG) +#else +#undef plb +#define plb(a_,b_) pl(a_,b_,AREG) +#endif +#undef plbd +#define plbd(a_,b_) px(AREG) pld(a_,b_,AREG) +#undef plbs +#define plbs(a_,b_) pls(a_,b_,AREG) +#undef dpr +#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_)) +#undef dprp +#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_)) +#undef dprs +#define dprs(a_) pmsr(src(a_),TREG) pasr(TREG,targ(a_)) +#undef dprps +#define dprps(a_,b_,c_) pf(b_,c_) pmsr(src(a_),TREG) pasr(TREG,targ(a_)) + +#undef plaa +#define plaa(a_) pl(a_ ## 0,si,CREG) +#undef wa +#define wa(a_) w(a_) +#undef dp +#define dp(a_,b_,c_) cp plb(a_ ## 0,b_) dpr(c_) wb(a_ ## 0,b_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) cp plb(a_ ## 0,b_) dprp(c_,d_,e_) wb(a_ ## 0,b_) +#undef ddp +#define ddp(a_,b_,c_) dp(a_,b_,c_) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) + +#undef plaa1_2 +#define plaa1_2(a_) px(CREG) pld(a_ ## 0,si,CREG) +#undef wa1_2 +#define wa1_2(a_) w1_2(a_) +#undef dp1_2 +#define dp1_2(a_,b_,c_) cp plbd(a_ ## 0,b_) dpr(c_) wbd(a_ ## 0,b_) +#undef dpp1_2 +#define dpp1_2(a_,b_,c_,d_,e_) cp plbd(a_ ## 0,b_) dprp(c_,d_,e_) wbd(a_ ## 0,b_) +#undef ddp1_2 +#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) +#undef ddpp1_2 +#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) + +#undef plaa1_4 +#define plaa1_4(a_) pls(a_ ## 0,si,CREG) +#undef wa1_4 +#define wa1_4(a_) w1_4(a_) +#undef dp1_4 +#define dp1_4(a_,b_,c_) cp plbs(a_ ## 0,b_) dprs(c_) wbs(a_ ## 0,b_) +#undef dpp1_4 +#define dpp1_4(a_,b_,c_,d_,e_) cp plbs(a_ ## 0,b_) dprps(c_,d_,e_) wbs(a_ ## 0,b_) +#undef ddp1_4 +#define ddp1_4(a_,b_,c_) dp1_4(a_,b_,c_) +#undef ddpp1_4 +#define ddpp1_4(a_,b_,c_,d_,e_) dpp1_4(a_,b_,c_,d_,e_) + + + +#undef R1 +#define R1 4 +#undef R2 +#define R2 5 +#undef R3 +#define R3 6 +#undef R4 +#define R4 7 + +#endif + +/****************************************************************************** + * Double Precision Real Macros + ******************************************************************************/ + +#ifdef DREAL + +#ifdef ATL_SSE2 + +#ifdef NO_TRANSPOSE + +#undef mpx +#define mpx(a_) pls(0,si,a_) ps(0,a_,a_) +#undef madd +#define madd(a_,b_,c_) pas(a_,b_,c_) +#undef TREG +#define TREG 1 +#undef targ +#define targ(a_) 0 +#undef src +#define src(a_) a_ +#undef ulfa +#define ulfa(a_) + +#ifdef GER +#undef w +#define w(a_) +#undef w1_2 +#define w1_2(a_) +#undef w1_4 +#define w1_4(a_) +#undef CREG +#define CREG 2 +#undef AREG +#define AREG 0 +#undef cp +#define cp pc(CREG,TREG) +#undef wb +#define wb(a_,b_) pu(AREG,a_,b_) +#undef wbd +#define wbd(a_,b_) pus(AREG,a_,b_) +#undef wbs +/* #define wbs(a_,b_) pus(AREG,a_,b_) */ +#else +#undef CREG +#define CREG 0 +#undef AREG +#define AREG TREG +#undef cp +#define cp +#undef wb +#define wb(a_,b_) +#undef wbd +#define wbd(a_,b_) +#undef wbs +/* #define wbs(a_,b_) */ +#undef w +#define w(a_) pu(CREG,a_ ## 0,si) +#undef w1_2 +#define w1_2(a_) pus(CREG,a_ ## 0,si) +#undef w1_4 +/* #define w1_4(a_) pus(CREG,a_ ## 0,si) */ +#endif + +#else + +#undef mpx +#define mpx(a_) px(a_) +#ifdef BETA0 +#undef madd +#define madd(a_,b_,c_) +#else +#undef madd +#define madd(a_,b_,c_) pas(a_,b_,c_) +#endif +#undef TREG +#define TREG 3 +#undef targ +#define targ(a_) a_ +#undef src +#define src(a_) 0 +#undef w +#define w(a_) +#undef w1_2 +#define w1_2(a_) +#undef w1_4 +#define w1_4(a_) +#undef ulfa +#undef ulfa +#define ulfa(a_) /* phl(a_,0) pa(0,a_) */ pc(a_,0) ps(1,0,0) pa(0,a_) \ + madd(0,si,a_) pus(a_,0,si) + +#undef CREG +#define CREG 0 +#undef AREG +#define AREG TREG +#undef cp +#define cp +#undef wb +#define wb(a_,b_) +#undef wbd +#define wbd(a_,b_) +#undef wbs +#define wbs(a_,b_) + +#endif + +#if defined(ALIGN) +#undef plb +#define plb(a_,b_) pla(a_,b_,AREG) +#else +#undef plb +#define plb(a_,b_) pl(a_,b_,AREG) +#endif +#undef plbd +#define plbd(a_,b_) /* px(AREG) */pls(a_,b_,AREG) +#undef plbs +/* #define plbs(a_,b_) pls(a_,b_,AREG) */ +#undef dpr +#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_)) +#undef dprp +#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_)) +#undef dprs +#define dprs(a_) pmsr(src(a_),TREG) pasr(TREG,targ(a_)) +#undef dprps +#define dprps(a_,b_,c_) pf(b_,c_) pmsr(src(a_),TREG) pasr(TREG,targ(a_)) + +#undef plaa +#define plaa(a_) pl(a_ ## 0,si,CREG) +#undef wa +#define wa(a_) w(a_) +#undef dp +#define dp(a_,b_,c_) cp plb(a_ ## 0,b_) dpr(c_) wb(a_ ## 0,b_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) cp plb(a_ ## 0,b_) dprp(c_,d_,e_) wb(a_ ## 0,b_) +#undef ddp +#define ddp(a_,b_,c_) dp(a_,b_,c_) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) + +#undef plaa1_2 +#define plaa1_2(a_) /* px(CREG) */pls(a_ ## 0,si,CREG) +#undef wa1_2 +#define wa1_2(a_) w1_2(a_) +#undef dp1_2 +#define dp1_2(a_,b_,c_) cp plbd(a_ ## 0,b_) dprs(c_) wbd(a_ ## 0,b_) +#undef dpp1_2 +#define dpp1_2(a_,b_,c_,d_,e_) cp plbd(a_ ## 0,b_) dprps(c_,d_,e_) wbd(a_ ## 0,b_) +#undef ddp1_2 +#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) +#undef ddpp1_2 +#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) + +#undef plaa1_4 +/* #define plaa1_4(a_) pls(a_ ## 0,si,CREG) */ +#undef wa1_4 +/* #define wa1_4(a_) w1_4(a_) */ +#undef dp1_4 +/* #define dp1_4(a_,b_,c_) cp plbs(a_ ## 0,b_) dprs(c_) wbs(a_ ## 0,b_) */ +#undef dpp1_4 +/* #define dpp1_4(a_,b_,c_,d_,e_) cp plbs(a_ ## 0,b_) dprps(c_,d_,e_) wbs(a_ ## 0,b_) */ +#undef ddp1_4 +/* #define ddp1_4(a_,b_,c_) dp1_4(a_,b_,c_) */ +#undef ddpp1_4 +/* #define ddpp1_4(a_,b_,c_,d_,e_) dpp1_4(a_,b_,c_,d_,e_) */ + + + +#undef R1 +#define R1 4 +#undef R2 +#define R2 5 +#undef R3 +#define R3 6 +#undef R4 +#define R4 7 + +#else + +#ifdef NO_TRANSPOSE + +#undef t0 +#define t0(a_) 1 +#undef s0 +#define s0(a_) a_ +#undef t8 +#define t8(a_) 2 +#undef s8 +#define s8(a_) a_ +#undef w +#define w(a_) fp(a_ ## 0,si) fp(a_ ## 8,si) +#undef w1_2 +#define w1_2(a_) fp(a_ ## 0,si) +#undef mpx +#define mpx(a_) fl(0,si) fc(M(a_,2)) +#undef madd +#define madd(a_,b_,c_) faa(a_,b_) +#undef ulfa +#define ulfa(a_) fc(0) + +#else + +#undef t0 +#define t0(a_) a_ +#undef s0 +#define s0(a_) 1 +#undef t8 +#define t8(a_) a_ +#undef s8 +#define s8(a_) 2 +#undef w +#define w(a_) +#undef w1_2 +#define w1_2(a_) +#undef mpx +#define mpx(a_) fz +#ifdef BETA0 +#undef madd +#define madd(a_,b_,c_) +#else +#undef madd +#define madd(a_,b_,c_) faa(a_,b_) +#endif +#undef ulfa +#define ulfa(a_) madd(0,si,a_) fp(0,si) + +#endif + + +#ifndef GER + +#undef plaa1_2 +#define plaa1_2(a_) fl(a_ ## 0,si) +#undef wa1_2 +#define wa1_2(a_) w1_2(a_) +#ifdef NO_TRANSPOSE +#undef ddp1_2 +#define ddp1_2(a_,b_,c_) fl(a_ ## 0,b_) fm(M(s0(c_),1),0) fap(0,t0(c_)) +#undef dp1_2 +#define dp1_2(a_,b_,c_) ddp1_2(a_,b_,c_) +#else +#undef ddp1_2 +#define ddp1_2(a_,b_,c_) fl(a_ ## 0,b_) fm(s0(c_),0) fap(0,M(t0(c_),1)) +#undef dp1_2 +#define dp1_2(a_,b_,c_) fl(a_ ## 0,b_) fmp(0,s0(c_)) fap(0,M(t0(c_),2)) +#endif + +#else + +#undef plaa1_2 +#define plaa1_2(a_) fl(a_ ## 0,si) +#undef wa1_2 +#define wa1_2(a_) +#undef ddp1_2 +#define ddp1_2(a_,b_,c_) fd(M(s0(c_),2)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) +#undef dp1_2 +#define dp1_2(a_,b_,c_) fm(M(s0(c_),2),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) + +#endif + + + +#undef plaa +#define plaa(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) fx1 + +#ifndef GER + + +#undef wa +#define wa(a_) w(a_) + + +#undef ddp +#define ddp(a_,b_,c_) fl(a_ ## 0,b_) fm(s0(c_),0) fl(a_ ## 8,b_) \ + fm(P(s8(c_),1),0) fx1 fap(0,P(t0(c_),1)) \ + fap(0,t8(c_)) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fm(s0(c_),0) fl(a_ ## 8,b_) \ + fm(P(s8(c_),1),0) pf(d_,e_) fx1 fap(0,P(t0(c_),1)) \ + fap(0,t8(c_)) + +/* #define ddp(a_,b_,c_) fd(M(s0(c_),1)) fma(a_ ## 0,b_) fap(0,t0(c_)) \ */ +/* fd(M(s8(c_),1)) fma(a_ ## 8,b_) fap(0,t8(c_)) */ +/* #define ddpp(a_,b_,c_,d_,e_) fd(M(s0(c_),1)) fma(a_ ## 0,b_) fap(0,t0(c_)) \ */ +/* \ */ +/* fd(M(s8(c_),1)) fma(a_ ## 8,b_) fap(0,t8(c_)) pf(d_,e_) */ + +#ifdef NO_TRANSPOSE + +#undef dp +#define dp(a_,b_,c_) ddp(a_,b_,c_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) ddpp(a_,b_,c_,d_,e_) + +#else + +#undef dp +#define dp(a_,b_,c_) fl(a_ ## 0,b_) fmp(0,s0(c_)) fl(a_ ## 8,b_) \ + fmp(0,s8(c_)) fap(0,M(t0(c_),1)) fap(0,M(t8(c_),2)) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) pf(d_ ,e_) fmp(0,s0(c_)) fl(a_ ## 8,b_) \ + fmp(0,s8(c_)) fap(0,M(t0(c_),1)) fap(0,M(t8(c_),2)) + +/* #define dp(a_,b_,c_) fma(a_ ## 0,b_) fap(0,M(t0(c_),1)) \ */ +/* fma(a_ ## 8,b_) fap(0,M(t8(c_),2)) */ +/* #define dpp(a_,b_,c_,d_,e_) fma(a_ ## 0,b_) fap(0,M(t0(c_),1)) \ */ +/* \ */ +/* fma(a_ ## 8,b_) fap(0,M(t8(c_),2)) pf(d_,e_) */ + +#endif + + +#else + +#undef wa +#define wa(a_) +#undef ddp +#define ddp(a_,b_,c_) fd(M(s0(c_),1)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \ + fd(M(s8(c_),1)) fm(t8(c_),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) fd(M(s0(c_),1)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \ + fd(M(s8(c_),1)) fm(t8(c_),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) pf(d_,e_) + +#undef dp +#define dp(a_,b_,c_) fm(M(s0(c_),1),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \ + fm(M(s8(c_),2),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) fm(M(s0(c_),1),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \ + fm(M(s8(c_),2),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) pf(d_,e_) + +#endif + + +#undef R1 +#define R1 3 +#undef R2 +#define R2 4 +#undef R3 +#define R3 5 +#undef R4 +#define R4 6 + +#endif + +#endif + +/****************************************************************************** + * Double Precision Complex Macros + ******************************************************************************/ + +#ifdef DCPLX + +#ifdef ATL_SSE2 +#ifdef NO_TRANSPOSE + +#if NDPM > 3 +#error Max NDPM is 3 for DCPLX NO_TRANSPOSE +#endif + +#undef plax +#define plax + +#undef R1 +#define R1 2 +#undef R2 +#define R2 4 +#undef R3 +#define R3 6 +#undef R4 +#define R4 6 + +#undef TREG +#define TREG 1 +#undef SREG +#define SREG 0 +#undef CREG +#define CREG 0 + +#ifdef GER +#undef AREG +#define AREG 0 +#undef targ +#define targ(a_) AREG +#undef wb +#define wb(a_,b_) pu(AREG,a_,b_) +#undef wbd +/* #define wbd(a_,b_) pud(AREG,a_,b_) */ +#undef w +#define w(a_) +#undef w1_2 +/* #define w1_2(a_) */ +#else +#undef AREG +#define AREG TREG +#undef targ +#define targ(a_) CREG +#undef wb +#define wb(a_,b_) +#undef wbd +/* #define wbd(a_,b_) */ +#undef w +#define w(a_) pu(CREG,a_ ## 0,si) +#undef w1_2 +/* #define w1_2(a_) pud(CREG,a_ ## 0,si) */ +#endif + +#undef src +#define src(a_) a_ +#undef mpx +#define mpx(a_) pls(0,si,a_) ps(0,a_,a_) pls(8,si,P(a_,1)) \ + ps(0,P(a_,1),P(a_,1)) sign(a_) +#undef madd +#define madd(a_,b_,c_) pas(a_,b_,c_) +#undef ulfa +#define ulfa(a_) + +#else + +#undef R1 +#define R1 4 +#undef R2 +#define R2 5 +#undef R3 +#define R3 6 +#undef R4 +#define R4 7 + +#undef TREG +#define TREG 3 +#undef SREG +#define SREG 2 +#undef CREG +#define CREG 0 +#undef targ +#define targ(a_) a_ +#undef src +#define src(a_) 0 +#undef w +#define w(a_) +#undef w1_2 +#define w1_2(a_) +#undef mpx +#define mpx(a_) px(a_) +#ifdef BETA0 +#undef ulfa +#define ulfa(a_) /* phl(a_,0) pa(0,a_) */pu(a_,0,si) +#else +#undef ulfa +#define ulfa(a_) pl(0,si,TREG) /* phl(a_,0) pa(0,a_) */ pa(TREG,a_) pu(a_,0,si) +#endif +#undef AREG +#define AREG TREG +#undef wb +#define wb(a_,b_) +#undef wbd +#define wbd(a_,b_) +#undef wbs +#define wbs(a_,b_) + + +#undef plax +#define plax pc(CREG,1) ps(0,CREG,CREG) ps(3,1,1) sign(CREG) + + + +#endif + +#if defined(Conj_) && ! defined(GER) +#undef sign +#define sign(a_) pm(SREG,a_) +#else +#undef sign +#define sign(a_) pm(SREG,P(a_,1)) +#endif + + + +#undef plb +#define plb(a_,b_) pl(a_,b_,AREG) +#undef plbd +/* #define plbd(a_,b_) px(AREG) pld(a_,b_,AREG) */ + +#undef dpr +#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_)) +#undef dprp +#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_)) +#undef dpi +#define dpi(a_) pm(P(src(a_),1),TREG) ps(1,TREG,TREG) pa(TREG,targ(a_)) + +#ifndef GER + +#undef plaa +#define plaa(a_) pl(a_ ## 0,si,CREG) plax +#undef wa +#define wa(a_) w(a_) +#undef dp +#define dp(a_,b_,c_) plb(a_ ## 0,b_) dpr(c_) plb(a_ ## 0,b_) dpi(c_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) plb(a_ ## 0,b_) dprp(c_,d_,e_) plb(a_ ## 0,b_) dpi(c_) +#undef ddp +#define ddp(a_,b_,c_) dp(a_,b_,c_) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) + +#undef plaa1_2 +/* #define plaa1_2(a_) px(CREG) pld(a_ ## 0,si,CREG) plax */ +#undef wa1_2 +/* #define wa1_2(a_) w1_2(a_) */ +#undef dp1_2 +/* #define dp1_2(a_,b_,c_) plbd(a_ ## 0,b_) dpr(c_) plbd(a_ ## 0,b_) dpi(c_) */ +#undef dpp1_2 +/* #define dpp1_2(a_,b_,c_,d_,e_) plbd(a_ ## 0,b_) dprp(c_,d_,e_) plbd(a_ ## 0,b_) dpi(c_) */ +#undef ddp1_2 +/* #define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) */ +#undef ddpp1_2 +/* #define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) */ + + +#else + +#undef lqc +#define lqc(a_) pl(a_ ## 0,si,TREG) +#undef lqc1 +/* #define lqc1_2(a_) px(TREG) pld(a_ ## 0,si,TREG) */ + + +#undef plaa +#define plaa(a_) +#undef wa +#define wa(a_) +#undef dp +#define dp(a_,b_,c_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) \ + lqc(a_) dpi(c_) wb(a_ ## 0,b_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) pf(d_,e_) \ + lqc(a_) dpi(c_) wb(a_ ## 0,b_) +#undef ddp +#define ddp(a_,b_,c_) dp(a_,b_,c_) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) + +#undef plaa1_2 +/* #define plaa1_2(a_) */ +#undef wa1_2 +/* #define wa1_2(a_) */ +#undef dp1_2 +/* #define dp1_2(a_,b_,c_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) \ */ +/* lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) */ +#undef dpp1_2 +/* #define dpp1_2(a_,b_,c_,d_,e_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) pf(d_,e_) \ */ +/* lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) */ +#undef ddp1_2 +/* #define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) */ +#undef ddpp1_2 +/* #define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) */ + +#endif + +#else + +#if NDPM > 2 +#error Max NDPM is 2 for DCPLX +#endif + +#undef TREG +#define TREG 2 + +#ifdef NO_TRANSPOSE + +#undef w +#define w(a_) fp(a_ ## 0,si) fp(a_ ## 8,si) +#undef plax +#define plax fx1 +#undef srr +#define srr(a_) a_ +#undef sri +#define sri(a_) a_ +#undef sir +#define sir(a_) a_ +#undef sii +#define sii(a_) a_ +#undef trr +#define trr(a_) P(TREG,1) +#undef tri +#define tri(a_) M(TREG,1) +#undef tir +#define tir(a_) TREG +#undef tii +#define tii(a_) TREG +#undef mpx +#define mpx(a_) fl(0,si) fl(8,si) fc(M(a_,2)) fc(M(a_,2)) +#undef madd +#define madd(a_,b_,c_) faa(a_,b_) +#undef ulfa +#define ulfa(a_) fc(0) fc(0) + +#else + +#undef srr +#define srr(a_) P(TREG,1) +#undef sri +#define sri(a_) M(TREG,1) +#undef sir +#define sir(a_) TREG +#undef sii +#define sii(a_) TREG +#undef trr +#define trr(a_) a_ +#undef tri +#define tri(a_) a_ +#undef tir +#define tir(a_) a_ +#undef tii +#define tii(a_) a_ +#undef w +#define w(a_) +#undef plax +#define plax +#undef mpx +#define mpx(a_) fz fz +#ifdef BETA0 +#undef madd +#define madd(a_,b_,c_) +#else +#undef madd +#define madd(a_,b_,c_) faa(a_,b_) +#endif +#undef ulfa +#define ulfa(a_) madd(0,si,a_) fp(0,si) madd(8,si,a_) fp(8,si) + +#endif + + + +#ifdef Conj_ +#undef fapi +#define fapi(a_,b_) fsp(b_) +#undef fspi +#define fspi(a_,b_) fap(a_,b_) +#else +#undef fapi +#define fapi(a_,b_) fap(a_,b_) +#undef fspi +#define fspi(a_,b_) fsp(b_) +#endif + +#ifndef GER + + +#undef plaa +#define plaa(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) plax +#undef wa +#define wa(a_) w(a_) +#undef ddp +#define ddp(a_,b_,c_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \ + fm(sri(c_),0) fap(0,tri(c_))\ + fl(a_ ## 8,b_) fd(0) fm(sir(c_),0) fspi(0,tir(c_)) \ + fm(sii(c_),0) fapi(0,tii(c_)) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \ + fm(sri(c_),0) fap(0,tri(c_))\ + fl(a_ ## 8,b_) fd(0) pf(d_,e_) fm(sir(c_),0) fspi(0,tir(c_))\ + fm(sii(c_),0) fapi(0,tii(c_)) + + + +#ifdef NO_TRANSPOSE + + + +#undef dp +#define dp(a_,b_,c_) ddp(a_,b_,c_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) ddpp(a_,b_,c_,d_,e_) + + + +#else + +#undef dp +#define dp(a_,b_,c_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \ + fm(sri(c_),0) fap(0,tri(c_))\ + fl(a_ ## 8,b_) fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \ + fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2)) + +#undef dpp +#define dpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \ + pf(d_,e_) fm(sri(c_),0) fap(0,tri(c_))\ + fl(a_ ## 8,b_) fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \ + fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2)) + + +#endif + +#else + +#undef plaa +#define plaa(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) plax +#undef wa +#define wa(a_) + +#undef ddprr +#define ddprr(a_,b_,c_) fl(a_ ## 0,b_) \ + fd(tri(c_)) fm(P(sri(c_),1),0) fap(0,1) \ + fd(M(trr(c_),1)) fm(srr(c_),0) fspi(0,1) \ + fp(a_ ## 0,b_) +#undef ddpri +#define ddpri(a_,b_,c_) fl(a_ ## 8,b_) \ + fd(tii(c_)) fm(P(sii(c_),1),0) fap(0,1) \ + fd(M(tir(c_),1)) fm(sir(c_),0) fapi(0,1) \ + fp(a_ ## 8,b_) +#undef dpri +#define dpri(a_,b_,c_) fl(a_ ## 8,b_) \ + fx(2) fm(sir(c_),0) fap(0,2) \ + fm(M(sii(c_),2),0) fapi(0,1) \ + fp(a_ ## 8,b_) + + +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) ddprr(a_,b_,c_) pf(d_,e_) ddpri(a_,b_,c_) +#undef ddp +#define ddp(a_,b_,c_) ddprr(a_,b_,c_) ddpri(a_,b_,c_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) ddprr(a_,b_,c_) pf(d_,e_) dpri(a_,b_,c_) +#undef dp +#define dp(a_,b_,c_) ddprr(a_,b_,c_) dpri(a_,b_,c_) + +#endif + + +#undef R1 +#define R1 4 +#undef R2 +#define R2 6 +#undef R3 +#define R3 6 +#undef R4 +#define R4 6 + +#endif + +#endif + + +/****************************************************************************** + * General Macros + ******************************************************************************/ + + + + +#undef bla1 +#define bla1(a_,b_) plaa(a_) dpp(a_,ax,R1,b_,si) wa(a_) +#undef blb1 +#define blb1(a_,b_) plaa(a_) dpp(a_,ax,R1,b_,ax) wa(a_) + +#undef bla2 +#undef bla2 +#define bla2(a_,b_) pf(b_,si) plaa(a_) ddp(a_,ax,R1) pf(b_,ax) dp(a_,bx,R2) wa(a_) +#undef blb2 +#undef blb2 +#define blb2(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,bx) dp(a_,bx,R2) wa(a_) + +#undef bla3 +#define bla3(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,si) ddp(a_,bx,R2) \ + dpp(a_,cx,R3,b_,ax) wa(a_) +#undef blb3 +#define blb3(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,bx) ddp(a_,bx,R2) \ + dpp(a_,cx,R3,b_,cx) wa(a_) + +#undef bla4 +#define bla4(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,si) ddpp(a_,bx,R2,b_,ax) \ + ddp(a_,cx,R3) dpp(a_,dx,R4,b_,bx) wa(a_) +#undef blb4 +#define blb4(a_,b_) plaa(a_) ddp(a_,ax,R1) ddpp(a_,bx,R2,b_,cx) \ + ddp(a_,cx,R3) dpp(a_,dx,R4,b_,dx) wa(a_) + +#undef bla +#define bla(a_,b_) Mjoin(bla,NDP)(a_,b_) +#undef blb +#define blb(a_,b_) Mjoin(blb,NDP)(a_,b_) + + + +#undef bla11_2 +#define bla11_2(a_) plaa1_2(a_) dp1_2(a_,ax,R1) wa1_2(a_) +#undef bla21_2 +#define bla21_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) dp1_2(a_,bx,R2) wa1_2(a_) +#undef bla31_2 +#define bla31_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \ + dp1_2(a_,cx,R3) wa1_2(a_) +#undef bla41_2 +#define bla41_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \ + ddp1_2(a_,cx,R3) dp1_2(a_,dx,R4) wa1_2(a_) + +#undef bla1_2 +#define bla1_2(a_) Mjoin(Mjoin(bla,NDP),1_2)(a_) + + + +#undef bla11_4 +#define bla11_4(a_) plaa1_4(a_) dp1_4(a_,ax,R1) wa1_4(a_) +#undef bla21_4 +#define bla21_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) dp1_4(a_,bx,R2) wa1_4(a_) +#undef bla31_4 +#define bla31_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \ + dp1_4(a_,cx,R3) wa1_4(a_) +#undef bla41_4 +#define bla41_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \ + ddp1_4(a_,cx,R3) dp1_4(a_,dx,R4) wa1_4(a_) + +#undef bla1_4 +#define bla1_4(a_) Mjoin(Mjoin(bla,NDP),1_4)(a_) + + + +#undef inc1 +#define inc1(a_) a(a_,si) a(a_,ax) +#undef inc2 +#define inc2(a_) inc1(a_) a(a_,bx) +#undef inc3 +#define inc3(a_) inc2(a_) a(a_,cx) +#undef inc4 +#define inc4(a_) inc3(a_) a(a_,dx) + +#undef inc +#define inc(a_) Mjoin(inc,NDP)(a_) + + +#ifdef PREFETCH +/* #include "camm_arith.h" */ +#undef S +#define S(a_,b_) (a_) + (b_) +#undef PF1 +#define PF1 PREFETCH +#undef PF2 +#define PF2 S(PF1,32) +#undef PF3 +#define PF3 S(PF1,64) +#undef PF4 +#define PF4 S(PF1,96) +#undef PF5 +#define PF5 S(PF1,128) +#undef PF6 +#define PF6 S(PF1,160) +#undef PF7 +#define PF7 S(PF1,192) +#undef PF8 +#define PF8 S(PF1,224) +#else +#undef PF1 +#define PF1 64 +#undef PF2 +#define PF2 96 +#undef PF3 +#define PF3 128 +#undef PF4 +#define PF4 160 +#undef PF5 +#define PF5 192 +#undef PF6 +#define PF6 224 +#undef PF7 +#define PF7 256 +#undef PF8 +#define PF8 288 +#endif + + +#if defined(NO_TRANSPOSE) && !defined(SREAL) && !defined(GER) +#undef pf +#define pf(a_,b_) f(t0,a_,b_) +#else +#undef pf +#define pf(a_,b_) f(nta,a_,b_) +#endif + +#undef bl1 +#define bl1 bla1_4(0x0) inc(4) +#undef bl2 +#define bl2 bla1_2(0x0) inc(8) +#undef bl4 +#define bl4 bla(0x0,PF1) inc(16) +#undef bl8 +#define bl8 bla(0x0,PF1) blb(0x1,PF1) inc(32) +#undef bl16 +#define bl16 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) inc(64) +#undef bl32 +#define bl32 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \ + bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) inc(128) +#undef bl64 +#define bl64 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \ + bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) \ + bla(0x8,PF5) blb(0x9,PF5) bla(0xa,PF6) blb(0xb,PF6) \ + bla(0xc,PF7) blb(0xd,PF7) bla(0xe,PF8) blb(0xf,PF8) inc(256) + +/* #define in2 inc(8) */ +/* #define in4 inc(16) */ +/* #define in8 inc(32) */ +/* #define in16 inc(64) */ + +#undef in2 +#define in2 +#undef in4 +#define in4 +#undef in8 +#define in8 +#undef in16 +#define in16 + +#ifdef NO_TRANSPOSE +#undef incf +#define incf ra(di,si) +#else +#undef incf +#define incf +#endif + +#undef lf1 +#define lf1 mpx(R1) +#undef lf2 +#define lf2 lf1 incf mpx(R2) +#undef lf3 +#define lf3 lf2 incf mpx(R3) +#undef lf4 +#define lf4 lf3 incf mpx(R4) + +#undef lf +#define lf Mjoin(lf,NDP) + + +#undef ulf1 +#define ulf1 ulfa(R1) +#undef ulf2 +#define ulf2 ulf1 ra(di,si) ulfa(R2) +#undef ulf3 +#define ulf3 ulf2 ra(di,si) ulfa(R3) +#undef ulf4 +#define ulf4 ulf3 ra(di,si) ulfa(R4) + +#undef ulf +#define ulf Mjoin(ulf,NDP) + +#undef lpba +#define lpba(a_) "movl %%esi,%%e" #a_ "\n\t" + +#undef lpb1 +#define lpb1 lpba(ax) +#undef lpb2 +#define lpb2 lpb1 ra(di,si) lpba(bx) +#undef lpb3 +#define lpb3 lpb2 ra(di,si) lpba(cx) +#undef lpb4 +#define lpb4 lpb3 ra(di,si) lpba(dx) + +#undef lpb +#define lpb Mjoin(lpb,NDP) + +#undef ipf1 +#define ipf1(a_) pf(a_,si) pf(a_,ax) +#undef ipf2 +#define ipf2(a_) ipf1(a_) pf(a_,bx) +#undef ipf3 +#define ipf3(a_) ipf2(a_) pf(a_,cx) +#undef ipf4 +#define ipf4(a_) ipf3(a_) pf(a_,dx) + +#undef ipf +#define ipf(a_) Mjoin(ipf,NDP)(a_) + +#ifdef LUNROLL +#undef UNROLL +#ifdef SREAL +#undef UNROLL +#define UNROLL LUNROLL +#elif defined(DREAL) || defined(SCPLX) +#undef UNROLL +#define UNROLL LUNROLL*2 +#elif defined(DCPLX) +#undef UNROLL +#define UNROLL LUNROLL*4 +#endif +#else +#undef UNROLL +#define UNROLL 16 +#endif + +#undef UNROLL1_2 +#if UNROLL == 64 +#undef blUNROLL +#define blUNROLL bl64 +#undef UNROLL1_2 +#define UNROLL1_2 32 +#elif UNROLL == 32 +#undef blUNROLL +#define blUNROLL bl32 +#undef UNROLL1_2 +#define UNROLL1_2 16 +#elif UNROLL == 16 +#undef blUNROLL +#define blUNROLL bl16 +#undef UNROLL1_2 +#define UNROLL1_2 8 +#elif UNROLL == 8 +#undef blUNROLL +#define blUNROLL bl8 +#undef UNROLL1_2 +#define UNROLL1_2 4 +#elif UNROLL == 4 +#undef blUNROLL +#define blUNROLL bl4 +#undef UNROLL1_2 +#define UNROLL1_2 2 +#elif UNROLL == 2 +#undef blUNROLL +#define blUNROLL bl2 +#undef UNROLL1_2 +#define UNROLL1_2 1 +#elif UNROLL == 1 +#undef blUNROLL +#define blUNROLL bl1 +#undef UNROLL1_2 +#define UNROLL1_2 stop +#endif +#ifndef UNROLL1_2 +#error UNROLL must be set to power of 2 < 128 +#endif + + +#ifdef GER +#undef aconst +#define aconst +#undef cconst +#define cconst const +#else +#undef aconst +#define aconst const +#undef cconst +#define cconst +#endif + +#undef MY_FUNCTION +#define MY_FUNCTION Mjoin(dp,EXT) + +static void +MY_FUNCTION(aconst TYPE *a,int lda, + const TYPE *b, + cconst TYPE *c,int stride,int len) { + +#ifdef SCPLX +#if defined(GER) && defined(Conj_) + const TYPE w1[2]={{-1.0,1.0},{-1.0,1.0}},*w=w1; +#else + const TYPE w1[2]={{1.0,-1.0},{1.0,-1.0}},*w=w1; +#endif +#endif + +#if defined(DCPLX) && defined(ATL_SSE2) +#if defined(GER) && defined(Conj_) + const TYPE w1[1]={{-1.0,1.0}},*w=w1; +#else + const TYPE w1[1]={{1.0,-1.0}},*w=w1; +#endif +#endif + +#ifdef NO_TRANSPOSE +#undef movm +#define movm c +#undef fixm +#define fixm b +#else +#undef movm +#define movm b +#undef fixm +#define fixm c +#endif + NO_INLINE + unsigned u1=stride*sizeof(*fixm),u2=lda*sizeof(*a),u3=len*sizeof(*movm)/sizeof(float); + + ASM ( + + "pushl %%ebx\n\t" + a(4,sp) + +#if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2)) + "movl %6,%%esi\n\t" + pl(0,si,SREG) +#endif + +#ifdef NO_TRANSPOSE + "movl %1,%%esi\n\t" /* fixm */ + "movl %2,%%edi\n\t" /* fixm2fixm */ +#endif + + lf + + "movl %3,%%esi\n\t" /* a */ + "movl %4,%%edi\n\t" /* a2a */ + + lpb + + ipf(0) + + "movl %0,%%esi\n\t" /* movm */ + "movl %5,%%edi\n\t" /* len */ + +#if defined(ALIGN) + +#if defined(SREAL) + + test(4,ax) + je(Mjoin(a1,EXT)) + test(-1,di) + je(Mjoin(a1,EXT)) + sub(1,di) + bl1 + + lab(Mjoin(a1,EXT)) + +#endif + +#if defined(DREAL) || defined(SREAL) + + test(8,ax) + je(Mjoin(as,EXT)) + test(-2,di) + je(Mjoin(as,EXT)) + sub(2,di) + bl2 + + lab(Mjoin(as,EXT)) + +#endif + +#endif + + + ipf(32) + + lab(Mjoin(loop,EXT)) + + test(-UNROLL,di) + je(Mjoin(UNROLL1_2,EXT)) + sub(UNROLL,di) + + blUNROLL + + jmp(Mjoin(loop,EXT)) + +#if UNROLL > 32 + lab(Mjoin(32,EXT)) + test(32,di) + je(Mjoin(16,EXT)) + bl32 +#endif + +#if UNROLL > 16 + lab(Mjoin(16,EXT)) + test(16,di) + je(Mjoin(8,EXT)) + bl16 +#endif + +#if UNROLL > 8 + lab(Mjoin(8,EXT)) + test(8,di) + je(Mjoin(4,EXT)) + bl8 +#endif + +#if UNROLL > 4 + lab(Mjoin(4,EXT)) + test(4,di) + je(Mjoin(2,EXT)) + bl4 +#endif + +#if UNROLL > 2 + lab(Mjoin(2,EXT)) +#ifndef DCPLX + test(2,di) + je(Mjoin(1,EXT)) + bl2 +#endif +#endif + +#if UNROLL > 1 + lab(Mjoin(1,EXT)) +#ifdef SREAL + test(1,di) + je(Mjoin(stop,EXT)) + bl1 +#endif +#endif + + lab(Mjoin(stop,EXT)) + +#ifndef NO_TRANSPOSE + "movl %1,%%esi\n\t" /* fixm */ + "movl %2,%%edi\n\t" /* fixm2fixm */ +#endif + + ulf + + a(-4,sp) + "popl %%ebx\n\t" + + + ::"m" (movm),"m" (fixm),"m" (u1),"m" (a),"m" (u2),"m" (u3) + +#if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2)) + ,"m" (w) +#endif + :"ax","bx","cx","dx","si","di"); + + +} + diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h new file mode 100644 index 0000000..7fd1404 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h @@ -0,0 +1,295 @@ +#include "camm_util.h" + +#ifndef N +#error N must be defined in camm_pipe3.h +#endif +#ifndef KB +#error KB must be defined in camm_pipe3.h +#endif + +#undef p1 +#define p1(a_) Mjoin(p1_4_,N)(a_) +#undef p2 +#define p2(a_) Mjoin(p1_2_,N)(a_) +#undef p4 +#define p4(a_) Mjoin(p1_,N)(a_) +#undef load_pipe +#define load_pipe(a_) Mjoin(lp,N)(a_) +#undef drain_pipe +#define drain_pipe(a_) Mjoin(dp,N)(a_) +#undef pipe_len +#define pipe_len Mjoin(pl,N) + +#undef p8 +#if pipe_len > 4 +#define p8(a_) Mjoin(p2_,N)(a_) +#else +#define p8(a_) p4(a_) p4(SS(a_,16)) +#endif + +#undef p16 +#if pipe_len > 8 +#define p16(a_) Mjoin(p4_,N)(a_) +#else +#define p16(a_) p8(a_) p8(SS(a_,32)) +#endif + +#undef p32 +#if pipe_len > 16 +#define p32(a_) Mjoin(p8_,N)(a_) +#else +#define p32(a_) p16(a_) p16(SS(a_,64)) +#endif + +#undef p64 +#if pipe_len > 32 +#define p64(a_) Mjoin(p16_,N)(a_) +#else +#define p64(a_) p32(a_) p32(SS(a_,128)) +#endif + +#undef p128 +#if pipe_len > 64 +#define p128(a_) Mjoin(p32_,N)(a_) +#else +#define p128(a_) p64(a_) p64(SS(a_,256)) +#endif + +#undef p256 +#if pipe_len > 128 +#define p256(a_) Mjoin(p64_,N)(a_) +#else +#define p256(a_) p128(a_) p128(SS(a_,512)) +#endif + +#if KB < pipe_len +#undef pipe_len +#define pipe_len 0 +#undef load_pipe +#define load_pipe(a_) +#undef drain_pipe +#define drain_pipe(a_) +#endif + + +#undef MKB +/* #ifdef SREAL */ +#define MKB KB +/* #elif defined (DCPLX) */ +/* #define MKB ( KB * 4 ) */ +/* #else */ +/* #define MKB ( KB * 2 ) */ +/* #endif */ + +#if MKB >= 512 +#error MKB must be less than 512 +#endif + +#undef x0 +#undef o0 +#define x0 load_pipe(0) +#define o0 0 + +#undef MKBB +#define MKBB ( MKB - pipe_len ) + +#undef xx1 +#undef oo1 +#if MKBB >= 256 +#define xx1 x0 p256(o0) +#define oo1 SS(1024,o0) +#else +#define xx1 x0 +#define oo1 o0 +#endif + +#undef xx1a +#undef oo1a +#if pipe_len == 256 +#define xx1a xx1 drain_pipe(oo1) +#define oo1a SS(1024,oo1) +#undef MKBB +#define MKBB MKB +#else +#define xx1a xx1 +#define oo1a oo1 +#endif + +#undef x1 +#undef o1 +#if ( MKBB / 128 ) % 2 +#define x1 xx1a p128(oo1a) +#define o1 SS(512,oo1a) +#else +#define x1 xx1a +#define o1 oo1a +#endif + +#undef x1a +#undef o1a +#if pipe_len == 128 +#define x1a x1 drain_pipe(o1) +#define o1a SS(512,o1) +#undef MKBB +#define MKBB MKB +#else +#define x1a x1 +#define o1a o1 +#endif + +#undef x2 +#undef o2 +#if ( MKBB / 64 ) % 2 +#define x2 x1a p64(o1a) +#define o2 SS(256,o1a) +#else +#define x2 x1a +#define o2 o1a +#endif + +#undef x2a +#undef o2a +#if pipe_len == 64 +#define x2a x2 drain_pipe(o2) +#define o2a SS(256,o2) +#undef MKBB +#define MKBB MKB +#else +#define x2a x2 +#define o2a o2 +#endif + +#undef x3 +#undef o3 +#if ( MKBB / 32 ) % 2 +#define x3 x2a p32(o2a) +#define o3 SS(128,o2a) +#else +#define x3 x2a +#define o3 o2a +#endif + +#undef x3a +#undef o3a +#if pipe_len == 32 +#define x3a x3 drain_pipe(o3) +#define o3a SS(128,o3) +#undef MKBB +#define MKBB MKB +#else +#define x3a x3 +#define o3a o3 +#endif + +#undef x4 +#undef o4 +#if ( MKBB / 16 ) % 2 +#define x4 x3a p16(o3a) +#define o4 SS(64,o3a) +#else +#define x4 x3a +#define o4 o3a +#endif + +#undef x4a +#undef o4a +#if pipe_len == 16 +#define x4a x4 drain_pipe(o4) +#define o4a SS(64,o4) +#undef MKBB +#define MKBB MKB +#else +#define x4a x4 +#define o4a o4 +#endif + +#undef x5 +#undef o5 +#if ( MKBB / 8 ) % 2 +#define x5 x4a p8(o4a) +#define o5 SS(32,o4a) +#else +#define x5 x4a +#define o5 o4a +#endif + +#undef x5a +#undef o5a +#if pipe_len == 8 +#define x5a x5 drain_pipe(o5) +#define o5a SS(32,o5) +#undef MKBB +#define MKBB MKB +#else +#define x5a x5 +#define o5a o5 +#endif + +#undef x6 +#undef o6 +#if ( MKBB / 4 ) % 2 +#define x6 x5a p4(o5a) +#define o6 SS(16,o5a) +#else +#define x6 x5a +#define o6 o5a +#endif + +#undef x6a +#undef o6a +#if pipe_len == 4 +#define x6a x6 drain_pipe(o6) +#define o6a SS(16,o6) +#undef MKBB +#define MKBB MKB +#else +#define x6a x6 +#define o6a o6 +#endif + +#undef x7 +#undef o7 +#if ( MKB / 2 ) % 2 +#define x7 x6a p2(o6a) +#define o7 SS(8,o6a) +#else +#define x7 x6a +#define o7 o6a +#endif + +#undef x7a +#undef o7a +#if pipe_len == 2 +#define x7a x7 drain_pipe(o7) +#define o7a SS(8,o7) +#undef MKBB +#define MKBB MKB +#else +#define x7a x7 +#define o7a o7 +#endif + +#undef x8 +#undef o8 +#if ( MKB / 1 ) % 2 +#define x8 x7a p1(o7a) +#define o8 SS(4,o7a) +#else +#define x8 x7a +#define o8 o7a +#endif + +#undef x8a +#undef o8a +#if pipe_len == 1 +#define x8a x8 drain_pipe(o8) +#define o8a SS(4,o8) +#undef MKBB +#define MKBB MKB +#else +#define x8a x8 +#define o8a o8 +#endif + +#undef KB_block +#define KB_block x8a diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h new file mode 100644 index 0000000..35e9e59 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h @@ -0,0 +1,215 @@ +#ifndef CAMM_SCALE_H +#define CAMM_SCALE_H /*+ To stop multiple inclusions. +*/ + +#include "camm_util.h" + +#undef spf +#define spf(a_,b_) f(t0,a_,b_) + +#ifdef SCPLX +#ifdef BETAX +#undef SSREG +#define SSREG 2 +#undef lbx +#define lbx pls(4,ax,1) ps(0,1,1) pm(SSREG,1) +#undef cxx +#define cxx pm(1,3) ps(177,3,3) pa(3,2) +#undef pcx +#define pcx pc(2,3) +#else +#undef lbx +#define lbx +#undef cxx +#define cxx +#undef pcx +#define pcx +#endif +#undef lb +#define lb pls(0,ax,0) ps(0,0,0) lbx +#undef c +#define c(a_) pl(a_ ## 0,si,2) pcx pm(0,2) cxx pu(2,a_ ## 0,si) +#undef cp +#define cp(a_,b_) pl(a_ ## 0,si,2) pcx pm(0,2) spf(b_,si) cxx pu(2,a_ ## 0,si) +#undef c1_2 +#define c1_2(a_) px(2) pld(a_ ## 0,si,2) pcx pm(0,2) cxx pud(2,a_ ## 0,si) +#undef ub +#define ub +#endif + +#ifdef SREAL +#undef lb +#define lb pls(0,ax,0) ps(0,0,0) +#undef c +#define c(a_) pl(a_ ## 0,si,2) pm(0,2) pu(2,a_ ## 0,si) +#undef cp +#define cp(a_,b_) pl(a_ ## 0,si,2) spf(b_,si) pm(0,2) pu(2,a_ ## 0,si) +#undef c1_2 +#define c1_2(a_) px(2) pld(a_ ## 0,si,2) pm(0,2) pud(2,a_ ## 0,si) +#undef c1_4 +#define c1_4(a_) pls(a_ ## 0,si,2) pm(0,2) pus(2,a_ ## 0,si) +#undef ub +#define ub +#endif + +#ifdef DREAL +#undef lb +#define lb fl(0,ax) +#undef c +#define c(a_) fl(a_ ## 0,si) fm(1,0) fl(a_ ## 8,si) fm(2,0) fx1 \ + fp(a_ ## 0,si) fp(a_ ## 8,si) +#undef cp +#define cp(a_,b_) fl(a_ ## 0,si) fm(1,0) fl(a_ ## 8,si) spf(b_,si) fm(2,0) fx1 \ + fp(a_ ## 0,si) fp(a_ ## 8,si) +#undef c1_2 +#define c1_2(a_) fl(a_ ## 0,si) fm(1,0) fp(a_ ## 0,si) +#undef ub +#define ub fc(0) +#endif + +#ifdef DCPLX +#undef lb +#define lb fl(0,ax) fl(8,ax) +#undef c +#define c(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) fd(3) fm(2,0) fd(3) \ + fm(2,0) fx(3) fm(4,0) fx(2) fm(5,0) fap(0,2) fx(2) fsp(2) fx1 \ + fp(a_ ## 0,si) fp(a_ ## 8,si) +#undef cp +#define cp(a_,b_) fl(a_ ## 0,si) fl(a_ ## 8,si) fd(3) fm(2,0) fd(3) \ + fm(2,0) fx(3) spf(b_,si) fm(4,0) fx(2) fm(5,0) fap(0,2) fx(2) \ + fsp(2) fx1 fp(a_ ## 0,si) fp(a_ ## 8,si) +#undef ub +#define ub fc(0) fc(0) +#endif + +#undef sbl1 +#define sbl1 c1_4(0x0) +#undef sbl2 +#define sbl2 c1_2(0x0) +#undef sbl4 +#define sbl4 cp(0x0,0x40) +#undef sbl8 +#define sbl8 sbl4 c(0x1) +#undef sbl16 +#define sbl16 sbl8 cp(0x2,0x60) c(0x3) + +#undef sinc16 +#define sinc16 a(0x40,si) +#undef sinc8 +#define sinc8 a(0x20,si) +#undef sinc4 +#define sinc4 a(0x10,si) +#undef sinc2 +#define sinc2 a(0x8,si) +#undef sinc1 +#define sinc1 a(0x4,si) + +#undef SCALE +#define SCALE Mjoin(Mjoin(PREC,Mjoin(scale,BLC)),FEXT) + +#undef MY_FUNCTION +#define MY_FUNCTION SCALE + +static void +MY_FUNCTION(const TYPE *b,TYPE *c,int len) { + + const TYPE *ce=c+len; +#if defined(BETAX) && defined(SCPLX) + const TYPE z1[2]={{1.0,-1.0},{1.0,-1.0}},*z=z1; +#endif + NO_INLINE + +#ifndef SREAL + len+=len; +#endif +#ifdef DCPLX + len+=len; +#endif + + + ASM( + + "pushl %%ebx\n\t" + a(4,sp) + + + "movl %0,%%esi\n\t" + + spf(0x00,si) + spf(0x20,si) + + "movl %1,%%eax\n\t" + "movl %2,%%edi\n\t" + +#if defined(BETAX) && defined(SCPLX) + "movl %3,%%ebx\n\t" + pl(0,bx,SSREG) +#endif + + lb + + lab(loop) + + test(-16,di) + je(8) + sub(16,di) + align + + sbl16 + sinc16 + + jmp(loop) + align + + lab(8) + + test(8,di) + je(4) + + sbl8 + sinc8 + + lab(4) + + test(4,di) + je(2) + + sbl4 + sinc4 + + lab(2) + +#ifndef DCPLX + test(2,di) + je(1) + + sbl2 + sinc2 + + lab(1) + +#ifdef SREAL + test(1,di) + je(stop) + + sbl1 + sinc1 + + lab(stop) +#endif +#endif + + ub + + a(-4,sp) + "popl %%ebx\n\t" + + + ::"m" (c),"m" (b), "m" (len) +#if defined(BETAX) && defined(SCPLX) + ,"m" (z) +#endif + : "si","ax","di"); + + +} +#endif /* CAMM_SCALE_H */ diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h new file mode 100644 index 0000000..4a92006 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h @@ -0,0 +1,2982 @@ +#include "camm_util.h" + +#undef p1_4_swap_1 +#define p1_4_swap_1(a_) \ + pls(a_,ax,1) \ + pls(a_,cx,0) \ + pus(0,a_,ax) \ + pus(1,a_,cx) +#undef p1_2_swap_1 +#define p1_2_swap_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + px(0) \ + pld(a_,cx,0) \ + pud(0,a_,ax) \ + pud(1,a_,cx) +#undef p1_swap_1 +#define p1_swap_1(a_) \ + plq(a_,ax,1) \ + pl(a_,cx,0) \ + puq(0,a_,ax) \ + pu(1,a_,cx) +#undef p2_swap_1 +#define p2_swap_1(a_) \ + plq(SS(a_,RS4),ax,3) \ + pl(SS(a_,RS4),cx,2) \ + puq(0,a_,ax) \ + pu(1,a_,cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,0) \ + puq(2,SS(a_,RS4),ax) \ + pu(3,SS(a_,RS4),cx) +#undef lpswap_1 +#define lpswap_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,0) +#undef dpswap_1 +#define dpswap_1(a_) \ + plq(SS(a_,RS4),ax,3) \ + pl(SS(a_,RS4),cx,2) \ + puq(0,a_,ax) \ + pu(1,a_,cx) \ + puq(2,SS(a_,RS4),ax) \ + pu(3,SS(a_,RS4),cx) +#undef plswap_1 +#define plswap_1 8 + + +#undef p1_4_scal_3 +#define p1_4_scal_3(a_) \ + pls(a_,ax,0) \ + pmsr(6,0) \ + pus(0,a_,ax) +#undef p1_2_scal_3 +#define p1_2_scal_3(a_) \ + pld(a_,ax,0) \ + pm(6,0) \ + pud(0,a_,ax) +#undef p1_scal_3 +#define p1_scal_3(a_) \ + plq(a_,ax,0) \ + pm(6,0) \ + puq(0,a_,ax) +#undef p2_scal_3 +#define p2_scal_3(a_) \ + plq(a_,ax,0) \ + plq(SS(a_,RS4),ax,1) \ + pm(6,0) \ + pm(6,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_scal_3 +#define p4_scal_3(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,2) \ + puq(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(6,3) \ + puq(1,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pm(6,0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(6,1) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpscal_3 +#define lpscal_3(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pm(6,1) +#undef dpscal_3 +#define dpscal_3(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,2) \ + puq(0,a_,ax) \ + pm(6,3) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plscal_3 +#define plscal_3 16 + +#undef p1_4_scal_3c +#define p1_4_scal_3c(a_) +#undef p1_2_scal_3c +#define p1_2_scal_3c(a_) \ + pld(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + pud(0,a_,ax) +#undef p1_scal_3c +#define p1_scal_3c(a_) \ + plq(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + puq(0,a_,ax) +#undef p2_scal_3c +#define p2_scal_3c(a_) \ + plq(a_,ax,0) \ + plq(SS(a_,RS4),ax,1) \ + pc(0,2) \ + pm(6,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + puq(0,a_,ax) \ + pc(1,3) \ + pm(6,1) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,1) \ + puq(1,SS(a_,RS4),ax) +#undef p4_scal_3c +#define p4_scal_3c(a_) \ + pm(7,5) \ + pa(5,1) \ + puq(0,a_,ax) \ + ps(CSHUF,4,4) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pc(3,5) \ + pm(6,3) \ + pm(7,4) \ + pa(4,2) \ + puq(1,SS(a_,RS4),ax) \ + ps(CSHUF,5,5) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pc(0,4) \ + pm(6,0) \ + pm(7,5) \ + pa(5,3) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pc(1,5) \ + pm(6,1) \ + pm(7,4) \ + pa(4,0) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + ps(CSHUF,5,5) \ + plq(SS(a_,MM(7,RS4)),ax,3) \ + pc(2,4) \ + pm(6,2) +#undef lpscal_3c +#define lpscal_3c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pc(0,4) \ + pm(6,0) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pc(1,5) \ + pm(6,1) \ + pm(7,4) \ + pa(4,0) \ + ps(CSHUF,5,5) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pc(2,4) \ + pm(6,2) +#undef dpscal_3c +#define dpscal_3c(a_) \ + pm(7,5) \ + pa(5,1) \ + ps(CSHUF,4,4) \ + puq(0,a_,ax) \ + pm(7,4) \ + pa(4,2) \ + pc(3,5) \ + pm(6,3) \ + puq(1,SS(a_,RS4),ax) \ + ps(CSHUF,5,5) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pm(7,5) \ + pa(5,3) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plscal_3c +#define plscal_3c 16 + +#undef p1_4_scal_4 +#define p1_4_scal_4(a_) \ + pls(SS(a_,MM(0,RS4)),ax,0) \ + pmsr(6,0) \ + pus(0,a_,ax) +#undef p1_2_scal_4 +#define p1_2_scal_4(a_) \ + pld(SS(a_,MM(0,RS4)),ax,0) \ + pm(6,0) \ + pud(0,a_,ax) +#undef p1_scal_4 +#define p1_scal_4(a_) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pm(6,0) \ + puq(0,a_,ax) +#undef p2_scal_4 +#define p2_scal_4(a_) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,0) \ + pm(6,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_scal_4 +#define p4_scal_4(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,0) \ + pm(6,1) \ + pm(6,2) \ + pm(6,3) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef p8_scal_4 +#define p8_scal_4(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + plq(SS(a_,MM(4,RS4)),ax,4) \ + plq(SS(a_,MM(5,RS4)),ax,5) \ + plq(SS(a_,MM(6,RS4)),ax,7) \ + pm(6,0) \ + pm(6,1) \ + pm(6,2) \ + puq(0,a_,ax) \ + pm(6,3) \ + pm(6,4) \ + pm(6,5) \ + plq(SS(a_,MM(7,RS4)),ax,0) \ + pm(6,7) \ + pm(6,0) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + puq(4,SS(a_,MM(4,RS4)),ax) \ + puq(5,SS(a_,MM(5,RS4)),ax) \ + puq(7,SS(a_,MM(6,RS4)),ax) \ + puq(0,SS(a_,MM(7,RS4)),ax) +#undef lpscal_4 +#define lpscal_4(a_) +#undef dpscal_4 +#define dpscal_4(a_) p4_scal_4(a_) +#undef plscal_4 +#define plscal_4 16 + +#undef p1_4_scal_4c +#define p1_4_scal_4c(a_) +#undef p1_2_scal_4c +#define p1_2_scal_4c(a_) \ + pld(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + pud(0,a_,ax) +#undef p1_scal_4c +#define p1_scal_4c(a_) \ + plq(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + puq(0,a_,ax) +#undef p2_scal_4c +#define p2_scal_4c(a_) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pc(0,4) \ + pc(1,5) \ + pm(6,0) \ + pm(6,1) \ + ps(CSHUF,4,4) \ + ps(CSHUF,5,5) \ + pm(7,4) \ + pa(4,0) \ + pm(7,5) \ + pa(5,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_scal_4c +#define p4_scal_4c(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pc(0,4) \ + pc(1,5) \ + pm(6,0) \ + pm(6,1) \ + ps(CSHUF,4,4) \ + ps(CSHUF,5,5) \ + pm(7,4) \ + pa(4,0) \ + pc(2,4) \ + pm(7,5) \ + pa(5,1) \ + pc(3,5) \ + pm(6,2) \ + pm(6,3) \ + ps(CSHUF,4,4) \ + ps(CSHUF,5,5) \ + pm(7,4) \ + pa(4,2) \ + pm(7,5) \ + pa(5,3) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef lpscal_4c +#define lpscal_4c(a_) +#undef dpscal_4c +#define dpscal_4c(a_) p4_scal_4c(a_) +#undef plscal_4c +#define plscal_4c 16 + +#undef p1_4_scal_1 +#define p1_4_scal_1(a_) \ + pls(a_,ax,1) \ + pmsr(0,1) \ + pus(1,a_,ax) +#undef p1_2_scal_1 +#define p1_2_scal_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + pm(0,1) \ + pud(1,a_,ax) +#undef p1_scal_1 +#define p1_scal_1(a_) \ + plq(a_,ax,1) \ + pm(0,1) \ + puq(1,a_,ax) +#undef p2_scal_1 +#define p2_scal_1(a_) \ + plq(a_,ax,1) \ + plq(SS(a_,RS4),ax,2) \ + pm(0,1) \ + pm(0,2) \ + puq(1,a_,ax) \ + puq(2,SS(a_,RS4),ax) +#undef p4_scal_1 +#define p4_scal_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pm(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(0,1) \ + puq(3,SS(a_,MM(1,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pm(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pm(0,7) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef lpscal_1 +#define lpscal_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,MM(1,RS4)),ax,3) \ + pm(0,7) +#undef dpscal_1 +#define dpscal_1(a_) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pm(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(0,1) \ + puq(3,SS(a_,MM(1,RS4)),ax) \ + pm(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef plscal_1 +#define plscal_1 RS4 + + +#undef p1_4_set_1 +#define p1_4_set_1(a_) \ + pls(a_,ax,1) \ + pcs(0,1) \ + pus(1,a_,ax) +#undef p1_2_set_1 +#define p1_2_set_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + pc(0,1) \ + pud(1,a_,ax) +#undef p1_set_1 +#define p1_set_1(a_) \ + plq(a_,ax,1) \ + pc(0,1) \ + puq(1,a_,ax) +#undef p2_set_1 +#define p2_set_1(a_) \ + plq(a_,ax,1) \ + plq(SS(a_,RS4),ax,2) \ + pc(0,1) \ + pc(0,2) \ + puq(1,a_,ax) \ + puq(2,SS(a_,RS4),ax) +#undef p4_set_1 +#define p4_set_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pc(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pc(0,1) \ + puq(3,SS(a_,MM(1,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pc(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pc(0,7) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef lpset_1 +#define lpset_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,MM(1,RS4)),ax,3) \ + pc(0,7) +#undef dpset_1 +#define dpset_1(a_) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pc(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pc(0,1) \ + puq(3,SS(a_,MM(1,RS4)),ax) \ + pc(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef plset_1 +#define plset_1 RS4 + + +#undef p1_4_set_2 +#define p1_4_set_2(a_) \ + pus(0,a_,ax) +#undef p1_2_set_2 +#define p1_2_set_2(a_) \ + pud(0,a_,ax) +#undef p1_set_2 +#define p1_set_2(a_) \ + puq(0,a_,ax) +#undef p2_set_2 +#define p2_set_2(a_) \ + puq(0,a_,ax) \ + puq(0,SS(a_,RS4),ax) +#undef p4_set_2 +#define p4_set_2(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + puq(0,a_,ax) \ + puq(0,SS(a_,MM(1,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + puq(0,SS(a_,MM(2,RS4)),ax) \ + puq(0,SS(a_,MM(3,RS4)),ax) +#undef lpset_2 +#define lpset_2(a_) +#undef dpset_2 +#define dpset_2(a_) \ + puq(0,a_,ax) \ + puq(0,SS(a_,MM(1,RS4)),ax) \ + puq(0,SS(a_,MM(2,RS4)),ax) \ + puq(0,SS(a_,MM(3,RS4)),ax) +#undef plset_2 +#define plset_2 RS4 + + +#undef p1_4_set_3 +#define p1_4_set_3(a_) \ + pus(0,a_,ax) +#undef p1_2_set_3 +#define p1_2_set_3(a_) \ + pud(0,a_,ax) +#undef p1_set_3 +#define p1_set_3(a_) \ + puq(0,SS(a_,MM(0,RS4)),ax) +#undef p2_set_3 +#define p2_set_3(a_) \ + puq(0,SS(a_,MM(0,RS4)),ax) \ + puq(0,SS(a_,MM(1,RS4)),ax) +#undef p4_set_3 +#define p4_set_3(a_) \ + puq(0,SS(a_,MM(0,RS4)),ax) \ + puq(0,SS(a_,MM(1,RS4)),ax) \ + puq(0,SS(a_,MM(2,RS4)),ax) \ + puq(0,SS(a_,MM(3,RS4)),ax) +#undef p8_set_3 +#define p8_set_3(a_) \ + puq(0,SS(a_,MM(0,RS4)),ax) \ + puq(0,SS(a_,MM(1,RS4)),ax) \ + puq(0,SS(a_,MM(2,RS4)),ax) \ + puq(0,SS(a_,MM(3,RS4)),ax) \ + puq(0,SS(a_,MM(4,RS4)),ax) \ + puq(0,SS(a_,MM(5,RS4)),ax) \ + puq(0,SS(a_,MM(6,RS4)),ax) \ + puq(0,SS(a_,MM(7,RS4)),ax) +#undef lpset_3 +#define lpset_3(a_) +#undef dpset_3 +#define dpset_3(a_) p8_set_3(a_) +#undef plset_3 +#define plset_3 32 + + +#undef p1_4_0x1_nrm2_1 +#define p1_4_0x1_nrm2_1(a_) \ + pls(a_,ax,1) \ + pmsr(1,1) \ + pasr(1,0) +#undef p1_2_0x1_nrm2_1 +#define p1_2_0x1_nrm2_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + pm(1,1) \ + pa(1,0) +#undef p1_0x1_nrm2_1 +#define p1_0x1_nrm2_1(a_) \ + plq(a_,ax,1) \ + pm(1,1) \ + pa(1,0) +#undef p2_0x1_nrm2_1 +#define p2_0x1_nrm2_1(a_) \ + plq(a_,ax,1) \ + plq(SS(a_,RS4),ax,2) \ + pm(1,1) \ + pm(2,2) \ + pa(1,0) \ + pm(2,0) +#undef p4_0x1_nrm2_1 +#define p4_0x1_nrm2_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pm(3,3) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(1,1) \ + pa(3,0) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pm(2,2) \ + pa(1,0) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pm(7,7) \ + pa(2,0) +#undef lp0x1_nrm2_1 +#define lp0x1_nrm2_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,MM(1,RS4)),ax,3) \ + pm(7,7) +#undef dp0x1_nrm2_1 +#define dp0x1_nrm2_1(a_) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pm(3,3) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(1,1) \ + pa(3,0) \ + pm(2,2) \ + pa(1,0) \ + pa(2,0) +#undef pl0x1_nrm2_1 +#define pl0x1_nrm2_1 RS4 + + +#undef p1_4_nrm2_2 +#define p1_4_nrm2_2(a_) \ + pls(a_,ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pcs(5,6) dbg(6) \ + pcs(5,7) dbg(7) \ + paxs(1,5) dbg(5) \ + prps(5,2) dbg(2) \ + px(3) \ + pcms(0,2,3) dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pasr(3,7) dbg(7) \ + pcs(7,5) dbg(5) \ + pdsr(5,6) dbg(6) \ + pdsr(5,1) dbg(1) \ + pmsr(6,6) dbg(6) \ + pmsr(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pasr(1,0) dbg(0) +#undef p1_2_nrm2_2 +#define p1_2_nrm2_2(a_) \ + px(1) pld(a_,ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef p1_nrm2_2 +#define p1_nrm2_2(a_) \ + plq(a_,ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#define p2_nrm2_2(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef lpnrm2_2 +#define lpnrm2_2(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef dpnrm2_2 +#define dpnrm2_2(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef plnrm2_2 +#define plnrm2_2 8 + + +#undef p1_4_nrm2_3 +#define p1_4_nrm2_3(a_) \ + pls(a_,ax,1) dbg(1) \ + pcs(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + paxs(1,5) dbg(5) \ + pdsr(5,6) dbg(6) \ + pdsr(5,1) dbg(1) \ + pmsr(6,6) dbg(6) \ + pmsr(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pasr(1,0) dbg(0) +#undef p1_2_nrm2_3 +#define p1_2_nrm2_3(a_) \ + px(1) pld(a_,ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef p1_nrm2_3 +#define p1_nrm2_3(a_) \ + plq(a_,ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#define p2_nrm2_3(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef lpnrm2_3 +#define lpnrm2_3(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef dpnrm2_3 +#define dpnrm2_3(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef plnrm2_3 +#define plnrm2_3 8 + +#define block_nrm2_4(a_,b_) \ + Mjoin(pc,a_)(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + Mjoin(pax,a_)(1,5) dbg(5) \ + Mjoin(pc,a_)(2,7) dbg(7) \ + Mjoin(pd,b_)(5,7) dbg(7) \ + Mjoin(pm,b_)(7,6) dbg(6) \ + Mjoin(pm,b_)(7,1) dbg(1) \ + Mjoin(pm,b_)(6,6) dbg(6) \ + Mjoin(pm,b_)(6,0) dbg(0) \ + Mjoin(pm,b_)(1,1) dbg(1) \ + Mjoin(pa,b_)(1,0) dbg(0) + + +/* #undef p1_4_nrm2_4 */ +/* #define p1_4_nrm2_4(a_) \ */ +/* pls(a_,ax,1) dbg(1) \ */ +/* pcs(5,6) dbg(6) \ */ +/* pan(4,1) dbg(1) \ */ +/* paxs(1,5) dbg(5) \ */ +/* pcs(2,7) dbg(7) \ */ +/* pdsr(5,7) dbg(7) \ */ +/* pmsr(7,6) dbg(6) \ */ +/* pmsr(7,1) dbg(1) \ */ +/* pmsr(6,6) dbg(6) \ */ +/* pmsr(6,0) dbg(0) \ */ +/* pmsr(1,1) dbg(1) \ */ +/* pasr(1,0) dbg(0) */ +#undef p1_4_nrm2_4 +#define p1_4_nrm2_4(a_) \ + pls(a_,ax,1) dbg(1) \ + block_nrm2_4(s,sr) +#undef p1_2_nrm2_4 +#define p1_2_nrm2_4(a_) \ + px(1) pld(a_,ax,1) dbg(1) \ + block_nrm2_4(,) +#undef p1_nrm2_4 +#define p1_nrm2_4(a_) \ + plq(a_,ax,1) dbg(1) \ + block_nrm2_4(,) +#define p2_nrm2_4(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + block_nrm2_4(,) \ + plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + block_nrm2_4(,) +#undef lpnrm2_4 +#define lpnrm2_4(a_) \ + plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + block_nrm2_4(,) +#undef dpnrm2_4 +#define dpnrm2_4(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + block_nrm2_4(,) +#undef plnrm2_4 +#define plnrm2_4 8 + + +#undef p1_4_1x1_1 +#define p1_4_1x1_1(a_) \ + pls(a_,ax,1) \ + pls(a_,bx,0) \ + pm(0,1) \ + pa(1,6) +#undef p1_2_1x1_1 +#define p1_2_1x1_1(a_) \ + pld(a_,ax,1) \ + pld(a_,bx,0) \ + pm(0,1) \ + pa(1,6) +#undef p1_1x1_1 +#define p1_1x1_1(a_) \ + plq(a_,ax,1) \ + plq(a_,bx,0) \ + pm(0,1) \ + pa(0,6) +#undef p2_1x1_1 +#define p2_1x1_1(a_) \ + plq(a_,ax,1) \ + plq(a_,bx,0) \ + plq(SS(a_,RS4),ax,2) \ + plq(SS(a_,RS4),bx,3) \ + pm(0,1) \ + pm(2,3) \ + pa(1,6) \ + pa(3,6) +#undef p4_1x1_1 +#define p4_1x1_1(a_) \ + f(nta,SS(a_,MM(4,RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pm(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(0,1) \ + puq(3,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM(6,RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pm(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pm(0,7) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef lp1x1_1 +#define lp1x1_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,RS4),ax,3) \ + pm(0,7) +#undef dp1x1_1 +#define dp1x1_1(a_) \ + plq(SS(,a_,MM(2,RS4)),ax,1) \ + pm(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(0,1) \ + puq(3,SS(a_,RS4),ax) \ + pm(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef pl1x1_1 +#define pl1x1_1 RS4 + + +#undef p1_4_0x1_asum_1 +#define p1_4_0x1_asum_1(a_) \ + pls(a_,ax,1) \ + pan(4,1) \ + pasr(1,0) +#undef p1_2_0x1_asum_1 +#define p1_2_0x1_asum_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + pan(4,1) \ + pa(1,0) +#undef p1_0x1_asum_1 +#define p1_0x1_asum_1(a_) \ + plq(a_,ax,1) \ + pan(4,1) \ + pa(1,0) +#undef p2_0x1_asum_1 +#define p2_0x1_asum_1(a_) \ + plq(a_,ax,1) \ + plq(SS(a_,RS4),ax,2) \ + pan(4,1) \ + pan(4,2) \ + pa(1,0) \ + pa(2,0) +#undef p4_0x1_asum_1 +#define p4_0x1_asum_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pan(4,3) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pan(4,1) \ + pa(3,0) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pan(4,2) \ + pa(1,0) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pan(4,7) \ + pa(2,0) +#undef lp0x1_asum_1 +#define lp0x1_asum_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,MM(1,RS4)),ax,3) \ + pan(4,7) +#undef dp0x1_asum_1 +#define dp0x1_asum_1(a_) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pan(4,3) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pan(4,1) \ + pa(3,0) \ + pan(4,2) \ + pa(1,0) \ + pa(2,0) +#undef pl0x1_asum_1 +#define pl0x1_asum_1 RS4 + + +#undef p1_4_sum_1 +#define p1_4_sum_1(a_) \ + pls(a_,ax,1) \ + pasr(1,0) +#undef p1_2_sum_1 +#define p1_2_sum_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + pa(1,0) +#undef p1_sum_1 +#define p1_sum_1(a_) \ + plq(a_,ax,1) \ + pa(1,0) +#undef p2_sum_1 +#define p2_sum_1(a_) \ + plq(a_,ax,1) \ + plq(SS(a_,RS4),ax,2) \ + pa(1,0) \ + pa(2,0) +#undef p4_sum_1 +#define p4_sum_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pa(3,0) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pa(1,0) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pa(2,0) +#undef lpsum_1 +#define lpsum_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,MM(1,RS4)),ax,3) +#undef dpsum_1 +#define dpsum_1(a_) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pa(3,0) \ + pa(1,0) \ + pa(2,0) +#undef plsum_1 +#define plsum_1 RS4 + + +#undef p1_4_dot_1 +#define p1_4_dot_1(a_) \ + pls(a_,ax,1) \ + pls(a_,cx,2) \ + pmsr(2,1) \ + pasr(1,0) +#undef p1_2_dot_1 +#define p1_2_dot_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + px(2) \ + pld(a_,cx,2) \ + pm(2,1) \ + pa(1,0) +#undef p1_dot_1 +#define p1_dot_1(a_) \ + plq(a_,ax,1) \ + pl(a_,cx,2) \ + pm(2,1) \ + pa(1,0) +#undef p2_dot_1 +#define p2_dot_1(a_) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pm(4,3) \ + pa(3,0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(2,1) \ + pa(1,0) +#undef lpdot_1 +#define lpdot_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(a_,ax,3) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(a_,cx,4) +#undef dpdot_1 +#define dpdot_1(a_) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pm(4,3) \ + pa(3,0) \ + pm(2,1) \ + pa(1,0) +#undef pldot_1 +#define pldot_1 8 + +#undef p1_4_dot_1c +#define p1_4_dot_1c(a_) +#undef p1_2_dot_1c +#define p1_2_dot_1c(a_) \ + px(1) \ + pld(a_,ax,1) \ + px(2) \ + pld(a_,cx,2) \ + pc(1,3) \ + ps(HSHUF,1,1) \ + ps(LSHUF,3,3) \ + pm(7,1) \ + pm(2,3) \ + pa(3,0) \ + pm(2,1) \ + pa(1,6) +#undef p1_dot_1c +#define p1_dot_1c(a_) \ + plq(a_,ax,1) \ + pl(a_,cx,2) \ + pc(1,3) \ + ps(HSHUF,1,1) \ + ps(LSHUF,3,3) \ + pm(7,1) \ + pm(2,3) \ + pa(3,0) \ + pm(2,1) \ + pa(1,6) +#undef p2_dot_1c +#define p2_dot_1c(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pc(3,5) \ + ps(HSHUF,3,3) \ + ps(LSHUF,5,5) \ + pm(7,3) \ + pm(4,5) \ + pa(5,0) \ + pm(4,3) \ + pa(3,6) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + plq(SS(a_,MM(2,RS4)),ax,3) \ + pc(1,5) \ + ps(HSHUF,1,1) \ + ps(LSHUF,5,5) \ + pm(7,1) \ + pm(2,5) \ + pa(5,0) \ + pm(2,1) \ + pa(1,6) +#undef lpdot_1c +#define lpdot_1c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(a_,ax,3) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(a_,cx,4) +#undef dpdot_1c +#define dpdot_1c(a_) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pc(3,5) \ + ps(HSHUF,3,3) \ + ps(LSHUF,5,5) \ + pm(7,3) \ + pm(4,5) \ + pa(5,0) \ + pm(4,3) \ + pa(3,6) \ + pc(1,5) \ + ps(HSHUF,1,1) \ + ps(LSHUF,5,5) \ + pm(7,1) \ + pm(2,5) \ + pa(5,0) \ + pm(2,1) \ + pa(1,6) +#undef pldot_1c +#define pldot_1c 8 + +#undef p1_4_dot_2c +#define p1_4_dot_2c(a_) +#undef p1_2_dot_2c +#define p1_2_dot_2c(a_) \ + px(1) \ + pld(a_,ax,1) \ + px(2) \ + pld(a_,cx,2) \ + pc(1,3) \ + ps(CSHUF,1,1) \ + pm(2,3) \ + pa(3,0) \ + pm(2,1) \ + pa(1,6) +#undef p1_dot_2c +#define p1_dot_2c(a_) \ + plq(a_,ax,1) \ + pl(a_,cx,2) \ + pc(1,3) \ + ps(CSHUF,1,1) \ + pm(2,3) \ + pa(3,0) \ + pm(2,1) \ + pa(1,6) +#undef p2_dot_2c +#define p2_dot_2c(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pc(3,5) \ + ps(CSHUF,3,3) \ + pm(4,5) \ + pa(5,0) \ + pm(4,3) \ + pa(3,6) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + plq(SS(a_,MM(2,RS4)),ax,3) \ + pc(1,5) \ + ps(CSHUF,1,1) \ + pm(2,5) \ + pa(5,0) \ + pm(2,1) \ + pa(1,6) +#undef lpdot_2c +#define lpdot_2c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(a_,ax,3) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(a_,cx,4) +#undef dpdot_2c +#define dpdot_2c(a_) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pc(3,5) \ + ps(CSHUF,3,3) \ + pm(4,5) \ + pa(5,0) \ + pm(4,3) \ + pa(3,6) \ + pc(1,5) \ + ps(CSHUF,1,1) \ + pm(2,5) \ + pa(5,0) \ + pm(2,1) \ + pa(1,6) +#undef pldot_2c +#define pldot_2c 8 + +#undef p1_4_axpby_3 +#define p1_4_axpby_3(a_) \ + pls(a_,ax,0) \ + pls(a_,cx,3) \ + pmsr(5,0) \ + pmsr(6,3) \ + pasr(3,0) \ + pus(0,a_,ax) +#undef p1_2_axpby_3 +#define p1_2_axpby_3(a_) \ + pld(a_,ax,0) \ + pld(a_,cx,3) \ + pm(5,0) \ + pm(6,3) \ + pa(3,0) \ + pud(0,a_,ax) +#undef p1_axpby_3 +#define p1_axpby_3(a_) \ + plq(a_,ax,0) \ + pl(a_,cx,3) \ + pm(5,0) \ + pm(6,3) \ + pa(3,0) \ + punt(0,a_,ax) +#undef p2_axpby_3 +#define p2_axpby_3(a_) \ + plq(a_,ax,0) \ + pl(a_,cx,3) \ + plq(SS(a_,RS4),ax,1) \ + pm(5,0) \ + pm(6,3) \ + pa(3,0) \ + pl(SS(a_,RS4),cx,3) \ + punt(0,a_,ax) \ + pm(5,1) \ + pm(6,3) \ + pa(3,1) \ + punt(1,SS(a_,RS4),ax) +#undef p4_axpby_3 +#define p4_axpby_3(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(5,2) \ + pl(SS(a_,MM(3,RS4)),cx,7) \ + pm(6,4) \ + pa(4,2) \ + punt(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + pm(5,3) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(6,7) \ + pa(7,3) \ + punt(1,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pm(5,0) \ + pl(SS(a_,MM(5,RS4)),cx,7) \ + pm(6,4) \ + pa(4,0) \ + punt(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,4) \ + pm(5,1) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(6,7) \ + pa(7,1) \ + punt(3,SS(a_,MM(3,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpaxpby_3 +#define lpaxpby_3(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,4) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pl(SS(a_,MM(1,RS4)),cx,7) \ + pm(5,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,4) \ + pa(4,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pm(5,1) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(6,7) \ + pa(7,1) +#undef dpaxpby_3 +#define dpaxpby_3(a_) \ + pl(SS(a_,MM(3,RS4)),cx,7) \ + pm(5,2) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,4) \ + pa(4,2) \ + pm(5,3) \ + punt(0,a_,ax) \ + pm(6,7) \ + pa(7,3) \ + punt(1,SS(a_,RS4),ax) \ + punt(2,SS(a_,MM(2,RS4)),ax) \ + punt(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpby_3 +#define plaxpby_3 16 + +#undef p1_4_axpby_3c +#define p1_4_axpby_3c(a_) +#undef p1_2_axpby_3c +#define p1_2_axpby_3c(a_) \ + pld(a_,ax,0) \ + pld(a_,cx,2) \ + pc(0,3) \ + pm(5,0) \ + ps(CSHUF,3,3) \ + pm(4,3) \ + pa(3,0) \ + pc(2,3) \ + pm(6,2) \ + pa(2,0) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,0) \ + pud(0,a_,ax) +#undef p1_axpby_3c +#define p1_axpby_3c(a_) \ + plq(a_,ax,0) \ + pl(a_,cx,2) \ + pc(0,3) \ + pm(5,0) \ + ps(CSHUF,3,3) \ + pm(4,3) \ + pa(3,0) \ + pc(2,3) \ + pm(6,2) \ + pa(2,0) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,0) \ + puq(0,a_,ax) +#undef p2_axpby_3c +#define p2_axpby_3c(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,3) \ + pc(1,2) \ + pm(5,1) \ + ps(CSHUF,2,2) \ + pm(4,2) \ + pa(2,1) \ + pc(3,2) \ + pm(6,3) \ + pa(3,1) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,1) \ + puq(0,a_,ax) \ + plq(SS(a_,MM(2,RS4)),ax,0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pc(0,3) \ + pm(5,0) \ + ps(CSHUF,3,3) \ + pm(4,3) \ + pa(3,0) \ + pc(2,3) \ + pm(6,2) \ + pa(2,0) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,0) \ + puq(1,SS(a_,RS4),ax) +#undef lpaxpby_3c +#define lpaxpby_3c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,2) \ + pc(0,3) \ + pm(5,0) \ + ps(CSHUF,3,3) \ + pm(4,3) \ + pa(3,0) \ + pc(2,3) \ + pm(6,2) \ + pa(2,0) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,0) +#undef dpaxpby_3c +#define dpaxpby_3c(a_) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,3) \ + pc(1,2) \ + pm(5,1) \ + ps(CSHUF,2,2) \ + pm(4,2) \ + pa(2,1) \ + pc(3,2) \ + pm(6,3) \ + pa(3,1) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef plaxpby_3c +#define plaxpby_3c 8 + +#undef p1_4_axpby_2 +#define p1_4_axpby_2(a_) \ + pls(a_,cx,5) \ + pls(a_,ax,0) \ + pmsr(6,5) \ + pasr(5,0) \ + pus(0,a_,ax) +#undef p1_2_axpby_2 +#define p1_2_axpby_2(a_) \ + pld(a_,cx,5) \ + pld(a_,ax,0) \ + pm(6,5) \ + pa(5,0) \ + pud(0,a_,ax) +#undef p1_axpby_2 +#define p1_axpby_2(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pm(6,5) \ + pa(5,0) \ + puq(0,a_,ax) +#undef p2_axpby_2 +#define p2_axpby_2(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,5) \ + pa(5,0) \ + plq(SS(a_,RS4),ax,1) \ + puq(0,a_,ax) \ + pm(6,4) \ + pa(4,1) \ + puq(1,SS(a_,RS4),ax) +#undef p4_axpby_2 +#define p4_axpby_2(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + pm(6,4) \ + pa(4,2) \ + puq(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(6,5) \ + pa(5,3) \ + puq(1,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pm(6,4) \ + pa(4,0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,4) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(6,5) \ + pa(5,1) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpaxpby_2 +#define lpaxpby_2(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,4) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pl(SS(a_,MM(1,RS4)),cx,5) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,4) \ + pa(4,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(6,5) \ + pa(5,1) +#undef dpaxpby_2 +#define dpaxpby_2(a_) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,4) \ + pa(4,2) \ + puq(0,a_,ax) \ + pm(6,5) \ + pa(5,3) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpby_2 +#define plaxpby_2 16 + +#undef p1_4_axpby_2c +#define p1_4_axpby_2c(a_) +#undef p1_2_axpby_2c +#define p1_2_axpby_2c(a_) \ + pld(a_,cx,5) \ + pld(a_,ax,0) \ + pc(5,1) \ + pm(6,5) \ + pa(5,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + pud(0,a_,ax) +#undef p1_axpby_2c +#define p1_axpby_2c(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pc(5,1) \ + pm(6,5) \ + pa(5,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + puq(0,a_,ax) +#undef p2_axpby_2c +#define p2_axpby_2c(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pl(SS(a_,RS4),cx,4) \ + pc(5,1) \ + pm(6,5) \ + pa(5,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + plq(SS(a_,RS4),ax,1) \ + puq(0,a_,ax) \ + pc(4,3) \ + pm(6,4) \ + pa(4,1) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,1) \ + puq(1,SS(a_,RS4),ax) +#undef p4_axpby_2c +#define p4_axpby_2c(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + puq(0,a_,ax) \ + pc(4,0) \ + pm(6,4) \ + pa(4,2) \ + ps(CSHUF,0,0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + pm(7,0) \ + pa(0,2) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + puq(1,SS(a_,RS4),ax) \ + pc(5,1) \ + pm(6,5) \ + pa(5,3) \ + ps(CSHUF,1,1) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pm(7,1) \ + pa(1,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,4) \ + pm(7,2) \ + pa(2,0) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + pc(5,3) \ + pm(6,5) \ + pa(5,1) \ + ps(CSHUF,3,3) \ + pl(SS(a_,MM(7,RS4)),cx,5) \ + pm(7,3) \ + pa(3,1) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpaxpby_2c +#define lpaxpby_2c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,4) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pl(SS(a_,MM(1,RS4)),cx,5) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(7,2) \ + pa(2,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pc(5,3) \ + pm(6,5) \ + pa(5,1) \ + ps(CSHUF,3,3) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + pm(7,3) \ + pa(3,1) +#undef dpaxpby_2c +#define dpaxpby_2c(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + puq(0,a_,ax) \ + pc(4,0) \ + pm(6,4) \ + pa(4,2) \ + ps(CSHUF,0,0) \ + puq(1,SS(a_,RS4),ax) \ + pm(7,0) \ + pa(0,2) \ + pc(5,1) \ + pm(6,5) \ + pa(5,3) \ + ps(CSHUF,1,1) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pm(7,1) \ + pa(1,3) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpby_2c +#define plaxpby_2c 16 + +#undef p1_4_axpby_1 +#define p1_4_axpby_1(a_) \ + pls(a_,ax,1) \ + pls(a_,cx,2) \ + pmsr(5,1) \ + pmsr(6,2) \ + pasr(2,1) \ + pus(1,a_,ax) +#undef p1_2_axpby_1 +#define p1_2_axpby_1(a_) \ + pld(a_,ax,1) \ + pld(a_,cx,2) \ + pm(5,1) \ + pm(6,2) \ + pa(2,1) \ + pud(1,a_,ax) +#undef p1_axpby_1 +#define p1_axpby_1(a_) \ + plq(a_,ax,1) \ + pl(a_,cx,2) \ + pm(5,1) \ + pm(6,2) \ + pa(2,1) \ + puq(1,a_,ax) +#undef p2_axpby_1 +#define p2_axpby_1(a_) \ + plq(SS(a_,RS4),ax,3) \ + pl(SS(a_,RS4),cx,4) \ + pm(5,1) \ + pm(6,2) \ + pa(2,1) \ + puq(1,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pm(5,3) \ + pm(6,4) \ + pa(4,3) \ + puq(3,SS(a_,RS4),ax) +#undef lpaxpby_1 +#define lpaxpby_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,2) +#undef dpaxpby_1 +#define dpaxpby_1(a_) \ + plq(SS(a_,RS4),ax,3) \ + pl(SS(a_,RS4),cx,4) \ + pm(5,1) \ + pm(6,2) \ + pa(2,1) \ + puq(1,a_,ax) \ + pm(5,3) \ + pm(6,4) \ + pa(4,3) \ + puq(3,SS(a_,RS4),ax) +#undef plaxpby_1 +#define plaxpby_1 8 + +#undef p1_4_axpy_0 +#define p1_4_axpy_0(a_) \ + pls(a_,cx,2) \ + pls(a_,ax,1) \ + pmsr(6,2) \ + pasr(2,1) \ + pus(1,a_,ax) +#undef p1_2_axpy_0 +#define p1_2_axpy_0(a_) \ + pld(a_,cx,2) \ + pld(a_,ax,1) \ + pm(6,2) \ + pa(2,1) \ + pud(1,a_,ax) +#undef p1_axpy_0 +#define p1_axpy_0(a_) \ + pl(a_,cx,2) \ + plq(a_,ax,1) \ + pm(6,2) \ + pa(2,1) \ + puq(1,a_,ax) +#undef p2_axpy_0 +#define p2_axpy_0(a_) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,2) \ + pa(2,1) \ + plq(SS(a_,RS4),ax,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + puq(1,a_,ax) \ + pm(6,4) \ + pa(4,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + puq(3,SS(a_,RS4),ax) +#undef lpaxpy_0 +#define lpaxpy_0(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,2) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) +#undef dpaxpy_0 +#define dpaxpy_0(a_) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,2) \ + pa(2,1) \ + plq(SS(a_,RS4),ax,3) \ + puq(1,a_,ax) \ + pm(6,4) \ + pa(4,3) \ + puq(3,SS(a_,RS4),ax) +#undef plaxpy_0 +#define plaxpy_0 8 + +#undef p1_4_axpy_1 +#define p1_4_axpy_1(a_) \ + pls(a_,cx,2) \ + pls(a_,ax,1) \ + pmsr(6,2) \ + pasr(2,1) \ + pus(1,a_,ax) +#undef p1_2_axpy_1 +#define p1_2_axpy_1(a_) \ + pld(a_,cx,2) \ + pld(a_,ax,1) \ + pm(6,2) \ + pa(2,1) \ + pud(1,a_,ax) +#undef p1_axpy_1 +#define p1_axpy_1(a_) \ + pl(a_,cx,2) \ + pm(6,2) \ + pam(a_,ax,2) \ + puq(2,a_,ax) +#undef p2_axpy_1 +#define p2_axpy_1(a_) \ + pl(a_,cx,2) \ + pm(6,2) \ + pl(SS(a_,RS4),cx,4) \ + pam(a_,ax,2) \ + pm(6,4) \ + puq(2,a_,ax) \ + pam(SS(a_,RS4),ax,4) \ + puq(4,SS(a_,RS4),ax) +#undef p4_axpy_1 +#define p4_axpy_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pm(6,2) \ + pam(SS(a_,MM(2,RS4)),ax,2) \ + puq(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + pl(SS(a_,MM(4,RS4)),cx,0) \ + pm(6,3) \ + pam(SS(a_,MM(3,RS4)),ax,3) \ + puq(1,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(5,RS4)),cx,1) \ + pm(6,0) \ + pam(SS(a_,MM(4,RS4)),ax,0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + pl(SS(a_,MM(6,RS4)),cx,2) \ + pm(6,1) \ + pam(SS(a_,MM(5,RS4)),ax,1) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef lpaxpy_1 +#define lpaxpy_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(a_,cx,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + pl(SS(a_,RS4),cx,1) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pm(6,0) \ + pam(a_,ax,0) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + pm(6,1) \ + pam(SS(a_,RS4),ax,1) +#undef dpaxpy_1 +#define dpaxpy_1(a_) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pm(6,2) \ + pam(SS(a_,MM(2,RS4)),ax,2) \ + puq(0,a_,ax) \ + pm(6,3) \ + pam(SS(a_,MM(3,RS4)),ax,3) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpy_1 +#define plaxpy_1 16 + +#undef p1_4_axpy_2 +#define p1_4_axpy_2(a_) \ + pls(a_,cx,5) \ + pls(a_,ax,0) \ + pmsr(6,5) \ + pasr(5,0) \ + pus(0,a_,ax) +#undef p1_2_axpy_2 +#define p1_2_axpy_2(a_) \ + pld(a_,cx,5) \ + pld(a_,ax,0) \ + pm(6,5) \ + pa(5,0) \ + pud(0,a_,ax) +#undef p1_axpy_2 +#define p1_axpy_2(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pm(6,5) \ + pa(5,0) \ + puq(0,a_,ax) +#undef p2_axpy_2 +#define p2_axpy_2(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,5) \ + pa(5,0) \ + plq(SS(a_,RS4),ax,1) \ + puq(0,a_,ax) \ + pm(6,4) \ + pa(4,1) \ + puq(1,SS(a_,RS4),ax) +#undef p4_axpy_2 +#define p4_axpy_2(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + pm(6,4) \ + pa(4,2) \ + puq(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(6,5) \ + pa(5,3) \ + puq(1,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pm(6,4) \ + pa(4,0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,4) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(6,5) \ + pa(5,1) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpaxpy_2 +#define lpaxpy_2(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,4) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pl(SS(a_,MM(1,RS4)),cx,5) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,4) \ + pa(4,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(6,5) \ + pa(5,1) +#undef dpaxpy_2 +#define dpaxpy_2(a_) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,4) \ + pa(4,2) \ + puq(0,a_,ax) \ + pm(6,5) \ + pa(5,3) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpy_2 +#define plaxpy_2 16 + +#undef p1_4_axpy_2c +#define p1_4_axpy_2c(a_) +#undef p1_2_axpy_2c +#define p1_2_axpy_2c(a_) \ + pld(a_,cx,4) \ + pld(a_,ax,0) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + pud(0,a_,ax) +#undef p1_axpy_2c +#define p1_axpy_2c(a_) \ + pl(a_,cx,4) \ + plq(a_,ax,0) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + puq(0,a_,ax) +#undef p2_axpy_2c +#define p2_axpy_2c(a_) \ + pl(a_,cx,4) \ + plq(a_,ax,0) \ + pl(SS(a_,RS4),cx,5) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + plq(SS(a_,RS4),ax,1) \ + puq(0,a_,ax) \ + pc(5,3) \ + pm(6,5) \ + pa(5,1) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,1) \ + puq(1,SS(a_,RS4),ax) +#undef p4_axpy_2c +#define p4_axpy_2c(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + puq(0,a_,ax) \ + pc(4,0) \ + pm(6,4) \ + pa(4,2) \ + ps(CSHUF,0,0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + pm(7,0) \ + pa(0,2) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + puq(1,SS(a_,RS4),ax) \ + pc(5,1) \ + pm(6,5) \ + pa(5,3) \ + ps(CSHUF,1,1) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pm(7,1) \ + pa(1,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,4) \ + pm(7,2) \ + pa(2,0) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + pc(5,3) \ + pm(6,5) \ + pa(5,1) \ + ps(CSHUF,3,3) \ + pl(SS(a_,MM(7,RS4)),cx,5) \ + pm(7,3) \ + pa(3,1) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpaxpy_2c +#define lpaxpy_2c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,4) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pl(SS(a_,MM(1,RS4)),cx,5) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(7,2) \ + pa(2,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pc(5,3) \ + pm(6,5) \ + pa(5,1) \ + ps(CSHUF,3,3) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + pm(7,3) \ + pa(3,1) +#undef dpaxpy_2c +#define dpaxpy_2c(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + puq(0,a_,ax) \ + pc(4,0) \ + pm(6,4) \ + pa(4,2) \ + ps(CSHUF,0,0) \ + puq(1,SS(a_,RS4),ax) \ + pm(7,0) \ + pa(0,2) \ + pc(5,1) \ + pm(6,5) \ + pa(5,3) \ + ps(CSHUF,1,1) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pm(7,1) \ + pa(1,3) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpy_2c +#define plaxpy_2c 16 + +#undef p1_4_axpy_1c +#define p1_4_axpy_1c(a_) +#undef p1_2_axpy_1c +#define p1_2_axpy_1c(a_) \ + pld(a_,cx,2) \ + pc(2,0) \ + pld(a_,ax,1) \ + ps(CSHUF,0,0) \ + pm(6,2) \ + pa(2,1) \ + pm(7,0) \ + pa(0,1) \ + pud(1,a_,ax) +#undef p1_axpy_1c +#define p1_axpy_1c(a_) \ + pl(a_,cx,2) \ + pc(2,0) \ + plq(a_,ax,1) \ + ps(CSHUF,0,0) \ + pm(6,2) \ + pa(2,1) \ + pm(7,0) \ + pa(0,1) \ + puq(1,a_,ax) +#undef p2_axpy_1c +#define p2_axpy_1c(a_) \ + plq(SS(a_,RS4),ax,3) \ + ps(CSHUF,0,0) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,2) \ + pa(2,1) \ + pm(7,0) \ + pa(0,1) \ + pc(4,0) \ + puq(1,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + ps(CSHUF,0,0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pm(6,4) \ + pa(4,3) \ + pm(7,0) \ + pa(0,3) \ + pc(2,0) \ + puq(3,SS(a_,RS4),ax) +#undef lpaxpy_1c +#define lpaxpy_1c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,2) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) \ + pc(2,0) +#undef dpaxpy_1c +#define dpaxpy_1c(a_) \ + plq(SS(a_,RS4),ax,3) \ + ps(CSHUF,0,0) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,2) \ + pa(2,1) \ + pm(7,0) \ + pa(0,1) \ + pc(4,0) \ + puq(1,a_,ax) \ + ps(CSHUF,0,0) \ + pm(6,4) \ + pa(4,3) \ + pm(7,0) \ + pa(0,3) \ + puq(3,SS(a_,RS4),ax) +#undef plaxpy_1c +#define plaxpy_1c 8 + +#undef p1_4_copy_1 +#define p1_4_copy_1(a_) \ + pls(a_,cx,2) \ + pus(2,a_,ax) +#undef p1_2_copy_1 +#define p1_2_copy_1(a_) \ + pld(a_,cx,2) \ + pud(2,a_,ax) +#undef p1_copy_1 +#define p1_copy_1(a_) \ + pl(a_,cx,2) \ + puq(2,a_,ax) +#undef p2_copy_1 +#define p2_copy_1(a_) \ + pl(SS(a_,RS4),cx,4) \ + puq(2,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + puq(4,SS(a_,RS4),ax) +#undef lpcopy_1 +#define lpcopy_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,2) +#undef dpcopy_1 +#define dpcopy_1(a_) \ + pl(SS(a_,RS4),cx,4) \ + puq(2,a_,ax) \ + puq(4,SS(a_,RS4),ax) +#undef plcopy_1 +#define plcopy_1 8 + +#undef p1_4_copy_2 +#define p1_4_copy_2(a_) \ + pls(a_,ax,2) \ + pus(2,a_,cx) +#undef p1_2_copy_2 +#define p1_2_copy_2(a_) \ + pld(a_,ax,2) \ + pud(2,a_,cx) +#undef p1_copy_2 +#define p1_copy_2(a_) \ + plq(a_,ax,2) \ + pu(2,a_,cx) +#undef p2_copy_2 +#define p2_copy_2(a_) \ + plq(SS(a_,RS4),ax,4) \ + pu(2,a_,cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pu(4,SS(a_,RS4),cx) +#undef lpcopy_2 +#define lpcopy_2(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,2) +#undef dpcopy_2 +#define dpcopy_2(a_) \ + plq(SS(a_,RS4),ax,4) \ + pu(2,a_,cx) \ + pu(4,SS(a_,RS4),cx) +#undef plcopy_2 +#define plcopy_2 8 + +#undef p1_4_copy_3 +#define p1_4_copy_3(a_) \ + pls(a_,cx,2) \ + pus(2,a_,ax) +#undef p1_2_copy_3 +#define p1_2_copy_3(a_) \ + pld(a_,cx,2) \ + pud(2,a_,ax) +#undef p1_copy_3 +#define p1_copy_3(a_) \ + pl(a_,cx,2) \ + punt(2,a_,ax) +#undef p2_copy_3 +#define p2_copy_3(a_) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + punt(0,SS(a_,MM(0,RS4)),ax) \ + punt(1,SS(a_,MM(1,RS4)),ax) +#undef p4_copy_3 +#define p4_copy_3(a_) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + punt(0,SS(a_,MM(0,RS4)),ax) \ + punt(1,SS(a_,MM(1,RS4)),ax) \ + punt(2,SS(a_,MM(2,RS4)),ax) \ + punt(3,SS(a_,MM(3,RS4)),ax) +#undef p8_copy_3 +#define p8_copy_3(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pl(SS(a_,MM(6,RS4)),cx,6) \ + pl(SS(a_,MM(7,RS4)),cx,7) \ + punt(0,SS(a_,MM(0,RS4)),ax) \ + punt(1,SS(a_,MM(1,RS4)),ax) \ + punt(2,SS(a_,MM(2,RS4)),ax) \ + punt(3,SS(a_,MM(3,RS4)),ax) \ + punt(4,SS(a_,MM(4,RS4)),ax) \ + punt(5,SS(a_,MM(5,RS4)),ax) \ + punt(6,SS(a_,MM(6,RS4)),ax) \ + punt(7,SS(a_,MM(7,RS4)),ax) +#undef lpcopy_3 +#define lpcopy_3(a_) +#undef dpcopy_3 +#define dpcopy_3(a_) p8_copy_3(a_) +#undef plcopy_3 +#define plcopy_3 32 + +#undef p1_4_cpsc_3 +#define p1_4_cpsc_3(a_) \ + pls(a_,ax,0) \ + pmsr(6,0) \ + pus(0,a_,cx) +#undef p1_2_cpsc_3 +#define p1_2_cpsc_3(a_) \ + pld(a_,ax,0) \ + pm(6,0) \ + pud(0,a_,cx) +#undef p1_cpsc_3 +#define p1_cpsc_3(a_) \ + plq(a_,ax,0) \ + pm(6,0) \ + pu(0,a_,cx) +#undef p2_cpsc_3 +#define p2_cpsc_3(a_) \ + plq(a_,ax,0) \ + plq(SS(a_,RS4),ax,1) \ + pm(6,0) \ + pm(6,1) \ + pu(0,a_,cx) \ + pu(1,SS(a_,RS4),cx) +#undef p4_cpsc_3 +#define p4_cpsc_3(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,2) \ + pu(0,a_,cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(6,3) \ + pu(1,SS(a_,RS4),cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pm(6,0) \ + pu(2,SS(a_,MM(2,RS4)),cx) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(6,1) \ + pu(3,SS(a_,MM(3,RS4)),cx) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) +#undef lpcpsc_3 +#define lpcpsc_3(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pm(6,1) +#undef dpcpsc_3 +#define dpcpsc_3(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,2) \ + pu(0,a_,cx) \ + pm(6,3) \ + pu(1,SS(a_,RS4),cx) \ + pu(2,SS(a_,MM(2,RS4)),cx) \ + pu(3,SS(a_,MM(3,RS4)),cx) +#undef plcpsc_3 +#define plcpsc_3 16 + +#undef p1_4_cpsc_3c +#define p1_4_cpsc_3c(a_) +#undef p1_2_cpsc_3c +#define p1_2_cpsc_3c(a_) \ + pld(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + pud(0,a_,cx) +#undef p1_cpsc_3c +#define p1_cpsc_3c(a_) \ + plq(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + pu(0,a_,cx) +#undef p2_cpsc_3c +#define p2_cpsc_3c(a_) \ + plq(a_,ax,0) \ + plq(SS(a_,RS4),ax,1) \ + pc(0,2) \ + pm(6,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + pu(0,a_,cx) \ + pc(1,3) \ + pm(6,1) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,1) \ + pu(1,SS(a_,RS4),cx) +#undef p4_cpsc_3c +#define p4_cpsc_3c(a_) \ + pu(0,a_,cx) \ + pc(2,4) \ + pm(6,2) \ + ps(CSHUF,4,4) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(7,4) \ + pa(4,2) \ + pu(1,SS(a_,RS4),cx) \ + pc(3,4) \ + pm(6,3) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pm(7,4) \ + pa(4,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pu(2,SS(a_,MM(2,RS4)),cx) \ + pc(0,4) \ + pm(6,0) \ + ps(CSHUF,4,4) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(7,4) \ + pa(4,0) \ + pu(3,SS(a_,MM(3,RS4)),cx) \ + pc(1,4) \ + pm(6,1) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(7,RS4)),ax,3) \ + pm(7,4) \ + pa(4,1) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) +#undef lpcpsc_3c +#define lpcpsc_3c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pc(0,4) \ + pm(6,0) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pm(7,4) \ + pa(4,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pc(1,4) \ + pm(6,1) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(7,4) \ + pa(4,1) +#undef dpcpsc_3c +#define dpcpsc_3c(a_) \ + pu(0,a_,cx) \ + pc(2,4) \ + pm(6,2) \ + ps(CSHUF,4,4) \ + pu(1,SS(a_,RS4),cx) \ + pm(7,4) \ + pa(4,2) \ + pc(3,4) \ + pm(6,3) \ + ps(CSHUF,4,4) \ + pu(2,SS(a_,MM(2,RS4)),cx) \ + pm(7,4) \ + pa(4,3) \ + pu(3,SS(a_,MM(3,RS4)),cx) +#undef plcpsc_3c +#define plcpsc_3c 16 + +#undef p1_4_cpsc_4 +#define p1_4_cpsc_4(a_) \ + pls(a_,cx,0) \ + pmsr(6,0) \ + pus(0,a_,ax) +#undef p1_2_cpsc_4 +#define p1_2_cpsc_4(a_) \ + pld(a_,cx,0) \ + pm(6,0) \ + pud(0,a_,ax) +#undef p1_cpsc_4 +#define p1_cpsc_4(a_) \ + pl(a_,cx,0) \ + pm(6,0) \ + puq(0,a_,ax) +#undef p2_cpsc_4 +#define p2_cpsc_4(a_) \ + pl(a_,cx,0) \ + pl(SS(a_,RS4),cx,1) \ + pm(6,0) \ + pm(6,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_cpsc_4 +#define p4_cpsc_4(a_) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pm(6,2) \ + puq(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,0) \ + pm(6,3) \ + puq(1,SS(a_,RS4),ax) \ + pl(SS(a_,MM(5,RS4)),cx,1) \ + pm(6,0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,2) \ + pm(6,1) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef lpcpsc_4 +#define lpcpsc_4(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pm(6,0) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pm(6,1) +#undef dpcpsc_4 +#define dpcpsc_4(a_) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pm(6,2) \ + puq(0,a_,ax) \ + pm(6,3) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plcpsc_4 +#define plcpsc_4 16 + +#undef p1_4_cpsc_5 +#define p1_4_cpsc_5(a_) \ + pls(a_,cx,0) \ + pmsr(6,0) \ + pus(0,a_,ax) +#undef p1_2_cpsc_5 +#define p1_2_cpsc_5(a_) \ + pld(a_,cx,0) \ + pm(6,0) \ + pud(0,a_,ax) +#undef p1_cpsc_5 +#define p1_cpsc_5(a_) \ + pl(a_,cx,0) \ + pm(6,0) \ + puq(0,a_,ax) +#undef p2_cpsc_5 +#define p2_cpsc_5(a_) \ + pl(a_,cx,0) \ + pl(SS(a_,RS4),cx,1) \ + pm(6,0) \ + pm(6,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_cpsc_5 +#define p4_cpsc_5(a_) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pm(6,0) \ + pm(6,1) \ + pm(6,2) \ + pm(6,3) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef p8_cpsc_5 +#define p8_cpsc_5(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pl(SS(a_,MM(6,RS4)),cx,7) \ + pm(6,0) \ + pm(6,1) \ + pm(6,2) \ + pm(6,3) \ + puq(0,a_,ax) \ + pl(SS(a_,MM(7,RS4)),cx,0) \ + pm(6,4) \ + pm(6,5) \ + pm(6,7) \ + pm(6,0) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + puq(4,SS(a_,MM(4,RS4)),ax) \ + puq(5,SS(a_,MM(5,RS4)),ax) \ + puq(7,SS(a_,MM(6,RS4)),ax) \ + puq(0,SS(a_,MM(7,RS4)),ax) +#undef lpcpsc_5 +#define lpcpsc_5(a_) +#undef dpcpsc_5 +#define dpcpsc_5(a_) p8_cpsc_5(a_) +#undef plcpsc_5 +#define plcpsc_5 32 + +#undef cpsc_cdp +#define cpsc_cdp(a_) pc(a_,5) pm(6,a_) ps(CSHUF,5,5) pm(7,5) pa(5,a_) +#undef p1_4_cpsc_5c +#define p1_4_cpsc_5c(a_) +#undef p1_2_cpsc_5c +#define p1_2_cpsc_5c(a_) \ + pld(a_,cx,0) \ + cpsc_cdp(0) \ + pud(0,a_,ax) +#undef p1_cpsc_5c +#define p1_cpsc_5c(a_) \ + pl(a_,cx,0) \ + cpsc_cdp(0) \ + puq(0,a_,ax) +#undef p2_cpsc_5c +#define p2_cpsc_5c(a_) \ + pl(a_,cx,0) \ + pl(SS(a_,RS4),cx,1) \ + cpsc_cdp(0) \ + cpsc_cdp(1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_cpsc_5c +#define p4_cpsc_5c(a_) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + cpsc_cdp(0) \ + cpsc_cdp(1) \ + cpsc_cdp(2) \ + cpsc_cdp(3) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef p8_cpsc_5c +#define p8_cpsc_5c(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + cpsc_cdp(0) \ + cpsc_cdp(1) \ + puq(0,a_,ax) \ + pl(SS(a_,MM(5,RS4)),cx,0) \ + cpsc_cdp(2) \ + cpsc_cdp(3) \ + puq(1,SS(a_,RS4),ax) \ + pl(SS(a_,MM(6,RS4)),cx,1) \ + cpsc_cdp(4) \ + cpsc_cdp(0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pl(SS(a_,MM(7,RS4)),cx,2) \ + cpsc_cdp(1) \ + cpsc_cdp(2) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + puq(4,SS(a_,MM(4,RS4)),ax) \ + puq(0,SS(a_,MM(5,RS4)),ax) \ + puq(1,SS(a_,MM(6,RS4)),ax) \ + puq(2,SS(a_,MM(7,RS4)),ax) +#undef lpcpsc_5c +#define lpcpsc_5c(a_) +#undef dpcpsc_5c +#define dpcpsc_5c(a_) p8_cpsc_5c(a_) +#undef plcpsc_5c +#define plcpsc_5c 32 + +#undef p1_4_cpsc_1 +#define p1_4_cpsc_1(a_) \ + pls(a_,ax,2) \ + pmsr(3,2) \ + pus(2,a_,cx) +#undef p1_2_cpsc_1 +#define p1_2_cpsc_1(a_) \ + pld(a_,ax,2) \ + pm(3,2) \ + pud(2,a_,cx) +#undef p1_cpsc_1 +#define p1_cpsc_1(a_) \ + plq(a_,ax,2) \ + pm(3,2) \ + pu(2,a_,cx) +#undef p2_cpsc_1 +#define p2_cpsc_1(a_) \ + plq(SS(a_,RS4),ax,4) \ + pm(3,2) \ + pu(2,a_,cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pm(3,4) \ + pu(4,SS(a_,RS4),cx) +#undef lpcpsc_1 +#define lpcpsc_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,2) +#undef dpcpsc_1 +#define dpcpsc_1(a_) \ + plq(SS(a_,RS4),ax,4) \ + pm(3,2) \ + pu(2,a_,cx) \ + pm(3,4) \ + pu(4,SS(a_,RS4),cx) +#undef plcpsc_1 +#define plcpsc_1 8 + +#undef p1_4_cpsc_2 +#define p1_4_cpsc_2(a_) \ + pls(a_,ax,2) \ + pmsr(3,2) \ + pus(2,a_,cx) +#undef p1_2_cpsc_2 +#define p1_2_cpsc_2(a_) \ + pld(a_,ax,2) \ + pm(3,2) \ + pud(2,a_,cx) +#undef p1_cpsc_2 +#define p1_cpsc_2(a_) \ + plq(a_,ax,2) \ + pm(3,2) \ + pu(2,a_,cx) +#undef p2_cpsc_2 +#define p2_cpsc_2(a_) \ + plq(a_,ax,2) \ + plq(SS(a_,RS4),ax,4) \ + pm(3,2) \ + pm(3,4) \ + pu(2,a_,cx) \ + pu(4,SS(a_,RS4),cx) +#undef p4_cpsc_2 +#define p4_cpsc_2(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,7) \ + pm(3,6) \ + pu(4,a_,cx) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(3,7) \ + pu(6,SS(a_,RS4),cx) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,4) \ + pm(3,2) \ + pu(7,SS(a_,MM(2,RS4)),cx) \ + plq(SS(a_,MM(5,RS4)),ax,6) \ + pm(3,4) \ + pu(2,SS(a_,MM(3,RS4)),cx) +#undef lpcpsc_2 +#define lpcpsc_2(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,4) \ + plq(SS(a_,MM(1,RS4)),ax,6) \ + pm(3,4) +#undef dpcpsc_2 +#define dpcpsc_2(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,7) \ + pm(3,6) \ + pu(4,a_,cx) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(3,7) \ + pu(6,SS(a_,RS4),cx) \ + pm(3,2) \ + pu(7,SS(a_,MM(2,RS4)),cx) \ + pu(2,SS(a_,MM(3,RS4)),cx) +#undef plcpsc_2 +#define plcpsc_2 RS4 + + +#undef p1_4_iamax_1 +#define p1_4_iamax_1(a_) \ + px(4) \ + pls(a_,ax,4) \ + pan(2,4) \ + pc(3,5) \ + pcm(6,4,5) \ + paxs(4,3) \ + pan(5,6) \ + pann(0,5) \ + pasr(5,6) \ + pasr(1,0) \ + ps(57,0,0) +#undef p1_2_iamax_1 +#define p1_2_iamax_1(a_) \ + px(4) \ + pld(a_,ax,4) \ + pan(2,4) \ + pc(3,5) \ + pcm(6,4,5) \ + pax(4,3) \ + pan(5,6) \ + pann(0,5) \ + pa(5,6) \ + pasr(1,0) \ + ps(57,0,0)\ + pasr(1,0) \ + ps(57,0,0) +#undef p1_iamax_1 +#define p1_iamax_1(a_) \ + plq(a_,ax,4) \ + pan(2,4) \ + pc(3,5) \ + pcm(6,4,5) \ + pax(4,3) \ + pan(5,6) \ + pann(0,5) \ + pa(5,6) \ + pa(1,0) +#define p2_iamax_1(a_) \ + plq(SS(a_,RS4),ax,4) \ + pan(2,4) \ + pc(3,5) \ + pcm(6,4,5) \ + pax(4,3) \ + pan(5,6) \ + pann(0,5) \ + pa(5,6) \ + pa(1,0) \ + f(nta,SS(a_,MM(SS(2,CL),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,4) \ + pan(2,4) \ + pc(3,5) \ + pcm(6,4,5) \ + pax(4,3) \ + pan(5,6) \ + pann(0,5) \ + pa(5,6) \ + pa(1,0) +#undef lpiamax_1 +#define lpiamax_1(a_) \ + f(nta,SS(a_,MM(CL,RS4)),ax) \ + plq(a_,ax,4) \ + pan(2,4) \ + pc(3,5) \ + pcm(6,4,5) \ + pax(4,3) \ + pan(5,6) \ + pann(0,5) \ + pa(5,6) \ + pa(1,0) +#undef dpiamax_1 +#define dpiamax_1(a_) \ + plq(SS(a_,RS4),ax,4) \ + pan(2,4) \ + pc(3,5) \ + pcm(6,4,5) \ + pax(4,3) \ + pan(5,6) \ + pann(0,5) \ + pa(5,6) \ + pa(1,0) +#undef pliamax_1 +#define pliamax_1 8 + +#undef p1_4_iamax_1d +#define p1_4_iamax_1d(a_) +#undef p1_2_iamax_1d +#define p1_2_iamax_1d(a_) \ + px(4) \ + pld(a_,ax,4) \ + dbg(2) \ + pan(2,4) \ + dbg(4) \ + pc(3,5) \ + dbg(5) \ + pcm(6,4,5) \ + dbg(5) \ + pax(4,3) \ + dbg(3) \ + pan(5,6) \ + dbg(6) \ + pann(0,5) \ + dbg(5) \ + pa(5,6) \ + dbg(6) \ + pasr(1,0) \ + dbg(0) \ + ps(1,0,0) +#undef p1_iamax_1d +#define p1_iamax_1d(a_) \ + plq(a_,ax,4) \ + dbg(2) \ + pan(2,4) \ + dbg(4) \ + pc(3,5) \ + dbg(5) \ + pcm(6,4,5) \ + dbg(5) \ + pax(4,3) \ + dbg(3) \ + pan(5,6) \ + dbg(6) \ + pann(0,5) \ + dbg(5) \ + pa(5,6) \ + dbg(6) \ + pa(1,0) +#define p2_iamax_1d(a_) \ + plq(SS(a_,RS4),ax,4) \ + dbg(2) \ + pan(2,4) \ + dbg(4) \ + pc(3,5) \ + dbg(5) \ + pcm(6,4,5) \ + dbg(5) \ + pax(4,3) \ + dbg(3) \ + pan(5,6) \ + dbg(6) \ + pann(0,5) \ + dbg(5) \ + pa(5,6) \ + dbg(6) \ + pa(1,0) \ + dbg(0) \ + f(nta,SS(a_,MM(SS(2,CL),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,4) \ + dbg(2) \ + pan(2,4) \ + dbg(4) \ + pc(3,5) \ + dbg(5) \ + pcm(6,4,5) \ + dbg(5) \ + pax(4,3) \ + dbg(3) \ + pan(5,6) \ + dbg(6) \ + pann(0,5) \ + dbg(5) \ + pa(5,6) \ + dbg(6) \ + pa(1,0) +#undef lpiamax_1d +#define lpiamax_1d(a_) \ + f(nta,SS(a_,MM(CL,RS4)),ax) \ + plq(a_,ax,4) \ + dbg(2) \ + pan(2,4) \ + dbg(4) \ + pc(3,5) \ + dbg(5) \ + pcm(6,4,5) \ + dbg(5) \ + pax(4,3) \ + dbg(3) \ + pan(5,6) \ + dbg(6) \ + pann(0,5) \ + dbg(5) \ + pa(5,6) \ + dbg(6) \ + pa(1,0) +#undef dpiamax_1d +#define dpiamax_1d(a_) \ + plq(SS(a_,RS4),ax,4) \ + dbg(2) \ + pan(2,4) \ + dbg(4) \ + pc(3,5) \ + dbg(5) \ + pcm(6,4,5) \ + dbg(5) \ + pax(4,3) \ + dbg(3) \ + pan(5,6) \ + dbg(6) \ + pann(0,5) \ + dbg(5) \ + pa(5,6) \ + dbg(6) \ + pa(1,0) +#undef pliamax_1d +#define pliamax_1d 8 + diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h new file mode 100644 index 0000000..03486cf --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h @@ -0,0 +1,331 @@ +/*************************************** + $Header: /cvsroot/math-atlas/AtlasBase/kernel/CammMaguire/camm_tpipe.h,v 1.2 2003/10/18 18:13:30 yycamm Exp $ + + +***************************************/ + + +/* #ifndef CAMM_TPIPE_H */ +/* #define CAMM_TPIPE_H */ /*+ To stop multiple inclusions. +*/ + +#ifndef BITS +#error BITS must be defined in camm_tpipe.h +#endif +#ifndef DIV +#error DIV must be defined in camm_tpipe.h +#endif +#ifndef INC +#error INC(a_) must be defined in camm_tpipe.h +#endif +#ifndef LR +#error LR must be defined in camm_tpipe.h +#endif + +#ifdef ALIGN + +#if defined(SREAL) + + test(4,ax) + je(a2) + +#undef KB +#define KB ( 1 /* / DIV */ ) +#include "camm_pipe3.h" + + KB_block + INC(4) + sub(1,LR) + + lab(a2) + +#endif + +#if defined(SREAL) || defined(DREAL) + + test(8,ax) + je(a4) + test(-2,LR) + je(a4) + +#undef KB +#define KB ( 2 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(8) + sub(2,LR) + + lab(a4) + +#endif +#endif + +/* "movl %%edx,%%edi\n\t" */ + push(LR) + shr(BITS,LR) + shl(BITS,LR) + m(4,LR) + ra(ax,LR) + +#if defined(ALIGN) && ( defined(SCPLX) || defined(DCPLX) ) + test(12,ax) + je(loopa) +#endif + +#if !defined(ALIGN) || defined(SCPLX) || defined(DCPLX) +#undef plq +#define plq(a_,b_,c_) pl(a_,b_,c_) +#undef puq +#define puq(a_,b_,c_) pu(a_,b_,c_) +#undef plqx +#define plqx(a_,b_,c_,d_,e_) plx(a_,b_,c_,d_,e_) +#undef puqx +#define puqx(a_,b_,c_,d_,e_) pux(a_,b_,c_,d_,e_) +#else +#undef plq +#define plq(a_,b_,c_) pla(a_,b_,c_) +#undef puq +#define puq(a_,b_,c_) punt(a_,b_,c_) +#undef plqx +#define plqx(a_,b_,c_,d_,e_) plax(a_,b_,c_,d_,e_) +#undef puqx +#define puqx(a_,b_,c_,d_,e_) puax(a_,b_,c_,d_,e_) +#endif + + align + lab(loop) + cmp(ax,LR) + je(stop) + +#undef KB +#define KB ( (1 << BITS) /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(4*KB/**DIV*/) + + jmp(loop) + + lab(stop) + pop(LR) + +#if ( 1 << BITS ) > 128 + test(128,LR) + je(64) +#undef KB +#define KB ( 128 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(512) + + lab(64) +#endif + +#if ( 1 << BITS ) > 64 + test(64,LR) + je(32) +#undef KB +#define KB ( 64 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(256) + + lab(32) +#endif + +#if ( 1 << BITS ) > 32 + test(32,LR) + je(16) +#undef KB +#define KB ( 32 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(128) + + lab(16) +#endif + +#if ( 1 << BITS ) > 16 + test(16,LR) + je(8) +#undef KB +#define KB ( 16 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(64) + + lab(8) +#endif + +#if ( 1 << BITS ) > 8 + test(8,LR) + je(4) +#undef KB +#define KB ( 8 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(32) + + lab(4) +#endif + +#if ( 1 << BITS ) > 4 + test(4,LR) + je(2) +#undef KB +#define KB ( 4 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(16) + + lab(2) +#endif + +#if DIV != 4 && ( 1 << BITS ) > 2 + test(2,LR) + je(1) +#undef KB +#define KB ( 2 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(8) + + lab(1) +#endif + +#if DIV == 1 && ( 1 << BITS ) > 1 + test(1,LR) + je(end) +#undef KB +#define KB ( 1 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + lab(end) +#endif + +#if defined (ALIGN) && ( defined(SCPLX) || defined(DCPLX) ) + + jmp(tend) + +#undef plq +#define plq(a_,b_,c_) pla(a_,b_,c_) +#undef puq +#define puq(a_,b_,c_) punt(a_,b_,c_) +#undef plqx +#define plqx(a_,b_,c_,d_,e_) plax(a_,b_,c_,d_,e_) +#undef puqx +#define puqx(a_,b_,c_,d_,e_) puax(a_,b_,c_,d_,e_) + + align + lab(loopa) + cmp(ax,LR) + je(stopa) + +#undef KB +#define KB ( (1 << BITS) /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(4*KB/**DIV*/) + + jmp(loopa) + + lab(stopa) + pop(LR) + +#if ( 1 << BITS ) > 128 + test(128,LR) + je(64a) +#undef KB +#define KB ( 128 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(512) + + lab(64a) +#endif + +#if ( 1 << BITS ) > 64 + test(64,LR) + je(32a) +#undef KB +#define KB ( 64 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(256) + + lab(32a) +#endif + +#if ( 1 << BITS ) > 32 + test(32,LR) + je(16a) +#undef KB +#define KB ( 32 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(128) + + lab(16a) +#endif + +#if ( 1 << BITS ) > 16 + test(16,LR) + je(8a) +#undef KB +#define KB ( 16 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(64) + + lab(8a) +#endif + +#if ( 1 << BITS ) > 8 + test(8,LR) + je(4a) +#undef KB +#define KB ( 8 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(32) + + lab(4a) +#endif + +#if ( 1 << BITS ) > 4 + test(4,LR) + je(2a) +#undef KB +#define KB ( 4 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(16) + + lab(2a) +#endif + +#if DIV != 4 && ( 1 << BITS ) > 2 + test(2,LR) + je(1a) +#undef KB +#define KB ( 2 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(8) + + lab(1a) +#endif + +#if DIV == 1 && ( 1 << BITS ) > 1 + test(1,LR) + je(enda) +#undef KB +#define KB ( 1 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + lab(enda) +#endif + + lab(tend) + +#endif + +/* #endif */ /* CAMM_TPIPE_H */ diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h new file mode 100644 index 0000000..6b150d3 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h @@ -0,0 +1,508 @@ +#ifndef CAMM_UTIL_H +#define CAMM_UTIL_H /*+ To stop multiple inclusions. +*/ + +typedef struct { + float r,i; +} Complex; + +typedef struct { + double r,i; +} Dcomplex; + +#undef str +#define str(a_) xstr(a_) +#undef xstr +#define xstr(a_) #a_ + +#undef val +#define val(a_) xval(a_) +#undef xval +#define xval(a_) a_ + +#ifndef Mjoin +#define Mjoin(a,b) mjoin(a,b) +#ifdef mjoin + #undef mjoin +#endif +#define mjoin(a,b) a ## b +#endif + +#undef VOLATILE +#define VOLATILE __volatile__ +#undef ASM +#define ASM __asm__ VOLATILE + +#ifdef BETA0 +#undef BL +#define BL b0 +#endif +#ifdef BETA1 +#undef BL +#define BL b1 +#endif +#ifdef BETAX +#undef BL +#define BL bX +#endif +#ifdef BETAXI0 +#undef BL +#define BL bXi0 +#endif + +#ifdef NO_TRANSPOSE +#ifdef GER +#ifdef Conj_ +#undef FEXT +#define FEXT Gc +#else +#undef FEXT +#define FEXT Gu +#endif +#else +#ifdef Conj_ +#undef FEXT +#define FEXT Nc +#else +#undef FEXT +#define FEXT N +#endif +#endif +#else +#ifdef Conj_ +#undef FEXT +#define FEXT C +#else +#undef FEXT +#define FEXT T +#endif +#endif + +#undef BLC +#define BLC Mjoin(FEXT,BL) + +#ifdef __GNUC__ +#undef NO_INLINE +#define NO_INLINE double sq(double x) {return x*x;} +#else +#undef NO_INLINE +#define NO_INLINE +#endif + +#undef lab +#define lab(a_) "\n" str(MY_FUNCTION) "_" str(N) "_" str(a_) ":\n\t" +#undef jmp +#define jmp(a_) "jmp " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" +#undef je +#define je(a_) "je " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" +#undef jge +#define jge(a_) "jge " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" +#undef jle +#define jle(a_) "jle " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" +#undef jl +#define jl(a_) "jl " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" +#undef jne +#define jne(a_) "jne " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" +#undef align +#define align ".align 16\n\t" +#undef test +#define test(a_,b_) "testl $" str(a_) ",%%e" str(b_) "\n\t" +#undef and +#define and(a_,b_) "andl $" str(a_) ",%%e" str(b_) "\n\t" +#undef sub +#define sub(a_,b_) "subl $" str(a_) ",%%e" str(b_) "\n\t" +#undef SS +#define SS(a_,b_) a_ + b_ +#undef MM +#define MM(a_,b_) a_ * b_ +#undef E4 +#define E4(a_) (( a_ >> 2 ) << 2 ) + +#undef TYPE +#undef SCALAR +#undef PREC +#undef CSHUF +#undef LSHUF +#undef HSHUF +#undef ISHUF +#undef RSHUF +#undef SINGLE +#undef REAL +#undef DIV + +#ifdef SCPLX +#define TYPE Complex +#define SCALAR Complex * +#define PREC c +#define CSHUF 177 +#define LSHUF 160 +#define HSHUF 245 +#define ISHUF 13*17 +#define RSHUF 8*17 +#define SINGLE +#define DIV 2 +/* #ifdef Conj_ */ +/* static const TYPE signd[2]={{-1.0,1.0},{-1.0,1.0}}; */ +/* #else */ + static const TYPE signd[2]={{1.0,-1.0},{1.0,-1.0}}; +/* #endif */ +#endif + +#ifdef SREAL +#define TYPE float +#define SCALAR float +#define PREC s +#define SINGLE +#define REAL +#define DIV 1 +#endif + +#ifdef DREAL +#define TYPE double +#define SCALAR double +#define PREC d +#define REAL +#define DIV 2 +#endif + +#ifdef DCPLX +#define TYPE Dcomplex +#define SCALAR Dcomplex * +#define PREC z +#define CSHUF 1 +#define LSHUF 0 +#define HSHUF 3 +#define ISHUF 3 +#define RSHUF 0 +#define DIV 4 +/* #ifdef Conj_ */ +/* static const TYPE signd[1]={{-1.0,1.0}}; */ +/* #else */ + static const TYPE signd[1]={{1.0,-1.0}}; +/* #endif */ +#endif + +#undef M11 +#define M11 0 +#undef M12 +#define M12 1 +#undef M13 +#define M13 2 +#undef M14 +#define M14 3 +#undef M15 +#define M15 4 +#undef M16 +#define M16 5 +#undef M17 +#define M17 6 +#undef M18 +#define M18 7 + +#undef M23 +#define M23 1 +#undef M24 +#define M24 2 +#undef M25 +#define M25 3 +#undef M26 +#define M26 4 +#undef M27 +#define M27 5 +#undef M28 +#define M28 6 + +#undef M33 +#define M33 0 +#undef M34 +#define M34 1 +#undef M35 +#define M35 2 +#undef M36 +#define M36 3 +#undef M37 +#define M37 4 +#undef M38 +#define M38 5 + +#undef P10 +#define P10 1 +#undef P11 +#define P11 2 +#undef P12 +#define P12 3 +#undef P13 +#define P13 4 +#undef P14 +#define P14 5 +#undef P15 +#define P15 6 +#undef P16 +#define P16 7 + +#undef XM +#define XM(a_,b_) M ## b_ ## a_ +#undef M +#define M(a_,b_) XM(a_,b_) + +#undef XP +#define XP(a_,b_) P ## b_ ## a_ +#undef P +#define P(a_,b_) XP(a_,b_) + +#undef mex +#define mex(a_) str(%%e ## a_) +#undef msx +#define msx(a_) "%%st(" str(a_) ")" + +#undef cmp +#define cmp(a_,b_) "cmp " mex(a_) "," mex(b_) "\n\t" +#undef icmpr +#define icmpr(a_,b_) "cmp " mex(a_) ",(" mex(b_) ")\n\t" +#undef f +#define f(a_,b_,c_) "prefetch" str(a_) " " str(b_) "(%%e" #c_ ")\n\t" +#undef pfx +#define pfx(a_,b_,c_,d_,e_) "prefetch" str(a_) " " str(b_) "(%%e" #c_ ",%%e" #d_ "," str(e_) ")\n\t" +#undef a +#define a(a_,b_) "addl $" str(a_) "," mex(b_) "\n\t" +#undef m +#define m(a_,b_) "imul $" str(a_) "," mex(b_) "\n\t" +#undef pop +#define pop(a_) "popl %%e" str(a_) "\n\t" +#undef push +#define push(a_) "pushl %%e" str(a_) "\n\t" +#undef d +#define d(a_,b_) "idiv $" str(a_) "," mex(b_) "\n\t" +#undef shl +#define shl(a_,b_) "shl $" str(a_) "," mex(b_) "\n\t" +#undef shr +#define shr(a_,b_) "shr $" str(a_) "," mex(b_) "\n\t" +#undef mm +#define mm(a_,b_) "mov $" str(a_) "," mex(b_) "\n\t" +#undef ra +#define ra(a_,b_) "addl %%e" str(a_) "," mex(b_) "\n\t" +#undef rs +#define rs(a_,b_) "subl %%e" str(a_) "," mex(b_) "\n\t" + +#undef fl +#define fl(a_,b_) "fldl " str(a_) "(" mex(b_) ")\n\t" +#undef fp +#define fp(a_,b_) "fstpl " str(a_) "(" mex(b_) ")\n\t" +#undef fd +#define fd(a_) "fld " msx(a_) "\n\t" +#undef fap +#define fap(a_,b_) "faddp " msx(a_) "," msx(b_) "\n\t" +/* #define fsp(a_) fx(a_) "fsubp %%st," msx(a_) "\n\t" */ +#undef fsp +#define fsp(a_) "fsubrp %%st," msx(a_) "\n\t" +#undef fmp +#define fmp(a_,b_) "fmulp " msx(a_) "," msx(b_) "\n\t" +#undef fa +#define fa(a_,b_) "fadd " msx(a_) "," msx(b_) "\n\t" +#undef fm +#define fm(a_,b_) "fmul " msx(a_) "," msx(b_) "\n\t" +#undef faa +#define faa(a_,b_) "faddl " str(a_) "(" mex(b_) ")\n\t" +#undef fma +#define fma(a_,b_) "fmull " str(a_) "(" mex(b_) ")\n\t" +#undef fz +#define fz "fldz\n\t" +#undef fx +#define fx(a_) "fxch " msx(a_) "\n\t" +#undef fx1 +#define fx1 "fxch\n\t" +#undef fc +#define fc(a_) "fstp " msx(a_) "\n\t" + + +#ifndef ATHLON + + +#if defined(DREAL) || defined(DCPLX) +#undef SSESUF +#define SSESUF "d " +#undef RS4 +#define RS4 16 +#undef RS +#define RS 4 +#else +#undef SSESUF +#define SSESUF "s " +#undef RS4 +#define RS4 16 +#undef RS +#define RS 4 +#endif + +#undef mxx +#define mxx(a_) str(%%xmm ## a_) +#undef prp +#define prp(a_,b_) "rcpp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef prps +#define prps(a_,b_) "rcps" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pann +#define pann(a_,b_) "andnp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef psqs +#define psqs(a_,b_) "sqrts" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef por +#define por(a_,b_) "orp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pan +#define pan(a_,b_) "andp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pcm +#define pcm(a_,b_,c_) "cmpp" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t" +#undef pcms +#define pcms(a_,b_,c_) "cmps" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t" +#undef pax +#define pax(a_,b_) "maxp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef paxs +#define paxs(a_,b_) "maxs" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pd +#define pd(a_,b_) "divp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pdsr +#define pdsr(a_,b_) "divs" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pxx +#define pxx(a_,b_) "xorp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef px +#define px(a_) "xorp" SSESUF mxx(a_) "," mxx(a_) "\n\t" +#undef pm +#define pm(a_,b_) "mulp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pa +#define pa(a_,b_) "addp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pmm +#define pmm(a_,b_,c_) "mulp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pam +#define pam(a_,b_,c_) "addp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pl +#define pl(a_,b_,c_) "movup" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pla +#define pla(a_,b_,c_) "movap" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pu +#define pu(a_,b_,c_) "movup" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" +#undef punt +#define punt(a_,b_,c_) "movntp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" +#undef pua +#define pua(a_,b_,c_) "movap" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" +#undef pud +#define pud(a_,b_,c_) "movlp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" +#undef pudr +#define pudr(a_,b_) "movlp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pc +#define pc(a_,b_) "movap" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef ps +#define ps(a_,b_,c_) "shufp" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t" +#undef phl +#define phl(a_,b_) "movhlp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pus +#define pus(a_,b_,c_) "movs" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" +#undef pls +#define pls(a_,b_,c_) "movs" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pld +#define pld(a_,b_,c_) "movlp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef plh +#define plh(a_,b_) "movlhp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pas +#define pas(a_,b_,c_) "adds" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pms +#define pms(a_,b_,c_) "muls" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pcs +#define pcs(a_,b_) "movs" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pasr +#define pasr(a_,b_) "adds" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pmsr +#define pmsr(a_,b_) "muls" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pul +#define pul(a_,b_) "unpcklp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef puh +#define puh(a_,b_) "unpckhp" SSESUF mxx(a_) "," mxx(b_) "\n\t" + +#undef plsx +#define plsx(a_,b_,c_,d_,e_) \ + "movs" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" +#undef plx +#define plx(a_,b_,c_,d_,e_) \ + "movup" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" +#undef plax +#define plax(a_,b_,c_,d_,e_) \ + "movap" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" +#undef pasx +#define pasx(a_,b_,c_,d_,e_) \ + "adds" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" +#undef pusx +#define pusx(a_,b_,c_,d_,e_) \ + "movs" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" +#undef pux +#define pux(a_,b_,c_,d_,e_) \ + "movup" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" +#undef puax +#define puax(a_,b_,c_,d_,e_) \ + "movap" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" +#undef pudx +#define pudx(a_,b_,c_,d_,e_) \ + "movlp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" + +#undef pldx +#define pldx(a_,b_,c_,d_,e_) \ + "movlp" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" + +#else + +#undef RS4 +#define RS4 8 +#undef RS +#define RS 2 + +#undef mxx +#define mxx(a_) str(%%mm ## a_) +#undef pul +#define pul(a_,b_) "punpckldq " mxx(a_) "," mxx(b_) "\n\t" +#undef puh +#define puh(a_,b_) "punpckhdq " mxx(a_) "," mxx(b_) "\n\t" + +#undef px +#define px(a_) "pxor " mxx(a_) "," mxx(a_) "\n\t" +#undef pm +#define pm(a_,b_) "pfmul " mxx(a_) "," mxx(b_) "\n\t" +#undef pa +#define pa(a_,b_) "pfadd " mxx(a_) "," mxx(b_) "\n\t" +#undef pac +#define pac(a_,b_) "pfacc " mxx(a_) "," mxx(b_) "\n\t" +#undef pmm +#define pmm(a_,b_,c_) "pfmul " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pam +#define pam(a_,b_,c_) "pfadd " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pl +#define pl(a_,b_,c_) "movq " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pla +#define pla(a_,b_,c_) "movq " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pu +#define pu(a_,b_,c_) "movq " mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" +#undef pc +#define pc(a_,b_) "movq " mxx(a_) "," mxx(b_) "\n\t" +#undef ps +#define ps(a_,b_,c_) "pswapd " mxx(b_) "," mxx(c_) "\n\t" +#undef phl +#define phl(a_,b_) "punpckhdq " mxx(a_) "," mxx(b_) "\n\t" +#undef plh +#define plh(a_,b_) "punpckldq " mxx(a_) "," mxx(b_) "\n\t" +#undef pus +#define pus(a_,b_,c_) "movd " mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" +#undef pls +#define pls(a_,b_,c_) "movd " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" + +#undef plsx +#define plsx(a_,b_,c_,d_,e_) \ + "movd " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" +#undef plx +#define plx(a_,b_,c_,d_,e_) \ + "movq " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" +#undef pasx +#define pasx(a_,b_,c_,d_,e_) \ + "addss " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" +#undef pusx +#define pusx(a_,b_,c_,d_,e_) \ + "movd " mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" +#undef pux +#define pux(a_,b_,c_,d_,e_) \ + "movq " mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" +#endif + +#endif /* CAMM_UTIL_H */ diff --git a/kaldi_io/src/tools/ATLAS/include/f77wrap_lapack.h b/kaldi_io/src/tools/ATLAS/include/f77wrap_lapack.h new file mode 100644 index 0000000..89417f7 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/f77wrap_lapack.h @@ -0,0 +1,91 @@ +/* + * Automatically Tuned Linear Algebra Software v3.8.3 + * (C) Copyright 1999 R. Clint Whaley + * + * Code contributers : R. Clint Whaley, Antoine P. Petitet + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions, and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the ATLAS group or the names of its contributers may + * not be used to endorse or promote products derived from this + * software without specific written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef F77WRAP_LAPACK_H +#define F77WRAP_LAPACK_H + +#include "atlas_misc.h" +#include "atlas_f77.h" + +#ifdef UpCase + #define PFW Mjoin(ATL_F77WRAP_,PREU) +#else + #define PFW Mjoin(atl_f77wrap_,PRE) +#endif + +#ifdef Add_ + #define F77WRAP_GETRI Mjoin(PFW,getri_) + #define F77WRAP_LAUUM Mjoin(PFW,lauum_) + #define F77WRAP_TRTRI Mjoin(PFW,trtri_) + #define F77WRAP_GETNB Mjoin(PFW,getnb_) + #define F77WRAP_GETRS Mjoin(PFW,getrs_) + #define F77WRAP_GETRF Mjoin(PFW,getrf_) + #define F77WRAP_GESV Mjoin(PFW,gesv_) + #define F77WRAP_POTRS Mjoin(PFW,potrs_) + #define F77WRAP_POTRF Mjoin(PFW,potrf_) + #define F77WRAP_POSV Mjoin(PFW,posv_) +#elif defined(Add__) + #define F77WRAP_GETRI Mjoin(PFW,getri__) + #define F77WRAP_LAUUM Mjoin(PFW,lauum__) + #define F77WRAP_TRTRI Mjoin(PFW,trtri__) + #define F77WRAP_GETNB Mjoin(PFW,getnb__) + #define F77WRAP_GETRS Mjoin(PFW,getrs__) + #define F77WRAP_GETRF Mjoin(PFW,getrf__) + #define F77WRAP_GESV Mjoin(PFW,gesv__) + #define F77WRAP_POTRS Mjoin(PFW,potrs__) + #define F77WRAP_POTRF Mjoin(PFW,potrf__) + #define F77WRAP_POSV Mjoin(PFW,posv__) +#elif defined(NoChange) + #define F77WRAP_GETRI Mjoin(PFW,getri) + #define F77WRAP_LAUUM Mjoin(PFW,lauum) + #define F77WRAP_TRTRI Mjoin(PFW,trtri) + #define F77WRAP_GETNB Mjoin(PFW,getnb) + #define F77WRAP_GETRS Mjoin(PFW,getrs) + #define F77WRAP_GETRF Mjoin(PFW,getrf) + #define F77WRAP_GESV Mjoin(PFW,gesv) + #define F77WRAP_POTRS Mjoin(PFW,potrs) + #define F77WRAP_POTRF Mjoin(PFW,potrf) + #define F77WRAP_POSV Mjoin(PFW,posv) +#elif defined(UpCase) + #define F77WRAP_GETRI Mjoin(PFW,GETRI) + #define F77WRAP_LAUUM Mjoin(PFW,LAUUM) + #define F77WRAP_TRTRI Mjoin(PFW,TRTRI) + #define F77WRAP_GETNB Mjoin(PFW,GETNB) + #define F77WRAP_GETRS Mjoin(PFW,GETRS) + #define F77WRAP_GETRF Mjoin(PFW,GETRF) + #define F77WRAP_GESV Mjoin(PFW,GESV) + #define F77WRAP_POTRS Mjoin(PFW,POTRS) + #define F77WRAP_POTRF Mjoin(PFW,POTRF) + #define F77WRAP_POSV Mjoin(PFW,POSV) +#endif + +#endif |