summaryrefslogtreecommitdiff
path: root/kaldi_io/src/tools/ATLAS/include/contrib
diff options
context:
space:
mode:
Diffstat (limited to 'kaldi_io/src/tools/ATLAS/include/contrib')
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h188
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/Make.ext39
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h709
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h1626
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h295
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h215
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h2982
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h331
-rw-r--r--kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h508
9 files changed, 6893 insertions, 0 deletions
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h b/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h
new file mode 100644
index 0000000..118d3de
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h
@@ -0,0 +1,188 @@
+#ifdef GER
+#undef NO_TRANSPOSE
+#define NO_TRANSPOSE
+#endif
+
+
+#if NDPM > 4
+#error Max NDPM is 4
+#endif
+
+#if !defined(ATL_SSE1) && ( defined(SREAL) || defined(SCPLX) )
+#error This routine needs ATL_SSE1 defined
+#endif
+
+#if !defined(ATL_SSE2) && ( defined(DREAL) || defined(DCPLX) )
+#error This routine needs ATL_SSE2 defined
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "camm_util.h"
+
+#ifndef GER
+#if defined(BETAX) || defined(BETAXI0)
+#include "camm_scale.h"
+#endif
+#endif
+
+#if NDPM >= 4
+#define EXT4 Mjoin(4dp,BLC)
+#undef NDP
+#define NDP 4
+#undef EXT
+#define EXT EXT4
+#include "camm_dpa.h"
+#endif
+
+#if NDPM >= 3
+#define EXT3 Mjoin(3dp,BLC)
+#undef NDP
+#define NDP 3
+#undef EXT
+#define EXT EXT3
+#include "camm_dpa.h"
+#endif
+
+#if NDPM >= 2
+#define EXT2 Mjoin(2dp,BLC)
+#undef NDP
+#define NDP 2
+#undef EXT
+#define EXT EXT2
+#include "camm_dpa.h"
+#endif
+
+#define EXT1 Mjoin(1dp,BLC)
+#undef NDP
+#define NDP 1
+#undef EXT
+#define EXT EXT1
+#include "camm_dpa.h"
+
+#undef NDP
+#define NDP NDPM
+#undef EXT
+#define EXT Mjoin(Mjoin(NDP,Mjoin(dp,BLC)),m)
+#include "camm_dpa.h"
+
+#ifdef GER
+#if defined(SCPLX) || defined(DCPLX)
+#ifdef Conj_
+#define IM 1c
+#else
+#define IM 1u
+#endif
+#else
+#define IM 1
+#endif
+
+
+#define FN Mjoin(Mjoin(Mjoin(ATL_,PREC),Mjoin(ger,IM)),_a1_x1_yX)
+
+#undef MY_FUNCTION
+#define MY_FUNCTION FN
+
+void
+MY_FUNCTION(int m,int n, const SCALAR alpha,const TYPE *c,
+ int cinc,const TYPE *b,int binc,
+ TYPE *a,int lda) {
+
+#else
+
+
+#define FN Mjoin(Mjoin(Mjoin(ATL_,PREC),gemv),Mjoin(FEXT,Mjoin(_a1_x1_,Mjoin(BL,_y1))))
+
+#undef MY_FUNCTION
+#define MY_FUNCTION FN
+
+void
+MY_FUNCTION(int m,int n, const SCALAR alpha,const TYPE *a,
+ int lda,const TYPE *b,int binc,
+ const SCALAR beta,TYPE *c,int cinc) {
+
+#endif
+
+ int i,mm,nn;
+ const TYPE *ae;
+#ifdef NO_TRANSPOSE
+ int len=m,w=n;
+#define zz b
+#else
+ int len=n,w=m;
+#define zz c
+#endif
+
+#ifdef GER
+#define zzinc binc
+#else
+#define zzinc 1
+
+
+#if defined(NO_TRANSPOSE) && defined(BETA0)
+ memset(c,0,m*sizeof(*c));
+#endif
+
+#if defined(BETAX) || defined(BETAXI0)
+#if defined(SCPLX) || defined(DCPLX)
+ SCALE(beta,c,m);
+#endif
+#if defined(SREAL) || defined(DREAL)
+ SCALE(&beta,c,m);
+#endif
+#endif
+
+#endif
+
+ ae=a+w*lda;
+ nn=STRIDE*lda;
+
+
+#if NDPM == 1
+ for (;a<ae;a+=lda,zz+=zzinc)
+ Mjoin(dp,EXT)(a,nn,b,c,STRIDE*zzinc,len);
+
+#else
+
+ while (a+NDPM*nn<=ae) {
+ for (i=0;i<STRIDE;i++,a+=lda,zz+=zzinc)
+ Mjoin(dp,EXT)(a,nn,b,c,STRIDE*zzinc,len);
+
+ a+=(NDPM-1)*nn;
+ zz+=(NDPM-1)*STRIDE*zzinc;
+ }
+
+ for (i=0;a<ae && i<STRIDE;i++,a+=lda,zz+=zzinc) {
+
+ mm=(ae-a)/nn;
+#if STRIDE > 1
+ if (((ae-a)/lda)%STRIDE)
+ mm++;
+#endif
+
+ if (mm == 1)
+ Mjoin(dp,EXT1)(a,nn,b,c,STRIDE*zzinc,len);
+
+#if ( NDPM == 2 && STRIDE > 1 ) || NDPM > 2
+ else if (mm == 2)
+ Mjoin(dp,EXT2)(a,nn,b,c,STRIDE*zzinc,len);
+#endif
+
+#if ( NDPM == 3 && STRIDE > 1 ) || NDPM > 3
+ else if (mm == 3)
+ Mjoin(dp,EXT3)(a,nn,b,c,STRIDE*zzinc,len);
+#endif
+
+#if ( NDPM == 4 && STRIDE > 1 ) || NDPM > 4
+ else if (mm == 4)
+ Mjoin(dp,EXT4)(a,nn,b,c,STRIDE*zzinc,len);
+#endif
+
+
+ }
+
+#endif
+
+}
+
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext b/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext
new file mode 100644
index 0000000..f7f9a0a
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext
@@ -0,0 +1,39 @@
+
+topd = /home/whaley/atlas3.8/AtlasBase
+incs = -def topd /home/whaley/atlas3.8/AtlasBase \
+ -def incd /home/whaley/atlas3.8/AtlasBase/Clint \
+ -def BASEdir /home/whaley/atlas3.8/AtlasBase/Antoine/ \
+ -def basd /home/whaley/atlas3.8/AtlasBase/Clint
+ext = extract
+extF = $(ext) -langF -lnlen71 -Remtblank -llwarn2 -LAPACK1 $(incs)
+extC = $(ext) -langC -lnlen79 -Remtblank -llwarn2 $(incs)
+extM = $(ext) -langM -lnlen79 -llwarn2 $(incs)
+
+default: all
+force_build:
+basd = /home/whaley/atlas3.8/AtlasBase/Clint
+basdRCW = /home/whaley/atlas3.8/AtlasBase/Clint
+basdAPP = /home/whaley/atlas3.8/AtlasBase/Antoine
+incf = /home/whaley/atlas3.8/AtlasBase/gen.inc
+
+files = ATL_gemv_ger_SSE.h SSE3Dnow.h camm_dpa.h camm_pipe3.h camm_scale.h \
+ camm_strat1.h camm_tpipe.h camm_util.h
+
+all : $(files)
+
+camm_strat1.h : $(topd)/kernel/CammMaguire/camm_strat1.h
+ cp $(topd)/kernel/CammMaguire/camm_strat1.h .
+camm_tpipe.h : $(topd)/kernel/CammMaguire/camm_tpipe.h
+ cp $(topd)/kernel/CammMaguire/camm_tpipe.h .
+camm_pipe3.h : $(topd)/kernel/CammMaguire/camm_pipe3.h
+ cp $(topd)/kernel/CammMaguire/camm_pipe3.h .
+ATL_gemv_ger_SSE.h : $(topd)/kernel/CammMaguire/ATL_gemv_ger_SSE.h
+ cp $(topd)/kernel/CammMaguire/ATL_gemv_ger_SSE.h .
+camm_util.h : $(topd)/kernel/CammMaguire/camm_util.h
+ cp $(topd)/kernel/CammMaguire/camm_util.h .
+camm_scale.h : $(topd)/kernel/CammMaguire/camm_scale.h
+ cp $(topd)/kernel/CammMaguire/camm_scale.h .
+camm_dpa.h : $(topd)/kernel/CammMaguire/camm_dpa.h
+ cp $(topd)/kernel/CammMaguire/camm_dpa.h .
+SSE3Dnow.h : $(topd)/kernel/PeterSoendergaard/SSE3Dnow.h
+ cp $(topd)/kernel/PeterSoendergaard/SSE3Dnow.h .
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h b/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h
new file mode 100644
index 0000000..a783749
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h
@@ -0,0 +1,709 @@
+#if !defined(ATL_GAS_x8632) && !defined(ATL_GAS_x8664)
+ #error "This kernel requires gas x86 assembler!"
+#endif
+#ifndef Mstr /* Added by RCW to make multiline macros work */
+ #define Mstr2(m) # m
+ #define Mstr(m) Mstr2(m)
+#endif
+/* The mening of the defined macros is as follows:
+ * VECLEN: The length of a singleprecision vector register
+ * vec_add: Add to single precision vectors.
+ * vec_mul: Multiply to single precision vectors.
+ * vec_mov: Moves data around
+ * vec_mov1: Load one element in a vector and zero all other entries!
+ * vec_splat: Load one element relpicated in all positions in the vector.
+ * vec_load_apart: Load elements from different memory positions into a register.
+ * vec_sum: Sums a register.
+ * vec_store_one: Stores lowest element in vector to memory, no zero-extend!
+ * Meaning of suffixes is as follows:
+ * mr means memory to register
+ * rr means register to register
+ * rm means register to memory
+ * a means that instruction needs aligned data
+ * 1 means that the instructions only operates on the lowest element of the
+ * vector.
+ *
+ * The _1 instructions work under one important assumption: That you never mix
+ * them with regular instructions, e.g. loading into a register with a normal
+ * mov, and then using add_rr_1 will not work under 3dnow! since it is in
+ * reality a normal add. However, if using a mov_1 first, the upper part of
+ * the register will be zeroed, and it will therefore work. The _1 system is
+ * more robust under SSE, but other architectures might be implemented the
+ * same way as 3dnow!
+ *
+ * RCW: I added the following functionality for SSE only (note that vw may
+ * be overwritten with intermediate results, but is not used as input,
+ * and that all input array may be overwritten wt intermediate results.
+ * VL : vector length -1):
+ * vec_red(vd, vw) : vd[0] = sum(vd[0:VL])
+ * vec_red2(v1, v2, vw) : v1[0] = sum(v1[0:VL]); v1[1] = sum(v2[0:VL])
+ * vec_red4(v0, v1, v2, v3 vw1, vw2) :
+ * v0[0] = sum(v0[0:VL]); v0[1] = sum(v1[0:VL])
+ * if type = double:
+ * v2[0] = sum(v2[0:VL]); v2[1] = sum(v3[0:VL])
+ * else
+ * v0[2] = sum(v2[0:VL]); v0[3] = sum(v3[0:VL])
+ * vec_zero(vd) : vd[0:VL] = 0.0
+ */
+
+
+/* Things to try:
+ * Non-temporal stores
+ * Sequences of instructions instead of movups
+ *
+ *
+ *
+ *
+ */
+
+
+
+#define gen_vec_rr(op,reg1,reg2) \
+ __asm__ __volatile__ (#op " " #reg1 ", " #reg2 \
+ : /* nothing */ \
+ : /* nothing */)
+
+
+#define w(p) p
+
+#define nop() __asm__ __volatile__ ("nop")
+
+#define rep() __asm__ __volatile__ ("rep")
+
+#define align() __asm__ __volatile__ (".align 16")
+
+
+#ifdef x87double
+
+#define st0 %%st(0)
+#define st1 %%st(1)
+#define st2 %%st(2)
+#define st3 %%st(3)
+#define st4 %%st(4)
+#define st5 %%st(5)
+#define st6 %%st(6)
+#define st7 %%st(7)
+
+
+#define gen_stack_rt(op,reg) \
+ __asm__ __volatile__ (#op " " #reg \
+ : /* nothing */ \
+ : /* nothing */)
+
+#define gen_stack_tr(op,reg) \
+ __asm__ __volatile__ (#op " %%st(0)," #reg \
+ : \
+ : )
+
+
+#define gen_stack_rr(op,reg1,reg2) \
+ __asm__ __volatile__ (#op " " #reg1 ", " #reg2 \
+ : /* nothing */ \
+ : /* nothing */)
+
+#define gen_stack_t(op) \
+ __asm__ __volatile__ (#op \
+ : /* nothing */ \
+ : /* nothing */)
+
+
+#define gen_stack_tm(op,mem) \
+ __asm__ __volatile__ (#op " %0" \
+ : "=m" (((mem)[0])) \
+ : )
+
+#define gen_stack_mt(op,mem) \
+ __asm__ __volatile__ (#op " %0" \
+ : \
+ : "m" (((mem)[0])))
+
+
+#define stack_mov_mt_push(mem) gen_stack_mt(fldl,mem)
+
+#define stack_add_tr_pop(reg) gen_stack_tr(faddp,reg)
+#define stack_add_mt(mem) gen_stack_mt(faddl,mem)
+
+#define stack_mul_tr(reg) gen_stack_tr(fmul,reg)
+#define stack_mul_tr_pop(reg) gen_stack_tr(fmulp,reg)
+#define stack_mul_mt(mem) gen_stack_mt(fmul,mem)
+
+#define stack_mov_tm_pop(mem) gen_stack_tm(fstpl,mem)
+
+#define stack_zero_push() gen_stack_t(fldz)
+
+#endif /* x87double */
+
+#ifdef SSE
+
+/* Peculiarities of SSE: Alignment is good, but not mandatory. It is possible to
+ * load/store from misaligned adresses using movups at a cost of some cycles. Loading
+ * using mul/add must always be aligned. Alignment is 16 bytes.
+ * No muladd.
+ */
+
+
+
+#define gen_vec_mr(op,mem,reg) \
+ __asm__ __volatile__ (#op " %0, " #reg \
+ : /* nothing */ \
+ : "m" (((mem)[0])), "m" (((mem)[1])), "m" (((mem)[2])), "m" (((mem)[3])))
+
+
+#define gen_vec_rm(op,reg,mem) \
+ __asm__ __volatile__ (#op " " #reg ", %0" \
+ : "=m" (((mem)[0])), "=m" (((mem)[1])), "=m" (((mem)[2])), "=m" (((mem)[3])) \
+ : /* nothing */ )
+
+
+
+
+#define VECLEN 4
+
+#define reg0 %%xmm0
+#define reg1 %%xmm1
+#define reg2 %%xmm2
+#define reg3 %%xmm3
+#define reg4 %%xmm4
+#define reg5 %%xmm5
+#define reg6 %%xmm6
+#define reg7 %%xmm7
+#ifdef ATL_GAS_x8664
+ #define reg8 %%xmm8
+ #define reg9 %%xmm9
+ #define reg10 %%xmm10
+ #define reg11 %%xmm11
+ #define reg12 %%xmm12
+ #define reg13 %%xmm13
+ #define reg14 %%xmm14
+ #define reg15 %%xmm15
+#endif
+
+#define vec_mov_mr(mem,reg) gen_vec_mr(movups,mem,reg)
+#define vec_mov_rm(reg,mem) gen_vec_rm(movups,reg,mem)
+#define vec_mov_mr_a(mem,reg) gen_vec_mr(movaps,mem,reg)
+#define vec_mov_rm_a(reg,mem) gen_vec_rm(movaps,reg,mem)
+#define vec_mov_rr(reg1,reg2) gen_vec_rr(movaps,reg1,reg2)
+
+#define vec_add_mr_a(mem,reg) gen_vec_mr(addps,mem,reg)
+#define vec_mul_mr_a(mem,reg) gen_vec_mr(mulps,mem,reg)
+
+#define vec_add_rr(mem,reg) gen_vec_rr(addps,mem,reg)
+#define vec_mul_rr(mem,reg) gen_vec_rr(mulps,mem,reg)
+
+#define vec_mov_mr_1(mem,reg) gen_vec_mr(movss,mem,reg)
+#define vec_mov_rm_1(reg,mem) gen_vec_rm(movss,reg,mem)
+#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movss,reg1,reg2)
+
+#define vec_add_mr_1(mem,reg) gen_vec_mr(addss,mem,reg)
+#define vec_add_rr_1(reg1,reg2) gen_vec_rr(addss,reg1,reg2)
+
+#define vec_mul_mr_1(mem,reg) gen_vec_mr(mulss,mem,reg)
+#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(mulss,reg1,reg2)
+
+#define vec_unpack_low(reg1,reg2) gen_vec_rr(unpcklps,reg1,reg2)
+#define vec_unpack_high(reg1,reg2) gen_vec_rr(unpckhps,reg1,reg2)
+#define vec_shuffle(mode,reg1,reg2) vec_shuffle_wrap(mode,reg1,reg2)
+#define vec_shuffle_wrap(mode,reg1,reg2) \
+ __asm__ __volatile__ ("shufps " #mode ", " #reg1 ", " #reg2 \
+ : /* nothing */\
+ : /* nothing */)
+
+/* Hack! */
+/* To use this instruction be sure that register 7 is not in use!!! */
+/* It must be possible to reduce this sequence to only four instructions.
+ * please tell me how! */
+#define vec_sum(reg) vec_sum_wrap(reg)
+#define vec_sum_wrap(reg) \
+ __asm__ __volatile__ ("movhlps " #reg ", %%xmm7\n"\
+ "addps " #reg ", %%xmm7\n"\
+ "movaps %%xmm7, " #reg "\n"\
+ "shufps $1, " #reg ", %%xmm7\n"\
+ "addss %%xmm7, " #reg "\n"\
+ : /* nothing */\
+ : /* nothing */)
+
+/* RCW: added to safely replace vec_sum (vec reduce), and use SSE3 when avail */
+#define vec_zero(vd) __asm__ __volatile__("xorps " Mstr(vd) ", " Mstr(vd) ::)
+#ifdef ATL_SSE3
+ #define vec_red(vr, vwrk) \
+ __asm__ __volatile__("haddps " Mstr(vr) ", " Mstr(vr) "\n"\
+ "haddps " Mstr(vr) ", " Mstr(vr) "\n" ::)
+/*
+ * haddps v1 v0 # v0 = {v1cd, v1ab, v0cd, v0ab}
+ * haddps v0 v0 # v0 = {v1abcd, v0abcd, v1abcd, v0abcd}
+ */
+ #define vec_red2(v0, v1, vwork) \
+ __asm__ __volatile__("haddps " Mstr(v1) ", " Mstr(v0) "\n"\
+ "haddps " Mstr(v0) ", " Mstr(v0) "\n" ::)
+/*
+ * haddps v1, v0 # v0 = {v1cd,v1ab,v0cd,v0ab}
+ * haddps v3, v2 # v2 = {v3cd,v3ab,v2cd,v2ab}
+ * haddps v2, v0 # v0 = {v3abcd,v2abcd,v1abcd, v0abcd}
+ */
+ #define vec_red4(v0, v1, v2, v3, w0, w1) \
+ __asm__ __volatile__("haddps " Mstr(v1) ", " Mstr(v0) "\n"\
+ "haddps " Mstr(v3) ", " Mstr(v2) "\n"\
+ "haddps " Mstr(v2) ", " Mstr(v0) "\n" ::)
+#elif defined(ATL_SSE2)
+ #define vec_red(vr, vwrk) \
+ __asm__ __volatile__ ("pshufd $0xEE, " Mstr(vr) ", " Mstr(vwrk) "\n"\
+ "addps " Mstr(vwrk) ", " Mstr(vr) "\n"\
+ "pshufd $0xE5, " Mstr(vr) ", " Mstr(vwrk) "\n"\
+ "addss " Mstr(vwrk) ", " Mstr(vr) "\n"\
+ ::)
+#else
+ #define vec_red(vr, vwrk) \
+ __asm__ __volatile__ ("movhlps " Mstr(vr) ", " Mstr(vwrk) "\n"\
+ "addps " Mstr(vwrk) ", " Mstr(vr) "\n"\
+ "movaps " Mstr(vr) ", " Mstr(vwrk) "\n"\
+ "shufps $0xE5, " Mstr(vr) ", " Mstr(vr) "\n"\
+ "addss " Mstr(vwrk) ", " Mstr(vr) "\n"\
+ ::)
+#endif
+#ifndef ATL_SSE3 /* codes that are the same for SSE2 and SSE1 */
+/*
+ # v0 = {v0d,v0c,v0b,v0a}
+ # v1 = {v1d,v1c,v1b,v1a}
+ movaps v0, vw # vw = {v0d,v0c,v0b,v0a}
+ unpacklps v1, v0 # v0 = {v1b,v0b,v1a,v0a}
+ unpackhps v1, vw # vw = {v1d,v0d,v1c,v0c}
+ addps vw, v0 # v0 = {v1bd,v0bd,v1ac,v0ac}
+ movhlps v0, vw # vw = {X , X,v1bd,v0bd}
+ addps vw, v0 # v0 = {X , X,v1abcd,v0abcd}
+*/
+ #define vec_red2(v0, v1, vw) \
+ __asm__ __volatile__ ("movaps " Mstr(v0) ", " Mstr(vw) "\n"\
+ "unpcklps " Mstr(v1) ", " Mstr(v0) "\n"\
+ "unpckhps " Mstr(v1) ", " Mstr(vw) "\n"\
+ "addps " Mstr(vw) ", " Mstr(v0) "\n"\
+ "movhlps " Mstr(v0) ", " Mstr(vw) "\n"\
+ "addps " Mstr(vw) ", " Mstr(v0) "\n"\
+ ::)
+/*
+ * movaps v0, w0 # w0 = {v0d, v0c, v0b, v0a}
+ * unpcklps v1, v0 # v0 = {v1b, v0b, v1a, v0a}
+ * movaps v2, w1 # w1 = {v2d, v2c, v2b, v2a}
+ * unpckhps v1, w0 # w0 = {v1d, v0d, v1c, v0c}
+ * unpcklps v3, v2 # v2 = {v3b, v2b, v3a, v2a}
+ * addps w0, v0 # v0 = {v1bd, v0bd, v1ac, v0ac}
+ * unpckhps v3, w1 # w1 = {v3d, v2d, v3c, v2c}
+ * movaps v0, w0 # w0 = {v1bd, v0bd, v1ac, v0ac}
+ * addps w1, v2 # v2 = {v3bd, v2bd, v3ac, v2ac}
+ * shufps $0x44,v2,v0 # v0 = {v3ac, v2ac, v1ac, v0ac}
+ * shufps $0xEE,v2,w0 # w0 = {v3bd, v2bd, v1bd, v0bd}
+ * addps w0, v0 # v0 = {v3abcd, v2abcd, v1abcd, v0abcd}
+ */
+ #define vec_red4(v0, v1, v2, v3, w0, w1) \
+ __asm__ __volatile__ ("movaps " Mstr(v0) ", " Mstr(w0) "\n"\
+ "unpcklps " Mstr(v1) ", " Mstr(v0) "\n"\
+ "movaps " Mstr(v2) ", " Mstr(w1) "\n"\
+ "unpckhps " Mstr(v1) ", " Mstr(w0) "\n"\
+ "unpcklps " Mstr(v3) ", " Mstr(v2) "\n"\
+ "addps " Mstr(w0) ", " Mstr(v0) "\n"\
+ "unpckhps " Mstr(v3) ", " Mstr(w1) "\n"\
+ "movaps " Mstr(v0) ", " Mstr(w0) "\n"\
+ "addps " Mstr(w1) ", " Mstr(v2) "\n"\
+ "shufps $0x44, " Mstr(v2) ", " Mstr(v0) "\n"\
+ "shufps $0xEE, " Mstr(v2) ", " Mstr(w0) "\n"\
+ "addps " Mstr(w0) ", " Mstr(v0) "\n"\
+ ::)
+#endif
+
+#define vec_splat(mem,reg) vec_splat_wrap(mem,reg)
+#define vec_splat_wrap(mem,reg) \
+ __asm__ __volatile__ ("movss %0, " #reg "\n"\
+ "unpcklps " #reg ", " #reg "\n"\
+ "movlhps " #reg ", " #reg "\n"\
+ : /* nothing */ \
+ : "m" ((mem)[0]))
+
+
+/* This instruction sequence appears courtesy of Camm Maguire. */
+#define vec_sum_full(reg0,reg1,reg2,reg3,regout,empty0,empty1) vec_sum_full_wrap(reg0,reg1,reg2,reg3,regout,empty0,empty1)
+#define vec_sum_full_wrap(reg0,reg1,reg2,reg3,regout,empty0,empty1) \
+ __asm__ __volatile__ ("movaps " #reg0 "," #empty0 "\n"\
+ "unpcklps " #reg1 "," #reg0 "\n"\
+ "movaps " #reg2 "," #empty1 "\n"\
+ "unpckhps " #reg1 "," #empty0 "\n"\
+ "unpcklps " #reg3 "," #reg2 "\n"\
+ "addps " #empty0 "," #reg0 "\n"\
+ "unpckhps " #reg3 "," #empty1 "\n"\
+ "movaps " #reg0 "," #regout "\n"\
+ "addps " #empty1 "," #reg2 "\n"\
+ "shufps $0x44," #reg2 "," #reg0 "\n"\
+ "shufps $0xee," #reg2 "," #regout "\n"\
+ "addps " #reg0 "," #regout "\n"\
+ : /* nothing */ \
+ : /* nothing */)
+
+
+
+typedef float vector[VECLEN];
+
+#endif /* end ifdef SSE */
+
+
+#ifdef SSE2
+
+/* Peculiarities of SSE: Alignment is good, but not mandatory. It is possible to
+ * load/store from misaligned adresses using movups at a cost of some cycles. Loading
+ * using mul/add must always be aligned. Alignment is 16 bytes.
+ * No muladd.
+ */
+
+
+
+#define gen_vec_mr(op,mem,reg) \
+ __asm__ __volatile__ (#op " %0, " #reg \
+ : /* nothing */ \
+ : "m" (((mem)[0])), "m" (((mem)[1])))
+
+
+#define gen_vec_rm(op,reg,mem) \
+ __asm__ __volatile__ (#op " " #reg ", %0" \
+ : "=m" (((mem)[0])), "=m" (((mem)[1])) \
+ : /* nothing */ )
+
+
+
+
+#define VECLEN 2
+
+#define reg0 %%xmm0
+#define reg1 %%xmm1
+#define reg2 %%xmm2
+#define reg3 %%xmm3
+#define reg4 %%xmm4
+#define reg5 %%xmm5
+#define reg6 %%xmm6
+#define reg7 %%xmm7
+#ifdef ATL_GAS_x8664
+ #define reg8 %%xmm8
+ #define reg9 %%xmm9
+ #define reg10 %%xmm10
+ #define reg11 %%xmm11
+ #define reg12 %%xmm12
+ #define reg13 %%xmm13
+ #define reg14 %%xmm14
+ #define reg15 %%xmm15
+#endif
+
+
+#define vec_mov_mr(mem,reg) gen_vec_mr(movupd,mem,reg)
+#define vec_mov_rm(reg,mem) gen_vec_rm(movupd,reg,mem)
+#define vec_mov_mr_a(mem,reg) gen_vec_mr(movapd,mem,reg)
+#define vec_mov_rm_a(reg,mem) gen_vec_rm(movapd,reg,mem)
+#define vec_mov_rr(reg1,reg2) gen_vec_rr(movapd,reg1,reg2)
+
+#define vec_add_mr_a(mem,reg) gen_vec_mr(addpd,mem,reg)
+#define vec_mul_mr_a(mem,reg) gen_vec_mr(mulpd,mem,reg)
+
+#define vec_add_rr(mem,reg) gen_vec_rr(addpd,mem,reg)
+#define vec_mul_rr(mem,reg) gen_vec_rr(mulpd,mem,reg)
+
+#define vec_mov_mr_1(mem,reg) gen_vec_mr(movsd,mem,reg)
+#define vec_mov_rm_1(reg,mem) gen_vec_rm(movsd,reg,mem)
+#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movsd,reg1,reg2)
+
+#define vec_add_mr_1(mem,reg) gen_vec_mr(addsd,mem,reg)
+#define vec_add_rr_1(reg1,reg2) gen_vec_rr(addsd,reg1,reg2)
+
+#define vec_mul_mr_1(mem,reg) gen_vec_mr(mulsd,mem,reg)
+#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(mulsd,reg1,reg2)
+
+#define vec_splat(mem,reg) vec_splat_wrap(mem,reg)
+#define vec_splat_wrap(mem,reg) \
+ __asm__ __volatile__ ("movsd %0, " #reg "\n"\
+ "unpcklpd " #reg ", " #reg \
+ : /* nothing */ \
+ : "m" ((mem)[0]))
+
+/* Hack! */
+/* To use this instruction be sure that register 7 is not in use!!! */
+#define vec_sum(reg) vec_sum_wrap(reg)
+#define vec_sum_wrap(reg) \
+ __asm__ __volatile__ ("movhlps " #reg ", %%xmm7\n"\
+ "addpd %%xmm7, " #reg "\n"\
+ : /* nothing */\
+ : /* nothing */)
+/*
+ * Added by RCW to improve performance and avoid xmm7 hack (replace vec_sum)
+ */
+#define vec_zero(vd) __asm__ __volatile__("xorps " Mstr(vd) ", " Mstr(vd) ::)
+#ifdef ATL_SSE3
+ #define vec_red(vr, vwrk) \
+ __asm__ __volatile__("haddpd " Mstr(vr) ", " Mstr(vr) "\n" ::)
+ #define vec_red2(v0, v1, vw) \
+ __asm__ __volatile__("haddpd " Mstr(v1) ", " Mstr(v0) "\n" ::)
+ #define vec_red4(v0, v1, v2, v3, w0, w1) \
+ __asm__ __volatile__("haddpd " Mstr(v1) ", " Mstr(v0) "\n"\
+ "haddpd " Mstr(v3) ", " Mstr(v2) "\n"\
+ ::)
+#else
+ #define vec_red(vr, vwrk) \
+ __asm__ __volatile__ ("pshufd $0xEE, " Mstr(vr) ", " Mstr(vwrk) "\n"\
+ "addsd " Mstr(vwrk) ", " Mstr(vr) "\n" ::)
+/*
+ * movapd v0, vw # vw = {v0b, v0a}
+ * unpcklpd v1,v0 # v0 = {v1a, v0a}
+ * unpckhpd v1, vw # vw = {v1b, v0b}
+ * addpd vw, v0 # v0 = {v1ab,v0ab}
+ */
+ #define vec_red2(v0, v1, vw) \
+ __asm__ __volatile__("movapd " Mstr(v0) ", " Mstr(vw) "\n"\
+ "unpcklpd " Mstr(v1) ", " Mstr(v0) "\n"\
+ "unpckhpd " Mstr(v1) ", " Mstr(vw) "\n"\
+ "addpd " Mstr(vw) ", " Mstr(v0) "\n"\
+ ::)
+/*
+ * movapd v0, w0 # w0 = {v0b, v0a}
+ * movapd v2, w1 # w1 = {v2b, v2a}
+ * unpcklpd v1, v0 # v0 = {v1a, v0a}
+ * unpcklpd v3, v2 # v2 = {v3a, v2a}
+ * unpckhpd v1, w0 # w0 = {v1b, v0b}
+ * unpckhpd v3, w1 # w1 = {v3b, v2b}
+ * addpd w0, v0 # v0 = {v1ab, v0ab}
+ * addpd w1, v2 # v2 = {v3ab, v2ab}
+ */
+ #define vec_red4(v0, v1, v2, v3, w0, w1) \
+ __asm__ __volatile__("movapd " Mstr(v0) ", " Mstr(w0) "\n"\
+ "movapd " Mstr(v2) ", " Mstr(w1) "\n"\
+ "unpcklpd " Mstr(v1) ", " Mstr(v0) "\n"\
+ "unpcklpd " Mstr(v3) ", " Mstr(v2) "\n"\
+ "unpckhpd " Mstr(v1) ", " Mstr(w0) "\n"\
+ "unpckhpd " Mstr(v3) ", " Mstr(w1) "\n"\
+ "addpd " Mstr(w0) ", " Mstr(v0) "\n"\
+ "addpd " Mstr(w1) ", " Mstr(v2) "\n"\
+ ::)
+#endif
+
+#define vec_sum_full(reg1,reg2,empty1) vec_sum_full_wrap(reg1,reg2,empty1)
+#define vec_sum_full_wrap(reg1,reg2,empty1) \
+ __asm__ __volatile__ ("movhlps " #reg2 ", " #empty1 "\n"\
+ "movlhps " #reg2 ", " #empty1 "\n"\
+ "addpd " #empty1 ", " #reg1 "\n"\
+ : /* nothing */\
+ : /* nothing */)
+
+
+typedef double vector[VECLEN];
+
+#endif /* end ifdef SSE2 */
+
+
+#ifdef THREEDNOW
+
+/* Peculiarities of 3DNOW. Alignment is not an issue,
+ * all alignments are legal, however alignment gives a speed increase.
+ * The vec_acc instruction can be used to sum to registers at once more efficiently
+ * than a series of vec_sum and vec_store_one
+ * No muladd.
+ */
+
+
+#define gen_vec_mr(op,mem,reg) \
+ __asm__ __volatile__ (#op " %0, " #reg \
+ : /* nothing */ \
+ : "m" (((mem)[0])), "m" (((mem)[1])))
+
+#define gen_vec_rm(op,reg,mem) \
+ __asm__ __volatile__ (#op " " #reg ", %0" \
+ : "=m" (((mem)[0])), "=m" (((mem)[1])) \
+ : /* nothing */ )
+
+
+
+
+#define VECLEN 2
+
+#define reg0 %%mm0
+#define reg1 %%mm1
+#define reg2 %%mm2
+#define reg3 %%mm3
+#define reg4 %%mm4
+#define reg5 %%mm5
+#define reg6 %%mm6
+#define reg7 %%mm7
+
+#define vec_add_mr(mem,reg) gen_vec_mr(pfadd,mem,reg)
+#define vec_mul_mr(mem,reg) gen_vec_mr(pfmul,mem,reg)
+#define vec_mov_mr(mem,reg) gen_vec_mr(movq,mem,reg)
+#define vec_mov_rm(reg,mem) gen_vec_rm(movq,reg,mem)
+#define vec_add_rr(reg1,reg2) gen_vec_rr(pfadd,reg1,reg2)
+#define vec_mul_rr(reg1,reg2) gen_vec_rr(pfmul,reg1,reg2)
+#define vec_acc_rr(reg1,reg2) gen_vec_rr(pfacc,reg1,reg2)
+#define vec_mov_rr(reg1,reg2) gen_vec_rr(movq,reg1,reg2)
+
+#define vec_sum(reg) gen_vec_rr(pfacc,reg,reg)
+#define vec_sum_full(reg1,reg2) gen_vec_rr(pfacc,reg1,reg2)
+
+#define vec_mov_mr_1(mem,reg) gen_vec_mr(movd,mem,reg)
+#define vec_mov_rm_1(reg,mem) gen_vec_rm(movd,reg,mem)
+#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movd,reg1,reg2)
+
+#define vec_add_rr_1(reg1,reg2) gen_vec_rr(pfadd,reg1,reg2)
+#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(pfmul,reg1,reg2)
+
+
+#define vec_splat(mem,reg) vec_splat_wrap(mem,reg)
+#define vec_splat_wrap(mem,reg) \
+ __asm__ __volatile__ ("movd %0, " #reg "\n"\
+ "punpckldq " #reg ", " #reg \
+ : /* nothing */ \
+ : "m" ((mem)[0]))
+
+
+#define vec_load_apart(mem1,mem2,reg) vec_load_apart_wrap(mem1,mem2,reg)
+#define vec_load_apart_wrap(mem1,mem2,reg) \
+ __asm__ __volatile__ ("movd %0, " #reg "\n"\
+ "punpckldq %1, " #reg \
+ : /* nothing */ \
+ : "m" ((mem1)[0]), "m" (((mem2)[0])))
+
+
+#define vec_zero(reg) gen_vec_rr(pxor,reg,reg)
+
+#define vec_enter() __asm__ __volatile__ ("femms")
+#define vec_exit() __asm__ __volatile__ ("femms")
+
+#define align() __asm__ __volatile__ (".align 16")
+
+
+typedef float vector[VECLEN];
+
+#endif
+
+
+
+
+
+#ifdef ALTIVEC
+
+#define VECLEN 4
+
+#define reg0 %%vr0
+#define reg1 %%vr1
+#define reg2 %%vr2
+#define reg3 %%vr3
+#define reg4 %%vr4
+#define reg5 %%vr5
+#define reg6 %%vr6
+#define reg7 %%vr7
+#define reg8 %%vr8
+#define reg9 %%vr9
+#define reg10 %%vr10
+#define reg11 %%vr11
+#define reg12 %%vr12
+#define reg13 %%vr13
+#define reg14 %%vr14
+#define reg15 %%vr15
+#define reg16 %%vr16
+#define reg17 %%vr17
+#define reg18 %%vr18
+#define reg19 %%vr19
+#define reg20 %%vr20
+#define reg21 %%vr21
+#define reg22 %%vr22
+#define reg23 %%vr23
+#define reg24 %%vr24
+#define reg25 %%vr25
+#define reg26 %%vr26
+#define reg27 %%vr27
+#define reg28 %%vr28
+#define reg29 %%vr29
+#define reg30 %%vr30
+#define reg31 %%vr31
+
+#define gen_vec_mr(op,mem,reg) \
+ __asm__ __volatile__ (#op " %0, " #reg \
+ : /* nothing */ \
+ : "m" (((mem)[0])), "m" (((mem)[1])), "m" (((mem)[2])), "m" (((mem)[3])))
+
+
+#define gen_vec_rm(op,reg,mem) \
+ __asm__ __volatile__ (#op " " #reg ", %0" \
+ : "=m" (((mem)[0])), "=m" (((mem)[1])), "=m" (((mem)[2])), "=m" (((mem)[3])) \
+ : /* nothing */ )
+
+
+#define gen_alti3(op,reg1,reg2,regout) \
+ __asm__ __volatile__ (#op " " #reg1 ", " #reg2 ", " #regout \
+ : /* nothing */ \
+ : /* nothing */)
+
+#define gen_alti_muladd(op,reg1,reg2,regout) \
+ __asm__ __volatile__ (#op " " #reg1 ", " #reg2 ", " #regout ", " #regout \
+ : /* nothing */ \
+ : /* nothing */)
+
+
+
+#define vec_mov_mr_a(mem,reg) gen_vec_mr(lvx,mem,reg)
+#define vec_mov_rm_a(reg,mem) gen_vec_rm(svx,reg,mem)
+#define vec_muladd(reg1,reg2,regout) gen_alti3(vmaddfp,reg1,reg2,regout)
+
+#define vec_zero(reg) gen_alti3(vxor,reg,reg,reg)
+
+
+typedef float vector[VECLEN];
+
+#endif
+
+
+#ifdef ALTIVEC_C
+
+/* These macros have been written by, or greatly inspired by,
+ * Nicholas A. Coult . Thanks.
+ */
+
+/* assumes that last four registers are not in use! */
+#define transpose(x0,x1,x2,x3) \
+reg28 = vec_mergeh(x0,x2); \
+reg29 = vec_mergeh(x1,x3); \
+reg30 = vec_mergel(x0,x2); \
+reg31 = vec_mergel(x1,x3); \
+x0 = vec_mergeh(reg28,reg29); \
+x1 = vec_mergel(reg28,reg29); \
+x2 = vec_mergeh(reg30,reg31); \
+x3 = vec_mergel(reg30,reg31)
+
+#define vec_mov_rm(v, where) \
+low = vec_ld(0, (where)); \
+high = vec_ld(16, (where)); \
+p_vector = vec_lvsr(0, (int *)(where)); \
+mask = vec_perm((vector unsigned char)(0), (vector unsigned char)(-1), p_vector); \
+v = vec_perm(v, v, p_vector); \
+low = vec_sel(low, v, mask); \
+high = vec_sel(v, high, mask); \
+vec_st(low, 0, (where)); \
+vec_st(high, 16, (where))
+
+#define vec_mov_mr_a(mem,reg) reg = vec_ld(0, mem)
+
+#define vec_mov_mr(u,v) \
+p_vector = (vector unsigned char)vec_lvsl(0, (int*)(v)); \
+low = (vector unsigned char)vec_ld(0, (v)); \
+high = (vector unsigned char)vec_ld(16, (v)); \
+u=(vector float)vec_perm(low, high, p_vector)
+
+#define vec_muladd(reg1,reg2,regout) regout = vec_madd(reg1,reg2,regout)
+#define vec_add_rr(reg1,reg2) reg2 = vec_add(reg1,reg2)
+
+#define vec_zero(reg) reg = vec_xor(reg,reg)
+
+#define vec_sum_full(reg0,reg1,reg2,reg3,regout,empty0,empty1) \
+transpose(reg0, reg1,reg2,reg3,regout,empty0,empty1); \
+empty0 = vec_add(reg0,reg1); \
+empty1 = vec_add(reg2,reg3); \
+regout = vec_add(empty0,empty1)
+
+
+#endif /* ALTIVEC_C */
+
+
+
+
+
+
+
+
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h
new file mode 100644
index 0000000..af9c6b1
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h
@@ -0,0 +1,1626 @@
+#include <stdlib.h>
+#include <sys/time.h>
+#include <stdio.h>
+
+#include "camm_util.h"
+
+
+#if defined(ALIGN)
+#if( defined(SCPLX) || defined(DCPLX))
+#error Cannot align complex routines
+#endif
+#if defined(SREAL) && ( NDPM != 1 ) && ( STRIDE % 4 != 0)
+#error Can only align SREAL with NDPM 1 or STRIDE % 4 = 0
+#endif
+#if defined(DREAL) && ( NDPM != 1 ) && ( STRIDE % 2 != 0)
+#error Can only align DREAL with NDPM 1 or STRIDE % 2 = 0
+#endif
+#endif
+
+/******************************************************************************
+ * Single Precision Complex Macros
+ ******************************************************************************/
+
+#ifdef SCPLX
+
+#ifdef NO_TRANSPOSE
+
+#if NDPM > 3
+#error Max NDPM is 3 for SCPLX NO_TRANSPOSE
+#endif
+
+#undef plax
+#define plax
+
+#undef R1
+#define R1 2
+#undef R2
+#define R2 4
+#undef R3
+#define R3 6
+#undef R4
+#define R4 6
+
+#undef TREG
+#define TREG 1
+#undef SREG
+#define SREG 0
+#undef CREG
+#define CREG 0
+
+#ifdef GER
+#undef AREG
+#define AREG 0
+#undef targ
+#define targ(a_) AREG
+#undef wb
+#define wb(a_,b_) pu(AREG,a_,b_)
+#undef wbd
+#define wbd(a_,b_) pud(AREG,a_,b_)
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#else
+#undef AREG
+#define AREG TREG
+#undef targ
+#define targ(a_) CREG
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef w
+#define w(a_) pu(CREG,a_ ## 0,si)
+#undef w1_2
+#define w1_2(a_) pud(CREG,a_ ## 0,si)
+#endif
+
+#undef src
+#define src(a_) a_
+#undef mpx
+#define mpx(a_) pls(0,si,a_) ps(0,a_,a_) pls(4,si,P(a_,1)) \
+ ps(0,P(a_,1),P(a_,1)) sign(a_)
+#undef madd
+#define madd(a_,b_,c_) pas(a_,b_,c_)
+#undef ulfa
+#define ulfa(a_)
+
+#else
+
+#undef R1
+#define R1 4
+#undef R2
+#define R2 5
+#undef R3
+#define R3 6
+#undef R4
+#define R4 7
+
+#undef TREG
+#define TREG 3
+#undef SREG
+#define SREG 2
+#undef CREG
+#define CREG 0
+#undef targ
+#define targ(a_) a_
+#undef src
+#define src(a_) 0
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef mpx
+#define mpx(a_) px(a_)
+#ifdef BETA0
+#undef ulfa
+#define ulfa(a_) phl(a_,0) pa(0,a_) pud(a_,0,si)
+#else
+#undef ulfa
+#define ulfa(a_) pld(0,si,TREG) phl(a_,0) pa(0,a_) pa(TREG,a_) pud(a_,0,si)
+#endif
+#undef AREG
+#define AREG TREG
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+#define wbs(a_,b_)
+
+
+#undef plax
+#define plax pc(CREG,1) ps(160,CREG,CREG) ps(245,1,1) sign(CREG)
+
+
+
+#endif
+
+#if defined(Conj_) && ! defined(GER)
+#undef sign
+#define sign(a_) pm(SREG,a_)
+#else
+#undef sign
+#define sign(a_) pm(SREG,P(a_,1))
+#endif
+
+
+
+#undef plb
+#define plb(a_,b_) pl(a_,b_,AREG)
+#undef plbd
+#define plbd(a_,b_) px(AREG) pld(a_,b_,AREG)
+
+#undef dpr
+#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprp
+#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dpi
+#define dpi(a_) pm(P(src(a_),1),TREG) ps(177,TREG,TREG) pa(TREG,targ(a_))
+
+#ifndef GER
+
+
+#undef plaa
+#define plaa(a_) pl(a_ ## 0,si,CREG) plax
+#undef wa
+#define wa(a_) w(a_)
+#undef dp
+#define dp(a_,b_,c_) plb(a_ ## 0,b_) dpr(c_) plb(a_ ## 0,b_) dpi(c_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) plb(a_ ## 0,b_) dprp(c_,d_,e_) plb(a_ ## 0,b_) dpi(c_)
+#undef ddp
+#define ddp(a_,b_,c_) dp(a_,b_,c_)
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+#define plaa1_2(a_) px(CREG) pld(a_ ## 0,si,CREG) plax
+#undef wa1_2
+#define wa1_2(a_) w1_2(a_)
+#undef dp1_2
+#define dp1_2(a_,b_,c_) plbd(a_ ## 0,b_) dpr(c_) plbd(a_ ## 0,b_) dpi(c_)
+#undef dpp1_2
+#define dpp1_2(a_,b_,c_,d_,e_) plbd(a_ ## 0,b_) dprp(c_,d_,e_) plbd(a_ ## 0,b_) dpi(c_)
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_)
+#undef ddpp1_2
+#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_)
+
+
+#else
+
+#undef lqc
+#define lqc(a_) pl(a_ ## 0,si,TREG)
+#undef lqc1
+#define lqc1_2(a_) px(TREG) pld(a_ ## 0,si,TREG)
+
+
+#undef plaa
+#define plaa(a_)
+#undef wa
+#define wa(a_)
+#undef dp
+#define dp(a_,b_,c_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) \
+ lqc(a_) dpi(c_) wb(a_ ## 0,b_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) pf(d_,e_) \
+ lqc(a_) dpi(c_) wb(a_ ## 0,b_)
+#undef ddp
+#define ddp(a_,b_,c_) dp(a_,b_,c_)
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+#define plaa1_2(a_)
+#undef wa1_2
+#define wa1_2(a_)
+#undef dp1_2
+#define dp1_2(a_,b_,c_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) \
+ lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_)
+#undef dpp1_2
+#define dpp1_2(a_,b_,c_,d_,e_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) pf(d_,e_) \
+ lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_)
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_)
+#undef ddpp1_2
+#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_)
+
+#endif
+
+#endif
+
+/******************************************************************************
+ * Single Precision Real Macros
+ ******************************************************************************/
+
+#ifdef SREAL
+
+#ifdef NO_TRANSPOSE
+
+#undef mpx
+#define mpx(a_) pls(0,si,a_) ps(0,a_,a_)
+#undef madd
+#define madd(a_,b_,c_) pas(a_,b_,c_)
+#undef TREG
+#define TREG 1
+#undef targ
+#define targ(a_) 0
+#undef src
+#define src(a_) a_
+#undef ulfa
+#define ulfa(a_)
+
+#ifdef GER
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef w1_4
+#define w1_4(a_)
+#undef CREG
+#define CREG 2
+#undef AREG
+#define AREG 0
+#undef cp
+#define cp pc(CREG,TREG)
+#undef wb
+#define wb(a_,b_) pu(AREG,a_,b_)
+#undef wbd
+#define wbd(a_,b_) pud(AREG,a_,b_)
+#undef wbs
+#define wbs(a_,b_) pus(AREG,a_,b_)
+#else
+#undef CREG
+#define CREG 0
+#undef AREG
+#define AREG TREG
+#undef cp
+#define cp
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+#define wbs(a_,b_)
+#undef w
+#define w(a_) pu(CREG,a_ ## 0,si)
+#undef w1_2
+#define w1_2(a_) pud(CREG,a_ ## 0,si)
+#undef w1_4
+#define w1_4(a_) pus(CREG,a_ ## 0,si)
+#endif
+
+#else
+
+#undef mpx
+#define mpx(a_) px(a_)
+#ifdef BETA0
+#undef madd
+#define madd(a_,b_,c_)
+#else
+#undef madd
+#define madd(a_,b_,c_) pas(a_,b_,c_)
+#endif
+#undef TREG
+#define TREG 3
+#undef targ
+#define targ(a_) a_
+#undef src
+#define src(a_) 0
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef w1_4
+#define w1_4(a_)
+#undef ulfa
+#undef ulfa
+#define ulfa(a_) phl(a_,0) pa(0,a_) pc(a_,0) ps(1,0,0) pa(0,a_) \
+ madd(0,si,a_) pus(a_,0,si)
+
+#undef CREG
+#define CREG 0
+#undef AREG
+#define AREG TREG
+#undef cp
+#define cp
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+#define wbs(a_,b_)
+
+#endif
+
+#if defined(ALIGN)
+#undef plb
+#define plb(a_,b_) pla(a_,b_,AREG)
+#else
+#undef plb
+#define plb(a_,b_) pl(a_,b_,AREG)
+#endif
+#undef plbd
+#define plbd(a_,b_) px(AREG) pld(a_,b_,AREG)
+#undef plbs
+#define plbs(a_,b_) pls(a_,b_,AREG)
+#undef dpr
+#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprp
+#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprs
+#define dprs(a_) pmsr(src(a_),TREG) pasr(TREG,targ(a_))
+#undef dprps
+#define dprps(a_,b_,c_) pf(b_,c_) pmsr(src(a_),TREG) pasr(TREG,targ(a_))
+
+#undef plaa
+#define plaa(a_) pl(a_ ## 0,si,CREG)
+#undef wa
+#define wa(a_) w(a_)
+#undef dp
+#define dp(a_,b_,c_) cp plb(a_ ## 0,b_) dpr(c_) wb(a_ ## 0,b_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) cp plb(a_ ## 0,b_) dprp(c_,d_,e_) wb(a_ ## 0,b_)
+#undef ddp
+#define ddp(a_,b_,c_) dp(a_,b_,c_)
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+#define plaa1_2(a_) px(CREG) pld(a_ ## 0,si,CREG)
+#undef wa1_2
+#define wa1_2(a_) w1_2(a_)
+#undef dp1_2
+#define dp1_2(a_,b_,c_) cp plbd(a_ ## 0,b_) dpr(c_) wbd(a_ ## 0,b_)
+#undef dpp1_2
+#define dpp1_2(a_,b_,c_,d_,e_) cp plbd(a_ ## 0,b_) dprp(c_,d_,e_) wbd(a_ ## 0,b_)
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_)
+#undef ddpp1_2
+#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_)
+
+#undef plaa1_4
+#define plaa1_4(a_) pls(a_ ## 0,si,CREG)
+#undef wa1_4
+#define wa1_4(a_) w1_4(a_)
+#undef dp1_4
+#define dp1_4(a_,b_,c_) cp plbs(a_ ## 0,b_) dprs(c_) wbs(a_ ## 0,b_)
+#undef dpp1_4
+#define dpp1_4(a_,b_,c_,d_,e_) cp plbs(a_ ## 0,b_) dprps(c_,d_,e_) wbs(a_ ## 0,b_)
+#undef ddp1_4
+#define ddp1_4(a_,b_,c_) dp1_4(a_,b_,c_)
+#undef ddpp1_4
+#define ddpp1_4(a_,b_,c_,d_,e_) dpp1_4(a_,b_,c_,d_,e_)
+
+
+
+#undef R1
+#define R1 4
+#undef R2
+#define R2 5
+#undef R3
+#define R3 6
+#undef R4
+#define R4 7
+
+#endif
+
+/******************************************************************************
+ * Double Precision Real Macros
+ ******************************************************************************/
+
+#ifdef DREAL
+
+#ifdef ATL_SSE2
+
+#ifdef NO_TRANSPOSE
+
+#undef mpx
+#define mpx(a_) pls(0,si,a_) ps(0,a_,a_)
+#undef madd
+#define madd(a_,b_,c_) pas(a_,b_,c_)
+#undef TREG
+#define TREG 1
+#undef targ
+#define targ(a_) 0
+#undef src
+#define src(a_) a_
+#undef ulfa
+#define ulfa(a_)
+
+#ifdef GER
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef w1_4
+#define w1_4(a_)
+#undef CREG
+#define CREG 2
+#undef AREG
+#define AREG 0
+#undef cp
+#define cp pc(CREG,TREG)
+#undef wb
+#define wb(a_,b_) pu(AREG,a_,b_)
+#undef wbd
+#define wbd(a_,b_) pus(AREG,a_,b_)
+#undef wbs
+/* #define wbs(a_,b_) pus(AREG,a_,b_) */
+#else
+#undef CREG
+#define CREG 0
+#undef AREG
+#define AREG TREG
+#undef cp
+#define cp
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+/* #define wbs(a_,b_) */
+#undef w
+#define w(a_) pu(CREG,a_ ## 0,si)
+#undef w1_2
+#define w1_2(a_) pus(CREG,a_ ## 0,si)
+#undef w1_4
+/* #define w1_4(a_) pus(CREG,a_ ## 0,si) */
+#endif
+
+#else
+
+#undef mpx
+#define mpx(a_) px(a_)
+#ifdef BETA0
+#undef madd
+#define madd(a_,b_,c_)
+#else
+#undef madd
+#define madd(a_,b_,c_) pas(a_,b_,c_)
+#endif
+#undef TREG
+#define TREG 3
+#undef targ
+#define targ(a_) a_
+#undef src
+#define src(a_) 0
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef w1_4
+#define w1_4(a_)
+#undef ulfa
+#undef ulfa
+#define ulfa(a_) /* phl(a_,0) pa(0,a_) */ pc(a_,0) ps(1,0,0) pa(0,a_) \
+ madd(0,si,a_) pus(a_,0,si)
+
+#undef CREG
+#define CREG 0
+#undef AREG
+#define AREG TREG
+#undef cp
+#define cp
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+#define wbs(a_,b_)
+
+#endif
+
+#if defined(ALIGN)
+#undef plb
+#define plb(a_,b_) pla(a_,b_,AREG)
+#else
+#undef plb
+#define plb(a_,b_) pl(a_,b_,AREG)
+#endif
+#undef plbd
+#define plbd(a_,b_) /* px(AREG) */pls(a_,b_,AREG)
+#undef plbs
+/* #define plbs(a_,b_) pls(a_,b_,AREG) */
+#undef dpr
+#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprp
+#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprs
+#define dprs(a_) pmsr(src(a_),TREG) pasr(TREG,targ(a_))
+#undef dprps
+#define dprps(a_,b_,c_) pf(b_,c_) pmsr(src(a_),TREG) pasr(TREG,targ(a_))
+
+#undef plaa
+#define plaa(a_) pl(a_ ## 0,si,CREG)
+#undef wa
+#define wa(a_) w(a_)
+#undef dp
+#define dp(a_,b_,c_) cp plb(a_ ## 0,b_) dpr(c_) wb(a_ ## 0,b_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) cp plb(a_ ## 0,b_) dprp(c_,d_,e_) wb(a_ ## 0,b_)
+#undef ddp
+#define ddp(a_,b_,c_) dp(a_,b_,c_)
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+#define plaa1_2(a_) /* px(CREG) */pls(a_ ## 0,si,CREG)
+#undef wa1_2
+#define wa1_2(a_) w1_2(a_)
+#undef dp1_2
+#define dp1_2(a_,b_,c_) cp plbd(a_ ## 0,b_) dprs(c_) wbd(a_ ## 0,b_)
+#undef dpp1_2
+#define dpp1_2(a_,b_,c_,d_,e_) cp plbd(a_ ## 0,b_) dprps(c_,d_,e_) wbd(a_ ## 0,b_)
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_)
+#undef ddpp1_2
+#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_)
+
+#undef plaa1_4
+/* #define plaa1_4(a_) pls(a_ ## 0,si,CREG) */
+#undef wa1_4
+/* #define wa1_4(a_) w1_4(a_) */
+#undef dp1_4
+/* #define dp1_4(a_,b_,c_) cp plbs(a_ ## 0,b_) dprs(c_) wbs(a_ ## 0,b_) */
+#undef dpp1_4
+/* #define dpp1_4(a_,b_,c_,d_,e_) cp plbs(a_ ## 0,b_) dprps(c_,d_,e_) wbs(a_ ## 0,b_) */
+#undef ddp1_4
+/* #define ddp1_4(a_,b_,c_) dp1_4(a_,b_,c_) */
+#undef ddpp1_4
+/* #define ddpp1_4(a_,b_,c_,d_,e_) dpp1_4(a_,b_,c_,d_,e_) */
+
+
+
+#undef R1
+#define R1 4
+#undef R2
+#define R2 5
+#undef R3
+#define R3 6
+#undef R4
+#define R4 7
+
+#else
+
+#ifdef NO_TRANSPOSE
+
+#undef t0
+#define t0(a_) 1
+#undef s0
+#define s0(a_) a_
+#undef t8
+#define t8(a_) 2
+#undef s8
+#define s8(a_) a_
+#undef w
+#define w(a_) fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef w1_2
+#define w1_2(a_) fp(a_ ## 0,si)
+#undef mpx
+#define mpx(a_) fl(0,si) fc(M(a_,2))
+#undef madd
+#define madd(a_,b_,c_) faa(a_,b_)
+#undef ulfa
+#define ulfa(a_) fc(0)
+
+#else
+
+#undef t0
+#define t0(a_) a_
+#undef s0
+#define s0(a_) 1
+#undef t8
+#define t8(a_) a_
+#undef s8
+#define s8(a_) 2
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef mpx
+#define mpx(a_) fz
+#ifdef BETA0
+#undef madd
+#define madd(a_,b_,c_)
+#else
+#undef madd
+#define madd(a_,b_,c_) faa(a_,b_)
+#endif
+#undef ulfa
+#define ulfa(a_) madd(0,si,a_) fp(0,si)
+
+#endif
+
+
+#ifndef GER
+
+#undef plaa1_2
+#define plaa1_2(a_) fl(a_ ## 0,si)
+#undef wa1_2
+#define wa1_2(a_) w1_2(a_)
+#ifdef NO_TRANSPOSE
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_) fl(a_ ## 0,b_) fm(M(s0(c_),1),0) fap(0,t0(c_))
+#undef dp1_2
+#define dp1_2(a_,b_,c_) ddp1_2(a_,b_,c_)
+#else
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_) fl(a_ ## 0,b_) fm(s0(c_),0) fap(0,M(t0(c_),1))
+#undef dp1_2
+#define dp1_2(a_,b_,c_) fl(a_ ## 0,b_) fmp(0,s0(c_)) fap(0,M(t0(c_),2))
+#endif
+
+#else
+
+#undef plaa1_2
+#define plaa1_2(a_) fl(a_ ## 0,si)
+#undef wa1_2
+#define wa1_2(a_)
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_) fd(M(s0(c_),2)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_)
+#undef dp1_2
+#define dp1_2(a_,b_,c_) fm(M(s0(c_),2),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_)
+
+#endif
+
+
+
+#undef plaa
+#define plaa(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) fx1
+
+#ifndef GER
+
+
+#undef wa
+#define wa(a_) w(a_)
+
+
+#undef ddp
+#define ddp(a_,b_,c_) fl(a_ ## 0,b_) fm(s0(c_),0) fl(a_ ## 8,b_) \
+ fm(P(s8(c_),1),0) fx1 fap(0,P(t0(c_),1)) \
+ fap(0,t8(c_))
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fm(s0(c_),0) fl(a_ ## 8,b_) \
+ fm(P(s8(c_),1),0) pf(d_,e_) fx1 fap(0,P(t0(c_),1)) \
+ fap(0,t8(c_))
+
+/* #define ddp(a_,b_,c_) fd(M(s0(c_),1)) fma(a_ ## 0,b_) fap(0,t0(c_)) \ */
+/* fd(M(s8(c_),1)) fma(a_ ## 8,b_) fap(0,t8(c_)) */
+/* #define ddpp(a_,b_,c_,d_,e_) fd(M(s0(c_),1)) fma(a_ ## 0,b_) fap(0,t0(c_)) \ */
+/* \ */
+/* fd(M(s8(c_),1)) fma(a_ ## 8,b_) fap(0,t8(c_)) pf(d_,e_) */
+
+#ifdef NO_TRANSPOSE
+
+#undef dp
+#define dp(a_,b_,c_) ddp(a_,b_,c_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) ddpp(a_,b_,c_,d_,e_)
+
+#else
+
+#undef dp
+#define dp(a_,b_,c_) fl(a_ ## 0,b_) fmp(0,s0(c_)) fl(a_ ## 8,b_) \
+ fmp(0,s8(c_)) fap(0,M(t0(c_),1)) fap(0,M(t8(c_),2))
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) pf(d_ ,e_) fmp(0,s0(c_)) fl(a_ ## 8,b_) \
+ fmp(0,s8(c_)) fap(0,M(t0(c_),1)) fap(0,M(t8(c_),2))
+
+/* #define dp(a_,b_,c_) fma(a_ ## 0,b_) fap(0,M(t0(c_),1)) \ */
+/* fma(a_ ## 8,b_) fap(0,M(t8(c_),2)) */
+/* #define dpp(a_,b_,c_,d_,e_) fma(a_ ## 0,b_) fap(0,M(t0(c_),1)) \ */
+/* \ */
+/* fma(a_ ## 8,b_) fap(0,M(t8(c_),2)) pf(d_,e_) */
+
+#endif
+
+
+#else
+
+#undef wa
+#define wa(a_)
+#undef ddp
+#define ddp(a_,b_,c_) fd(M(s0(c_),1)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \
+ fd(M(s8(c_),1)) fm(t8(c_),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_)
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) fd(M(s0(c_),1)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \
+ fd(M(s8(c_),1)) fm(t8(c_),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) pf(d_,e_)
+
+#undef dp
+#define dp(a_,b_,c_) fm(M(s0(c_),1),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \
+ fm(M(s8(c_),2),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) fm(M(s0(c_),1),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \
+ fm(M(s8(c_),2),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) pf(d_,e_)
+
+#endif
+
+
+#undef R1
+#define R1 3
+#undef R2
+#define R2 4
+#undef R3
+#define R3 5
+#undef R4
+#define R4 6
+
+#endif
+
+#endif
+
+/******************************************************************************
+ * Double Precision Complex Macros
+ ******************************************************************************/
+
+#ifdef DCPLX
+
+#ifdef ATL_SSE2
+#ifdef NO_TRANSPOSE
+
+#if NDPM > 3
+#error Max NDPM is 3 for DCPLX NO_TRANSPOSE
+#endif
+
+#undef plax
+#define plax
+
+#undef R1
+#define R1 2
+#undef R2
+#define R2 4
+#undef R3
+#define R3 6
+#undef R4
+#define R4 6
+
+#undef TREG
+#define TREG 1
+#undef SREG
+#define SREG 0
+#undef CREG
+#define CREG 0
+
+#ifdef GER
+#undef AREG
+#define AREG 0
+#undef targ
+#define targ(a_) AREG
+#undef wb
+#define wb(a_,b_) pu(AREG,a_,b_)
+#undef wbd
+/* #define wbd(a_,b_) pud(AREG,a_,b_) */
+#undef w
+#define w(a_)
+#undef w1_2
+/* #define w1_2(a_) */
+#else
+#undef AREG
+#define AREG TREG
+#undef targ
+#define targ(a_) CREG
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+/* #define wbd(a_,b_) */
+#undef w
+#define w(a_) pu(CREG,a_ ## 0,si)
+#undef w1_2
+/* #define w1_2(a_) pud(CREG,a_ ## 0,si) */
+#endif
+
+#undef src
+#define src(a_) a_
+#undef mpx
+#define mpx(a_) pls(0,si,a_) ps(0,a_,a_) pls(8,si,P(a_,1)) \
+ ps(0,P(a_,1),P(a_,1)) sign(a_)
+#undef madd
+#define madd(a_,b_,c_) pas(a_,b_,c_)
+#undef ulfa
+#define ulfa(a_)
+
+#else
+
+#undef R1
+#define R1 4
+#undef R2
+#define R2 5
+#undef R3
+#define R3 6
+#undef R4
+#define R4 7
+
+#undef TREG
+#define TREG 3
+#undef SREG
+#define SREG 2
+#undef CREG
+#define CREG 0
+#undef targ
+#define targ(a_) a_
+#undef src
+#define src(a_) 0
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef mpx
+#define mpx(a_) px(a_)
+#ifdef BETA0
+#undef ulfa
+#define ulfa(a_) /* phl(a_,0) pa(0,a_) */pu(a_,0,si)
+#else
+#undef ulfa
+#define ulfa(a_) pl(0,si,TREG) /* phl(a_,0) pa(0,a_) */ pa(TREG,a_) pu(a_,0,si)
+#endif
+#undef AREG
+#define AREG TREG
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+#define wbs(a_,b_)
+
+
+#undef plax
+#define plax pc(CREG,1) ps(0,CREG,CREG) ps(3,1,1) sign(CREG)
+
+
+
+#endif
+
+#if defined(Conj_) && ! defined(GER)
+#undef sign
+#define sign(a_) pm(SREG,a_)
+#else
+#undef sign
+#define sign(a_) pm(SREG,P(a_,1))
+#endif
+
+
+
+#undef plb
+#define plb(a_,b_) pl(a_,b_,AREG)
+#undef plbd
+/* #define plbd(a_,b_) px(AREG) pld(a_,b_,AREG) */
+
+#undef dpr
+#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprp
+#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dpi
+#define dpi(a_) pm(P(src(a_),1),TREG) ps(1,TREG,TREG) pa(TREG,targ(a_))
+
+#ifndef GER
+
+#undef plaa
+#define plaa(a_) pl(a_ ## 0,si,CREG) plax
+#undef wa
+#define wa(a_) w(a_)
+#undef dp
+#define dp(a_,b_,c_) plb(a_ ## 0,b_) dpr(c_) plb(a_ ## 0,b_) dpi(c_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) plb(a_ ## 0,b_) dprp(c_,d_,e_) plb(a_ ## 0,b_) dpi(c_)
+#undef ddp
+#define ddp(a_,b_,c_) dp(a_,b_,c_)
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+/* #define plaa1_2(a_) px(CREG) pld(a_ ## 0,si,CREG) plax */
+#undef wa1_2
+/* #define wa1_2(a_) w1_2(a_) */
+#undef dp1_2
+/* #define dp1_2(a_,b_,c_) plbd(a_ ## 0,b_) dpr(c_) plbd(a_ ## 0,b_) dpi(c_) */
+#undef dpp1_2
+/* #define dpp1_2(a_,b_,c_,d_,e_) plbd(a_ ## 0,b_) dprp(c_,d_,e_) plbd(a_ ## 0,b_) dpi(c_) */
+#undef ddp1_2
+/* #define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) */
+#undef ddpp1_2
+/* #define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) */
+
+
+#else
+
+#undef lqc
+#define lqc(a_) pl(a_ ## 0,si,TREG)
+#undef lqc1
+/* #define lqc1_2(a_) px(TREG) pld(a_ ## 0,si,TREG) */
+
+
+#undef plaa
+#define plaa(a_)
+#undef wa
+#define wa(a_)
+#undef dp
+#define dp(a_,b_,c_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) \
+ lqc(a_) dpi(c_) wb(a_ ## 0,b_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) pf(d_,e_) \
+ lqc(a_) dpi(c_) wb(a_ ## 0,b_)
+#undef ddp
+#define ddp(a_,b_,c_) dp(a_,b_,c_)
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+/* #define plaa1_2(a_) */
+#undef wa1_2
+/* #define wa1_2(a_) */
+#undef dp1_2
+/* #define dp1_2(a_,b_,c_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) \ */
+/* lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) */
+#undef dpp1_2
+/* #define dpp1_2(a_,b_,c_,d_,e_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) pf(d_,e_) \ */
+/* lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) */
+#undef ddp1_2
+/* #define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) */
+#undef ddpp1_2
+/* #define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) */
+
+#endif
+
+#else
+
+#if NDPM > 2
+#error Max NDPM is 2 for DCPLX
+#endif
+
+#undef TREG
+#define TREG 2
+
+#ifdef NO_TRANSPOSE
+
+#undef w
+#define w(a_) fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef plax
+#define plax fx1
+#undef srr
+#define srr(a_) a_
+#undef sri
+#define sri(a_) a_
+#undef sir
+#define sir(a_) a_
+#undef sii
+#define sii(a_) a_
+#undef trr
+#define trr(a_) P(TREG,1)
+#undef tri
+#define tri(a_) M(TREG,1)
+#undef tir
+#define tir(a_) TREG
+#undef tii
+#define tii(a_) TREG
+#undef mpx
+#define mpx(a_) fl(0,si) fl(8,si) fc(M(a_,2)) fc(M(a_,2))
+#undef madd
+#define madd(a_,b_,c_) faa(a_,b_)
+#undef ulfa
+#define ulfa(a_) fc(0) fc(0)
+
+#else
+
+#undef srr
+#define srr(a_) P(TREG,1)
+#undef sri
+#define sri(a_) M(TREG,1)
+#undef sir
+#define sir(a_) TREG
+#undef sii
+#define sii(a_) TREG
+#undef trr
+#define trr(a_) a_
+#undef tri
+#define tri(a_) a_
+#undef tir
+#define tir(a_) a_
+#undef tii
+#define tii(a_) a_
+#undef w
+#define w(a_)
+#undef plax
+#define plax
+#undef mpx
+#define mpx(a_) fz fz
+#ifdef BETA0
+#undef madd
+#define madd(a_,b_,c_)
+#else
+#undef madd
+#define madd(a_,b_,c_) faa(a_,b_)
+#endif
+#undef ulfa
+#define ulfa(a_) madd(0,si,a_) fp(0,si) madd(8,si,a_) fp(8,si)
+
+#endif
+
+
+
+#ifdef Conj_
+#undef fapi
+#define fapi(a_,b_) fsp(b_)
+#undef fspi
+#define fspi(a_,b_) fap(a_,b_)
+#else
+#undef fapi
+#define fapi(a_,b_) fap(a_,b_)
+#undef fspi
+#define fspi(a_,b_) fsp(b_)
+#endif
+
+#ifndef GER
+
+
+#undef plaa
+#define plaa(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) plax
+#undef wa
+#define wa(a_) w(a_)
+#undef ddp
+#define ddp(a_,b_,c_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \
+ fm(sri(c_),0) fap(0,tri(c_))\
+ fl(a_ ## 8,b_) fd(0) fm(sir(c_),0) fspi(0,tir(c_)) \
+ fm(sii(c_),0) fapi(0,tii(c_))
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \
+ fm(sri(c_),0) fap(0,tri(c_))\
+ fl(a_ ## 8,b_) fd(0) pf(d_,e_) fm(sir(c_),0) fspi(0,tir(c_))\
+ fm(sii(c_),0) fapi(0,tii(c_))
+
+
+
+#ifdef NO_TRANSPOSE
+
+
+
+#undef dp
+#define dp(a_,b_,c_) ddp(a_,b_,c_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) ddpp(a_,b_,c_,d_,e_)
+
+
+
+#else
+
+#undef dp
+#define dp(a_,b_,c_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \
+ fm(sri(c_),0) fap(0,tri(c_))\
+ fl(a_ ## 8,b_) fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \
+ fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2))
+
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \
+ pf(d_,e_) fm(sri(c_),0) fap(0,tri(c_))\
+ fl(a_ ## 8,b_) fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \
+ fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2))
+
+
+#endif
+
+#else
+
+#undef plaa
+#define plaa(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) plax
+#undef wa
+#define wa(a_)
+
+#undef ddprr
+#define ddprr(a_,b_,c_) fl(a_ ## 0,b_) \
+ fd(tri(c_)) fm(P(sri(c_),1),0) fap(0,1) \
+ fd(M(trr(c_),1)) fm(srr(c_),0) fspi(0,1) \
+ fp(a_ ## 0,b_)
+#undef ddpri
+#define ddpri(a_,b_,c_) fl(a_ ## 8,b_) \
+ fd(tii(c_)) fm(P(sii(c_),1),0) fap(0,1) \
+ fd(M(tir(c_),1)) fm(sir(c_),0) fapi(0,1) \
+ fp(a_ ## 8,b_)
+#undef dpri
+#define dpri(a_,b_,c_) fl(a_ ## 8,b_) \
+ fx(2) fm(sir(c_),0) fap(0,2) \
+ fm(M(sii(c_),2),0) fapi(0,1) \
+ fp(a_ ## 8,b_)
+
+
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) ddprr(a_,b_,c_) pf(d_,e_) ddpri(a_,b_,c_)
+#undef ddp
+#define ddp(a_,b_,c_) ddprr(a_,b_,c_) ddpri(a_,b_,c_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) ddprr(a_,b_,c_) pf(d_,e_) dpri(a_,b_,c_)
+#undef dp
+#define dp(a_,b_,c_) ddprr(a_,b_,c_) dpri(a_,b_,c_)
+
+#endif
+
+
+#undef R1
+#define R1 4
+#undef R2
+#define R2 6
+#undef R3
+#define R3 6
+#undef R4
+#define R4 6
+
+#endif
+
+#endif
+
+
+/******************************************************************************
+ * General Macros
+ ******************************************************************************/
+
+
+
+
+#undef bla1
+#define bla1(a_,b_) plaa(a_) dpp(a_,ax,R1,b_,si) wa(a_)
+#undef blb1
+#define blb1(a_,b_) plaa(a_) dpp(a_,ax,R1,b_,ax) wa(a_)
+
+#undef bla2
+#undef bla2
+#define bla2(a_,b_) pf(b_,si) plaa(a_) ddp(a_,ax,R1) pf(b_,ax) dp(a_,bx,R2) wa(a_)
+#undef blb2
+#undef blb2
+#define blb2(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,bx) dp(a_,bx,R2) wa(a_)
+
+#undef bla3
+#define bla3(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,si) ddp(a_,bx,R2) \
+ dpp(a_,cx,R3,b_,ax) wa(a_)
+#undef blb3
+#define blb3(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,bx) ddp(a_,bx,R2) \
+ dpp(a_,cx,R3,b_,cx) wa(a_)
+
+#undef bla4
+#define bla4(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,si) ddpp(a_,bx,R2,b_,ax) \
+ ddp(a_,cx,R3) dpp(a_,dx,R4,b_,bx) wa(a_)
+#undef blb4
+#define blb4(a_,b_) plaa(a_) ddp(a_,ax,R1) ddpp(a_,bx,R2,b_,cx) \
+ ddp(a_,cx,R3) dpp(a_,dx,R4,b_,dx) wa(a_)
+
+#undef bla
+#define bla(a_,b_) Mjoin(bla,NDP)(a_,b_)
+#undef blb
+#define blb(a_,b_) Mjoin(blb,NDP)(a_,b_)
+
+
+
+#undef bla11_2
+#define bla11_2(a_) plaa1_2(a_) dp1_2(a_,ax,R1) wa1_2(a_)
+#undef bla21_2
+#define bla21_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) dp1_2(a_,bx,R2) wa1_2(a_)
+#undef bla31_2
+#define bla31_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \
+ dp1_2(a_,cx,R3) wa1_2(a_)
+#undef bla41_2
+#define bla41_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \
+ ddp1_2(a_,cx,R3) dp1_2(a_,dx,R4) wa1_2(a_)
+
+#undef bla1_2
+#define bla1_2(a_) Mjoin(Mjoin(bla,NDP),1_2)(a_)
+
+
+
+#undef bla11_4
+#define bla11_4(a_) plaa1_4(a_) dp1_4(a_,ax,R1) wa1_4(a_)
+#undef bla21_4
+#define bla21_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) dp1_4(a_,bx,R2) wa1_4(a_)
+#undef bla31_4
+#define bla31_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \
+ dp1_4(a_,cx,R3) wa1_4(a_)
+#undef bla41_4
+#define bla41_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \
+ ddp1_4(a_,cx,R3) dp1_4(a_,dx,R4) wa1_4(a_)
+
+#undef bla1_4
+#define bla1_4(a_) Mjoin(Mjoin(bla,NDP),1_4)(a_)
+
+
+
+#undef inc1
+#define inc1(a_) a(a_,si) a(a_,ax)
+#undef inc2
+#define inc2(a_) inc1(a_) a(a_,bx)
+#undef inc3
+#define inc3(a_) inc2(a_) a(a_,cx)
+#undef inc4
+#define inc4(a_) inc3(a_) a(a_,dx)
+
+#undef inc
+#define inc(a_) Mjoin(inc,NDP)(a_)
+
+
+#ifdef PREFETCH
+/* #include "camm_arith.h" */
+#undef S
+#define S(a_,b_) (a_) + (b_)
+#undef PF1
+#define PF1 PREFETCH
+#undef PF2
+#define PF2 S(PF1,32)
+#undef PF3
+#define PF3 S(PF1,64)
+#undef PF4
+#define PF4 S(PF1,96)
+#undef PF5
+#define PF5 S(PF1,128)
+#undef PF6
+#define PF6 S(PF1,160)
+#undef PF7
+#define PF7 S(PF1,192)
+#undef PF8
+#define PF8 S(PF1,224)
+#else
+#undef PF1
+#define PF1 64
+#undef PF2
+#define PF2 96
+#undef PF3
+#define PF3 128
+#undef PF4
+#define PF4 160
+#undef PF5
+#define PF5 192
+#undef PF6
+#define PF6 224
+#undef PF7
+#define PF7 256
+#undef PF8
+#define PF8 288
+#endif
+
+
+#if defined(NO_TRANSPOSE) && !defined(SREAL) && !defined(GER)
+#undef pf
+#define pf(a_,b_) f(t0,a_,b_)
+#else
+#undef pf
+#define pf(a_,b_) f(nta,a_,b_)
+#endif
+
+#undef bl1
+#define bl1 bla1_4(0x0) inc(4)
+#undef bl2
+#define bl2 bla1_2(0x0) inc(8)
+#undef bl4
+#define bl4 bla(0x0,PF1) inc(16)
+#undef bl8
+#define bl8 bla(0x0,PF1) blb(0x1,PF1) inc(32)
+#undef bl16
+#define bl16 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) inc(64)
+#undef bl32
+#define bl32 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \
+ bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) inc(128)
+#undef bl64
+#define bl64 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \
+ bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) \
+ bla(0x8,PF5) blb(0x9,PF5) bla(0xa,PF6) blb(0xb,PF6) \
+ bla(0xc,PF7) blb(0xd,PF7) bla(0xe,PF8) blb(0xf,PF8) inc(256)
+
+/* #define in2 inc(8) */
+/* #define in4 inc(16) */
+/* #define in8 inc(32) */
+/* #define in16 inc(64) */
+
+#undef in2
+#define in2
+#undef in4
+#define in4
+#undef in8
+#define in8
+#undef in16
+#define in16
+
+#ifdef NO_TRANSPOSE
+#undef incf
+#define incf ra(di,si)
+#else
+#undef incf
+#define incf
+#endif
+
+#undef lf1
+#define lf1 mpx(R1)
+#undef lf2
+#define lf2 lf1 incf mpx(R2)
+#undef lf3
+#define lf3 lf2 incf mpx(R3)
+#undef lf4
+#define lf4 lf3 incf mpx(R4)
+
+#undef lf
+#define lf Mjoin(lf,NDP)
+
+
+#undef ulf1
+#define ulf1 ulfa(R1)
+#undef ulf2
+#define ulf2 ulf1 ra(di,si) ulfa(R2)
+#undef ulf3
+#define ulf3 ulf2 ra(di,si) ulfa(R3)
+#undef ulf4
+#define ulf4 ulf3 ra(di,si) ulfa(R4)
+
+#undef ulf
+#define ulf Mjoin(ulf,NDP)
+
+#undef lpba
+#define lpba(a_) "movl %%esi,%%e" #a_ "\n\t"
+
+#undef lpb1
+#define lpb1 lpba(ax)
+#undef lpb2
+#define lpb2 lpb1 ra(di,si) lpba(bx)
+#undef lpb3
+#define lpb3 lpb2 ra(di,si) lpba(cx)
+#undef lpb4
+#define lpb4 lpb3 ra(di,si) lpba(dx)
+
+#undef lpb
+#define lpb Mjoin(lpb,NDP)
+
+#undef ipf1
+#define ipf1(a_) pf(a_,si) pf(a_,ax)
+#undef ipf2
+#define ipf2(a_) ipf1(a_) pf(a_,bx)
+#undef ipf3
+#define ipf3(a_) ipf2(a_) pf(a_,cx)
+#undef ipf4
+#define ipf4(a_) ipf3(a_) pf(a_,dx)
+
+#undef ipf
+#define ipf(a_) Mjoin(ipf,NDP)(a_)
+
+#ifdef LUNROLL
+#undef UNROLL
+#ifdef SREAL
+#undef UNROLL
+#define UNROLL LUNROLL
+#elif defined(DREAL) || defined(SCPLX)
+#undef UNROLL
+#define UNROLL LUNROLL*2
+#elif defined(DCPLX)
+#undef UNROLL
+#define UNROLL LUNROLL*4
+#endif
+#else
+#undef UNROLL
+#define UNROLL 16
+#endif
+
+#undef UNROLL1_2
+#if UNROLL == 64
+#undef blUNROLL
+#define blUNROLL bl64
+#undef UNROLL1_2
+#define UNROLL1_2 32
+#elif UNROLL == 32
+#undef blUNROLL
+#define blUNROLL bl32
+#undef UNROLL1_2
+#define UNROLL1_2 16
+#elif UNROLL == 16
+#undef blUNROLL
+#define blUNROLL bl16
+#undef UNROLL1_2
+#define UNROLL1_2 8
+#elif UNROLL == 8
+#undef blUNROLL
+#define blUNROLL bl8
+#undef UNROLL1_2
+#define UNROLL1_2 4
+#elif UNROLL == 4
+#undef blUNROLL
+#define blUNROLL bl4
+#undef UNROLL1_2
+#define UNROLL1_2 2
+#elif UNROLL == 2
+#undef blUNROLL
+#define blUNROLL bl2
+#undef UNROLL1_2
+#define UNROLL1_2 1
+#elif UNROLL == 1
+#undef blUNROLL
+#define blUNROLL bl1
+#undef UNROLL1_2
+#define UNROLL1_2 stop
+#endif
+#ifndef UNROLL1_2
+#error UNROLL must be set to power of 2 < 128
+#endif
+
+
+#ifdef GER
+#undef aconst
+#define aconst
+#undef cconst
+#define cconst const
+#else
+#undef aconst
+#define aconst const
+#undef cconst
+#define cconst
+#endif
+
+#undef MY_FUNCTION
+#define MY_FUNCTION Mjoin(dp,EXT)
+
+static void
+MY_FUNCTION(aconst TYPE *a,int lda,
+ const TYPE *b,
+ cconst TYPE *c,int stride,int len) {
+
+#ifdef SCPLX
+#if defined(GER) && defined(Conj_)
+ const TYPE w1[2]={{-1.0,1.0},{-1.0,1.0}},*w=w1;
+#else
+ const TYPE w1[2]={{1.0,-1.0},{1.0,-1.0}},*w=w1;
+#endif
+#endif
+
+#if defined(DCPLX) && defined(ATL_SSE2)
+#if defined(GER) && defined(Conj_)
+ const TYPE w1[1]={{-1.0,1.0}},*w=w1;
+#else
+ const TYPE w1[1]={{1.0,-1.0}},*w=w1;
+#endif
+#endif
+
+#ifdef NO_TRANSPOSE
+#undef movm
+#define movm c
+#undef fixm
+#define fixm b
+#else
+#undef movm
+#define movm b
+#undef fixm
+#define fixm c
+#endif
+ NO_INLINE
+ unsigned u1=stride*sizeof(*fixm),u2=lda*sizeof(*a),u3=len*sizeof(*movm)/sizeof(float);
+
+ ASM (
+
+ "pushl %%ebx\n\t"
+ a(4,sp)
+
+#if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2))
+ "movl %6,%%esi\n\t"
+ pl(0,si,SREG)
+#endif
+
+#ifdef NO_TRANSPOSE
+ "movl %1,%%esi\n\t" /* fixm */
+ "movl %2,%%edi\n\t" /* fixm2fixm */
+#endif
+
+ lf
+
+ "movl %3,%%esi\n\t" /* a */
+ "movl %4,%%edi\n\t" /* a2a */
+
+ lpb
+
+ ipf(0)
+
+ "movl %0,%%esi\n\t" /* movm */
+ "movl %5,%%edi\n\t" /* len */
+
+#if defined(ALIGN)
+
+#if defined(SREAL)
+
+ test(4,ax)
+ je(Mjoin(a1,EXT))
+ test(-1,di)
+ je(Mjoin(a1,EXT))
+ sub(1,di)
+ bl1
+
+ lab(Mjoin(a1,EXT))
+
+#endif
+
+#if defined(DREAL) || defined(SREAL)
+
+ test(8,ax)
+ je(Mjoin(as,EXT))
+ test(-2,di)
+ je(Mjoin(as,EXT))
+ sub(2,di)
+ bl2
+
+ lab(Mjoin(as,EXT))
+
+#endif
+
+#endif
+
+
+ ipf(32)
+
+ lab(Mjoin(loop,EXT))
+
+ test(-UNROLL,di)
+ je(Mjoin(UNROLL1_2,EXT))
+ sub(UNROLL,di)
+
+ blUNROLL
+
+ jmp(Mjoin(loop,EXT))
+
+#if UNROLL > 32
+ lab(Mjoin(32,EXT))
+ test(32,di)
+ je(Mjoin(16,EXT))
+ bl32
+#endif
+
+#if UNROLL > 16
+ lab(Mjoin(16,EXT))
+ test(16,di)
+ je(Mjoin(8,EXT))
+ bl16
+#endif
+
+#if UNROLL > 8
+ lab(Mjoin(8,EXT))
+ test(8,di)
+ je(Mjoin(4,EXT))
+ bl8
+#endif
+
+#if UNROLL > 4
+ lab(Mjoin(4,EXT))
+ test(4,di)
+ je(Mjoin(2,EXT))
+ bl4
+#endif
+
+#if UNROLL > 2
+ lab(Mjoin(2,EXT))
+#ifndef DCPLX
+ test(2,di)
+ je(Mjoin(1,EXT))
+ bl2
+#endif
+#endif
+
+#if UNROLL > 1
+ lab(Mjoin(1,EXT))
+#ifdef SREAL
+ test(1,di)
+ je(Mjoin(stop,EXT))
+ bl1
+#endif
+#endif
+
+ lab(Mjoin(stop,EXT))
+
+#ifndef NO_TRANSPOSE
+ "movl %1,%%esi\n\t" /* fixm */
+ "movl %2,%%edi\n\t" /* fixm2fixm */
+#endif
+
+ ulf
+
+ a(-4,sp)
+ "popl %%ebx\n\t"
+
+
+ ::"m" (movm),"m" (fixm),"m" (u1),"m" (a),"m" (u2),"m" (u3)
+
+#if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2))
+ ,"m" (w)
+#endif
+ :"ax","bx","cx","dx","si","di");
+
+
+}
+
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h
new file mode 100644
index 0000000..7fd1404
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h
@@ -0,0 +1,295 @@
+#include "camm_util.h"
+
+#ifndef N
+#error N must be defined in camm_pipe3.h
+#endif
+#ifndef KB
+#error KB must be defined in camm_pipe3.h
+#endif
+
+#undef p1
+#define p1(a_) Mjoin(p1_4_,N)(a_)
+#undef p2
+#define p2(a_) Mjoin(p1_2_,N)(a_)
+#undef p4
+#define p4(a_) Mjoin(p1_,N)(a_)
+#undef load_pipe
+#define load_pipe(a_) Mjoin(lp,N)(a_)
+#undef drain_pipe
+#define drain_pipe(a_) Mjoin(dp,N)(a_)
+#undef pipe_len
+#define pipe_len Mjoin(pl,N)
+
+#undef p8
+#if pipe_len > 4
+#define p8(a_) Mjoin(p2_,N)(a_)
+#else
+#define p8(a_) p4(a_) p4(SS(a_,16))
+#endif
+
+#undef p16
+#if pipe_len > 8
+#define p16(a_) Mjoin(p4_,N)(a_)
+#else
+#define p16(a_) p8(a_) p8(SS(a_,32))
+#endif
+
+#undef p32
+#if pipe_len > 16
+#define p32(a_) Mjoin(p8_,N)(a_)
+#else
+#define p32(a_) p16(a_) p16(SS(a_,64))
+#endif
+
+#undef p64
+#if pipe_len > 32
+#define p64(a_) Mjoin(p16_,N)(a_)
+#else
+#define p64(a_) p32(a_) p32(SS(a_,128))
+#endif
+
+#undef p128
+#if pipe_len > 64
+#define p128(a_) Mjoin(p32_,N)(a_)
+#else
+#define p128(a_) p64(a_) p64(SS(a_,256))
+#endif
+
+#undef p256
+#if pipe_len > 128
+#define p256(a_) Mjoin(p64_,N)(a_)
+#else
+#define p256(a_) p128(a_) p128(SS(a_,512))
+#endif
+
+#if KB < pipe_len
+#undef pipe_len
+#define pipe_len 0
+#undef load_pipe
+#define load_pipe(a_)
+#undef drain_pipe
+#define drain_pipe(a_)
+#endif
+
+
+#undef MKB
+/* #ifdef SREAL */
+#define MKB KB
+/* #elif defined (DCPLX) */
+/* #define MKB ( KB * 4 ) */
+/* #else */
+/* #define MKB ( KB * 2 ) */
+/* #endif */
+
+#if MKB >= 512
+#error MKB must be less than 512
+#endif
+
+#undef x0
+#undef o0
+#define x0 load_pipe(0)
+#define o0 0
+
+#undef MKBB
+#define MKBB ( MKB - pipe_len )
+
+#undef xx1
+#undef oo1
+#if MKBB >= 256
+#define xx1 x0 p256(o0)
+#define oo1 SS(1024,o0)
+#else
+#define xx1 x0
+#define oo1 o0
+#endif
+
+#undef xx1a
+#undef oo1a
+#if pipe_len == 256
+#define xx1a xx1 drain_pipe(oo1)
+#define oo1a SS(1024,oo1)
+#undef MKBB
+#define MKBB MKB
+#else
+#define xx1a xx1
+#define oo1a oo1
+#endif
+
+#undef x1
+#undef o1
+#if ( MKBB / 128 ) % 2
+#define x1 xx1a p128(oo1a)
+#define o1 SS(512,oo1a)
+#else
+#define x1 xx1a
+#define o1 oo1a
+#endif
+
+#undef x1a
+#undef o1a
+#if pipe_len == 128
+#define x1a x1 drain_pipe(o1)
+#define o1a SS(512,o1)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x1a x1
+#define o1a o1
+#endif
+
+#undef x2
+#undef o2
+#if ( MKBB / 64 ) % 2
+#define x2 x1a p64(o1a)
+#define o2 SS(256,o1a)
+#else
+#define x2 x1a
+#define o2 o1a
+#endif
+
+#undef x2a
+#undef o2a
+#if pipe_len == 64
+#define x2a x2 drain_pipe(o2)
+#define o2a SS(256,o2)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x2a x2
+#define o2a o2
+#endif
+
+#undef x3
+#undef o3
+#if ( MKBB / 32 ) % 2
+#define x3 x2a p32(o2a)
+#define o3 SS(128,o2a)
+#else
+#define x3 x2a
+#define o3 o2a
+#endif
+
+#undef x3a
+#undef o3a
+#if pipe_len == 32
+#define x3a x3 drain_pipe(o3)
+#define o3a SS(128,o3)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x3a x3
+#define o3a o3
+#endif
+
+#undef x4
+#undef o4
+#if ( MKBB / 16 ) % 2
+#define x4 x3a p16(o3a)
+#define o4 SS(64,o3a)
+#else
+#define x4 x3a
+#define o4 o3a
+#endif
+
+#undef x4a
+#undef o4a
+#if pipe_len == 16
+#define x4a x4 drain_pipe(o4)
+#define o4a SS(64,o4)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x4a x4
+#define o4a o4
+#endif
+
+#undef x5
+#undef o5
+#if ( MKBB / 8 ) % 2
+#define x5 x4a p8(o4a)
+#define o5 SS(32,o4a)
+#else
+#define x5 x4a
+#define o5 o4a
+#endif
+
+#undef x5a
+#undef o5a
+#if pipe_len == 8
+#define x5a x5 drain_pipe(o5)
+#define o5a SS(32,o5)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x5a x5
+#define o5a o5
+#endif
+
+#undef x6
+#undef o6
+#if ( MKBB / 4 ) % 2
+#define x6 x5a p4(o5a)
+#define o6 SS(16,o5a)
+#else
+#define x6 x5a
+#define o6 o5a
+#endif
+
+#undef x6a
+#undef o6a
+#if pipe_len == 4
+#define x6a x6 drain_pipe(o6)
+#define o6a SS(16,o6)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x6a x6
+#define o6a o6
+#endif
+
+#undef x7
+#undef o7
+#if ( MKB / 2 ) % 2
+#define x7 x6a p2(o6a)
+#define o7 SS(8,o6a)
+#else
+#define x7 x6a
+#define o7 o6a
+#endif
+
+#undef x7a
+#undef o7a
+#if pipe_len == 2
+#define x7a x7 drain_pipe(o7)
+#define o7a SS(8,o7)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x7a x7
+#define o7a o7
+#endif
+
+#undef x8
+#undef o8
+#if ( MKB / 1 ) % 2
+#define x8 x7a p1(o7a)
+#define o8 SS(4,o7a)
+#else
+#define x8 x7a
+#define o8 o7a
+#endif
+
+#undef x8a
+#undef o8a
+#if pipe_len == 1
+#define x8a x8 drain_pipe(o8)
+#define o8a SS(4,o8)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x8a x8
+#define o8a o8
+#endif
+
+#undef KB_block
+#define KB_block x8a
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h
new file mode 100644
index 0000000..35e9e59
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h
@@ -0,0 +1,215 @@
+#ifndef CAMM_SCALE_H
+#define CAMM_SCALE_H /*+ To stop multiple inclusions. +*/
+
+#include "camm_util.h"
+
+#undef spf
+#define spf(a_,b_) f(t0,a_,b_)
+
+#ifdef SCPLX
+#ifdef BETAX
+#undef SSREG
+#define SSREG 2
+#undef lbx
+#define lbx pls(4,ax,1) ps(0,1,1) pm(SSREG,1)
+#undef cxx
+#define cxx pm(1,3) ps(177,3,3) pa(3,2)
+#undef pcx
+#define pcx pc(2,3)
+#else
+#undef lbx
+#define lbx
+#undef cxx
+#define cxx
+#undef pcx
+#define pcx
+#endif
+#undef lb
+#define lb pls(0,ax,0) ps(0,0,0) lbx
+#undef c
+#define c(a_) pl(a_ ## 0,si,2) pcx pm(0,2) cxx pu(2,a_ ## 0,si)
+#undef cp
+#define cp(a_,b_) pl(a_ ## 0,si,2) pcx pm(0,2) spf(b_,si) cxx pu(2,a_ ## 0,si)
+#undef c1_2
+#define c1_2(a_) px(2) pld(a_ ## 0,si,2) pcx pm(0,2) cxx pud(2,a_ ## 0,si)
+#undef ub
+#define ub
+#endif
+
+#ifdef SREAL
+#undef lb
+#define lb pls(0,ax,0) ps(0,0,0)
+#undef c
+#define c(a_) pl(a_ ## 0,si,2) pm(0,2) pu(2,a_ ## 0,si)
+#undef cp
+#define cp(a_,b_) pl(a_ ## 0,si,2) spf(b_,si) pm(0,2) pu(2,a_ ## 0,si)
+#undef c1_2
+#define c1_2(a_) px(2) pld(a_ ## 0,si,2) pm(0,2) pud(2,a_ ## 0,si)
+#undef c1_4
+#define c1_4(a_) pls(a_ ## 0,si,2) pm(0,2) pus(2,a_ ## 0,si)
+#undef ub
+#define ub
+#endif
+
+#ifdef DREAL
+#undef lb
+#define lb fl(0,ax)
+#undef c
+#define c(a_) fl(a_ ## 0,si) fm(1,0) fl(a_ ## 8,si) fm(2,0) fx1 \
+ fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef cp
+#define cp(a_,b_) fl(a_ ## 0,si) fm(1,0) fl(a_ ## 8,si) spf(b_,si) fm(2,0) fx1 \
+ fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef c1_2
+#define c1_2(a_) fl(a_ ## 0,si) fm(1,0) fp(a_ ## 0,si)
+#undef ub
+#define ub fc(0)
+#endif
+
+#ifdef DCPLX
+#undef lb
+#define lb fl(0,ax) fl(8,ax)
+#undef c
+#define c(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) fd(3) fm(2,0) fd(3) \
+ fm(2,0) fx(3) fm(4,0) fx(2) fm(5,0) fap(0,2) fx(2) fsp(2) fx1 \
+ fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef cp
+#define cp(a_,b_) fl(a_ ## 0,si) fl(a_ ## 8,si) fd(3) fm(2,0) fd(3) \
+ fm(2,0) fx(3) spf(b_,si) fm(4,0) fx(2) fm(5,0) fap(0,2) fx(2) \
+ fsp(2) fx1 fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef ub
+#define ub fc(0) fc(0)
+#endif
+
+#undef sbl1
+#define sbl1 c1_4(0x0)
+#undef sbl2
+#define sbl2 c1_2(0x0)
+#undef sbl4
+#define sbl4 cp(0x0,0x40)
+#undef sbl8
+#define sbl8 sbl4 c(0x1)
+#undef sbl16
+#define sbl16 sbl8 cp(0x2,0x60) c(0x3)
+
+#undef sinc16
+#define sinc16 a(0x40,si)
+#undef sinc8
+#define sinc8 a(0x20,si)
+#undef sinc4
+#define sinc4 a(0x10,si)
+#undef sinc2
+#define sinc2 a(0x8,si)
+#undef sinc1
+#define sinc1 a(0x4,si)
+
+#undef SCALE
+#define SCALE Mjoin(Mjoin(PREC,Mjoin(scale,BLC)),FEXT)
+
+#undef MY_FUNCTION
+#define MY_FUNCTION SCALE
+
+static void
+MY_FUNCTION(const TYPE *b,TYPE *c,int len) {
+
+ const TYPE *ce=c+len;
+#if defined(BETAX) && defined(SCPLX)
+ const TYPE z1[2]={{1.0,-1.0},{1.0,-1.0}},*z=z1;
+#endif
+ NO_INLINE
+
+#ifndef SREAL
+ len+=len;
+#endif
+#ifdef DCPLX
+ len+=len;
+#endif
+
+
+ ASM(
+
+ "pushl %%ebx\n\t"
+ a(4,sp)
+
+
+ "movl %0,%%esi\n\t"
+
+ spf(0x00,si)
+ spf(0x20,si)
+
+ "movl %1,%%eax\n\t"
+ "movl %2,%%edi\n\t"
+
+#if defined(BETAX) && defined(SCPLX)
+ "movl %3,%%ebx\n\t"
+ pl(0,bx,SSREG)
+#endif
+
+ lb
+
+ lab(loop)
+
+ test(-16,di)
+ je(8)
+ sub(16,di)
+ align
+
+ sbl16
+ sinc16
+
+ jmp(loop)
+ align
+
+ lab(8)
+
+ test(8,di)
+ je(4)
+
+ sbl8
+ sinc8
+
+ lab(4)
+
+ test(4,di)
+ je(2)
+
+ sbl4
+ sinc4
+
+ lab(2)
+
+#ifndef DCPLX
+ test(2,di)
+ je(1)
+
+ sbl2
+ sinc2
+
+ lab(1)
+
+#ifdef SREAL
+ test(1,di)
+ je(stop)
+
+ sbl1
+ sinc1
+
+ lab(stop)
+#endif
+#endif
+
+ ub
+
+ a(-4,sp)
+ "popl %%ebx\n\t"
+
+
+ ::"m" (c),"m" (b), "m" (len)
+#if defined(BETAX) && defined(SCPLX)
+ ,"m" (z)
+#endif
+ : "si","ax","di");
+
+
+}
+#endif /* CAMM_SCALE_H */
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h
new file mode 100644
index 0000000..4a92006
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h
@@ -0,0 +1,2982 @@
+#include "camm_util.h"
+
+#undef p1_4_swap_1
+#define p1_4_swap_1(a_) \
+ pls(a_,ax,1) \
+ pls(a_,cx,0) \
+ pus(0,a_,ax) \
+ pus(1,a_,cx)
+#undef p1_2_swap_1
+#define p1_2_swap_1(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ px(0) \
+ pld(a_,cx,0) \
+ pud(0,a_,ax) \
+ pud(1,a_,cx)
+#undef p1_swap_1
+#define p1_swap_1(a_) \
+ plq(a_,ax,1) \
+ pl(a_,cx,0) \
+ puq(0,a_,ax) \
+ pu(1,a_,cx)
+#undef p2_swap_1
+#define p2_swap_1(a_) \
+ plq(SS(a_,RS4),ax,3) \
+ pl(SS(a_,RS4),cx,2) \
+ puq(0,a_,ax) \
+ pu(1,a_,cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,0) \
+ puq(2,SS(a_,RS4),ax) \
+ pu(3,SS(a_,RS4),cx)
+#undef lpswap_1
+#define lpswap_1(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,1) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,0)
+#undef dpswap_1
+#define dpswap_1(a_) \
+ plq(SS(a_,RS4),ax,3) \
+ pl(SS(a_,RS4),cx,2) \
+ puq(0,a_,ax) \
+ pu(1,a_,cx) \
+ puq(2,SS(a_,RS4),ax) \
+ pu(3,SS(a_,RS4),cx)
+#undef plswap_1
+#define plswap_1 8
+
+
+#undef p1_4_scal_3
+#define p1_4_scal_3(a_) \
+ pls(a_,ax,0) \
+ pmsr(6,0) \
+ pus(0,a_,ax)
+#undef p1_2_scal_3
+#define p1_2_scal_3(a_) \
+ pld(a_,ax,0) \
+ pm(6,0) \
+ pud(0,a_,ax)
+#undef p1_scal_3
+#define p1_scal_3(a_) \
+ plq(a_,ax,0) \
+ pm(6,0) \
+ puq(0,a_,ax)
+#undef p2_scal_3
+#define p2_scal_3(a_) \
+ plq(a_,ax,0) \
+ plq(SS(a_,RS4),ax,1) \
+ pm(6,0) \
+ pm(6,1) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_scal_3
+#define p4_scal_3(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(6,2) \
+ puq(0,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ pm(6,3) \
+ puq(1,SS(a_,RS4),ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ pm(6,0) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ pm(6,1) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax)
+#undef lpscal_3
+#define lpscal_3(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pm(6,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pm(6,1)
+#undef dpscal_3
+#define dpscal_3(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(6,2) \
+ puq(0,a_,ax) \
+ pm(6,3) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plscal_3
+#define plscal_3 16
+
+#undef p1_4_scal_3c
+#define p1_4_scal_3c(a_)
+#undef p1_2_scal_3c
+#define p1_2_scal_3c(a_) \
+ pld(a_,ax,0) \
+ pc(0,1) \
+ pm(6,0) \
+ ps(CSHUF,1,1) \
+ pm(7,1) \
+ pa(1,0) \
+ pud(0,a_,ax)
+#undef p1_scal_3c
+#define p1_scal_3c(a_) \
+ plq(a_,ax,0) \
+ pc(0,1) \
+ pm(6,0) \
+ ps(CSHUF,1,1) \
+ pm(7,1) \
+ pa(1,0) \
+ puq(0,a_,ax)
+#undef p2_scal_3c
+#define p2_scal_3c(a_) \
+ plq(a_,ax,0) \
+ plq(SS(a_,RS4),ax,1) \
+ pc(0,2) \
+ pm(6,0) \
+ ps(CSHUF,2,2) \
+ pm(7,2) \
+ pa(2,0) \
+ puq(0,a_,ax) \
+ pc(1,3) \
+ pm(6,1) \
+ ps(CSHUF,3,3) \
+ pm(7,3) \
+ pa(3,1) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_scal_3c
+#define p4_scal_3c(a_) \
+ pm(7,5) \
+ pa(5,1) \
+ puq(0,a_,ax) \
+ ps(CSHUF,4,4) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ pc(3,5) \
+ pm(6,3) \
+ pm(7,4) \
+ pa(4,2) \
+ puq(1,SS(a_,RS4),ax) \
+ ps(CSHUF,5,5) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ pc(0,4) \
+ pm(6,0) \
+ pm(7,5) \
+ pa(5,3) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ ps(CSHUF,4,4) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ pc(1,5) \
+ pm(6,1) \
+ pm(7,4) \
+ pa(4,0) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ ps(CSHUF,5,5) \
+ plq(SS(a_,MM(7,RS4)),ax,3) \
+ pc(2,4) \
+ pm(6,2)
+#undef lpscal_3c
+#define lpscal_3c(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pc(0,4) \
+ pm(6,0) \
+ ps(CSHUF,4,4) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pc(1,5) \
+ pm(6,1) \
+ pm(7,4) \
+ pa(4,0) \
+ ps(CSHUF,5,5) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pc(2,4) \
+ pm(6,2)
+#undef dpscal_3c
+#define dpscal_3c(a_) \
+ pm(7,5) \
+ pa(5,1) \
+ ps(CSHUF,4,4) \
+ puq(0,a_,ax) \
+ pm(7,4) \
+ pa(4,2) \
+ pc(3,5) \
+ pm(6,3) \
+ puq(1,SS(a_,RS4),ax) \
+ ps(CSHUF,5,5) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ pm(7,5) \
+ pa(5,3) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plscal_3c
+#define plscal_3c 16
+
+#undef p1_4_scal_4
+#define p1_4_scal_4(a_) \
+ pls(SS(a_,MM(0,RS4)),ax,0) \
+ pmsr(6,0) \
+ pus(0,a_,ax)
+#undef p1_2_scal_4
+#define p1_2_scal_4(a_) \
+ pld(SS(a_,MM(0,RS4)),ax,0) \
+ pm(6,0) \
+ pud(0,a_,ax)
+#undef p1_scal_4
+#define p1_scal_4(a_) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ pm(6,0) \
+ puq(0,a_,ax)
+#undef p2_scal_4
+#define p2_scal_4(a_) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pm(6,0) \
+ pm(6,1) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_scal_4
+#define p4_scal_4(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(6,0) \
+ pm(6,1) \
+ pm(6,2) \
+ pm(6,3) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef p8_scal_4
+#define p8_scal_4(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ plq(SS(a_,MM(4,RS4)),ax,4) \
+ plq(SS(a_,MM(5,RS4)),ax,5) \
+ plq(SS(a_,MM(6,RS4)),ax,7) \
+ pm(6,0) \
+ pm(6,1) \
+ pm(6,2) \
+ puq(0,a_,ax) \
+ pm(6,3) \
+ pm(6,4) \
+ pm(6,5) \
+ plq(SS(a_,MM(7,RS4)),ax,0) \
+ pm(6,7) \
+ pm(6,0) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ puq(4,SS(a_,MM(4,RS4)),ax) \
+ puq(5,SS(a_,MM(5,RS4)),ax) \
+ puq(7,SS(a_,MM(6,RS4)),ax) \
+ puq(0,SS(a_,MM(7,RS4)),ax)
+#undef lpscal_4
+#define lpscal_4(a_)
+#undef dpscal_4
+#define dpscal_4(a_) p4_scal_4(a_)
+#undef plscal_4
+#define plscal_4 16
+
+#undef p1_4_scal_4c
+#define p1_4_scal_4c(a_)
+#undef p1_2_scal_4c
+#define p1_2_scal_4c(a_) \
+ pld(a_,ax,0) \
+ pc(0,1) \
+ pm(6,0) \
+ ps(CSHUF,1,1) \
+ pm(7,1) \
+ pa(1,0) \
+ pud(0,a_,ax)
+#undef p1_scal_4c
+#define p1_scal_4c(a_) \
+ plq(a_,ax,0) \
+ pc(0,1) \
+ pm(6,0) \
+ ps(CSHUF,1,1) \
+ pm(7,1) \
+ pa(1,0) \
+ puq(0,a_,ax)
+#undef p2_scal_4c
+#define p2_scal_4c(a_) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pc(0,4) \
+ pc(1,5) \
+ pm(6,0) \
+ pm(6,1) \
+ ps(CSHUF,4,4) \
+ ps(CSHUF,5,5) \
+ pm(7,4) \
+ pa(4,0) \
+ pm(7,5) \
+ pa(5,1) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_scal_4c
+#define p4_scal_4c(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pc(0,4) \
+ pc(1,5) \
+ pm(6,0) \
+ pm(6,1) \
+ ps(CSHUF,4,4) \
+ ps(CSHUF,5,5) \
+ pm(7,4) \
+ pa(4,0) \
+ pc(2,4) \
+ pm(7,5) \
+ pa(5,1) \
+ pc(3,5) \
+ pm(6,2) \
+ pm(6,3) \
+ ps(CSHUF,4,4) \
+ ps(CSHUF,5,5) \
+ pm(7,4) \
+ pa(4,2) \
+ pm(7,5) \
+ pa(5,3) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef lpscal_4c
+#define lpscal_4c(a_)
+#undef dpscal_4c
+#define dpscal_4c(a_) p4_scal_4c(a_)
+#undef plscal_4c
+#define plscal_4c 16
+
+#undef p1_4_scal_1
+#define p1_4_scal_1(a_) \
+ pls(a_,ax,1) \
+ pmsr(0,1) \
+ pus(1,a_,ax)
+#undef p1_2_scal_1
+#define p1_2_scal_1(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ pm(0,1) \
+ pud(1,a_,ax)
+#undef p1_scal_1
+#define p1_scal_1(a_) \
+ plq(a_,ax,1) \
+ pm(0,1) \
+ puq(1,a_,ax)
+#undef p2_scal_1
+#define p2_scal_1(a_) \
+ plq(a_,ax,1) \
+ plq(SS(a_,RS4),ax,2) \
+ pm(0,1) \
+ pm(0,2) \
+ puq(1,a_,ax) \
+ puq(2,SS(a_,RS4),ax)
+#undef p4_scal_1
+#define p4_scal_1(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pm(0,3) \
+ puq(7,a_,ax) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pm(0,1) \
+ puq(3,SS(a_,MM(1,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,7) \
+ pm(0,2) \
+ puq(1,SS(a_,MM(2,RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,3) \
+ pm(0,7) \
+ puq(2,SS(a_,MM(3,RS4)),ax)
+#undef lpscal_1
+#define lpscal_1(a_) \
+ plq(a_,ax,7) \
+ plq(SS(a_,MM(1,RS4)),ax,3) \
+ pm(0,7)
+#undef dpscal_1
+#define dpscal_1(a_) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pm(0,3) \
+ puq(7,a_,ax) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pm(0,1) \
+ puq(3,SS(a_,MM(1,RS4)),ax) \
+ pm(0,2) \
+ puq(1,SS(a_,MM(2,RS4)),ax) \
+ puq(2,SS(a_,MM(3,RS4)),ax)
+#undef plscal_1
+#define plscal_1 RS4
+
+
+#undef p1_4_set_1
+#define p1_4_set_1(a_) \
+ pls(a_,ax,1) \
+ pcs(0,1) \
+ pus(1,a_,ax)
+#undef p1_2_set_1
+#define p1_2_set_1(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ pc(0,1) \
+ pud(1,a_,ax)
+#undef p1_set_1
+#define p1_set_1(a_) \
+ plq(a_,ax,1) \
+ pc(0,1) \
+ puq(1,a_,ax)
+#undef p2_set_1
+#define p2_set_1(a_) \
+ plq(a_,ax,1) \
+ plq(SS(a_,RS4),ax,2) \
+ pc(0,1) \
+ pc(0,2) \
+ puq(1,a_,ax) \
+ puq(2,SS(a_,RS4),ax)
+#undef p4_set_1
+#define p4_set_1(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pc(0,3) \
+ puq(7,a_,ax) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pc(0,1) \
+ puq(3,SS(a_,MM(1,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,7) \
+ pc(0,2) \
+ puq(1,SS(a_,MM(2,RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,3) \
+ pc(0,7) \
+ puq(2,SS(a_,MM(3,RS4)),ax)
+#undef lpset_1
+#define lpset_1(a_) \
+ plq(a_,ax,7) \
+ plq(SS(a_,MM(1,RS4)),ax,3) \
+ pc(0,7)
+#undef dpset_1
+#define dpset_1(a_) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pc(0,3) \
+ puq(7,a_,ax) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pc(0,1) \
+ puq(3,SS(a_,MM(1,RS4)),ax) \
+ pc(0,2) \
+ puq(1,SS(a_,MM(2,RS4)),ax) \
+ puq(2,SS(a_,MM(3,RS4)),ax)
+#undef plset_1
+#define plset_1 RS4
+
+
+#undef p1_4_set_2
+#define p1_4_set_2(a_) \
+ pus(0,a_,ax)
+#undef p1_2_set_2
+#define p1_2_set_2(a_) \
+ pud(0,a_,ax)
+#undef p1_set_2
+#define p1_set_2(a_) \
+ puq(0,a_,ax)
+#undef p2_set_2
+#define p2_set_2(a_) \
+ puq(0,a_,ax) \
+ puq(0,SS(a_,RS4),ax)
+#undef p4_set_2
+#define p4_set_2(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ puq(0,a_,ax) \
+ puq(0,SS(a_,MM(1,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ puq(0,SS(a_,MM(2,RS4)),ax) \
+ puq(0,SS(a_,MM(3,RS4)),ax)
+#undef lpset_2
+#define lpset_2(a_)
+#undef dpset_2
+#define dpset_2(a_) \
+ puq(0,a_,ax) \
+ puq(0,SS(a_,MM(1,RS4)),ax) \
+ puq(0,SS(a_,MM(2,RS4)),ax) \
+ puq(0,SS(a_,MM(3,RS4)),ax)
+#undef plset_2
+#define plset_2 RS4
+
+
+#undef p1_4_set_3
+#define p1_4_set_3(a_) \
+ pus(0,a_,ax)
+#undef p1_2_set_3
+#define p1_2_set_3(a_) \
+ pud(0,a_,ax)
+#undef p1_set_3
+#define p1_set_3(a_) \
+ puq(0,SS(a_,MM(0,RS4)),ax)
+#undef p2_set_3
+#define p2_set_3(a_) \
+ puq(0,SS(a_,MM(0,RS4)),ax) \
+ puq(0,SS(a_,MM(1,RS4)),ax)
+#undef p4_set_3
+#define p4_set_3(a_) \
+ puq(0,SS(a_,MM(0,RS4)),ax) \
+ puq(0,SS(a_,MM(1,RS4)),ax) \
+ puq(0,SS(a_,MM(2,RS4)),ax) \
+ puq(0,SS(a_,MM(3,RS4)),ax)
+#undef p8_set_3
+#define p8_set_3(a_) \
+ puq(0,SS(a_,MM(0,RS4)),ax) \
+ puq(0,SS(a_,MM(1,RS4)),ax) \
+ puq(0,SS(a_,MM(2,RS4)),ax) \
+ puq(0,SS(a_,MM(3,RS4)),ax) \
+ puq(0,SS(a_,MM(4,RS4)),ax) \
+ puq(0,SS(a_,MM(5,RS4)),ax) \
+ puq(0,SS(a_,MM(6,RS4)),ax) \
+ puq(0,SS(a_,MM(7,RS4)),ax)
+#undef lpset_3
+#define lpset_3(a_)
+#undef dpset_3
+#define dpset_3(a_) p8_set_3(a_)
+#undef plset_3
+#define plset_3 32
+
+
+#undef p1_4_0x1_nrm2_1
+#define p1_4_0x1_nrm2_1(a_) \
+ pls(a_,ax,1) \
+ pmsr(1,1) \
+ pasr(1,0)
+#undef p1_2_0x1_nrm2_1
+#define p1_2_0x1_nrm2_1(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ pm(1,1) \
+ pa(1,0)
+#undef p1_0x1_nrm2_1
+#define p1_0x1_nrm2_1(a_) \
+ plq(a_,ax,1) \
+ pm(1,1) \
+ pa(1,0)
+#undef p2_0x1_nrm2_1
+#define p2_0x1_nrm2_1(a_) \
+ plq(a_,ax,1) \
+ plq(SS(a_,RS4),ax,2) \
+ pm(1,1) \
+ pm(2,2) \
+ pa(1,0) \
+ pm(2,0)
+#undef p4_0x1_nrm2_1
+#define p4_0x1_nrm2_1(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pm(3,3) \
+ pa(7,0) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pm(1,1) \
+ pa(3,0) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,7) \
+ pm(2,2) \
+ pa(1,0) \
+ plq(SS(a_,MM(5,RS4)),ax,3) \
+ pm(7,7) \
+ pa(2,0)
+#undef lp0x1_nrm2_1
+#define lp0x1_nrm2_1(a_) \
+ plq(a_,ax,7) \
+ plq(SS(a_,MM(1,RS4)),ax,3) \
+ pm(7,7)
+#undef dp0x1_nrm2_1
+#define dp0x1_nrm2_1(a_) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pm(3,3) \
+ pa(7,0) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pm(1,1) \
+ pa(3,0) \
+ pm(2,2) \
+ pa(1,0) \
+ pa(2,0)
+#undef pl0x1_nrm2_1
+#define pl0x1_nrm2_1 RS4
+
+
+#undef p1_4_nrm2_2
+#define p1_4_nrm2_2(a_) \
+ pls(a_,ax,1) dbg(1) \
+ pan(4,1) dbg(1) \
+ pcs(5,6) dbg(6) \
+ pcs(5,7) dbg(7) \
+ paxs(1,5) dbg(5) \
+ prps(5,2) dbg(2) \
+ px(3) \
+ pcms(0,2,3) dbg(3) \
+ pan(3,7) dbg(7) \
+ pann(5,3) dbg(3) \
+ pasr(3,7) dbg(7) \
+ pcs(7,5) dbg(5) \
+ pdsr(5,6) dbg(6) \
+ pdsr(5,1) dbg(1) \
+ pmsr(6,6) dbg(6) \
+ pmsr(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pasr(1,0) dbg(0)
+#undef p1_2_nrm2_2
+#define p1_2_nrm2_2(a_) \
+ px(1) pld(a_,ax,1) dbg(1) \
+ pan(4,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pc(5,7) dbg(7) \
+ pax(1,5) dbg(5) \
+ prp(5,2) dbg(2) \
+ px(3) \
+ pcm(0,2,3)dbg(3) \
+ pan(3,7) dbg(7) \
+ pann(5,3) dbg(3) \
+ pa(3,7) dbg(7) \
+ pc(7,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#undef p1_nrm2_2
+#define p1_nrm2_2(a_) \
+ plq(a_,ax,1) dbg(1) \
+ pan(4,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pc(5,7) dbg(7) \
+ pax(1,5) dbg(5) \
+ prp(5,2) dbg(2) \
+ px(3) \
+ pcm(0,2,3)dbg(3) \
+ pan(3,7) dbg(7) \
+ pann(5,3) dbg(3) \
+ pa(3,7) dbg(7) \
+ pc(7,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#define p2_nrm2_2(a_) \
+ plq(SS(a_,RS4),ax,1) dbg(1) \
+ pan(4,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pc(5,7) dbg(7) \
+ pax(1,5) dbg(5) \
+ prp(5,2) dbg(2) \
+ px(3) \
+ pcm(0,2,3)dbg(3) \
+ pan(3,7) dbg(7) \
+ pann(5,3) dbg(3) \
+ pa(3,7) dbg(7) \
+ pc(7,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \
+ pan(4,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pc(5,7) dbg(7) \
+ pax(1,5) dbg(5) \
+ prp(5,2) dbg(2) \
+ px(3) \
+ pcm(0,2,3)dbg(3) \
+ pan(3,7) dbg(7) \
+ pann(5,3) dbg(3) \
+ pa(3,7) dbg(7) \
+ pc(7,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#undef lpnrm2_2
+#define lpnrm2_2(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \
+ pan(4,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pc(5,7) dbg(7) \
+ pax(1,5) dbg(5) \
+ prp(5,2) dbg(2) \
+ px(3) \
+ pcm(0,2,3)dbg(3) \
+ pan(3,7) dbg(7) \
+ pann(5,3) dbg(3) \
+ pa(3,7) dbg(7) \
+ pc(7,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#undef dpnrm2_2
+#define dpnrm2_2(a_) \
+ plq(SS(a_,RS4),ax,1) dbg(1) \
+ pan(4,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pc(5,7) dbg(7) \
+ pax(1,5) dbg(5) \
+ prp(5,2) dbg(2) \
+ px(3) \
+ pcm(0,2,3)dbg(3) \
+ pan(3,7) dbg(7) \
+ pann(5,3) dbg(3) \
+ pa(3,7) dbg(7) \
+ pc(7,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#undef plnrm2_2
+#define plnrm2_2 8
+
+
+#undef p1_4_nrm2_3
+#define p1_4_nrm2_3(a_) \
+ pls(a_,ax,1) dbg(1) \
+ pcs(5,6) dbg(6) \
+ pan(4,1) dbg(1) \
+ paxs(1,5) dbg(5) \
+ pdsr(5,6) dbg(6) \
+ pdsr(5,1) dbg(1) \
+ pmsr(6,6) dbg(6) \
+ pmsr(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pasr(1,0) dbg(0)
+#undef p1_2_nrm2_3
+#define p1_2_nrm2_3(a_) \
+ px(1) pld(a_,ax,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pan(4,1) dbg(1) \
+ pax(1,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#undef p1_nrm2_3
+#define p1_nrm2_3(a_) \
+ plq(a_,ax,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pan(4,1) dbg(1) \
+ pax(1,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#define p2_nrm2_3(a_) \
+ plq(SS(a_,RS4),ax,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pan(4,1) dbg(1) \
+ pax(1,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pan(4,1) dbg(1) \
+ pax(1,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#undef lpnrm2_3
+#define lpnrm2_3(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pan(4,1) dbg(1) \
+ pax(1,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#undef dpnrm2_3
+#define dpnrm2_3(a_) \
+ plq(SS(a_,RS4),ax,1) dbg(1) \
+ pc(5,6) dbg(6) \
+ pan(4,1) dbg(1) \
+ pax(1,5) dbg(5) \
+ pd(5,6) dbg(6) \
+ pd(5,1) dbg(1) \
+ pm(6,6) dbg(6) \
+ pm(1,1) dbg(1) \
+ pm(6,0) dbg(0) \
+ pa(1,0) dbg(0)
+#undef plnrm2_3
+#define plnrm2_3 8
+
+#define block_nrm2_4(a_,b_) \
+ Mjoin(pc,a_)(5,6) dbg(6) \
+ pan(4,1) dbg(1) \
+ Mjoin(pax,a_)(1,5) dbg(5) \
+ Mjoin(pc,a_)(2,7) dbg(7) \
+ Mjoin(pd,b_)(5,7) dbg(7) \
+ Mjoin(pm,b_)(7,6) dbg(6) \
+ Mjoin(pm,b_)(7,1) dbg(1) \
+ Mjoin(pm,b_)(6,6) dbg(6) \
+ Mjoin(pm,b_)(6,0) dbg(0) \
+ Mjoin(pm,b_)(1,1) dbg(1) \
+ Mjoin(pa,b_)(1,0) dbg(0)
+
+
+/* #undef p1_4_nrm2_4 */
+/* #define p1_4_nrm2_4(a_) \ */
+/* pls(a_,ax,1) dbg(1) \ */
+/* pcs(5,6) dbg(6) \ */
+/* pan(4,1) dbg(1) \ */
+/* paxs(1,5) dbg(5) \ */
+/* pcs(2,7) dbg(7) \ */
+/* pdsr(5,7) dbg(7) \ */
+/* pmsr(7,6) dbg(6) \ */
+/* pmsr(7,1) dbg(1) \ */
+/* pmsr(6,6) dbg(6) \ */
+/* pmsr(6,0) dbg(0) \ */
+/* pmsr(1,1) dbg(1) \ */
+/* pasr(1,0) dbg(0) */
+#undef p1_4_nrm2_4
+#define p1_4_nrm2_4(a_) \
+ pls(a_,ax,1) dbg(1) \
+ block_nrm2_4(s,sr)
+#undef p1_2_nrm2_4
+#define p1_2_nrm2_4(a_) \
+ px(1) pld(a_,ax,1) dbg(1) \
+ block_nrm2_4(,)
+#undef p1_nrm2_4
+#define p1_nrm2_4(a_) \
+ plq(a_,ax,1) dbg(1) \
+ block_nrm2_4(,)
+#define p2_nrm2_4(a_) \
+ plq(SS(a_,RS4),ax,1) dbg(1) \
+ block_nrm2_4(,) \
+ plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ block_nrm2_4(,)
+#undef lpnrm2_4
+#define lpnrm2_4(a_) \
+ plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ block_nrm2_4(,)
+#undef dpnrm2_4
+#define dpnrm2_4(a_) \
+ plq(SS(a_,RS4),ax,1) dbg(1) \
+ block_nrm2_4(,)
+#undef plnrm2_4
+#define plnrm2_4 8
+
+
+#undef p1_4_1x1_1
+#define p1_4_1x1_1(a_) \
+ pls(a_,ax,1) \
+ pls(a_,bx,0) \
+ pm(0,1) \
+ pa(1,6)
+#undef p1_2_1x1_1
+#define p1_2_1x1_1(a_) \
+ pld(a_,ax,1) \
+ pld(a_,bx,0) \
+ pm(0,1) \
+ pa(1,6)
+#undef p1_1x1_1
+#define p1_1x1_1(a_) \
+ plq(a_,ax,1) \
+ plq(a_,bx,0) \
+ pm(0,1) \
+ pa(0,6)
+#undef p2_1x1_1
+#define p2_1x1_1(a_) \
+ plq(a_,ax,1) \
+ plq(a_,bx,0) \
+ plq(SS(a_,RS4),ax,2) \
+ plq(SS(a_,RS4),bx,3) \
+ pm(0,1) \
+ pm(2,3) \
+ pa(1,6) \
+ pa(3,6)
+#undef p4_1x1_1
+#define p4_1x1_1(a_) \
+ f(nta,SS(a_,MM(4,RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pm(0,3) \
+ puq(7,a_,ax) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pm(0,1) \
+ puq(3,SS(a_,RS4),ax) \
+ f(nta,SS(a_,MM(6,RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,7) \
+ pm(0,2) \
+ puq(1,SS(a_,MM(2,RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,3) \
+ pm(0,7) \
+ puq(2,SS(a_,MM(3,RS4)),ax)
+#undef lp1x1_1
+#define lp1x1_1(a_) \
+ plq(a_,ax,7) \
+ plq(SS(a_,RS4),ax,3) \
+ pm(0,7)
+#undef dp1x1_1
+#define dp1x1_1(a_) \
+ plq(SS(,a_,MM(2,RS4)),ax,1) \
+ pm(0,3) \
+ puq(7,a_,ax) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pm(0,1) \
+ puq(3,SS(a_,RS4),ax) \
+ pm(0,2) \
+ puq(1,SS(a_,MM(2,RS4)),ax) \
+ puq(2,SS(a_,MM(3,RS4)),ax)
+#undef pl1x1_1
+#define pl1x1_1 RS4
+
+
+#undef p1_4_0x1_asum_1
+#define p1_4_0x1_asum_1(a_) \
+ pls(a_,ax,1) \
+ pan(4,1) \
+ pasr(1,0)
+#undef p1_2_0x1_asum_1
+#define p1_2_0x1_asum_1(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ pan(4,1) \
+ pa(1,0)
+#undef p1_0x1_asum_1
+#define p1_0x1_asum_1(a_) \
+ plq(a_,ax,1) \
+ pan(4,1) \
+ pa(1,0)
+#undef p2_0x1_asum_1
+#define p2_0x1_asum_1(a_) \
+ plq(a_,ax,1) \
+ plq(SS(a_,RS4),ax,2) \
+ pan(4,1) \
+ pan(4,2) \
+ pa(1,0) \
+ pa(2,0)
+#undef p4_0x1_asum_1
+#define p4_0x1_asum_1(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pan(4,3) \
+ pa(7,0) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pan(4,1) \
+ pa(3,0) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,7) \
+ pan(4,2) \
+ pa(1,0) \
+ plq(SS(a_,MM(5,RS4)),ax,3) \
+ pan(4,7) \
+ pa(2,0)
+#undef lp0x1_asum_1
+#define lp0x1_asum_1(a_) \
+ plq(a_,ax,7) \
+ plq(SS(a_,MM(1,RS4)),ax,3) \
+ pan(4,7)
+#undef dp0x1_asum_1
+#define dp0x1_asum_1(a_) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pan(4,3) \
+ pa(7,0) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pan(4,1) \
+ pa(3,0) \
+ pan(4,2) \
+ pa(1,0) \
+ pa(2,0)
+#undef pl0x1_asum_1
+#define pl0x1_asum_1 RS4
+
+
+#undef p1_4_sum_1
+#define p1_4_sum_1(a_) \
+ pls(a_,ax,1) \
+ pasr(1,0)
+#undef p1_2_sum_1
+#define p1_2_sum_1(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ pa(1,0)
+#undef p1_sum_1
+#define p1_sum_1(a_) \
+ plq(a_,ax,1) \
+ pa(1,0)
+#undef p2_sum_1
+#define p2_sum_1(a_) \
+ plq(a_,ax,1) \
+ plq(SS(a_,RS4),ax,2) \
+ pa(1,0) \
+ pa(2,0)
+#undef p4_sum_1
+#define p4_sum_1(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pa(7,0) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pa(3,0) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,7) \
+ pa(1,0) \
+ plq(SS(a_,MM(5,RS4)),ax,3) \
+ pa(2,0)
+#undef lpsum_1
+#define lpsum_1(a_) \
+ plq(a_,ax,7) \
+ plq(SS(a_,MM(1,RS4)),ax,3)
+#undef dpsum_1
+#define dpsum_1(a_) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ pa(7,0) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pa(3,0) \
+ pa(1,0) \
+ pa(2,0)
+#undef plsum_1
+#define plsum_1 RS4
+
+
+#undef p1_4_dot_1
+#define p1_4_dot_1(a_) \
+ pls(a_,ax,1) \
+ pls(a_,cx,2) \
+ pmsr(2,1) \
+ pasr(1,0)
+#undef p1_2_dot_1
+#define p1_2_dot_1(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ px(2) \
+ pld(a_,cx,2) \
+ pm(2,1) \
+ pa(1,0)
+#undef p1_dot_1
+#define p1_dot_1(a_) \
+ plq(a_,ax,1) \
+ pl(a_,cx,2) \
+ pm(2,1) \
+ pa(1,0)
+#undef p2_dot_1
+#define p2_dot_1(a_) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pl(SS(a_,MM(1,RS4)),cx,2) \
+ pm(4,3) \
+ pa(3,0) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,3) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,4) \
+ pm(2,1) \
+ pa(1,0)
+#undef lpdot_1
+#define lpdot_1(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(a_,ax,3) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(a_,cx,4)
+#undef dpdot_1
+#define dpdot_1(a_) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pl(SS(a_,MM(1,RS4)),cx,2) \
+ pm(4,3) \
+ pa(3,0) \
+ pm(2,1) \
+ pa(1,0)
+#undef pldot_1
+#define pldot_1 8
+
+#undef p1_4_dot_1c
+#define p1_4_dot_1c(a_)
+#undef p1_2_dot_1c
+#define p1_2_dot_1c(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ px(2) \
+ pld(a_,cx,2) \
+ pc(1,3) \
+ ps(HSHUF,1,1) \
+ ps(LSHUF,3,3) \
+ pm(7,1) \
+ pm(2,3) \
+ pa(3,0) \
+ pm(2,1) \
+ pa(1,6)
+#undef p1_dot_1c
+#define p1_dot_1c(a_) \
+ plq(a_,ax,1) \
+ pl(a_,cx,2) \
+ pc(1,3) \
+ ps(HSHUF,1,1) \
+ ps(LSHUF,3,3) \
+ pm(7,1) \
+ pm(2,3) \
+ pa(3,0) \
+ pm(2,1) \
+ pa(1,6)
+#undef p2_dot_1c
+#define p2_dot_1c(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pl(SS(a_,MM(1,RS4)),cx,2) \
+ pc(3,5) \
+ ps(HSHUF,3,3) \
+ ps(LSHUF,5,5) \
+ pm(7,3) \
+ pm(4,5) \
+ pa(5,0) \
+ pm(4,3) \
+ pa(3,6) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,4) \
+ plq(SS(a_,MM(2,RS4)),ax,3) \
+ pc(1,5) \
+ ps(HSHUF,1,1) \
+ ps(LSHUF,5,5) \
+ pm(7,1) \
+ pm(2,5) \
+ pa(5,0) \
+ pm(2,1) \
+ pa(1,6)
+#undef lpdot_1c
+#define lpdot_1c(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(a_,ax,3) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(a_,cx,4)
+#undef dpdot_1c
+#define dpdot_1c(a_) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pl(SS(a_,MM(1,RS4)),cx,2) \
+ pc(3,5) \
+ ps(HSHUF,3,3) \
+ ps(LSHUF,5,5) \
+ pm(7,3) \
+ pm(4,5) \
+ pa(5,0) \
+ pm(4,3) \
+ pa(3,6) \
+ pc(1,5) \
+ ps(HSHUF,1,1) \
+ ps(LSHUF,5,5) \
+ pm(7,1) \
+ pm(2,5) \
+ pa(5,0) \
+ pm(2,1) \
+ pa(1,6)
+#undef pldot_1c
+#define pldot_1c 8
+
+#undef p1_4_dot_2c
+#define p1_4_dot_2c(a_)
+#undef p1_2_dot_2c
+#define p1_2_dot_2c(a_) \
+ px(1) \
+ pld(a_,ax,1) \
+ px(2) \
+ pld(a_,cx,2) \
+ pc(1,3) \
+ ps(CSHUF,1,1) \
+ pm(2,3) \
+ pa(3,0) \
+ pm(2,1) \
+ pa(1,6)
+#undef p1_dot_2c
+#define p1_dot_2c(a_) \
+ plq(a_,ax,1) \
+ pl(a_,cx,2) \
+ pc(1,3) \
+ ps(CSHUF,1,1) \
+ pm(2,3) \
+ pa(3,0) \
+ pm(2,1) \
+ pa(1,6)
+#undef p2_dot_2c
+#define p2_dot_2c(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pl(SS(a_,MM(1,RS4)),cx,2) \
+ pc(3,5) \
+ ps(CSHUF,3,3) \
+ pm(4,5) \
+ pa(5,0) \
+ pm(4,3) \
+ pa(3,6) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,4) \
+ plq(SS(a_,MM(2,RS4)),ax,3) \
+ pc(1,5) \
+ ps(CSHUF,1,1) \
+ pm(2,5) \
+ pa(5,0) \
+ pm(2,1) \
+ pa(1,6)
+#undef lpdot_2c
+#define lpdot_2c(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(a_,ax,3) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(a_,cx,4)
+#undef dpdot_2c
+#define dpdot_2c(a_) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pl(SS(a_,MM(1,RS4)),cx,2) \
+ pc(3,5) \
+ ps(CSHUF,3,3) \
+ pm(4,5) \
+ pa(5,0) \
+ pm(4,3) \
+ pa(3,6) \
+ pc(1,5) \
+ ps(CSHUF,1,1) \
+ pm(2,5) \
+ pa(5,0) \
+ pm(2,1) \
+ pa(1,6)
+#undef pldot_2c
+#define pldot_2c 8
+
+#undef p1_4_axpby_3
+#define p1_4_axpby_3(a_) \
+ pls(a_,ax,0) \
+ pls(a_,cx,3) \
+ pmsr(5,0) \
+ pmsr(6,3) \
+ pasr(3,0) \
+ pus(0,a_,ax)
+#undef p1_2_axpby_3
+#define p1_2_axpby_3(a_) \
+ pld(a_,ax,0) \
+ pld(a_,cx,3) \
+ pm(5,0) \
+ pm(6,3) \
+ pa(3,0) \
+ pud(0,a_,ax)
+#undef p1_axpby_3
+#define p1_axpby_3(a_) \
+ plq(a_,ax,0) \
+ pl(a_,cx,3) \
+ pm(5,0) \
+ pm(6,3) \
+ pa(3,0) \
+ punt(0,a_,ax)
+#undef p2_axpby_3
+#define p2_axpby_3(a_) \
+ plq(a_,ax,0) \
+ pl(a_,cx,3) \
+ plq(SS(a_,RS4),ax,1) \
+ pm(5,0) \
+ pm(6,3) \
+ pa(3,0) \
+ pl(SS(a_,RS4),cx,3) \
+ punt(0,a_,ax) \
+ pm(5,1) \
+ pm(6,3) \
+ pa(3,1) \
+ punt(1,SS(a_,RS4),ax)
+#undef p4_axpby_3
+#define p4_axpby_3(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(5,2) \
+ pl(SS(a_,MM(3,RS4)),cx,7) \
+ pm(6,4) \
+ pa(4,2) \
+ punt(0,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(4,RS4)),cx,4) \
+ pm(5,3) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ pm(6,7) \
+ pa(7,3) \
+ punt(1,SS(a_,RS4),ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ pm(5,0) \
+ pl(SS(a_,MM(5,RS4)),cx,7) \
+ pm(6,4) \
+ pa(4,0) \
+ punt(2,SS(a_,MM(2,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+ pl(SS(a_,MM(6,RS4)),cx,4) \
+ pm(5,1) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ pm(6,7) \
+ pa(7,1) \
+ punt(3,SS(a_,MM(3,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax)
+#undef lpaxpby_3
+#define lpaxpby_3(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,4) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ pl(SS(a_,MM(1,RS4)),cx,7) \
+ pm(5,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pm(6,4) \
+ pa(4,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pm(5,1) \
+ pl(SS(a_,MM(2,RS4)),cx,4) \
+ pm(6,7) \
+ pa(7,1)
+#undef dpaxpby_3
+#define dpaxpby_3(a_) \
+ pl(SS(a_,MM(3,RS4)),cx,7) \
+ pm(5,2) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(6,4) \
+ pa(4,2) \
+ pm(5,3) \
+ punt(0,a_,ax) \
+ pm(6,7) \
+ pa(7,3) \
+ punt(1,SS(a_,RS4),ax) \
+ punt(2,SS(a_,MM(2,RS4)),ax) \
+ punt(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpby_3
+#define plaxpby_3 16
+
+#undef p1_4_axpby_3c
+#define p1_4_axpby_3c(a_)
+#undef p1_2_axpby_3c
+#define p1_2_axpby_3c(a_) \
+ pld(a_,ax,0) \
+ pld(a_,cx,2) \
+ pc(0,3) \
+ pm(5,0) \
+ ps(CSHUF,3,3) \
+ pm(4,3) \
+ pa(3,0) \
+ pc(2,3) \
+ pm(6,2) \
+ pa(2,0) \
+ ps(CSHUF,3,3) \
+ pm(7,3) \
+ pa(3,0) \
+ pud(0,a_,ax)
+#undef p1_axpby_3c
+#define p1_axpby_3c(a_) \
+ plq(a_,ax,0) \
+ pl(a_,cx,2) \
+ pc(0,3) \
+ pm(5,0) \
+ ps(CSHUF,3,3) \
+ pm(4,3) \
+ pa(3,0) \
+ pc(2,3) \
+ pm(6,2) \
+ pa(2,0) \
+ ps(CSHUF,3,3) \
+ pm(7,3) \
+ pa(3,0) \
+ puq(0,a_,ax)
+#undef p2_axpby_3c
+#define p2_axpby_3c(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pl(SS(a_,MM(1,RS4)),cx,3) \
+ pc(1,2) \
+ pm(5,1) \
+ ps(CSHUF,2,2) \
+ pm(4,2) \
+ pa(2,1) \
+ pc(3,2) \
+ pm(6,3) \
+ pa(3,1) \
+ ps(CSHUF,2,2) \
+ pm(7,2) \
+ pa(2,1) \
+ puq(0,a_,ax) \
+ plq(SS(a_,MM(2,RS4)),ax,0) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pc(0,3) \
+ pm(5,0) \
+ ps(CSHUF,3,3) \
+ pm(4,3) \
+ pa(3,0) \
+ pc(2,3) \
+ pm(6,2) \
+ pa(2,0) \
+ ps(CSHUF,3,3) \
+ pm(7,3) \
+ pa(3,0) \
+ puq(1,SS(a_,RS4),ax)
+#undef lpaxpby_3c
+#define lpaxpby_3c(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,2) \
+ pc(0,3) \
+ pm(5,0) \
+ ps(CSHUF,3,3) \
+ pm(4,3) \
+ pa(3,0) \
+ pc(2,3) \
+ pm(6,2) \
+ pa(2,0) \
+ ps(CSHUF,3,3) \
+ pm(7,3) \
+ pa(3,0)
+#undef dpaxpby_3c
+#define dpaxpby_3c(a_) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pl(SS(a_,MM(1,RS4)),cx,3) \
+ pc(1,2) \
+ pm(5,1) \
+ ps(CSHUF,2,2) \
+ pm(4,2) \
+ pa(2,1) \
+ pc(3,2) \
+ pm(6,3) \
+ pa(3,1) \
+ ps(CSHUF,2,2) \
+ pm(7,2) \
+ pa(2,1) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax)
+#undef plaxpby_3c
+#define plaxpby_3c 8
+
+#undef p1_4_axpby_2
+#define p1_4_axpby_2(a_) \
+ pls(a_,cx,5) \
+ pls(a_,ax,0) \
+ pmsr(6,5) \
+ pasr(5,0) \
+ pus(0,a_,ax)
+#undef p1_2_axpby_2
+#define p1_2_axpby_2(a_) \
+ pld(a_,cx,5) \
+ pld(a_,ax,0) \
+ pm(6,5) \
+ pa(5,0) \
+ pud(0,a_,ax)
+#undef p1_axpby_2
+#define p1_axpby_2(a_) \
+ pl(a_,cx,5) \
+ plq(a_,ax,0) \
+ pm(6,5) \
+ pa(5,0) \
+ puq(0,a_,ax)
+#undef p2_axpby_2
+#define p2_axpby_2(a_) \
+ pl(a_,cx,5) \
+ plq(a_,ax,0) \
+ pl(SS(a_,RS4),cx,4) \
+ pm(6,5) \
+ pa(5,0) \
+ plq(SS(a_,RS4),ax,1) \
+ puq(0,a_,ax) \
+ pm(6,4) \
+ pa(4,1) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_axpby_2
+#define p4_axpby_2(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pl(SS(a_,MM(3,RS4)),cx,5) \
+ pm(6,4) \
+ pa(4,2) \
+ puq(0,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(4,RS4)),cx,4) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ pm(6,5) \
+ pa(5,3) \
+ puq(1,SS(a_,RS4),ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ pl(SS(a_,MM(5,RS4)),cx,5) \
+ pm(6,4) \
+ pa(4,0) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+ pl(SS(a_,MM(6,RS4)),cx,4) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ pm(6,5) \
+ pa(5,1) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax)
+#undef lpaxpby_2
+#define lpaxpby_2(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,4) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ pl(SS(a_,MM(1,RS4)),cx,5) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pm(6,4) \
+ pa(4,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pl(SS(a_,MM(2,RS4)),cx,4) \
+ pm(6,5) \
+ pa(5,1)
+#undef dpaxpby_2
+#define dpaxpby_2(a_) \
+ pl(SS(a_,MM(3,RS4)),cx,5) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(6,4) \
+ pa(4,2) \
+ puq(0,a_,ax) \
+ pm(6,5) \
+ pa(5,3) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpby_2
+#define plaxpby_2 16
+
+#undef p1_4_axpby_2c
+#define p1_4_axpby_2c(a_)
+#undef p1_2_axpby_2c
+#define p1_2_axpby_2c(a_) \
+ pld(a_,cx,5) \
+ pld(a_,ax,0) \
+ pc(5,1) \
+ pm(6,5) \
+ pa(5,0) \
+ ps(CSHUF,1,1) \
+ pm(7,1) \
+ pa(1,0) \
+ pud(0,a_,ax)
+#undef p1_axpby_2c
+#define p1_axpby_2c(a_) \
+ pl(a_,cx,5) \
+ plq(a_,ax,0) \
+ pc(5,1) \
+ pm(6,5) \
+ pa(5,0) \
+ ps(CSHUF,1,1) \
+ pm(7,1) \
+ pa(1,0) \
+ puq(0,a_,ax)
+#undef p2_axpby_2c
+#define p2_axpby_2c(a_) \
+ pl(a_,cx,5) \
+ plq(a_,ax,0) \
+ pl(SS(a_,RS4),cx,4) \
+ pc(5,1) \
+ pm(6,5) \
+ pa(5,0) \
+ ps(CSHUF,2,2) \
+ pm(7,2) \
+ pa(2,0) \
+ plq(SS(a_,RS4),ax,1) \
+ puq(0,a_,ax) \
+ pc(4,3) \
+ pm(6,4) \
+ pa(4,1) \
+ ps(CSHUF,3,3) \
+ pm(7,3) \
+ pa(3,1) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_axpby_2c
+#define p4_axpby_2c(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ puq(0,a_,ax) \
+ pc(4,0) \
+ pm(6,4) \
+ pa(4,2) \
+ ps(CSHUF,0,0) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(4,RS4)),cx,4) \
+ pm(7,0) \
+ pa(0,2) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ puq(1,SS(a_,RS4),ax) \
+ pc(5,1) \
+ pm(6,5) \
+ pa(5,3) \
+ ps(CSHUF,1,1) \
+ pl(SS(a_,MM(5,RS4)),cx,5) \
+ pm(7,1) \
+ pa(1,3) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ pc(4,2) \
+ pm(6,4) \
+ pa(4,0) \
+ ps(CSHUF,2,2) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+ pl(SS(a_,MM(6,RS4)),cx,4) \
+ pm(7,2) \
+ pa(2,0) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ pc(5,3) \
+ pm(6,5) \
+ pa(5,1) \
+ ps(CSHUF,3,3) \
+ pl(SS(a_,MM(7,RS4)),cx,5) \
+ pm(7,3) \
+ pa(3,1) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax)
+#undef lpaxpby_2c
+#define lpaxpby_2c(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,4) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ pl(SS(a_,MM(1,RS4)),cx,5) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pc(4,2) \
+ pm(6,4) \
+ pa(4,0) \
+ ps(CSHUF,2,2) \
+ pl(SS(a_,MM(2,RS4)),cx,4) \
+ pm(7,2) \
+ pa(2,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pc(5,3) \
+ pm(6,5) \
+ pa(5,1) \
+ ps(CSHUF,3,3) \
+ pl(SS(a_,MM(3,RS4)),cx,5) \
+ pm(7,3) \
+ pa(3,1)
+#undef dpaxpby_2c
+#define dpaxpby_2c(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ puq(0,a_,ax) \
+ pc(4,0) \
+ pm(6,4) \
+ pa(4,2) \
+ ps(CSHUF,0,0) \
+ puq(1,SS(a_,RS4),ax) \
+ pm(7,0) \
+ pa(0,2) \
+ pc(5,1) \
+ pm(6,5) \
+ pa(5,3) \
+ ps(CSHUF,1,1) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ pm(7,1) \
+ pa(1,3) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpby_2c
+#define plaxpby_2c 16
+
+#undef p1_4_axpby_1
+#define p1_4_axpby_1(a_) \
+ pls(a_,ax,1) \
+ pls(a_,cx,2) \
+ pmsr(5,1) \
+ pmsr(6,2) \
+ pasr(2,1) \
+ pus(1,a_,ax)
+#undef p1_2_axpby_1
+#define p1_2_axpby_1(a_) \
+ pld(a_,ax,1) \
+ pld(a_,cx,2) \
+ pm(5,1) \
+ pm(6,2) \
+ pa(2,1) \
+ pud(1,a_,ax)
+#undef p1_axpby_1
+#define p1_axpby_1(a_) \
+ plq(a_,ax,1) \
+ pl(a_,cx,2) \
+ pm(5,1) \
+ pm(6,2) \
+ pa(2,1) \
+ puq(1,a_,ax)
+#undef p2_axpby_1
+#define p2_axpby_1(a_) \
+ plq(SS(a_,RS4),ax,3) \
+ pl(SS(a_,RS4),cx,4) \
+ pm(5,1) \
+ pm(6,2) \
+ pa(2,1) \
+ puq(1,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pm(5,3) \
+ pm(6,4) \
+ pa(4,3) \
+ puq(3,SS(a_,RS4),ax)
+#undef lpaxpby_1
+#define lpaxpby_1(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,1) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,2)
+#undef dpaxpby_1
+#define dpaxpby_1(a_) \
+ plq(SS(a_,RS4),ax,3) \
+ pl(SS(a_,RS4),cx,4) \
+ pm(5,1) \
+ pm(6,2) \
+ pa(2,1) \
+ puq(1,a_,ax) \
+ pm(5,3) \
+ pm(6,4) \
+ pa(4,3) \
+ puq(3,SS(a_,RS4),ax)
+#undef plaxpby_1
+#define plaxpby_1 8
+
+#undef p1_4_axpy_0
+#define p1_4_axpy_0(a_) \
+ pls(a_,cx,2) \
+ pls(a_,ax,1) \
+ pmsr(6,2) \
+ pasr(2,1) \
+ pus(1,a_,ax)
+#undef p1_2_axpy_0
+#define p1_2_axpy_0(a_) \
+ pld(a_,cx,2) \
+ pld(a_,ax,1) \
+ pm(6,2) \
+ pa(2,1) \
+ pud(1,a_,ax)
+#undef p1_axpy_0
+#define p1_axpy_0(a_) \
+ pl(a_,cx,2) \
+ plq(a_,ax,1) \
+ pm(6,2) \
+ pa(2,1) \
+ puq(1,a_,ax)
+#undef p2_axpy_0
+#define p2_axpy_0(a_) \
+ pl(SS(a_,RS4),cx,4) \
+ pm(6,2) \
+ pa(2,1) \
+ plq(SS(a_,RS4),ax,3) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ puq(1,a_,ax) \
+ pm(6,4) \
+ pa(4,3) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ puq(3,SS(a_,RS4),ax)
+#undef lpaxpy_0
+#define lpaxpy_0(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,2) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,1)
+#undef dpaxpy_0
+#define dpaxpy_0(a_) \
+ pl(SS(a_,RS4),cx,4) \
+ pm(6,2) \
+ pa(2,1) \
+ plq(SS(a_,RS4),ax,3) \
+ puq(1,a_,ax) \
+ pm(6,4) \
+ pa(4,3) \
+ puq(3,SS(a_,RS4),ax)
+#undef plaxpy_0
+#define plaxpy_0 8
+
+#undef p1_4_axpy_1
+#define p1_4_axpy_1(a_) \
+ pls(a_,cx,2) \
+ pls(a_,ax,1) \
+ pmsr(6,2) \
+ pasr(2,1) \
+ pus(1,a_,ax)
+#undef p1_2_axpy_1
+#define p1_2_axpy_1(a_) \
+ pld(a_,cx,2) \
+ pld(a_,ax,1) \
+ pm(6,2) \
+ pa(2,1) \
+ pud(1,a_,ax)
+#undef p1_axpy_1
+#define p1_axpy_1(a_) \
+ pl(a_,cx,2) \
+ pm(6,2) \
+ pam(a_,ax,2) \
+ puq(2,a_,ax)
+#undef p2_axpy_1
+#define p2_axpy_1(a_) \
+ pl(a_,cx,2) \
+ pm(6,2) \
+ pl(SS(a_,RS4),cx,4) \
+ pam(a_,ax,2) \
+ pm(6,4) \
+ puq(2,a_,ax) \
+ pam(SS(a_,RS4),ax,4) \
+ puq(4,SS(a_,RS4),ax)
+#undef p4_axpy_1
+#define p4_axpy_1(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ pm(6,2) \
+ pam(SS(a_,MM(2,RS4)),ax,2) \
+ puq(0,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ pl(SS(a_,MM(4,RS4)),cx,0) \
+ pm(6,3) \
+ pam(SS(a_,MM(3,RS4)),ax,3) \
+ puq(1,SS(a_,RS4),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+ pl(SS(a_,MM(5,RS4)),cx,1) \
+ pm(6,0) \
+ pam(SS(a_,MM(4,RS4)),ax,0) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ pl(SS(a_,MM(6,RS4)),cx,2) \
+ pm(6,1) \
+ pam(SS(a_,MM(5,RS4)),ax,1) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef lpaxpy_1
+#define lpaxpy_1(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(a_,cx,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ pl(SS(a_,RS4),cx,1) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pm(6,0) \
+ pam(a_,ax,0) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ pm(6,1) \
+ pam(SS(a_,RS4),ax,1)
+#undef dpaxpy_1
+#define dpaxpy_1(a_) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ pm(6,2) \
+ pam(SS(a_,MM(2,RS4)),ax,2) \
+ puq(0,a_,ax) \
+ pm(6,3) \
+ pam(SS(a_,MM(3,RS4)),ax,3) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpy_1
+#define plaxpy_1 16
+
+#undef p1_4_axpy_2
+#define p1_4_axpy_2(a_) \
+ pls(a_,cx,5) \
+ pls(a_,ax,0) \
+ pmsr(6,5) \
+ pasr(5,0) \
+ pus(0,a_,ax)
+#undef p1_2_axpy_2
+#define p1_2_axpy_2(a_) \
+ pld(a_,cx,5) \
+ pld(a_,ax,0) \
+ pm(6,5) \
+ pa(5,0) \
+ pud(0,a_,ax)
+#undef p1_axpy_2
+#define p1_axpy_2(a_) \
+ pl(a_,cx,5) \
+ plq(a_,ax,0) \
+ pm(6,5) \
+ pa(5,0) \
+ puq(0,a_,ax)
+#undef p2_axpy_2
+#define p2_axpy_2(a_) \
+ pl(a_,cx,5) \
+ plq(a_,ax,0) \
+ pl(SS(a_,RS4),cx,4) \
+ pm(6,5) \
+ pa(5,0) \
+ plq(SS(a_,RS4),ax,1) \
+ puq(0,a_,ax) \
+ pm(6,4) \
+ pa(4,1) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_axpy_2
+#define p4_axpy_2(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pl(SS(a_,MM(3,RS4)),cx,5) \
+ pm(6,4) \
+ pa(4,2) \
+ puq(0,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(4,RS4)),cx,4) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ pm(6,5) \
+ pa(5,3) \
+ puq(1,SS(a_,RS4),ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ pl(SS(a_,MM(5,RS4)),cx,5) \
+ pm(6,4) \
+ pa(4,0) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+ pl(SS(a_,MM(6,RS4)),cx,4) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ pm(6,5) \
+ pa(5,1) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax)
+#undef lpaxpy_2
+#define lpaxpy_2(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,4) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ pl(SS(a_,MM(1,RS4)),cx,5) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pm(6,4) \
+ pa(4,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pl(SS(a_,MM(2,RS4)),cx,4) \
+ pm(6,5) \
+ pa(5,1)
+#undef dpaxpy_2
+#define dpaxpy_2(a_) \
+ pl(SS(a_,MM(3,RS4)),cx,5) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(6,4) \
+ pa(4,2) \
+ puq(0,a_,ax) \
+ pm(6,5) \
+ pa(5,3) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpy_2
+#define plaxpy_2 16
+
+#undef p1_4_axpy_2c
+#define p1_4_axpy_2c(a_)
+#undef p1_2_axpy_2c
+#define p1_2_axpy_2c(a_) \
+ pld(a_,cx,4) \
+ pld(a_,ax,0) \
+ pc(4,2) \
+ pm(6,4) \
+ pa(4,0) \
+ ps(CSHUF,2,2) \
+ pm(7,2) \
+ pa(2,0) \
+ pud(0,a_,ax)
+#undef p1_axpy_2c
+#define p1_axpy_2c(a_) \
+ pl(a_,cx,4) \
+ plq(a_,ax,0) \
+ pc(4,2) \
+ pm(6,4) \
+ pa(4,0) \
+ ps(CSHUF,2,2) \
+ pm(7,2) \
+ pa(2,0) \
+ puq(0,a_,ax)
+#undef p2_axpy_2c
+#define p2_axpy_2c(a_) \
+ pl(a_,cx,4) \
+ plq(a_,ax,0) \
+ pl(SS(a_,RS4),cx,5) \
+ pc(4,2) \
+ pm(6,4) \
+ pa(4,0) \
+ ps(CSHUF,2,2) \
+ pm(7,2) \
+ pa(2,0) \
+ plq(SS(a_,RS4),ax,1) \
+ puq(0,a_,ax) \
+ pc(5,3) \
+ pm(6,5) \
+ pa(5,1) \
+ ps(CSHUF,3,3) \
+ pm(7,3) \
+ pa(3,1) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_axpy_2c
+#define p4_axpy_2c(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ puq(0,a_,ax) \
+ pc(4,0) \
+ pm(6,4) \
+ pa(4,2) \
+ ps(CSHUF,0,0) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(4,RS4)),cx,4) \
+ pm(7,0) \
+ pa(0,2) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ puq(1,SS(a_,RS4),ax) \
+ pc(5,1) \
+ pm(6,5) \
+ pa(5,3) \
+ ps(CSHUF,1,1) \
+ pl(SS(a_,MM(5,RS4)),cx,5) \
+ pm(7,1) \
+ pa(1,3) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ pc(4,2) \
+ pm(6,4) \
+ pa(4,0) \
+ ps(CSHUF,2,2) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+ pl(SS(a_,MM(6,RS4)),cx,4) \
+ pm(7,2) \
+ pa(2,0) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ pc(5,3) \
+ pm(6,5) \
+ pa(5,1) \
+ ps(CSHUF,3,3) \
+ pl(SS(a_,MM(7,RS4)),cx,5) \
+ pm(7,3) \
+ pa(3,1) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax)
+#undef lpaxpy_2c
+#define lpaxpy_2c(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,4) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ pl(SS(a_,MM(1,RS4)),cx,5) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pc(4,2) \
+ pm(6,4) \
+ pa(4,0) \
+ ps(CSHUF,2,2) \
+ pl(SS(a_,MM(2,RS4)),cx,4) \
+ pm(7,2) \
+ pa(2,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pc(5,3) \
+ pm(6,5) \
+ pa(5,1) \
+ ps(CSHUF,3,3) \
+ pl(SS(a_,MM(3,RS4)),cx,5) \
+ pm(7,3) \
+ pa(3,1)
+#undef dpaxpy_2c
+#define dpaxpy_2c(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ puq(0,a_,ax) \
+ pc(4,0) \
+ pm(6,4) \
+ pa(4,2) \
+ ps(CSHUF,0,0) \
+ puq(1,SS(a_,RS4),ax) \
+ pm(7,0) \
+ pa(0,2) \
+ pc(5,1) \
+ pm(6,5) \
+ pa(5,3) \
+ ps(CSHUF,1,1) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ pm(7,1) \
+ pa(1,3) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpy_2c
+#define plaxpy_2c 16
+
+#undef p1_4_axpy_1c
+#define p1_4_axpy_1c(a_)
+#undef p1_2_axpy_1c
+#define p1_2_axpy_1c(a_) \
+ pld(a_,cx,2) \
+ pc(2,0) \
+ pld(a_,ax,1) \
+ ps(CSHUF,0,0) \
+ pm(6,2) \
+ pa(2,1) \
+ pm(7,0) \
+ pa(0,1) \
+ pud(1,a_,ax)
+#undef p1_axpy_1c
+#define p1_axpy_1c(a_) \
+ pl(a_,cx,2) \
+ pc(2,0) \
+ plq(a_,ax,1) \
+ ps(CSHUF,0,0) \
+ pm(6,2) \
+ pa(2,1) \
+ pm(7,0) \
+ pa(0,1) \
+ puq(1,a_,ax)
+#undef p2_axpy_1c
+#define p2_axpy_1c(a_) \
+ plq(SS(a_,RS4),ax,3) \
+ ps(CSHUF,0,0) \
+ pl(SS(a_,RS4),cx,4) \
+ pm(6,2) \
+ pa(2,1) \
+ pm(7,0) \
+ pa(0,1) \
+ pc(4,0) \
+ puq(1,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,1) \
+ ps(CSHUF,0,0) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pm(6,4) \
+ pa(4,3) \
+ pm(7,0) \
+ pa(0,3) \
+ pc(2,0) \
+ puq(3,SS(a_,RS4),ax)
+#undef lpaxpy_1c
+#define lpaxpy_1c(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,2) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,1) \
+ pc(2,0)
+#undef dpaxpy_1c
+#define dpaxpy_1c(a_) \
+ plq(SS(a_,RS4),ax,3) \
+ ps(CSHUF,0,0) \
+ pl(SS(a_,RS4),cx,4) \
+ pm(6,2) \
+ pa(2,1) \
+ pm(7,0) \
+ pa(0,1) \
+ pc(4,0) \
+ puq(1,a_,ax) \
+ ps(CSHUF,0,0) \
+ pm(6,4) \
+ pa(4,3) \
+ pm(7,0) \
+ pa(0,3) \
+ puq(3,SS(a_,RS4),ax)
+#undef plaxpy_1c
+#define plaxpy_1c 8
+
+#undef p1_4_copy_1
+#define p1_4_copy_1(a_) \
+ pls(a_,cx,2) \
+ pus(2,a_,ax)
+#undef p1_2_copy_1
+#define p1_2_copy_1(a_) \
+ pld(a_,cx,2) \
+ pud(2,a_,ax)
+#undef p1_copy_1
+#define p1_copy_1(a_) \
+ pl(a_,cx,2) \
+ puq(2,a_,ax)
+#undef p2_copy_1
+#define p2_copy_1(a_) \
+ pl(SS(a_,RS4),cx,4) \
+ puq(2,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ puq(4,SS(a_,RS4),ax)
+#undef lpcopy_1
+#define lpcopy_1(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,2)
+#undef dpcopy_1
+#define dpcopy_1(a_) \
+ pl(SS(a_,RS4),cx,4) \
+ puq(2,a_,ax) \
+ puq(4,SS(a_,RS4),ax)
+#undef plcopy_1
+#define plcopy_1 8
+
+#undef p1_4_copy_2
+#define p1_4_copy_2(a_) \
+ pls(a_,ax,2) \
+ pus(2,a_,cx)
+#undef p1_2_copy_2
+#define p1_2_copy_2(a_) \
+ pld(a_,ax,2) \
+ pud(2,a_,cx)
+#undef p1_copy_2
+#define p1_copy_2(a_) \
+ plq(a_,ax,2) \
+ pu(2,a_,cx)
+#undef p2_copy_2
+#define p2_copy_2(a_) \
+ plq(SS(a_,RS4),ax,4) \
+ pu(2,a_,cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pu(4,SS(a_,RS4),cx)
+#undef lpcopy_2
+#define lpcopy_2(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,2)
+#undef dpcopy_2
+#define dpcopy_2(a_) \
+ plq(SS(a_,RS4),ax,4) \
+ pu(2,a_,cx) \
+ pu(4,SS(a_,RS4),cx)
+#undef plcopy_2
+#define plcopy_2 8
+
+#undef p1_4_copy_3
+#define p1_4_copy_3(a_) \
+ pls(a_,cx,2) \
+ pus(2,a_,ax)
+#undef p1_2_copy_3
+#define p1_2_copy_3(a_) \
+ pld(a_,cx,2) \
+ pud(2,a_,ax)
+#undef p1_copy_3
+#define p1_copy_3(a_) \
+ pl(a_,cx,2) \
+ punt(2,a_,ax)
+#undef p2_copy_3
+#define p2_copy_3(a_) \
+ pl(SS(a_,MM(0,RS4)),cx,0) \
+ pl(SS(a_,MM(1,RS4)),cx,1) \
+ punt(0,SS(a_,MM(0,RS4)),ax) \
+ punt(1,SS(a_,MM(1,RS4)),ax)
+#undef p4_copy_3
+#define p4_copy_3(a_) \
+ pl(SS(a_,MM(0,RS4)),cx,0) \
+ pl(SS(a_,MM(1,RS4)),cx,1) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ punt(0,SS(a_,MM(0,RS4)),ax) \
+ punt(1,SS(a_,MM(1,RS4)),ax) \
+ punt(2,SS(a_,MM(2,RS4)),ax) \
+ punt(3,SS(a_,MM(3,RS4)),ax)
+#undef p8_copy_3
+#define p8_copy_3(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,0) \
+ pl(SS(a_,MM(1,RS4)),cx,1) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ pl(SS(a_,MM(4,RS4)),cx,4) \
+ pl(SS(a_,MM(5,RS4)),cx,5) \
+ pl(SS(a_,MM(6,RS4)),cx,6) \
+ pl(SS(a_,MM(7,RS4)),cx,7) \
+ punt(0,SS(a_,MM(0,RS4)),ax) \
+ punt(1,SS(a_,MM(1,RS4)),ax) \
+ punt(2,SS(a_,MM(2,RS4)),ax) \
+ punt(3,SS(a_,MM(3,RS4)),ax) \
+ punt(4,SS(a_,MM(4,RS4)),ax) \
+ punt(5,SS(a_,MM(5,RS4)),ax) \
+ punt(6,SS(a_,MM(6,RS4)),ax) \
+ punt(7,SS(a_,MM(7,RS4)),ax)
+#undef lpcopy_3
+#define lpcopy_3(a_)
+#undef dpcopy_3
+#define dpcopy_3(a_) p8_copy_3(a_)
+#undef plcopy_3
+#define plcopy_3 32
+
+#undef p1_4_cpsc_3
+#define p1_4_cpsc_3(a_) \
+ pls(a_,ax,0) \
+ pmsr(6,0) \
+ pus(0,a_,cx)
+#undef p1_2_cpsc_3
+#define p1_2_cpsc_3(a_) \
+ pld(a_,ax,0) \
+ pm(6,0) \
+ pud(0,a_,cx)
+#undef p1_cpsc_3
+#define p1_cpsc_3(a_) \
+ plq(a_,ax,0) \
+ pm(6,0) \
+ pu(0,a_,cx)
+#undef p2_cpsc_3
+#define p2_cpsc_3(a_) \
+ plq(a_,ax,0) \
+ plq(SS(a_,RS4),ax,1) \
+ pm(6,0) \
+ pm(6,1) \
+ pu(0,a_,cx) \
+ pu(1,SS(a_,RS4),cx)
+#undef p4_cpsc_3
+#define p4_cpsc_3(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(6,2) \
+ pu(0,a_,cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ pm(6,3) \
+ pu(1,SS(a_,RS4),cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ pm(6,0) \
+ pu(2,SS(a_,MM(2,RS4)),cx) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ pm(6,1) \
+ pu(3,SS(a_,MM(3,RS4)),cx) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx)
+#undef lpcpsc_3
+#define lpcpsc_3(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pm(6,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pm(6,1)
+#undef dpcpsc_3
+#define dpcpsc_3(a_) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(6,2) \
+ pu(0,a_,cx) \
+ pm(6,3) \
+ pu(1,SS(a_,RS4),cx) \
+ pu(2,SS(a_,MM(2,RS4)),cx) \
+ pu(3,SS(a_,MM(3,RS4)),cx)
+#undef plcpsc_3
+#define plcpsc_3 16
+
+#undef p1_4_cpsc_3c
+#define p1_4_cpsc_3c(a_)
+#undef p1_2_cpsc_3c
+#define p1_2_cpsc_3c(a_) \
+ pld(a_,ax,0) \
+ pc(0,1) \
+ pm(6,0) \
+ ps(CSHUF,1,1) \
+ pm(7,1) \
+ pa(1,0) \
+ pud(0,a_,cx)
+#undef p1_cpsc_3c
+#define p1_cpsc_3c(a_) \
+ plq(a_,ax,0) \
+ pc(0,1) \
+ pm(6,0) \
+ ps(CSHUF,1,1) \
+ pm(7,1) \
+ pa(1,0) \
+ pu(0,a_,cx)
+#undef p2_cpsc_3c
+#define p2_cpsc_3c(a_) \
+ plq(a_,ax,0) \
+ plq(SS(a_,RS4),ax,1) \
+ pc(0,2) \
+ pm(6,0) \
+ ps(CSHUF,2,2) \
+ pm(7,2) \
+ pa(2,0) \
+ pu(0,a_,cx) \
+ pc(1,3) \
+ pm(6,1) \
+ ps(CSHUF,3,3) \
+ pm(7,3) \
+ pa(3,1) \
+ pu(1,SS(a_,RS4),cx)
+#undef p4_cpsc_3c
+#define p4_cpsc_3c(a_) \
+ pu(0,a_,cx) \
+ pc(2,4) \
+ pm(6,2) \
+ ps(CSHUF,4,4) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,0) \
+ pm(7,4) \
+ pa(4,2) \
+ pu(1,SS(a_,RS4),cx) \
+ pc(3,4) \
+ pm(6,3) \
+ ps(CSHUF,4,4) \
+ plq(SS(a_,MM(5,RS4)),ax,1) \
+ pm(7,4) \
+ pa(4,3) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pu(2,SS(a_,MM(2,RS4)),cx) \
+ pc(0,4) \
+ pm(6,0) \
+ ps(CSHUF,4,4) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(6,RS4)),ax,2) \
+ pm(7,4) \
+ pa(4,0) \
+ pu(3,SS(a_,MM(3,RS4)),cx) \
+ pc(1,4) \
+ pm(6,1) \
+ ps(CSHUF,4,4) \
+ plq(SS(a_,MM(7,RS4)),ax,3) \
+ pm(7,4) \
+ pa(4,1) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx)
+#undef lpcpsc_3c
+#define lpcpsc_3c(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,0) \
+ plq(SS(a_,MM(1,RS4)),ax,1) \
+ pc(0,4) \
+ pm(6,0) \
+ ps(CSHUF,4,4) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pm(7,4) \
+ pa(4,0) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pc(1,4) \
+ pm(6,1) \
+ ps(CSHUF,4,4) \
+ plq(SS(a_,MM(3,RS4)),ax,3) \
+ pm(7,4) \
+ pa(4,1)
+#undef dpcpsc_3c
+#define dpcpsc_3c(a_) \
+ pu(0,a_,cx) \
+ pc(2,4) \
+ pm(6,2) \
+ ps(CSHUF,4,4) \
+ pu(1,SS(a_,RS4),cx) \
+ pm(7,4) \
+ pa(4,2) \
+ pc(3,4) \
+ pm(6,3) \
+ ps(CSHUF,4,4) \
+ pu(2,SS(a_,MM(2,RS4)),cx) \
+ pm(7,4) \
+ pa(4,3) \
+ pu(3,SS(a_,MM(3,RS4)),cx)
+#undef plcpsc_3c
+#define plcpsc_3c 16
+
+#undef p1_4_cpsc_4
+#define p1_4_cpsc_4(a_) \
+ pls(a_,cx,0) \
+ pmsr(6,0) \
+ pus(0,a_,ax)
+#undef p1_2_cpsc_4
+#define p1_2_cpsc_4(a_) \
+ pld(a_,cx,0) \
+ pm(6,0) \
+ pud(0,a_,ax)
+#undef p1_cpsc_4
+#define p1_cpsc_4(a_) \
+ pl(a_,cx,0) \
+ pm(6,0) \
+ puq(0,a_,ax)
+#undef p2_cpsc_4
+#define p2_cpsc_4(a_) \
+ pl(a_,cx,0) \
+ pl(SS(a_,RS4),cx,1) \
+ pm(6,0) \
+ pm(6,1) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_cpsc_4
+#define p4_cpsc_4(a_) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ pm(6,2) \
+ puq(0,a_,ax) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(4,RS4)),cx,0) \
+ pm(6,3) \
+ puq(1,SS(a_,RS4),ax) \
+ pl(SS(a_,MM(5,RS4)),cx,1) \
+ pm(6,0) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+ pl(SS(a_,MM(6,RS4)),cx,2) \
+ pm(6,1) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef lpcpsc_4
+#define lpcpsc_4(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,0) \
+ pl(SS(a_,MM(1,RS4)),cx,1) \
+ pm(6,0) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pm(6,1)
+#undef dpcpsc_4
+#define dpcpsc_4(a_) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ pm(6,2) \
+ puq(0,a_,ax) \
+ pm(6,3) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plcpsc_4
+#define plcpsc_4 16
+
+#undef p1_4_cpsc_5
+#define p1_4_cpsc_5(a_) \
+ pls(a_,cx,0) \
+ pmsr(6,0) \
+ pus(0,a_,ax)
+#undef p1_2_cpsc_5
+#define p1_2_cpsc_5(a_) \
+ pld(a_,cx,0) \
+ pm(6,0) \
+ pud(0,a_,ax)
+#undef p1_cpsc_5
+#define p1_cpsc_5(a_) \
+ pl(a_,cx,0) \
+ pm(6,0) \
+ puq(0,a_,ax)
+#undef p2_cpsc_5
+#define p2_cpsc_5(a_) \
+ pl(a_,cx,0) \
+ pl(SS(a_,RS4),cx,1) \
+ pm(6,0) \
+ pm(6,1) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_cpsc_5
+#define p4_cpsc_5(a_) \
+ pl(SS(a_,MM(0,RS4)),cx,0) \
+ pl(SS(a_,MM(1,RS4)),cx,1) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ pm(6,0) \
+ pm(6,1) \
+ pm(6,2) \
+ pm(6,3) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef p8_cpsc_5
+#define p8_cpsc_5(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,0) \
+ pl(SS(a_,MM(1,RS4)),cx,1) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ pl(SS(a_,MM(4,RS4)),cx,4) \
+ pl(SS(a_,MM(5,RS4)),cx,5) \
+ pl(SS(a_,MM(6,RS4)),cx,7) \
+ pm(6,0) \
+ pm(6,1) \
+ pm(6,2) \
+ pm(6,3) \
+ puq(0,a_,ax) \
+ pl(SS(a_,MM(7,RS4)),cx,0) \
+ pm(6,4) \
+ pm(6,5) \
+ pm(6,7) \
+ pm(6,0) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ puq(4,SS(a_,MM(4,RS4)),ax) \
+ puq(5,SS(a_,MM(5,RS4)),ax) \
+ puq(7,SS(a_,MM(6,RS4)),ax) \
+ puq(0,SS(a_,MM(7,RS4)),ax)
+#undef lpcpsc_5
+#define lpcpsc_5(a_)
+#undef dpcpsc_5
+#define dpcpsc_5(a_) p8_cpsc_5(a_)
+#undef plcpsc_5
+#define plcpsc_5 32
+
+#undef cpsc_cdp
+#define cpsc_cdp(a_) pc(a_,5) pm(6,a_) ps(CSHUF,5,5) pm(7,5) pa(5,a_)
+#undef p1_4_cpsc_5c
+#define p1_4_cpsc_5c(a_)
+#undef p1_2_cpsc_5c
+#define p1_2_cpsc_5c(a_) \
+ pld(a_,cx,0) \
+ cpsc_cdp(0) \
+ pud(0,a_,ax)
+#undef p1_cpsc_5c
+#define p1_cpsc_5c(a_) \
+ pl(a_,cx,0) \
+ cpsc_cdp(0) \
+ puq(0,a_,ax)
+#undef p2_cpsc_5c
+#define p2_cpsc_5c(a_) \
+ pl(a_,cx,0) \
+ pl(SS(a_,RS4),cx,1) \
+ cpsc_cdp(0) \
+ cpsc_cdp(1) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax)
+#undef p4_cpsc_5c
+#define p4_cpsc_5c(a_) \
+ pl(SS(a_,MM(0,RS4)),cx,0) \
+ pl(SS(a_,MM(1,RS4)),cx,1) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ cpsc_cdp(0) \
+ cpsc_cdp(1) \
+ cpsc_cdp(2) \
+ cpsc_cdp(3) \
+ puq(0,a_,ax) \
+ puq(1,SS(a_,RS4),ax) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ puq(3,SS(a_,MM(3,RS4)),ax)
+#undef p8_cpsc_5c
+#define p8_cpsc_5c(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ pl(SS(a_,MM(0,RS4)),cx,0) \
+ pl(SS(a_,MM(1,RS4)),cx,1) \
+ pl(SS(a_,MM(2,RS4)),cx,2) \
+ pl(SS(a_,MM(3,RS4)),cx,3) \
+ pl(SS(a_,MM(4,RS4)),cx,4) \
+ cpsc_cdp(0) \
+ cpsc_cdp(1) \
+ puq(0,a_,ax) \
+ pl(SS(a_,MM(5,RS4)),cx,0) \
+ cpsc_cdp(2) \
+ cpsc_cdp(3) \
+ puq(1,SS(a_,RS4),ax) \
+ pl(SS(a_,MM(6,RS4)),cx,1) \
+ cpsc_cdp(4) \
+ cpsc_cdp(0) \
+ puq(2,SS(a_,MM(2,RS4)),ax) \
+ pl(SS(a_,MM(7,RS4)),cx,2) \
+ cpsc_cdp(1) \
+ cpsc_cdp(2) \
+ puq(3,SS(a_,MM(3,RS4)),ax) \
+ puq(4,SS(a_,MM(4,RS4)),ax) \
+ puq(0,SS(a_,MM(5,RS4)),ax) \
+ puq(1,SS(a_,MM(6,RS4)),ax) \
+ puq(2,SS(a_,MM(7,RS4)),ax)
+#undef lpcpsc_5c
+#define lpcpsc_5c(a_)
+#undef dpcpsc_5c
+#define dpcpsc_5c(a_) p8_cpsc_5c(a_)
+#undef plcpsc_5c
+#define plcpsc_5c 32
+
+#undef p1_4_cpsc_1
+#define p1_4_cpsc_1(a_) \
+ pls(a_,ax,2) \
+ pmsr(3,2) \
+ pus(2,a_,cx)
+#undef p1_2_cpsc_1
+#define p1_2_cpsc_1(a_) \
+ pld(a_,ax,2) \
+ pm(3,2) \
+ pud(2,a_,cx)
+#undef p1_cpsc_1
+#define p1_cpsc_1(a_) \
+ plq(a_,ax,2) \
+ pm(3,2) \
+ pu(2,a_,cx)
+#undef p2_cpsc_1
+#define p2_cpsc_1(a_) \
+ plq(SS(a_,RS4),ax,4) \
+ pm(3,2) \
+ pu(2,a_,cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,2) \
+ pm(3,4) \
+ pu(4,SS(a_,RS4),cx)
+#undef lpcpsc_1
+#define lpcpsc_1(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,2)
+#undef dpcpsc_1
+#define dpcpsc_1(a_) \
+ plq(SS(a_,RS4),ax,4) \
+ pm(3,2) \
+ pu(2,a_,cx) \
+ pm(3,4) \
+ pu(4,SS(a_,RS4),cx)
+#undef plcpsc_1
+#define plcpsc_1 8
+
+#undef p1_4_cpsc_2
+#define p1_4_cpsc_2(a_) \
+ pls(a_,ax,2) \
+ pmsr(3,2) \
+ pus(2,a_,cx)
+#undef p1_2_cpsc_2
+#define p1_2_cpsc_2(a_) \
+ pld(a_,ax,2) \
+ pm(3,2) \
+ pud(2,a_,cx)
+#undef p1_cpsc_2
+#define p1_cpsc_2(a_) \
+ plq(a_,ax,2) \
+ pm(3,2) \
+ pu(2,a_,cx)
+#undef p2_cpsc_2
+#define p2_cpsc_2(a_) \
+ plq(a_,ax,2) \
+ plq(SS(a_,RS4),ax,4) \
+ pm(3,2) \
+ pm(3,4) \
+ pu(2,a_,cx) \
+ pu(4,SS(a_,RS4),cx)
+#undef p4_cpsc_2
+#define p4_cpsc_2(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,7) \
+ pm(3,6) \
+ pu(4,a_,cx) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pm(3,7) \
+ pu(6,SS(a_,RS4),cx) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+ f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+ plq(SS(a_,MM(4,RS4)),ax,4) \
+ pm(3,2) \
+ pu(7,SS(a_,MM(2,RS4)),cx) \
+ plq(SS(a_,MM(5,RS4)),ax,6) \
+ pm(3,4) \
+ pu(2,SS(a_,MM(3,RS4)),cx)
+#undef lpcpsc_2
+#define lpcpsc_2(a_) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+ plq(SS(a_,MM(0,RS4)),ax,4) \
+ plq(SS(a_,MM(1,RS4)),ax,6) \
+ pm(3,4)
+#undef dpcpsc_2
+#define dpcpsc_2(a_) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,7) \
+ pm(3,6) \
+ pu(4,a_,cx) \
+ plq(SS(a_,MM(3,RS4)),ax,2) \
+ pm(3,7) \
+ pu(6,SS(a_,RS4),cx) \
+ pm(3,2) \
+ pu(7,SS(a_,MM(2,RS4)),cx) \
+ pu(2,SS(a_,MM(3,RS4)),cx)
+#undef plcpsc_2
+#define plcpsc_2 RS4
+
+
+#undef p1_4_iamax_1
+#define p1_4_iamax_1(a_) \
+ px(4) \
+ pls(a_,ax,4) \
+ pan(2,4) \
+ pc(3,5) \
+ pcm(6,4,5) \
+ paxs(4,3) \
+ pan(5,6) \
+ pann(0,5) \
+ pasr(5,6) \
+ pasr(1,0) \
+ ps(57,0,0)
+#undef p1_2_iamax_1
+#define p1_2_iamax_1(a_) \
+ px(4) \
+ pld(a_,ax,4) \
+ pan(2,4) \
+ pc(3,5) \
+ pcm(6,4,5) \
+ pax(4,3) \
+ pan(5,6) \
+ pann(0,5) \
+ pa(5,6) \
+ pasr(1,0) \
+ ps(57,0,0)\
+ pasr(1,0) \
+ ps(57,0,0)
+#undef p1_iamax_1
+#define p1_iamax_1(a_) \
+ plq(a_,ax,4) \
+ pan(2,4) \
+ pc(3,5) \
+ pcm(6,4,5) \
+ pax(4,3) \
+ pan(5,6) \
+ pann(0,5) \
+ pa(5,6) \
+ pa(1,0)
+#define p2_iamax_1(a_) \
+ plq(SS(a_,RS4),ax,4) \
+ pan(2,4) \
+ pc(3,5) \
+ pcm(6,4,5) \
+ pax(4,3) \
+ pan(5,6) \
+ pann(0,5) \
+ pa(5,6) \
+ pa(1,0) \
+ f(nta,SS(a_,MM(SS(2,CL),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,4) \
+ pan(2,4) \
+ pc(3,5) \
+ pcm(6,4,5) \
+ pax(4,3) \
+ pan(5,6) \
+ pann(0,5) \
+ pa(5,6) \
+ pa(1,0)
+#undef lpiamax_1
+#define lpiamax_1(a_) \
+ f(nta,SS(a_,MM(CL,RS4)),ax) \
+ plq(a_,ax,4) \
+ pan(2,4) \
+ pc(3,5) \
+ pcm(6,4,5) \
+ pax(4,3) \
+ pan(5,6) \
+ pann(0,5) \
+ pa(5,6) \
+ pa(1,0)
+#undef dpiamax_1
+#define dpiamax_1(a_) \
+ plq(SS(a_,RS4),ax,4) \
+ pan(2,4) \
+ pc(3,5) \
+ pcm(6,4,5) \
+ pax(4,3) \
+ pan(5,6) \
+ pann(0,5) \
+ pa(5,6) \
+ pa(1,0)
+#undef pliamax_1
+#define pliamax_1 8
+
+#undef p1_4_iamax_1d
+#define p1_4_iamax_1d(a_)
+#undef p1_2_iamax_1d
+#define p1_2_iamax_1d(a_) \
+ px(4) \
+ pld(a_,ax,4) \
+ dbg(2) \
+ pan(2,4) \
+ dbg(4) \
+ pc(3,5) \
+ dbg(5) \
+ pcm(6,4,5) \
+ dbg(5) \
+ pax(4,3) \
+ dbg(3) \
+ pan(5,6) \
+ dbg(6) \
+ pann(0,5) \
+ dbg(5) \
+ pa(5,6) \
+ dbg(6) \
+ pasr(1,0) \
+ dbg(0) \
+ ps(1,0,0)
+#undef p1_iamax_1d
+#define p1_iamax_1d(a_) \
+ plq(a_,ax,4) \
+ dbg(2) \
+ pan(2,4) \
+ dbg(4) \
+ pc(3,5) \
+ dbg(5) \
+ pcm(6,4,5) \
+ dbg(5) \
+ pax(4,3) \
+ dbg(3) \
+ pan(5,6) \
+ dbg(6) \
+ pann(0,5) \
+ dbg(5) \
+ pa(5,6) \
+ dbg(6) \
+ pa(1,0)
+#define p2_iamax_1d(a_) \
+ plq(SS(a_,RS4),ax,4) \
+ dbg(2) \
+ pan(2,4) \
+ dbg(4) \
+ pc(3,5) \
+ dbg(5) \
+ pcm(6,4,5) \
+ dbg(5) \
+ pax(4,3) \
+ dbg(3) \
+ pan(5,6) \
+ dbg(6) \
+ pann(0,5) \
+ dbg(5) \
+ pa(5,6) \
+ dbg(6) \
+ pa(1,0) \
+ dbg(0) \
+ f(nta,SS(a_,MM(SS(2,CL),RS4)),ax) \
+ plq(SS(a_,MM(2,RS4)),ax,4) \
+ dbg(2) \
+ pan(2,4) \
+ dbg(4) \
+ pc(3,5) \
+ dbg(5) \
+ pcm(6,4,5) \
+ dbg(5) \
+ pax(4,3) \
+ dbg(3) \
+ pan(5,6) \
+ dbg(6) \
+ pann(0,5) \
+ dbg(5) \
+ pa(5,6) \
+ dbg(6) \
+ pa(1,0)
+#undef lpiamax_1d
+#define lpiamax_1d(a_) \
+ f(nta,SS(a_,MM(CL,RS4)),ax) \
+ plq(a_,ax,4) \
+ dbg(2) \
+ pan(2,4) \
+ dbg(4) \
+ pc(3,5) \
+ dbg(5) \
+ pcm(6,4,5) \
+ dbg(5) \
+ pax(4,3) \
+ dbg(3) \
+ pan(5,6) \
+ dbg(6) \
+ pann(0,5) \
+ dbg(5) \
+ pa(5,6) \
+ dbg(6) \
+ pa(1,0)
+#undef dpiamax_1d
+#define dpiamax_1d(a_) \
+ plq(SS(a_,RS4),ax,4) \
+ dbg(2) \
+ pan(2,4) \
+ dbg(4) \
+ pc(3,5) \
+ dbg(5) \
+ pcm(6,4,5) \
+ dbg(5) \
+ pax(4,3) \
+ dbg(3) \
+ pan(5,6) \
+ dbg(6) \
+ pann(0,5) \
+ dbg(5) \
+ pa(5,6) \
+ dbg(6) \
+ pa(1,0)
+#undef pliamax_1d
+#define pliamax_1d 8
+
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h
new file mode 100644
index 0000000..03486cf
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h
@@ -0,0 +1,331 @@
+/***************************************
+ $Header: /cvsroot/math-atlas/AtlasBase/kernel/CammMaguire/camm_tpipe.h,v 1.2 2003/10/18 18:13:30 yycamm Exp $
+
+
+***************************************/
+
+
+/* #ifndef CAMM_TPIPE_H */
+/* #define CAMM_TPIPE_H */ /*+ To stop multiple inclusions. +*/
+
+#ifndef BITS
+#error BITS must be defined in camm_tpipe.h
+#endif
+#ifndef DIV
+#error DIV must be defined in camm_tpipe.h
+#endif
+#ifndef INC
+#error INC(a_) must be defined in camm_tpipe.h
+#endif
+#ifndef LR
+#error LR must be defined in camm_tpipe.h
+#endif
+
+#ifdef ALIGN
+
+#if defined(SREAL)
+
+ test(4,ax)
+ je(a2)
+
+#undef KB
+#define KB ( 1 /* / DIV */ )
+#include "camm_pipe3.h"
+
+ KB_block
+ INC(4)
+ sub(1,LR)
+
+ lab(a2)
+
+#endif
+
+#if defined(SREAL) || defined(DREAL)
+
+ test(8,ax)
+ je(a4)
+ test(-2,LR)
+ je(a4)
+
+#undef KB
+#define KB ( 2 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(8)
+ sub(2,LR)
+
+ lab(a4)
+
+#endif
+#endif
+
+/* "movl %%edx,%%edi\n\t" */
+ push(LR)
+ shr(BITS,LR)
+ shl(BITS,LR)
+ m(4,LR)
+ ra(ax,LR)
+
+#if defined(ALIGN) && ( defined(SCPLX) || defined(DCPLX) )
+ test(12,ax)
+ je(loopa)
+#endif
+
+#if !defined(ALIGN) || defined(SCPLX) || defined(DCPLX)
+#undef plq
+#define plq(a_,b_,c_) pl(a_,b_,c_)
+#undef puq
+#define puq(a_,b_,c_) pu(a_,b_,c_)
+#undef plqx
+#define plqx(a_,b_,c_,d_,e_) plx(a_,b_,c_,d_,e_)
+#undef puqx
+#define puqx(a_,b_,c_,d_,e_) pux(a_,b_,c_,d_,e_)
+#else
+#undef plq
+#define plq(a_,b_,c_) pla(a_,b_,c_)
+#undef puq
+#define puq(a_,b_,c_) punt(a_,b_,c_)
+#undef plqx
+#define plqx(a_,b_,c_,d_,e_) plax(a_,b_,c_,d_,e_)
+#undef puqx
+#define puqx(a_,b_,c_,d_,e_) puax(a_,b_,c_,d_,e_)
+#endif
+
+ align
+ lab(loop)
+ cmp(ax,LR)
+ je(stop)
+
+#undef KB
+#define KB ( (1 << BITS) /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(4*KB/**DIV*/)
+
+ jmp(loop)
+
+ lab(stop)
+ pop(LR)
+
+#if ( 1 << BITS ) > 128
+ test(128,LR)
+ je(64)
+#undef KB
+#define KB ( 128 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(512)
+
+ lab(64)
+#endif
+
+#if ( 1 << BITS ) > 64
+ test(64,LR)
+ je(32)
+#undef KB
+#define KB ( 64 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(256)
+
+ lab(32)
+#endif
+
+#if ( 1 << BITS ) > 32
+ test(32,LR)
+ je(16)
+#undef KB
+#define KB ( 32 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(128)
+
+ lab(16)
+#endif
+
+#if ( 1 << BITS ) > 16
+ test(16,LR)
+ je(8)
+#undef KB
+#define KB ( 16 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(64)
+
+ lab(8)
+#endif
+
+#if ( 1 << BITS ) > 8
+ test(8,LR)
+ je(4)
+#undef KB
+#define KB ( 8 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(32)
+
+ lab(4)
+#endif
+
+#if ( 1 << BITS ) > 4
+ test(4,LR)
+ je(2)
+#undef KB
+#define KB ( 4 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(16)
+
+ lab(2)
+#endif
+
+#if DIV != 4 && ( 1 << BITS ) > 2
+ test(2,LR)
+ je(1)
+#undef KB
+#define KB ( 2 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(8)
+
+ lab(1)
+#endif
+
+#if DIV == 1 && ( 1 << BITS ) > 1
+ test(1,LR)
+ je(end)
+#undef KB
+#define KB ( 1 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ lab(end)
+#endif
+
+#if defined (ALIGN) && ( defined(SCPLX) || defined(DCPLX) )
+
+ jmp(tend)
+
+#undef plq
+#define plq(a_,b_,c_) pla(a_,b_,c_)
+#undef puq
+#define puq(a_,b_,c_) punt(a_,b_,c_)
+#undef plqx
+#define plqx(a_,b_,c_,d_,e_) plax(a_,b_,c_,d_,e_)
+#undef puqx
+#define puqx(a_,b_,c_,d_,e_) puax(a_,b_,c_,d_,e_)
+
+ align
+ lab(loopa)
+ cmp(ax,LR)
+ je(stopa)
+
+#undef KB
+#define KB ( (1 << BITS) /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(4*KB/**DIV*/)
+
+ jmp(loopa)
+
+ lab(stopa)
+ pop(LR)
+
+#if ( 1 << BITS ) > 128
+ test(128,LR)
+ je(64a)
+#undef KB
+#define KB ( 128 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(512)
+
+ lab(64a)
+#endif
+
+#if ( 1 << BITS ) > 64
+ test(64,LR)
+ je(32a)
+#undef KB
+#define KB ( 64 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(256)
+
+ lab(32a)
+#endif
+
+#if ( 1 << BITS ) > 32
+ test(32,LR)
+ je(16a)
+#undef KB
+#define KB ( 32 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(128)
+
+ lab(16a)
+#endif
+
+#if ( 1 << BITS ) > 16
+ test(16,LR)
+ je(8a)
+#undef KB
+#define KB ( 16 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(64)
+
+ lab(8a)
+#endif
+
+#if ( 1 << BITS ) > 8
+ test(8,LR)
+ je(4a)
+#undef KB
+#define KB ( 8 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(32)
+
+ lab(4a)
+#endif
+
+#if ( 1 << BITS ) > 4
+ test(4,LR)
+ je(2a)
+#undef KB
+#define KB ( 4 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(16)
+
+ lab(2a)
+#endif
+
+#if DIV != 4 && ( 1 << BITS ) > 2
+ test(2,LR)
+ je(1a)
+#undef KB
+#define KB ( 2 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ INC(8)
+
+ lab(1a)
+#endif
+
+#if DIV == 1 && ( 1 << BITS ) > 1
+ test(1,LR)
+ je(enda)
+#undef KB
+#define KB ( 1 /* / DIV */ )
+#include "camm_pipe3.h"
+ KB_block
+ lab(enda)
+#endif
+
+ lab(tend)
+
+#endif
+
+/* #endif */ /* CAMM_TPIPE_H */
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h
new file mode 100644
index 0000000..6b150d3
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h
@@ -0,0 +1,508 @@
+#ifndef CAMM_UTIL_H
+#define CAMM_UTIL_H /*+ To stop multiple inclusions. +*/
+
+typedef struct {
+ float r,i;
+} Complex;
+
+typedef struct {
+ double r,i;
+} Dcomplex;
+
+#undef str
+#define str(a_) xstr(a_)
+#undef xstr
+#define xstr(a_) #a_
+
+#undef val
+#define val(a_) xval(a_)
+#undef xval
+#define xval(a_) a_
+
+#ifndef Mjoin
+#define Mjoin(a,b) mjoin(a,b)
+#ifdef mjoin
+ #undef mjoin
+#endif
+#define mjoin(a,b) a ## b
+#endif
+
+#undef VOLATILE
+#define VOLATILE __volatile__
+#undef ASM
+#define ASM __asm__ VOLATILE
+
+#ifdef BETA0
+#undef BL
+#define BL b0
+#endif
+#ifdef BETA1
+#undef BL
+#define BL b1
+#endif
+#ifdef BETAX
+#undef BL
+#define BL bX
+#endif
+#ifdef BETAXI0
+#undef BL
+#define BL bXi0
+#endif
+
+#ifdef NO_TRANSPOSE
+#ifdef GER
+#ifdef Conj_
+#undef FEXT
+#define FEXT Gc
+#else
+#undef FEXT
+#define FEXT Gu
+#endif
+#else
+#ifdef Conj_
+#undef FEXT
+#define FEXT Nc
+#else
+#undef FEXT
+#define FEXT N
+#endif
+#endif
+#else
+#ifdef Conj_
+#undef FEXT
+#define FEXT C
+#else
+#undef FEXT
+#define FEXT T
+#endif
+#endif
+
+#undef BLC
+#define BLC Mjoin(FEXT,BL)
+
+#ifdef __GNUC__
+#undef NO_INLINE
+#define NO_INLINE double sq(double x) {return x*x;}
+#else
+#undef NO_INLINE
+#define NO_INLINE
+#endif
+
+#undef lab
+#define lab(a_) "\n" str(MY_FUNCTION) "_" str(N) "_" str(a_) ":\n\t"
+#undef jmp
+#define jmp(a_) "jmp " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef je
+#define je(a_) "je " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef jge
+#define jge(a_) "jge " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef jle
+#define jle(a_) "jle " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef jl
+#define jl(a_) "jl " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef jne
+#define jne(a_) "jne " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef align
+#define align ".align 16\n\t"
+#undef test
+#define test(a_,b_) "testl $" str(a_) ",%%e" str(b_) "\n\t"
+#undef and
+#define and(a_,b_) "andl $" str(a_) ",%%e" str(b_) "\n\t"
+#undef sub
+#define sub(a_,b_) "subl $" str(a_) ",%%e" str(b_) "\n\t"
+#undef SS
+#define SS(a_,b_) a_ + b_
+#undef MM
+#define MM(a_,b_) a_ * b_
+#undef E4
+#define E4(a_) (( a_ >> 2 ) << 2 )
+
+#undef TYPE
+#undef SCALAR
+#undef PREC
+#undef CSHUF
+#undef LSHUF
+#undef HSHUF
+#undef ISHUF
+#undef RSHUF
+#undef SINGLE
+#undef REAL
+#undef DIV
+
+#ifdef SCPLX
+#define TYPE Complex
+#define SCALAR Complex *
+#define PREC c
+#define CSHUF 177
+#define LSHUF 160
+#define HSHUF 245
+#define ISHUF 13*17
+#define RSHUF 8*17
+#define SINGLE
+#define DIV 2
+/* #ifdef Conj_ */
+/* static const TYPE signd[2]={{-1.0,1.0},{-1.0,1.0}}; */
+/* #else */
+ static const TYPE signd[2]={{1.0,-1.0},{1.0,-1.0}};
+/* #endif */
+#endif
+
+#ifdef SREAL
+#define TYPE float
+#define SCALAR float
+#define PREC s
+#define SINGLE
+#define REAL
+#define DIV 1
+#endif
+
+#ifdef DREAL
+#define TYPE double
+#define SCALAR double
+#define PREC d
+#define REAL
+#define DIV 2
+#endif
+
+#ifdef DCPLX
+#define TYPE Dcomplex
+#define SCALAR Dcomplex *
+#define PREC z
+#define CSHUF 1
+#define LSHUF 0
+#define HSHUF 3
+#define ISHUF 3
+#define RSHUF 0
+#define DIV 4
+/* #ifdef Conj_ */
+/* static const TYPE signd[1]={{-1.0,1.0}}; */
+/* #else */
+ static const TYPE signd[1]={{1.0,-1.0}};
+/* #endif */
+#endif
+
+#undef M11
+#define M11 0
+#undef M12
+#define M12 1
+#undef M13
+#define M13 2
+#undef M14
+#define M14 3
+#undef M15
+#define M15 4
+#undef M16
+#define M16 5
+#undef M17
+#define M17 6
+#undef M18
+#define M18 7
+
+#undef M23
+#define M23 1
+#undef M24
+#define M24 2
+#undef M25
+#define M25 3
+#undef M26
+#define M26 4
+#undef M27
+#define M27 5
+#undef M28
+#define M28 6
+
+#undef M33
+#define M33 0
+#undef M34
+#define M34 1
+#undef M35
+#define M35 2
+#undef M36
+#define M36 3
+#undef M37
+#define M37 4
+#undef M38
+#define M38 5
+
+#undef P10
+#define P10 1
+#undef P11
+#define P11 2
+#undef P12
+#define P12 3
+#undef P13
+#define P13 4
+#undef P14
+#define P14 5
+#undef P15
+#define P15 6
+#undef P16
+#define P16 7
+
+#undef XM
+#define XM(a_,b_) M ## b_ ## a_
+#undef M
+#define M(a_,b_) XM(a_,b_)
+
+#undef XP
+#define XP(a_,b_) P ## b_ ## a_
+#undef P
+#define P(a_,b_) XP(a_,b_)
+
+#undef mex
+#define mex(a_) str(%%e ## a_)
+#undef msx
+#define msx(a_) "%%st(" str(a_) ")"
+
+#undef cmp
+#define cmp(a_,b_) "cmp " mex(a_) "," mex(b_) "\n\t"
+#undef icmpr
+#define icmpr(a_,b_) "cmp " mex(a_) ",(" mex(b_) ")\n\t"
+#undef f
+#define f(a_,b_,c_) "prefetch" str(a_) " " str(b_) "(%%e" #c_ ")\n\t"
+#undef pfx
+#define pfx(a_,b_,c_,d_,e_) "prefetch" str(a_) " " str(b_) "(%%e" #c_ ",%%e" #d_ "," str(e_) ")\n\t"
+#undef a
+#define a(a_,b_) "addl $" str(a_) "," mex(b_) "\n\t"
+#undef m
+#define m(a_,b_) "imul $" str(a_) "," mex(b_) "\n\t"
+#undef pop
+#define pop(a_) "popl %%e" str(a_) "\n\t"
+#undef push
+#define push(a_) "pushl %%e" str(a_) "\n\t"
+#undef d
+#define d(a_,b_) "idiv $" str(a_) "," mex(b_) "\n\t"
+#undef shl
+#define shl(a_,b_) "shl $" str(a_) "," mex(b_) "\n\t"
+#undef shr
+#define shr(a_,b_) "shr $" str(a_) "," mex(b_) "\n\t"
+#undef mm
+#define mm(a_,b_) "mov $" str(a_) "," mex(b_) "\n\t"
+#undef ra
+#define ra(a_,b_) "addl %%e" str(a_) "," mex(b_) "\n\t"
+#undef rs
+#define rs(a_,b_) "subl %%e" str(a_) "," mex(b_) "\n\t"
+
+#undef fl
+#define fl(a_,b_) "fldl " str(a_) "(" mex(b_) ")\n\t"
+#undef fp
+#define fp(a_,b_) "fstpl " str(a_) "(" mex(b_) ")\n\t"
+#undef fd
+#define fd(a_) "fld " msx(a_) "\n\t"
+#undef fap
+#define fap(a_,b_) "faddp " msx(a_) "," msx(b_) "\n\t"
+/* #define fsp(a_) fx(a_) "fsubp %%st," msx(a_) "\n\t" */
+#undef fsp
+#define fsp(a_) "fsubrp %%st," msx(a_) "\n\t"
+#undef fmp
+#define fmp(a_,b_) "fmulp " msx(a_) "," msx(b_) "\n\t"
+#undef fa
+#define fa(a_,b_) "fadd " msx(a_) "," msx(b_) "\n\t"
+#undef fm
+#define fm(a_,b_) "fmul " msx(a_) "," msx(b_) "\n\t"
+#undef faa
+#define faa(a_,b_) "faddl " str(a_) "(" mex(b_) ")\n\t"
+#undef fma
+#define fma(a_,b_) "fmull " str(a_) "(" mex(b_) ")\n\t"
+#undef fz
+#define fz "fldz\n\t"
+#undef fx
+#define fx(a_) "fxch " msx(a_) "\n\t"
+#undef fx1
+#define fx1 "fxch\n\t"
+#undef fc
+#define fc(a_) "fstp " msx(a_) "\n\t"
+
+
+#ifndef ATHLON
+
+
+#if defined(DREAL) || defined(DCPLX)
+#undef SSESUF
+#define SSESUF "d "
+#undef RS4
+#define RS4 16
+#undef RS
+#define RS 4
+#else
+#undef SSESUF
+#define SSESUF "s "
+#undef RS4
+#define RS4 16
+#undef RS
+#define RS 4
+#endif
+
+#undef mxx
+#define mxx(a_) str(%%xmm ## a_)
+#undef prp
+#define prp(a_,b_) "rcpp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef prps
+#define prps(a_,b_) "rcps" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pann
+#define pann(a_,b_) "andnp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef psqs
+#define psqs(a_,b_) "sqrts" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef por
+#define por(a_,b_) "orp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pan
+#define pan(a_,b_) "andp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pcm
+#define pcm(a_,b_,c_) "cmpp" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t"
+#undef pcms
+#define pcms(a_,b_,c_) "cmps" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t"
+#undef pax
+#define pax(a_,b_) "maxp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef paxs
+#define paxs(a_,b_) "maxs" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pd
+#define pd(a_,b_) "divp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pdsr
+#define pdsr(a_,b_) "divs" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pxx
+#define pxx(a_,b_) "xorp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef px
+#define px(a_) "xorp" SSESUF mxx(a_) "," mxx(a_) "\n\t"
+#undef pm
+#define pm(a_,b_) "mulp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pa
+#define pa(a_,b_) "addp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pmm
+#define pmm(a_,b_,c_) "mulp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pam
+#define pam(a_,b_,c_) "addp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pl
+#define pl(a_,b_,c_) "movup" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pla
+#define pla(a_,b_,c_) "movap" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pu
+#define pu(a_,b_,c_) "movup" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef punt
+#define punt(a_,b_,c_) "movntp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pua
+#define pua(a_,b_,c_) "movap" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pud
+#define pud(a_,b_,c_) "movlp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pudr
+#define pudr(a_,b_) "movlp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pc
+#define pc(a_,b_) "movap" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef ps
+#define ps(a_,b_,c_) "shufp" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t"
+#undef phl
+#define phl(a_,b_) "movhlp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pus
+#define pus(a_,b_,c_) "movs" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pls
+#define pls(a_,b_,c_) "movs" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pld
+#define pld(a_,b_,c_) "movlp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef plh
+#define plh(a_,b_) "movlhp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pas
+#define pas(a_,b_,c_) "adds" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pms
+#define pms(a_,b_,c_) "muls" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pcs
+#define pcs(a_,b_) "movs" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pasr
+#define pasr(a_,b_) "adds" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pmsr
+#define pmsr(a_,b_) "muls" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pul
+#define pul(a_,b_) "unpcklp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef puh
+#define puh(a_,b_) "unpckhp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+
+#undef plsx
+#define plsx(a_,b_,c_,d_,e_) \
+ "movs" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef plx
+#define plx(a_,b_,c_,d_,e_) \
+ "movup" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef plax
+#define plax(a_,b_,c_,d_,e_) \
+ "movap" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef pasx
+#define pasx(a_,b_,c_,d_,e_) \
+ "adds" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef pusx
+#define pusx(a_,b_,c_,d_,e_) \
+ "movs" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+#undef pux
+#define pux(a_,b_,c_,d_,e_) \
+ "movup" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+#undef puax
+#define puax(a_,b_,c_,d_,e_) \
+ "movap" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+#undef pudx
+#define pudx(a_,b_,c_,d_,e_) \
+ "movlp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+
+#undef pldx
+#define pldx(a_,b_,c_,d_,e_) \
+ "movlp" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+
+#else
+
+#undef RS4
+#define RS4 8
+#undef RS
+#define RS 2
+
+#undef mxx
+#define mxx(a_) str(%%mm ## a_)
+#undef pul
+#define pul(a_,b_) "punpckldq " mxx(a_) "," mxx(b_) "\n\t"
+#undef puh
+#define puh(a_,b_) "punpckhdq " mxx(a_) "," mxx(b_) "\n\t"
+
+#undef px
+#define px(a_) "pxor " mxx(a_) "," mxx(a_) "\n\t"
+#undef pm
+#define pm(a_,b_) "pfmul " mxx(a_) "," mxx(b_) "\n\t"
+#undef pa
+#define pa(a_,b_) "pfadd " mxx(a_) "," mxx(b_) "\n\t"
+#undef pac
+#define pac(a_,b_) "pfacc " mxx(a_) "," mxx(b_) "\n\t"
+#undef pmm
+#define pmm(a_,b_,c_) "pfmul " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pam
+#define pam(a_,b_,c_) "pfadd " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pl
+#define pl(a_,b_,c_) "movq " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pla
+#define pla(a_,b_,c_) "movq " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pu
+#define pu(a_,b_,c_) "movq " mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pc
+#define pc(a_,b_) "movq " mxx(a_) "," mxx(b_) "\n\t"
+#undef ps
+#define ps(a_,b_,c_) "pswapd " mxx(b_) "," mxx(c_) "\n\t"
+#undef phl
+#define phl(a_,b_) "punpckhdq " mxx(a_) "," mxx(b_) "\n\t"
+#undef plh
+#define plh(a_,b_) "punpckldq " mxx(a_) "," mxx(b_) "\n\t"
+#undef pus
+#define pus(a_,b_,c_) "movd " mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pls
+#define pls(a_,b_,c_) "movd " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+
+#undef plsx
+#define plsx(a_,b_,c_,d_,e_) \
+ "movd " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef plx
+#define plx(a_,b_,c_,d_,e_) \
+ "movq " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef pasx
+#define pasx(a_,b_,c_,d_,e_) \
+ "addss " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef pusx
+#define pusx(a_,b_,c_,d_,e_) \
+ "movd " mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+#undef pux
+#define pux(a_,b_,c_,d_,e_) \
+ "movq " mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+#endif
+
+#endif /* CAMM_UTIL_H */