From 96a32415ab43377cf1575bd3f4f2980f58028209 Mon Sep 17 00:00:00 2001 From: Determinant Date: Fri, 14 Aug 2015 11:51:42 +0800 Subject: add implementation for kaldi io (by ymz) --- .../tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h | 188 ++ kaldi_io/src/tools/ATLAS/include/contrib/Make.ext | 39 + .../src/tools/ATLAS/include/contrib/SSE3Dnow.h | 709 +++++ .../src/tools/ATLAS/include/contrib/camm_dpa.h | 1626 +++++++++++ .../src/tools/ATLAS/include/contrib/camm_pipe3.h | 295 ++ .../src/tools/ATLAS/include/contrib/camm_scale.h | 215 ++ .../src/tools/ATLAS/include/contrib/camm_strat1.h | 2982 ++++++++++++++++++++ .../src/tools/ATLAS/include/contrib/camm_tpipe.h | 331 +++ .../src/tools/ATLAS/include/contrib/camm_util.h | 508 ++++ 9 files changed, 6893 insertions(+) create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/Make.ext create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h (limited to 'kaldi_io/src/tools/ATLAS/include/contrib') diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h b/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h new file mode 100644 index 0000000..118d3de --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h @@ -0,0 +1,188 @@ +#ifdef GER +#undef NO_TRANSPOSE +#define NO_TRANSPOSE +#endif + + +#if NDPM > 4 +#error Max NDPM is 4 +#endif + +#if !defined(ATL_SSE1) && ( defined(SREAL) || defined(SCPLX) ) +#error This routine needs ATL_SSE1 defined +#endif + +#if !defined(ATL_SSE2) && ( defined(DREAL) || defined(DCPLX) ) +#error This routine needs ATL_SSE2 defined +#endif + +#include +#include + +#include "camm_util.h" + +#ifndef GER +#if defined(BETAX) || defined(BETAXI0) +#include "camm_scale.h" +#endif +#endif + +#if NDPM >= 4 +#define EXT4 Mjoin(4dp,BLC) +#undef NDP +#define NDP 4 +#undef EXT +#define EXT EXT4 +#include "camm_dpa.h" +#endif + +#if NDPM >= 3 +#define EXT3 Mjoin(3dp,BLC) +#undef NDP +#define NDP 3 +#undef EXT +#define EXT EXT3 +#include "camm_dpa.h" +#endif + +#if NDPM >= 2 +#define EXT2 Mjoin(2dp,BLC) +#undef NDP +#define NDP 2 +#undef EXT +#define EXT EXT2 +#include "camm_dpa.h" +#endif + +#define EXT1 Mjoin(1dp,BLC) +#undef NDP +#define NDP 1 +#undef EXT +#define EXT EXT1 +#include "camm_dpa.h" + +#undef NDP +#define NDP NDPM +#undef EXT +#define EXT Mjoin(Mjoin(NDP,Mjoin(dp,BLC)),m) +#include "camm_dpa.h" + +#ifdef GER +#if defined(SCPLX) || defined(DCPLX) +#ifdef Conj_ +#define IM 1c +#else +#define IM 1u +#endif +#else +#define IM 1 +#endif + + +#define FN Mjoin(Mjoin(Mjoin(ATL_,PREC),Mjoin(ger,IM)),_a1_x1_yX) + +#undef MY_FUNCTION +#define MY_FUNCTION FN + +void +MY_FUNCTION(int m,int n, const SCALAR alpha,const TYPE *c, + int cinc,const TYPE *b,int binc, + TYPE *a,int lda) { + +#else + + +#define FN Mjoin(Mjoin(Mjoin(ATL_,PREC),gemv),Mjoin(FEXT,Mjoin(_a1_x1_,Mjoin(BL,_y1)))) + +#undef MY_FUNCTION +#define MY_FUNCTION FN + +void +MY_FUNCTION(int m,int n, const SCALAR alpha,const TYPE *a, + int lda,const TYPE *b,int binc, + const SCALAR beta,TYPE *c,int cinc) { + +#endif + + int i,mm,nn; + const TYPE *ae; +#ifdef NO_TRANSPOSE + int len=m,w=n; +#define zz b +#else + int len=n,w=m; +#define zz c +#endif + +#ifdef GER +#define zzinc binc +#else +#define zzinc 1 + + +#if defined(NO_TRANSPOSE) && defined(BETA0) + memset(c,0,m*sizeof(*c)); +#endif + +#if defined(BETAX) || defined(BETAXI0) +#if defined(SCPLX) || defined(DCPLX) + SCALE(beta,c,m); +#endif +#if defined(SREAL) || defined(DREAL) + SCALE(&beta,c,m); +#endif +#endif + +#endif + + ae=a+w*lda; + nn=STRIDE*lda; + + +#if NDPM == 1 + for (;a 1 + if (((ae-a)/lda)%STRIDE) + mm++; +#endif + + if (mm == 1) + Mjoin(dp,EXT1)(a,nn,b,c,STRIDE*zzinc,len); + +#if ( NDPM == 2 && STRIDE > 1 ) || NDPM > 2 + else if (mm == 2) + Mjoin(dp,EXT2)(a,nn,b,c,STRIDE*zzinc,len); +#endif + +#if ( NDPM == 3 && STRIDE > 1 ) || NDPM > 3 + else if (mm == 3) + Mjoin(dp,EXT3)(a,nn,b,c,STRIDE*zzinc,len); +#endif + +#if ( NDPM == 4 && STRIDE > 1 ) || NDPM > 4 + else if (mm == 4) + Mjoin(dp,EXT4)(a,nn,b,c,STRIDE*zzinc,len); +#endif + + + } + +#endif + +} + diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext b/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext new file mode 100644 index 0000000..f7f9a0a --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext @@ -0,0 +1,39 @@ + +topd = /home/whaley/atlas3.8/AtlasBase +incs = -def topd /home/whaley/atlas3.8/AtlasBase \ + -def incd /home/whaley/atlas3.8/AtlasBase/Clint \ + -def BASEdir /home/whaley/atlas3.8/AtlasBase/Antoine/ \ + -def basd /home/whaley/atlas3.8/AtlasBase/Clint +ext = extract +extF = $(ext) -langF -lnlen71 -Remtblank -llwarn2 -LAPACK1 $(incs) +extC = $(ext) -langC -lnlen79 -Remtblank -llwarn2 $(incs) +extM = $(ext) -langM -lnlen79 -llwarn2 $(incs) + +default: all +force_build: +basd = /home/whaley/atlas3.8/AtlasBase/Clint +basdRCW = /home/whaley/atlas3.8/AtlasBase/Clint +basdAPP = /home/whaley/atlas3.8/AtlasBase/Antoine +incf = /home/whaley/atlas3.8/AtlasBase/gen.inc + +files = ATL_gemv_ger_SSE.h SSE3Dnow.h camm_dpa.h camm_pipe3.h camm_scale.h \ + camm_strat1.h camm_tpipe.h camm_util.h + +all : $(files) + +camm_strat1.h : $(topd)/kernel/CammMaguire/camm_strat1.h + cp $(topd)/kernel/CammMaguire/camm_strat1.h . +camm_tpipe.h : $(topd)/kernel/CammMaguire/camm_tpipe.h + cp $(topd)/kernel/CammMaguire/camm_tpipe.h . +camm_pipe3.h : $(topd)/kernel/CammMaguire/camm_pipe3.h + cp $(topd)/kernel/CammMaguire/camm_pipe3.h . +ATL_gemv_ger_SSE.h : $(topd)/kernel/CammMaguire/ATL_gemv_ger_SSE.h + cp $(topd)/kernel/CammMaguire/ATL_gemv_ger_SSE.h . +camm_util.h : $(topd)/kernel/CammMaguire/camm_util.h + cp $(topd)/kernel/CammMaguire/camm_util.h . +camm_scale.h : $(topd)/kernel/CammMaguire/camm_scale.h + cp $(topd)/kernel/CammMaguire/camm_scale.h . +camm_dpa.h : $(topd)/kernel/CammMaguire/camm_dpa.h + cp $(topd)/kernel/CammMaguire/camm_dpa.h . +SSE3Dnow.h : $(topd)/kernel/PeterSoendergaard/SSE3Dnow.h + cp $(topd)/kernel/PeterSoendergaard/SSE3Dnow.h . diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h b/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h new file mode 100644 index 0000000..a783749 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h @@ -0,0 +1,709 @@ +#if !defined(ATL_GAS_x8632) && !defined(ATL_GAS_x8664) + #error "This kernel requires gas x86 assembler!" +#endif +#ifndef Mstr /* Added by RCW to make multiline macros work */ + #define Mstr2(m) # m + #define Mstr(m) Mstr2(m) +#endif +/* The mening of the defined macros is as follows: + * VECLEN: The length of a singleprecision vector register + * vec_add: Add to single precision vectors. + * vec_mul: Multiply to single precision vectors. + * vec_mov: Moves data around + * vec_mov1: Load one element in a vector and zero all other entries! + * vec_splat: Load one element relpicated in all positions in the vector. + * vec_load_apart: Load elements from different memory positions into a register. + * vec_sum: Sums a register. + * vec_store_one: Stores lowest element in vector to memory, no zero-extend! + * Meaning of suffixes is as follows: + * mr means memory to register + * rr means register to register + * rm means register to memory + * a means that instruction needs aligned data + * 1 means that the instructions only operates on the lowest element of the + * vector. + * + * The _1 instructions work under one important assumption: That you never mix + * them with regular instructions, e.g. loading into a register with a normal + * mov, and then using add_rr_1 will not work under 3dnow! since it is in + * reality a normal add. However, if using a mov_1 first, the upper part of + * the register will be zeroed, and it will therefore work. The _1 system is + * more robust under SSE, but other architectures might be implemented the + * same way as 3dnow! + * + * RCW: I added the following functionality for SSE only (note that vw may + * be overwritten with intermediate results, but is not used as input, + * and that all input array may be overwritten wt intermediate results. + * VL : vector length -1): + * vec_red(vd, vw) : vd[0] = sum(vd[0:VL]) + * vec_red2(v1, v2, vw) : v1[0] = sum(v1[0:VL]); v1[1] = sum(v2[0:VL]) + * vec_red4(v0, v1, v2, v3 vw1, vw2) : + * v0[0] = sum(v0[0:VL]); v0[1] = sum(v1[0:VL]) + * if type = double: + * v2[0] = sum(v2[0:VL]); v2[1] = sum(v3[0:VL]) + * else + * v0[2] = sum(v2[0:VL]); v0[3] = sum(v3[0:VL]) + * vec_zero(vd) : vd[0:VL] = 0.0 + */ + + +/* Things to try: + * Non-temporal stores + * Sequences of instructions instead of movups + * + * + * + * + */ + + + +#define gen_vec_rr(op,reg1,reg2) \ + __asm__ __volatile__ (#op " " #reg1 ", " #reg2 \ + : /* nothing */ \ + : /* nothing */) + + +#define w(p) p + +#define nop() __asm__ __volatile__ ("nop") + +#define rep() __asm__ __volatile__ ("rep") + +#define align() __asm__ __volatile__ (".align 16") + + +#ifdef x87double + +#define st0 %%st(0) +#define st1 %%st(1) +#define st2 %%st(2) +#define st3 %%st(3) +#define st4 %%st(4) +#define st5 %%st(5) +#define st6 %%st(6) +#define st7 %%st(7) + + +#define gen_stack_rt(op,reg) \ + __asm__ __volatile__ (#op " " #reg \ + : /* nothing */ \ + : /* nothing */) + +#define gen_stack_tr(op,reg) \ + __asm__ __volatile__ (#op " %%st(0)," #reg \ + : \ + : ) + + +#define gen_stack_rr(op,reg1,reg2) \ + __asm__ __volatile__ (#op " " #reg1 ", " #reg2 \ + : /* nothing */ \ + : /* nothing */) + +#define gen_stack_t(op) \ + __asm__ __volatile__ (#op \ + : /* nothing */ \ + : /* nothing */) + + +#define gen_stack_tm(op,mem) \ + __asm__ __volatile__ (#op " %0" \ + : "=m" (((mem)[0])) \ + : ) + +#define gen_stack_mt(op,mem) \ + __asm__ __volatile__ (#op " %0" \ + : \ + : "m" (((mem)[0]))) + + +#define stack_mov_mt_push(mem) gen_stack_mt(fldl,mem) + +#define stack_add_tr_pop(reg) gen_stack_tr(faddp,reg) +#define stack_add_mt(mem) gen_stack_mt(faddl,mem) + +#define stack_mul_tr(reg) gen_stack_tr(fmul,reg) +#define stack_mul_tr_pop(reg) gen_stack_tr(fmulp,reg) +#define stack_mul_mt(mem) gen_stack_mt(fmul,mem) + +#define stack_mov_tm_pop(mem) gen_stack_tm(fstpl,mem) + +#define stack_zero_push() gen_stack_t(fldz) + +#endif /* x87double */ + +#ifdef SSE + +/* Peculiarities of SSE: Alignment is good, but not mandatory. It is possible to + * load/store from misaligned adresses using movups at a cost of some cycles. Loading + * using mul/add must always be aligned. Alignment is 16 bytes. + * No muladd. + */ + + + +#define gen_vec_mr(op,mem,reg) \ + __asm__ __volatile__ (#op " %0, " #reg \ + : /* nothing */ \ + : "m" (((mem)[0])), "m" (((mem)[1])), "m" (((mem)[2])), "m" (((mem)[3]))) + + +#define gen_vec_rm(op,reg,mem) \ + __asm__ __volatile__ (#op " " #reg ", %0" \ + : "=m" (((mem)[0])), "=m" (((mem)[1])), "=m" (((mem)[2])), "=m" (((mem)[3])) \ + : /* nothing */ ) + + + + +#define VECLEN 4 + +#define reg0 %%xmm0 +#define reg1 %%xmm1 +#define reg2 %%xmm2 +#define reg3 %%xmm3 +#define reg4 %%xmm4 +#define reg5 %%xmm5 +#define reg6 %%xmm6 +#define reg7 %%xmm7 +#ifdef ATL_GAS_x8664 + #define reg8 %%xmm8 + #define reg9 %%xmm9 + #define reg10 %%xmm10 + #define reg11 %%xmm11 + #define reg12 %%xmm12 + #define reg13 %%xmm13 + #define reg14 %%xmm14 + #define reg15 %%xmm15 +#endif + +#define vec_mov_mr(mem,reg) gen_vec_mr(movups,mem,reg) +#define vec_mov_rm(reg,mem) gen_vec_rm(movups,reg,mem) +#define vec_mov_mr_a(mem,reg) gen_vec_mr(movaps,mem,reg) +#define vec_mov_rm_a(reg,mem) gen_vec_rm(movaps,reg,mem) +#define vec_mov_rr(reg1,reg2) gen_vec_rr(movaps,reg1,reg2) + +#define vec_add_mr_a(mem,reg) gen_vec_mr(addps,mem,reg) +#define vec_mul_mr_a(mem,reg) gen_vec_mr(mulps,mem,reg) + +#define vec_add_rr(mem,reg) gen_vec_rr(addps,mem,reg) +#define vec_mul_rr(mem,reg) gen_vec_rr(mulps,mem,reg) + +#define vec_mov_mr_1(mem,reg) gen_vec_mr(movss,mem,reg) +#define vec_mov_rm_1(reg,mem) gen_vec_rm(movss,reg,mem) +#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movss,reg1,reg2) + +#define vec_add_mr_1(mem,reg) gen_vec_mr(addss,mem,reg) +#define vec_add_rr_1(reg1,reg2) gen_vec_rr(addss,reg1,reg2) + +#define vec_mul_mr_1(mem,reg) gen_vec_mr(mulss,mem,reg) +#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(mulss,reg1,reg2) + +#define vec_unpack_low(reg1,reg2) gen_vec_rr(unpcklps,reg1,reg2) +#define vec_unpack_high(reg1,reg2) gen_vec_rr(unpckhps,reg1,reg2) +#define vec_shuffle(mode,reg1,reg2) vec_shuffle_wrap(mode,reg1,reg2) +#define vec_shuffle_wrap(mode,reg1,reg2) \ + __asm__ __volatile__ ("shufps " #mode ", " #reg1 ", " #reg2 \ + : /* nothing */\ + : /* nothing */) + +/* Hack! */ +/* To use this instruction be sure that register 7 is not in use!!! */ +/* It must be possible to reduce this sequence to only four instructions. + * please tell me how! */ +#define vec_sum(reg) vec_sum_wrap(reg) +#define vec_sum_wrap(reg) \ + __asm__ __volatile__ ("movhlps " #reg ", %%xmm7\n"\ + "addps " #reg ", %%xmm7\n"\ + "movaps %%xmm7, " #reg "\n"\ + "shufps $1, " #reg ", %%xmm7\n"\ + "addss %%xmm7, " #reg "\n"\ + : /* nothing */\ + : /* nothing */) + +/* RCW: added to safely replace vec_sum (vec reduce), and use SSE3 when avail */ +#define vec_zero(vd) __asm__ __volatile__("xorps " Mstr(vd) ", " Mstr(vd) ::) +#ifdef ATL_SSE3 + #define vec_red(vr, vwrk) \ + __asm__ __volatile__("haddps " Mstr(vr) ", " Mstr(vr) "\n"\ + "haddps " Mstr(vr) ", " Mstr(vr) "\n" ::) +/* + * haddps v1 v0 # v0 = {v1cd, v1ab, v0cd, v0ab} + * haddps v0 v0 # v0 = {v1abcd, v0abcd, v1abcd, v0abcd} + */ + #define vec_red2(v0, v1, vwork) \ + __asm__ __volatile__("haddps " Mstr(v1) ", " Mstr(v0) "\n"\ + "haddps " Mstr(v0) ", " Mstr(v0) "\n" ::) +/* + * haddps v1, v0 # v0 = {v1cd,v1ab,v0cd,v0ab} + * haddps v3, v2 # v2 = {v3cd,v3ab,v2cd,v2ab} + * haddps v2, v0 # v0 = {v3abcd,v2abcd,v1abcd, v0abcd} + */ + #define vec_red4(v0, v1, v2, v3, w0, w1) \ + __asm__ __volatile__("haddps " Mstr(v1) ", " Mstr(v0) "\n"\ + "haddps " Mstr(v3) ", " Mstr(v2) "\n"\ + "haddps " Mstr(v2) ", " Mstr(v0) "\n" ::) +#elif defined(ATL_SSE2) + #define vec_red(vr, vwrk) \ + __asm__ __volatile__ ("pshufd $0xEE, " Mstr(vr) ", " Mstr(vwrk) "\n"\ + "addps " Mstr(vwrk) ", " Mstr(vr) "\n"\ + "pshufd $0xE5, " Mstr(vr) ", " Mstr(vwrk) "\n"\ + "addss " Mstr(vwrk) ", " Mstr(vr) "\n"\ + ::) +#else + #define vec_red(vr, vwrk) \ + __asm__ __volatile__ ("movhlps " Mstr(vr) ", " Mstr(vwrk) "\n"\ + "addps " Mstr(vwrk) ", " Mstr(vr) "\n"\ + "movaps " Mstr(vr) ", " Mstr(vwrk) "\n"\ + "shufps $0xE5, " Mstr(vr) ", " Mstr(vr) "\n"\ + "addss " Mstr(vwrk) ", " Mstr(vr) "\n"\ + ::) +#endif +#ifndef ATL_SSE3 /* codes that are the same for SSE2 and SSE1 */ +/* + # v0 = {v0d,v0c,v0b,v0a} + # v1 = {v1d,v1c,v1b,v1a} + movaps v0, vw # vw = {v0d,v0c,v0b,v0a} + unpacklps v1, v0 # v0 = {v1b,v0b,v1a,v0a} + unpackhps v1, vw # vw = {v1d,v0d,v1c,v0c} + addps vw, v0 # v0 = {v1bd,v0bd,v1ac,v0ac} + movhlps v0, vw # vw = {X , X,v1bd,v0bd} + addps vw, v0 # v0 = {X , X,v1abcd,v0abcd} +*/ + #define vec_red2(v0, v1, vw) \ + __asm__ __volatile__ ("movaps " Mstr(v0) ", " Mstr(vw) "\n"\ + "unpcklps " Mstr(v1) ", " Mstr(v0) "\n"\ + "unpckhps " Mstr(v1) ", " Mstr(vw) "\n"\ + "addps " Mstr(vw) ", " Mstr(v0) "\n"\ + "movhlps " Mstr(v0) ", " Mstr(vw) "\n"\ + "addps " Mstr(vw) ", " Mstr(v0) "\n"\ + ::) +/* + * movaps v0, w0 # w0 = {v0d, v0c, v0b, v0a} + * unpcklps v1, v0 # v0 = {v1b, v0b, v1a, v0a} + * movaps v2, w1 # w1 = {v2d, v2c, v2b, v2a} + * unpckhps v1, w0 # w0 = {v1d, v0d, v1c, v0c} + * unpcklps v3, v2 # v2 = {v3b, v2b, v3a, v2a} + * addps w0, v0 # v0 = {v1bd, v0bd, v1ac, v0ac} + * unpckhps v3, w1 # w1 = {v3d, v2d, v3c, v2c} + * movaps v0, w0 # w0 = {v1bd, v0bd, v1ac, v0ac} + * addps w1, v2 # v2 = {v3bd, v2bd, v3ac, v2ac} + * shufps $0x44,v2,v0 # v0 = {v3ac, v2ac, v1ac, v0ac} + * shufps $0xEE,v2,w0 # w0 = {v3bd, v2bd, v1bd, v0bd} + * addps w0, v0 # v0 = {v3abcd, v2abcd, v1abcd, v0abcd} + */ + #define vec_red4(v0, v1, v2, v3, w0, w1) \ + __asm__ __volatile__ ("movaps " Mstr(v0) ", " Mstr(w0) "\n"\ + "unpcklps " Mstr(v1) ", " Mstr(v0) "\n"\ + "movaps " Mstr(v2) ", " Mstr(w1) "\n"\ + "unpckhps " Mstr(v1) ", " Mstr(w0) "\n"\ + "unpcklps " Mstr(v3) ", " Mstr(v2) "\n"\ + "addps " Mstr(w0) ", " Mstr(v0) "\n"\ + "unpckhps " Mstr(v3) ", " Mstr(w1) "\n"\ + "movaps " Mstr(v0) ", " Mstr(w0) "\n"\ + "addps " Mstr(w1) ", " Mstr(v2) "\n"\ + "shufps $0x44, " Mstr(v2) ", " Mstr(v0) "\n"\ + "shufps $0xEE, " Mstr(v2) ", " Mstr(w0) "\n"\ + "addps " Mstr(w0) ", " Mstr(v0) "\n"\ + ::) +#endif + +#define vec_splat(mem,reg) vec_splat_wrap(mem,reg) +#define vec_splat_wrap(mem,reg) \ + __asm__ __volatile__ ("movss %0, " #reg "\n"\ + "unpcklps " #reg ", " #reg "\n"\ + "movlhps " #reg ", " #reg "\n"\ + : /* nothing */ \ + : "m" ((mem)[0])) + + +/* This instruction sequence appears courtesy of Camm Maguire. */ +#define vec_sum_full(reg0,reg1,reg2,reg3,regout,empty0,empty1) vec_sum_full_wrap(reg0,reg1,reg2,reg3,regout,empty0,empty1) +#define vec_sum_full_wrap(reg0,reg1,reg2,reg3,regout,empty0,empty1) \ + __asm__ __volatile__ ("movaps " #reg0 "," #empty0 "\n"\ + "unpcklps " #reg1 "," #reg0 "\n"\ + "movaps " #reg2 "," #empty1 "\n"\ + "unpckhps " #reg1 "," #empty0 "\n"\ + "unpcklps " #reg3 "," #reg2 "\n"\ + "addps " #empty0 "," #reg0 "\n"\ + "unpckhps " #reg3 "," #empty1 "\n"\ + "movaps " #reg0 "," #regout "\n"\ + "addps " #empty1 "," #reg2 "\n"\ + "shufps $0x44," #reg2 "," #reg0 "\n"\ + "shufps $0xee," #reg2 "," #regout "\n"\ + "addps " #reg0 "," #regout "\n"\ + : /* nothing */ \ + : /* nothing */) + + + +typedef float vector[VECLEN]; + +#endif /* end ifdef SSE */ + + +#ifdef SSE2 + +/* Peculiarities of SSE: Alignment is good, but not mandatory. It is possible to + * load/store from misaligned adresses using movups at a cost of some cycles. Loading + * using mul/add must always be aligned. Alignment is 16 bytes. + * No muladd. + */ + + + +#define gen_vec_mr(op,mem,reg) \ + __asm__ __volatile__ (#op " %0, " #reg \ + : /* nothing */ \ + : "m" (((mem)[0])), "m" (((mem)[1]))) + + +#define gen_vec_rm(op,reg,mem) \ + __asm__ __volatile__ (#op " " #reg ", %0" \ + : "=m" (((mem)[0])), "=m" (((mem)[1])) \ + : /* nothing */ ) + + + + +#define VECLEN 2 + +#define reg0 %%xmm0 +#define reg1 %%xmm1 +#define reg2 %%xmm2 +#define reg3 %%xmm3 +#define reg4 %%xmm4 +#define reg5 %%xmm5 +#define reg6 %%xmm6 +#define reg7 %%xmm7 +#ifdef ATL_GAS_x8664 + #define reg8 %%xmm8 + #define reg9 %%xmm9 + #define reg10 %%xmm10 + #define reg11 %%xmm11 + #define reg12 %%xmm12 + #define reg13 %%xmm13 + #define reg14 %%xmm14 + #define reg15 %%xmm15 +#endif + + +#define vec_mov_mr(mem,reg) gen_vec_mr(movupd,mem,reg) +#define vec_mov_rm(reg,mem) gen_vec_rm(movupd,reg,mem) +#define vec_mov_mr_a(mem,reg) gen_vec_mr(movapd,mem,reg) +#define vec_mov_rm_a(reg,mem) gen_vec_rm(movapd,reg,mem) +#define vec_mov_rr(reg1,reg2) gen_vec_rr(movapd,reg1,reg2) + +#define vec_add_mr_a(mem,reg) gen_vec_mr(addpd,mem,reg) +#define vec_mul_mr_a(mem,reg) gen_vec_mr(mulpd,mem,reg) + +#define vec_add_rr(mem,reg) gen_vec_rr(addpd,mem,reg) +#define vec_mul_rr(mem,reg) gen_vec_rr(mulpd,mem,reg) + +#define vec_mov_mr_1(mem,reg) gen_vec_mr(movsd,mem,reg) +#define vec_mov_rm_1(reg,mem) gen_vec_rm(movsd,reg,mem) +#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movsd,reg1,reg2) + +#define vec_add_mr_1(mem,reg) gen_vec_mr(addsd,mem,reg) +#define vec_add_rr_1(reg1,reg2) gen_vec_rr(addsd,reg1,reg2) + +#define vec_mul_mr_1(mem,reg) gen_vec_mr(mulsd,mem,reg) +#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(mulsd,reg1,reg2) + +#define vec_splat(mem,reg) vec_splat_wrap(mem,reg) +#define vec_splat_wrap(mem,reg) \ + __asm__ __volatile__ ("movsd %0, " #reg "\n"\ + "unpcklpd " #reg ", " #reg \ + : /* nothing */ \ + : "m" ((mem)[0])) + +/* Hack! */ +/* To use this instruction be sure that register 7 is not in use!!! */ +#define vec_sum(reg) vec_sum_wrap(reg) +#define vec_sum_wrap(reg) \ + __asm__ __volatile__ ("movhlps " #reg ", %%xmm7\n"\ + "addpd %%xmm7, " #reg "\n"\ + : /* nothing */\ + : /* nothing */) +/* + * Added by RCW to improve performance and avoid xmm7 hack (replace vec_sum) + */ +#define vec_zero(vd) __asm__ __volatile__("xorps " Mstr(vd) ", " Mstr(vd) ::) +#ifdef ATL_SSE3 + #define vec_red(vr, vwrk) \ + __asm__ __volatile__("haddpd " Mstr(vr) ", " Mstr(vr) "\n" ::) + #define vec_red2(v0, v1, vw) \ + __asm__ __volatile__("haddpd " Mstr(v1) ", " Mstr(v0) "\n" ::) + #define vec_red4(v0, v1, v2, v3, w0, w1) \ + __asm__ __volatile__("haddpd " Mstr(v1) ", " Mstr(v0) "\n"\ + "haddpd " Mstr(v3) ", " Mstr(v2) "\n"\ + ::) +#else + #define vec_red(vr, vwrk) \ + __asm__ __volatile__ ("pshufd $0xEE, " Mstr(vr) ", " Mstr(vwrk) "\n"\ + "addsd " Mstr(vwrk) ", " Mstr(vr) "\n" ::) +/* + * movapd v0, vw # vw = {v0b, v0a} + * unpcklpd v1,v0 # v0 = {v1a, v0a} + * unpckhpd v1, vw # vw = {v1b, v0b} + * addpd vw, v0 # v0 = {v1ab,v0ab} + */ + #define vec_red2(v0, v1, vw) \ + __asm__ __volatile__("movapd " Mstr(v0) ", " Mstr(vw) "\n"\ + "unpcklpd " Mstr(v1) ", " Mstr(v0) "\n"\ + "unpckhpd " Mstr(v1) ", " Mstr(vw) "\n"\ + "addpd " Mstr(vw) ", " Mstr(v0) "\n"\ + ::) +/* + * movapd v0, w0 # w0 = {v0b, v0a} + * movapd v2, w1 # w1 = {v2b, v2a} + * unpcklpd v1, v0 # v0 = {v1a, v0a} + * unpcklpd v3, v2 # v2 = {v3a, v2a} + * unpckhpd v1, w0 # w0 = {v1b, v0b} + * unpckhpd v3, w1 # w1 = {v3b, v2b} + * addpd w0, v0 # v0 = {v1ab, v0ab} + * addpd w1, v2 # v2 = {v3ab, v2ab} + */ + #define vec_red4(v0, v1, v2, v3, w0, w1) \ + __asm__ __volatile__("movapd " Mstr(v0) ", " Mstr(w0) "\n"\ + "movapd " Mstr(v2) ", " Mstr(w1) "\n"\ + "unpcklpd " Mstr(v1) ", " Mstr(v0) "\n"\ + "unpcklpd " Mstr(v3) ", " Mstr(v2) "\n"\ + "unpckhpd " Mstr(v1) ", " Mstr(w0) "\n"\ + "unpckhpd " Mstr(v3) ", " Mstr(w1) "\n"\ + "addpd " Mstr(w0) ", " Mstr(v0) "\n"\ + "addpd " Mstr(w1) ", " Mstr(v2) "\n"\ + ::) +#endif + +#define vec_sum_full(reg1,reg2,empty1) vec_sum_full_wrap(reg1,reg2,empty1) +#define vec_sum_full_wrap(reg1,reg2,empty1) \ + __asm__ __volatile__ ("movhlps " #reg2 ", " #empty1 "\n"\ + "movlhps " #reg2 ", " #empty1 "\n"\ + "addpd " #empty1 ", " #reg1 "\n"\ + : /* nothing */\ + : /* nothing */) + + +typedef double vector[VECLEN]; + +#endif /* end ifdef SSE2 */ + + +#ifdef THREEDNOW + +/* Peculiarities of 3DNOW. Alignment is not an issue, + * all alignments are legal, however alignment gives a speed increase. + * The vec_acc instruction can be used to sum to registers at once more efficiently + * than a series of vec_sum and vec_store_one + * No muladd. + */ + + +#define gen_vec_mr(op,mem,reg) \ + __asm__ __volatile__ (#op " %0, " #reg \ + : /* nothing */ \ + : "m" (((mem)[0])), "m" (((mem)[1]))) + +#define gen_vec_rm(op,reg,mem) \ + __asm__ __volatile__ (#op " " #reg ", %0" \ + : "=m" (((mem)[0])), "=m" (((mem)[1])) \ + : /* nothing */ ) + + + + +#define VECLEN 2 + +#define reg0 %%mm0 +#define reg1 %%mm1 +#define reg2 %%mm2 +#define reg3 %%mm3 +#define reg4 %%mm4 +#define reg5 %%mm5 +#define reg6 %%mm6 +#define reg7 %%mm7 + +#define vec_add_mr(mem,reg) gen_vec_mr(pfadd,mem,reg) +#define vec_mul_mr(mem,reg) gen_vec_mr(pfmul,mem,reg) +#define vec_mov_mr(mem,reg) gen_vec_mr(movq,mem,reg) +#define vec_mov_rm(reg,mem) gen_vec_rm(movq,reg,mem) +#define vec_add_rr(reg1,reg2) gen_vec_rr(pfadd,reg1,reg2) +#define vec_mul_rr(reg1,reg2) gen_vec_rr(pfmul,reg1,reg2) +#define vec_acc_rr(reg1,reg2) gen_vec_rr(pfacc,reg1,reg2) +#define vec_mov_rr(reg1,reg2) gen_vec_rr(movq,reg1,reg2) + +#define vec_sum(reg) gen_vec_rr(pfacc,reg,reg) +#define vec_sum_full(reg1,reg2) gen_vec_rr(pfacc,reg1,reg2) + +#define vec_mov_mr_1(mem,reg) gen_vec_mr(movd,mem,reg) +#define vec_mov_rm_1(reg,mem) gen_vec_rm(movd,reg,mem) +#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movd,reg1,reg2) + +#define vec_add_rr_1(reg1,reg2) gen_vec_rr(pfadd,reg1,reg2) +#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(pfmul,reg1,reg2) + + +#define vec_splat(mem,reg) vec_splat_wrap(mem,reg) +#define vec_splat_wrap(mem,reg) \ + __asm__ __volatile__ ("movd %0, " #reg "\n"\ + "punpckldq " #reg ", " #reg \ + : /* nothing */ \ + : "m" ((mem)[0])) + + +#define vec_load_apart(mem1,mem2,reg) vec_load_apart_wrap(mem1,mem2,reg) +#define vec_load_apart_wrap(mem1,mem2,reg) \ + __asm__ __volatile__ ("movd %0, " #reg "\n"\ + "punpckldq %1, " #reg \ + : /* nothing */ \ + : "m" ((mem1)[0]), "m" (((mem2)[0]))) + + +#define vec_zero(reg) gen_vec_rr(pxor,reg,reg) + +#define vec_enter() __asm__ __volatile__ ("femms") +#define vec_exit() __asm__ __volatile__ ("femms") + +#define align() __asm__ __volatile__ (".align 16") + + +typedef float vector[VECLEN]; + +#endif + + + + + +#ifdef ALTIVEC + +#define VECLEN 4 + +#define reg0 %%vr0 +#define reg1 %%vr1 +#define reg2 %%vr2 +#define reg3 %%vr3 +#define reg4 %%vr4 +#define reg5 %%vr5 +#define reg6 %%vr6 +#define reg7 %%vr7 +#define reg8 %%vr8 +#define reg9 %%vr9 +#define reg10 %%vr10 +#define reg11 %%vr11 +#define reg12 %%vr12 +#define reg13 %%vr13 +#define reg14 %%vr14 +#define reg15 %%vr15 +#define reg16 %%vr16 +#define reg17 %%vr17 +#define reg18 %%vr18 +#define reg19 %%vr19 +#define reg20 %%vr20 +#define reg21 %%vr21 +#define reg22 %%vr22 +#define reg23 %%vr23 +#define reg24 %%vr24 +#define reg25 %%vr25 +#define reg26 %%vr26 +#define reg27 %%vr27 +#define reg28 %%vr28 +#define reg29 %%vr29 +#define reg30 %%vr30 +#define reg31 %%vr31 + +#define gen_vec_mr(op,mem,reg) \ + __asm__ __volatile__ (#op " %0, " #reg \ + : /* nothing */ \ + : "m" (((mem)[0])), "m" (((mem)[1])), "m" (((mem)[2])), "m" (((mem)[3]))) + + +#define gen_vec_rm(op,reg,mem) \ + __asm__ __volatile__ (#op " " #reg ", %0" \ + : "=m" (((mem)[0])), "=m" (((mem)[1])), "=m" (((mem)[2])), "=m" (((mem)[3])) \ + : /* nothing */ ) + + +#define gen_alti3(op,reg1,reg2,regout) \ + __asm__ __volatile__ (#op " " #reg1 ", " #reg2 ", " #regout \ + : /* nothing */ \ + : /* nothing */) + +#define gen_alti_muladd(op,reg1,reg2,regout) \ + __asm__ __volatile__ (#op " " #reg1 ", " #reg2 ", " #regout ", " #regout \ + : /* nothing */ \ + : /* nothing */) + + + +#define vec_mov_mr_a(mem,reg) gen_vec_mr(lvx,mem,reg) +#define vec_mov_rm_a(reg,mem) gen_vec_rm(svx,reg,mem) +#define vec_muladd(reg1,reg2,regout) gen_alti3(vmaddfp,reg1,reg2,regout) + +#define vec_zero(reg) gen_alti3(vxor,reg,reg,reg) + + +typedef float vector[VECLEN]; + +#endif + + +#ifdef ALTIVEC_C + +/* These macros have been written by, or greatly inspired by, + * Nicholas A. Coult . Thanks. + */ + +/* assumes that last four registers are not in use! */ +#define transpose(x0,x1,x2,x3) \ +reg28 = vec_mergeh(x0,x2); \ +reg29 = vec_mergeh(x1,x3); \ +reg30 = vec_mergel(x0,x2); \ +reg31 = vec_mergel(x1,x3); \ +x0 = vec_mergeh(reg28,reg29); \ +x1 = vec_mergel(reg28,reg29); \ +x2 = vec_mergeh(reg30,reg31); \ +x3 = vec_mergel(reg30,reg31) + +#define vec_mov_rm(v, where) \ +low = vec_ld(0, (where)); \ +high = vec_ld(16, (where)); \ +p_vector = vec_lvsr(0, (int *)(where)); \ +mask = vec_perm((vector unsigned char)(0), (vector unsigned char)(-1), p_vector); \ +v = vec_perm(v, v, p_vector); \ +low = vec_sel(low, v, mask); \ +high = vec_sel(v, high, mask); \ +vec_st(low, 0, (where)); \ +vec_st(high, 16, (where)) + +#define vec_mov_mr_a(mem,reg) reg = vec_ld(0, mem) + +#define vec_mov_mr(u,v) \ +p_vector = (vector unsigned char)vec_lvsl(0, (int*)(v)); \ +low = (vector unsigned char)vec_ld(0, (v)); \ +high = (vector unsigned char)vec_ld(16, (v)); \ +u=(vector float)vec_perm(low, high, p_vector) + +#define vec_muladd(reg1,reg2,regout) regout = vec_madd(reg1,reg2,regout) +#define vec_add_rr(reg1,reg2) reg2 = vec_add(reg1,reg2) + +#define vec_zero(reg) reg = vec_xor(reg,reg) + +#define vec_sum_full(reg0,reg1,reg2,reg3,regout,empty0,empty1) \ +transpose(reg0, reg1,reg2,reg3,regout,empty0,empty1); \ +empty0 = vec_add(reg0,reg1); \ +empty1 = vec_add(reg2,reg3); \ +regout = vec_add(empty0,empty1) + + +#endif /* ALTIVEC_C */ + + + + + + + + diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h new file mode 100644 index 0000000..af9c6b1 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h @@ -0,0 +1,1626 @@ +#include +#include +#include + +#include "camm_util.h" + + +#if defined(ALIGN) +#if( defined(SCPLX) || defined(DCPLX)) +#error Cannot align complex routines +#endif +#if defined(SREAL) && ( NDPM != 1 ) && ( STRIDE % 4 != 0) +#error Can only align SREAL with NDPM 1 or STRIDE % 4 = 0 +#endif +#if defined(DREAL) && ( NDPM != 1 ) && ( STRIDE % 2 != 0) +#error Can only align DREAL with NDPM 1 or STRIDE % 2 = 0 +#endif +#endif + +/****************************************************************************** + * Single Precision Complex Macros + ******************************************************************************/ + +#ifdef SCPLX + +#ifdef NO_TRANSPOSE + +#if NDPM > 3 +#error Max NDPM is 3 for SCPLX NO_TRANSPOSE +#endif + +#undef plax +#define plax + +#undef R1 +#define R1 2 +#undef R2 +#define R2 4 +#undef R3 +#define R3 6 +#undef R4 +#define R4 6 + +#undef TREG +#define TREG 1 +#undef SREG +#define SREG 0 +#undef CREG +#define CREG 0 + +#ifdef GER +#undef AREG +#define AREG 0 +#undef targ +#define targ(a_) AREG +#undef wb +#define wb(a_,b_) pu(AREG,a_,b_) +#undef wbd +#define wbd(a_,b_) pud(AREG,a_,b_) +#undef w +#define w(a_) +#undef w1_2 +#define w1_2(a_) +#else +#undef AREG +#define AREG TREG +#undef targ +#define targ(a_) CREG +#undef wb +#define wb(a_,b_) +#undef wbd +#define wbd(a_,b_) +#undef w +#define w(a_) pu(CREG,a_ ## 0,si) +#undef w1_2 +#define w1_2(a_) pud(CREG,a_ ## 0,si) +#endif + +#undef src +#define src(a_) a_ +#undef mpx +#define mpx(a_) pls(0,si,a_) ps(0,a_,a_) pls(4,si,P(a_,1)) \ + ps(0,P(a_,1),P(a_,1)) sign(a_) +#undef madd +#define madd(a_,b_,c_) pas(a_,b_,c_) +#undef ulfa +#define ulfa(a_) + +#else + +#undef R1 +#define R1 4 +#undef R2 +#define R2 5 +#undef R3 +#define R3 6 +#undef R4 +#define R4 7 + +#undef TREG +#define TREG 3 +#undef SREG +#define SREG 2 +#undef CREG +#define CREG 0 +#undef targ +#define targ(a_) a_ +#undef src +#define src(a_) 0 +#undef w +#define w(a_) +#undef w1_2 +#define w1_2(a_) +#undef mpx +#define mpx(a_) px(a_) +#ifdef BETA0 +#undef ulfa +#define ulfa(a_) phl(a_,0) pa(0,a_) pud(a_,0,si) +#else +#undef ulfa +#define ulfa(a_) pld(0,si,TREG) phl(a_,0) pa(0,a_) pa(TREG,a_) pud(a_,0,si) +#endif +#undef AREG +#define AREG TREG +#undef wb +#define wb(a_,b_) +#undef wbd +#define wbd(a_,b_) +#undef wbs +#define wbs(a_,b_) + + +#undef plax +#define plax pc(CREG,1) ps(160,CREG,CREG) ps(245,1,1) sign(CREG) + + + +#endif + +#if defined(Conj_) && ! defined(GER) +#undef sign +#define sign(a_) pm(SREG,a_) +#else +#undef sign +#define sign(a_) pm(SREG,P(a_,1)) +#endif + + + +#undef plb +#define plb(a_,b_) pl(a_,b_,AREG) +#undef plbd +#define plbd(a_,b_) px(AREG) pld(a_,b_,AREG) + +#undef dpr +#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_)) +#undef dprp +#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_)) +#undef dpi +#define dpi(a_) pm(P(src(a_),1),TREG) ps(177,TREG,TREG) pa(TREG,targ(a_)) + +#ifndef GER + + +#undef plaa +#define plaa(a_) pl(a_ ## 0,si,CREG) plax +#undef wa +#define wa(a_) w(a_) +#undef dp +#define dp(a_,b_,c_) plb(a_ ## 0,b_) dpr(c_) plb(a_ ## 0,b_) dpi(c_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) plb(a_ ## 0,b_) dprp(c_,d_,e_) plb(a_ ## 0,b_) dpi(c_) +#undef ddp +#define ddp(a_,b_,c_) dp(a_,b_,c_) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) + +#undef plaa1_2 +#define plaa1_2(a_) px(CREG) pld(a_ ## 0,si,CREG) plax +#undef wa1_2 +#define wa1_2(a_) w1_2(a_) +#undef dp1_2 +#define dp1_2(a_,b_,c_) plbd(a_ ## 0,b_) dpr(c_) plbd(a_ ## 0,b_) dpi(c_) +#undef dpp1_2 +#define dpp1_2(a_,b_,c_,d_,e_) plbd(a_ ## 0,b_) dprp(c_,d_,e_) plbd(a_ ## 0,b_) dpi(c_) +#undef ddp1_2 +#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) +#undef ddpp1_2 +#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) + + +#else + +#undef lqc +#define lqc(a_) pl(a_ ## 0,si,TREG) +#undef lqc1 +#define lqc1_2(a_) px(TREG) pld(a_ ## 0,si,TREG) + + +#undef plaa +#define plaa(a_) +#undef wa +#define wa(a_) +#undef dp +#define dp(a_,b_,c_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) \ + lqc(a_) dpi(c_) wb(a_ ## 0,b_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) pf(d_,e_) \ + lqc(a_) dpi(c_) wb(a_ ## 0,b_) +#undef ddp +#define ddp(a_,b_,c_) dp(a_,b_,c_) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) + +#undef plaa1_2 +#define plaa1_2(a_) +#undef wa1_2 +#define wa1_2(a_) +#undef dp1_2 +#define dp1_2(a_,b_,c_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) \ + lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) +#undef dpp1_2 +#define dpp1_2(a_,b_,c_,d_,e_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) pf(d_,e_) \ + lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) +#undef ddp1_2 +#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) +#undef ddpp1_2 +#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) + +#endif + +#endif + +/****************************************************************************** + * Single Precision Real Macros + ******************************************************************************/ + +#ifdef SREAL + +#ifdef NO_TRANSPOSE + +#undef mpx +#define mpx(a_) pls(0,si,a_) ps(0,a_,a_) +#undef madd +#define madd(a_,b_,c_) pas(a_,b_,c_) +#undef TREG +#define TREG 1 +#undef targ +#define targ(a_) 0 +#undef src +#define src(a_) a_ +#undef ulfa +#define ulfa(a_) + +#ifdef GER +#undef w +#define w(a_) +#undef w1_2 +#define w1_2(a_) +#undef w1_4 +#define w1_4(a_) +#undef CREG +#define CREG 2 +#undef AREG +#define AREG 0 +#undef cp +#define cp pc(CREG,TREG) +#undef wb +#define wb(a_,b_) pu(AREG,a_,b_) +#undef wbd +#define wbd(a_,b_) pud(AREG,a_,b_) +#undef wbs +#define wbs(a_,b_) pus(AREG,a_,b_) +#else +#undef CREG +#define CREG 0 +#undef AREG +#define AREG TREG +#undef cp +#define cp +#undef wb +#define wb(a_,b_) +#undef wbd +#define wbd(a_,b_) +#undef wbs +#define wbs(a_,b_) +#undef w +#define w(a_) pu(CREG,a_ ## 0,si) +#undef w1_2 +#define w1_2(a_) pud(CREG,a_ ## 0,si) +#undef w1_4 +#define w1_4(a_) pus(CREG,a_ ## 0,si) +#endif + +#else + +#undef mpx +#define mpx(a_) px(a_) +#ifdef BETA0 +#undef madd +#define madd(a_,b_,c_) +#else +#undef madd +#define madd(a_,b_,c_) pas(a_,b_,c_) +#endif +#undef TREG +#define TREG 3 +#undef targ +#define targ(a_) a_ +#undef src +#define src(a_) 0 +#undef w +#define w(a_) +#undef w1_2 +#define w1_2(a_) +#undef w1_4 +#define w1_4(a_) +#undef ulfa +#undef ulfa +#define ulfa(a_) phl(a_,0) pa(0,a_) pc(a_,0) ps(1,0,0) pa(0,a_) \ + madd(0,si,a_) pus(a_,0,si) + +#undef CREG +#define CREG 0 +#undef AREG +#define AREG TREG +#undef cp +#define cp +#undef wb +#define wb(a_,b_) +#undef wbd +#define wbd(a_,b_) +#undef wbs +#define wbs(a_,b_) + +#endif + +#if defined(ALIGN) +#undef plb +#define plb(a_,b_) pla(a_,b_,AREG) +#else +#undef plb +#define plb(a_,b_) pl(a_,b_,AREG) +#endif +#undef plbd +#define plbd(a_,b_) px(AREG) pld(a_,b_,AREG) +#undef plbs +#define plbs(a_,b_) pls(a_,b_,AREG) +#undef dpr +#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_)) +#undef dprp +#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_)) +#undef dprs +#define dprs(a_) pmsr(src(a_),TREG) pasr(TREG,targ(a_)) +#undef dprps +#define dprps(a_,b_,c_) pf(b_,c_) pmsr(src(a_),TREG) pasr(TREG,targ(a_)) + +#undef plaa +#define plaa(a_) pl(a_ ## 0,si,CREG) +#undef wa +#define wa(a_) w(a_) +#undef dp +#define dp(a_,b_,c_) cp plb(a_ ## 0,b_) dpr(c_) wb(a_ ## 0,b_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) cp plb(a_ ## 0,b_) dprp(c_,d_,e_) wb(a_ ## 0,b_) +#undef ddp +#define ddp(a_,b_,c_) dp(a_,b_,c_) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) + +#undef plaa1_2 +#define plaa1_2(a_) px(CREG) pld(a_ ## 0,si,CREG) +#undef wa1_2 +#define wa1_2(a_) w1_2(a_) +#undef dp1_2 +#define dp1_2(a_,b_,c_) cp plbd(a_ ## 0,b_) dpr(c_) wbd(a_ ## 0,b_) +#undef dpp1_2 +#define dpp1_2(a_,b_,c_,d_,e_) cp plbd(a_ ## 0,b_) dprp(c_,d_,e_) wbd(a_ ## 0,b_) +#undef ddp1_2 +#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) +#undef ddpp1_2 +#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) + +#undef plaa1_4 +#define plaa1_4(a_) pls(a_ ## 0,si,CREG) +#undef wa1_4 +#define wa1_4(a_) w1_4(a_) +#undef dp1_4 +#define dp1_4(a_,b_,c_) cp plbs(a_ ## 0,b_) dprs(c_) wbs(a_ ## 0,b_) +#undef dpp1_4 +#define dpp1_4(a_,b_,c_,d_,e_) cp plbs(a_ ## 0,b_) dprps(c_,d_,e_) wbs(a_ ## 0,b_) +#undef ddp1_4 +#define ddp1_4(a_,b_,c_) dp1_4(a_,b_,c_) +#undef ddpp1_4 +#define ddpp1_4(a_,b_,c_,d_,e_) dpp1_4(a_,b_,c_,d_,e_) + + + +#undef R1 +#define R1 4 +#undef R2 +#define R2 5 +#undef R3 +#define R3 6 +#undef R4 +#define R4 7 + +#endif + +/****************************************************************************** + * Double Precision Real Macros + ******************************************************************************/ + +#ifdef DREAL + +#ifdef ATL_SSE2 + +#ifdef NO_TRANSPOSE + +#undef mpx +#define mpx(a_) pls(0,si,a_) ps(0,a_,a_) +#undef madd +#define madd(a_,b_,c_) pas(a_,b_,c_) +#undef TREG +#define TREG 1 +#undef targ +#define targ(a_) 0 +#undef src +#define src(a_) a_ +#undef ulfa +#define ulfa(a_) + +#ifdef GER +#undef w +#define w(a_) +#undef w1_2 +#define w1_2(a_) +#undef w1_4 +#define w1_4(a_) +#undef CREG +#define CREG 2 +#undef AREG +#define AREG 0 +#undef cp +#define cp pc(CREG,TREG) +#undef wb +#define wb(a_,b_) pu(AREG,a_,b_) +#undef wbd +#define wbd(a_,b_) pus(AREG,a_,b_) +#undef wbs +/* #define wbs(a_,b_) pus(AREG,a_,b_) */ +#else +#undef CREG +#define CREG 0 +#undef AREG +#define AREG TREG +#undef cp +#define cp +#undef wb +#define wb(a_,b_) +#undef wbd +#define wbd(a_,b_) +#undef wbs +/* #define wbs(a_,b_) */ +#undef w +#define w(a_) pu(CREG,a_ ## 0,si) +#undef w1_2 +#define w1_2(a_) pus(CREG,a_ ## 0,si) +#undef w1_4 +/* #define w1_4(a_) pus(CREG,a_ ## 0,si) */ +#endif + +#else + +#undef mpx +#define mpx(a_) px(a_) +#ifdef BETA0 +#undef madd +#define madd(a_,b_,c_) +#else +#undef madd +#define madd(a_,b_,c_) pas(a_,b_,c_) +#endif +#undef TREG +#define TREG 3 +#undef targ +#define targ(a_) a_ +#undef src +#define src(a_) 0 +#undef w +#define w(a_) +#undef w1_2 +#define w1_2(a_) +#undef w1_4 +#define w1_4(a_) +#undef ulfa +#undef ulfa +#define ulfa(a_) /* phl(a_,0) pa(0,a_) */ pc(a_,0) ps(1,0,0) pa(0,a_) \ + madd(0,si,a_) pus(a_,0,si) + +#undef CREG +#define CREG 0 +#undef AREG +#define AREG TREG +#undef cp +#define cp +#undef wb +#define wb(a_,b_) +#undef wbd +#define wbd(a_,b_) +#undef wbs +#define wbs(a_,b_) + +#endif + +#if defined(ALIGN) +#undef plb +#define plb(a_,b_) pla(a_,b_,AREG) +#else +#undef plb +#define plb(a_,b_) pl(a_,b_,AREG) +#endif +#undef plbd +#define plbd(a_,b_) /* px(AREG) */pls(a_,b_,AREG) +#undef plbs +/* #define plbs(a_,b_) pls(a_,b_,AREG) */ +#undef dpr +#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_)) +#undef dprp +#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_)) +#undef dprs +#define dprs(a_) pmsr(src(a_),TREG) pasr(TREG,targ(a_)) +#undef dprps +#define dprps(a_,b_,c_) pf(b_,c_) pmsr(src(a_),TREG) pasr(TREG,targ(a_)) + +#undef plaa +#define plaa(a_) pl(a_ ## 0,si,CREG) +#undef wa +#define wa(a_) w(a_) +#undef dp +#define dp(a_,b_,c_) cp plb(a_ ## 0,b_) dpr(c_) wb(a_ ## 0,b_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) cp plb(a_ ## 0,b_) dprp(c_,d_,e_) wb(a_ ## 0,b_) +#undef ddp +#define ddp(a_,b_,c_) dp(a_,b_,c_) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) + +#undef plaa1_2 +#define plaa1_2(a_) /* px(CREG) */pls(a_ ## 0,si,CREG) +#undef wa1_2 +#define wa1_2(a_) w1_2(a_) +#undef dp1_2 +#define dp1_2(a_,b_,c_) cp plbd(a_ ## 0,b_) dprs(c_) wbd(a_ ## 0,b_) +#undef dpp1_2 +#define dpp1_2(a_,b_,c_,d_,e_) cp plbd(a_ ## 0,b_) dprps(c_,d_,e_) wbd(a_ ## 0,b_) +#undef ddp1_2 +#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) +#undef ddpp1_2 +#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) + +#undef plaa1_4 +/* #define plaa1_4(a_) pls(a_ ## 0,si,CREG) */ +#undef wa1_4 +/* #define wa1_4(a_) w1_4(a_) */ +#undef dp1_4 +/* #define dp1_4(a_,b_,c_) cp plbs(a_ ## 0,b_) dprs(c_) wbs(a_ ## 0,b_) */ +#undef dpp1_4 +/* #define dpp1_4(a_,b_,c_,d_,e_) cp plbs(a_ ## 0,b_) dprps(c_,d_,e_) wbs(a_ ## 0,b_) */ +#undef ddp1_4 +/* #define ddp1_4(a_,b_,c_) dp1_4(a_,b_,c_) */ +#undef ddpp1_4 +/* #define ddpp1_4(a_,b_,c_,d_,e_) dpp1_4(a_,b_,c_,d_,e_) */ + + + +#undef R1 +#define R1 4 +#undef R2 +#define R2 5 +#undef R3 +#define R3 6 +#undef R4 +#define R4 7 + +#else + +#ifdef NO_TRANSPOSE + +#undef t0 +#define t0(a_) 1 +#undef s0 +#define s0(a_) a_ +#undef t8 +#define t8(a_) 2 +#undef s8 +#define s8(a_) a_ +#undef w +#define w(a_) fp(a_ ## 0,si) fp(a_ ## 8,si) +#undef w1_2 +#define w1_2(a_) fp(a_ ## 0,si) +#undef mpx +#define mpx(a_) fl(0,si) fc(M(a_,2)) +#undef madd +#define madd(a_,b_,c_) faa(a_,b_) +#undef ulfa +#define ulfa(a_) fc(0) + +#else + +#undef t0 +#define t0(a_) a_ +#undef s0 +#define s0(a_) 1 +#undef t8 +#define t8(a_) a_ +#undef s8 +#define s8(a_) 2 +#undef w +#define w(a_) +#undef w1_2 +#define w1_2(a_) +#undef mpx +#define mpx(a_) fz +#ifdef BETA0 +#undef madd +#define madd(a_,b_,c_) +#else +#undef madd +#define madd(a_,b_,c_) faa(a_,b_) +#endif +#undef ulfa +#define ulfa(a_) madd(0,si,a_) fp(0,si) + +#endif + + +#ifndef GER + +#undef plaa1_2 +#define plaa1_2(a_) fl(a_ ## 0,si) +#undef wa1_2 +#define wa1_2(a_) w1_2(a_) +#ifdef NO_TRANSPOSE +#undef ddp1_2 +#define ddp1_2(a_,b_,c_) fl(a_ ## 0,b_) fm(M(s0(c_),1),0) fap(0,t0(c_)) +#undef dp1_2 +#define dp1_2(a_,b_,c_) ddp1_2(a_,b_,c_) +#else +#undef ddp1_2 +#define ddp1_2(a_,b_,c_) fl(a_ ## 0,b_) fm(s0(c_),0) fap(0,M(t0(c_),1)) +#undef dp1_2 +#define dp1_2(a_,b_,c_) fl(a_ ## 0,b_) fmp(0,s0(c_)) fap(0,M(t0(c_),2)) +#endif + +#else + +#undef plaa1_2 +#define plaa1_2(a_) fl(a_ ## 0,si) +#undef wa1_2 +#define wa1_2(a_) +#undef ddp1_2 +#define ddp1_2(a_,b_,c_) fd(M(s0(c_),2)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) +#undef dp1_2 +#define dp1_2(a_,b_,c_) fm(M(s0(c_),2),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) + +#endif + + + +#undef plaa +#define plaa(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) fx1 + +#ifndef GER + + +#undef wa +#define wa(a_) w(a_) + + +#undef ddp +#define ddp(a_,b_,c_) fl(a_ ## 0,b_) fm(s0(c_),0) fl(a_ ## 8,b_) \ + fm(P(s8(c_),1),0) fx1 fap(0,P(t0(c_),1)) \ + fap(0,t8(c_)) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fm(s0(c_),0) fl(a_ ## 8,b_) \ + fm(P(s8(c_),1),0) pf(d_,e_) fx1 fap(0,P(t0(c_),1)) \ + fap(0,t8(c_)) + +/* #define ddp(a_,b_,c_) fd(M(s0(c_),1)) fma(a_ ## 0,b_) fap(0,t0(c_)) \ */ +/* fd(M(s8(c_),1)) fma(a_ ## 8,b_) fap(0,t8(c_)) */ +/* #define ddpp(a_,b_,c_,d_,e_) fd(M(s0(c_),1)) fma(a_ ## 0,b_) fap(0,t0(c_)) \ */ +/* \ */ +/* fd(M(s8(c_),1)) fma(a_ ## 8,b_) fap(0,t8(c_)) pf(d_,e_) */ + +#ifdef NO_TRANSPOSE + +#undef dp +#define dp(a_,b_,c_) ddp(a_,b_,c_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) ddpp(a_,b_,c_,d_,e_) + +#else + +#undef dp +#define dp(a_,b_,c_) fl(a_ ## 0,b_) fmp(0,s0(c_)) fl(a_ ## 8,b_) \ + fmp(0,s8(c_)) fap(0,M(t0(c_),1)) fap(0,M(t8(c_),2)) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) pf(d_ ,e_) fmp(0,s0(c_)) fl(a_ ## 8,b_) \ + fmp(0,s8(c_)) fap(0,M(t0(c_),1)) fap(0,M(t8(c_),2)) + +/* #define dp(a_,b_,c_) fma(a_ ## 0,b_) fap(0,M(t0(c_),1)) \ */ +/* fma(a_ ## 8,b_) fap(0,M(t8(c_),2)) */ +/* #define dpp(a_,b_,c_,d_,e_) fma(a_ ## 0,b_) fap(0,M(t0(c_),1)) \ */ +/* \ */ +/* fma(a_ ## 8,b_) fap(0,M(t8(c_),2)) pf(d_,e_) */ + +#endif + + +#else + +#undef wa +#define wa(a_) +#undef ddp +#define ddp(a_,b_,c_) fd(M(s0(c_),1)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \ + fd(M(s8(c_),1)) fm(t8(c_),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) fd(M(s0(c_),1)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \ + fd(M(s8(c_),1)) fm(t8(c_),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) pf(d_,e_) + +#undef dp +#define dp(a_,b_,c_) fm(M(s0(c_),1),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \ + fm(M(s8(c_),2),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) fm(M(s0(c_),1),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \ + fm(M(s8(c_),2),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) pf(d_,e_) + +#endif + + +#undef R1 +#define R1 3 +#undef R2 +#define R2 4 +#undef R3 +#define R3 5 +#undef R4 +#define R4 6 + +#endif + +#endif + +/****************************************************************************** + * Double Precision Complex Macros + ******************************************************************************/ + +#ifdef DCPLX + +#ifdef ATL_SSE2 +#ifdef NO_TRANSPOSE + +#if NDPM > 3 +#error Max NDPM is 3 for DCPLX NO_TRANSPOSE +#endif + +#undef plax +#define plax + +#undef R1 +#define R1 2 +#undef R2 +#define R2 4 +#undef R3 +#define R3 6 +#undef R4 +#define R4 6 + +#undef TREG +#define TREG 1 +#undef SREG +#define SREG 0 +#undef CREG +#define CREG 0 + +#ifdef GER +#undef AREG +#define AREG 0 +#undef targ +#define targ(a_) AREG +#undef wb +#define wb(a_,b_) pu(AREG,a_,b_) +#undef wbd +/* #define wbd(a_,b_) pud(AREG,a_,b_) */ +#undef w +#define w(a_) +#undef w1_2 +/* #define w1_2(a_) */ +#else +#undef AREG +#define AREG TREG +#undef targ +#define targ(a_) CREG +#undef wb +#define wb(a_,b_) +#undef wbd +/* #define wbd(a_,b_) */ +#undef w +#define w(a_) pu(CREG,a_ ## 0,si) +#undef w1_2 +/* #define w1_2(a_) pud(CREG,a_ ## 0,si) */ +#endif + +#undef src +#define src(a_) a_ +#undef mpx +#define mpx(a_) pls(0,si,a_) ps(0,a_,a_) pls(8,si,P(a_,1)) \ + ps(0,P(a_,1),P(a_,1)) sign(a_) +#undef madd +#define madd(a_,b_,c_) pas(a_,b_,c_) +#undef ulfa +#define ulfa(a_) + +#else + +#undef R1 +#define R1 4 +#undef R2 +#define R2 5 +#undef R3 +#define R3 6 +#undef R4 +#define R4 7 + +#undef TREG +#define TREG 3 +#undef SREG +#define SREG 2 +#undef CREG +#define CREG 0 +#undef targ +#define targ(a_) a_ +#undef src +#define src(a_) 0 +#undef w +#define w(a_) +#undef w1_2 +#define w1_2(a_) +#undef mpx +#define mpx(a_) px(a_) +#ifdef BETA0 +#undef ulfa +#define ulfa(a_) /* phl(a_,0) pa(0,a_) */pu(a_,0,si) +#else +#undef ulfa +#define ulfa(a_) pl(0,si,TREG) /* phl(a_,0) pa(0,a_) */ pa(TREG,a_) pu(a_,0,si) +#endif +#undef AREG +#define AREG TREG +#undef wb +#define wb(a_,b_) +#undef wbd +#define wbd(a_,b_) +#undef wbs +#define wbs(a_,b_) + + +#undef plax +#define plax pc(CREG,1) ps(0,CREG,CREG) ps(3,1,1) sign(CREG) + + + +#endif + +#if defined(Conj_) && ! defined(GER) +#undef sign +#define sign(a_) pm(SREG,a_) +#else +#undef sign +#define sign(a_) pm(SREG,P(a_,1)) +#endif + + + +#undef plb +#define plb(a_,b_) pl(a_,b_,AREG) +#undef plbd +/* #define plbd(a_,b_) px(AREG) pld(a_,b_,AREG) */ + +#undef dpr +#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_)) +#undef dprp +#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_)) +#undef dpi +#define dpi(a_) pm(P(src(a_),1),TREG) ps(1,TREG,TREG) pa(TREG,targ(a_)) + +#ifndef GER + +#undef plaa +#define plaa(a_) pl(a_ ## 0,si,CREG) plax +#undef wa +#define wa(a_) w(a_) +#undef dp +#define dp(a_,b_,c_) plb(a_ ## 0,b_) dpr(c_) plb(a_ ## 0,b_) dpi(c_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) plb(a_ ## 0,b_) dprp(c_,d_,e_) plb(a_ ## 0,b_) dpi(c_) +#undef ddp +#define ddp(a_,b_,c_) dp(a_,b_,c_) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) + +#undef plaa1_2 +/* #define plaa1_2(a_) px(CREG) pld(a_ ## 0,si,CREG) plax */ +#undef wa1_2 +/* #define wa1_2(a_) w1_2(a_) */ +#undef dp1_2 +/* #define dp1_2(a_,b_,c_) plbd(a_ ## 0,b_) dpr(c_) plbd(a_ ## 0,b_) dpi(c_) */ +#undef dpp1_2 +/* #define dpp1_2(a_,b_,c_,d_,e_) plbd(a_ ## 0,b_) dprp(c_,d_,e_) plbd(a_ ## 0,b_) dpi(c_) */ +#undef ddp1_2 +/* #define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) */ +#undef ddpp1_2 +/* #define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) */ + + +#else + +#undef lqc +#define lqc(a_) pl(a_ ## 0,si,TREG) +#undef lqc1 +/* #define lqc1_2(a_) px(TREG) pld(a_ ## 0,si,TREG) */ + + +#undef plaa +#define plaa(a_) +#undef wa +#define wa(a_) +#undef dp +#define dp(a_,b_,c_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) \ + lqc(a_) dpi(c_) wb(a_ ## 0,b_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) pf(d_,e_) \ + lqc(a_) dpi(c_) wb(a_ ## 0,b_) +#undef ddp +#define ddp(a_,b_,c_) dp(a_,b_,c_) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) + +#undef plaa1_2 +/* #define plaa1_2(a_) */ +#undef wa1_2 +/* #define wa1_2(a_) */ +#undef dp1_2 +/* #define dp1_2(a_,b_,c_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) \ */ +/* lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) */ +#undef dpp1_2 +/* #define dpp1_2(a_,b_,c_,d_,e_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) pf(d_,e_) \ */ +/* lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) */ +#undef ddp1_2 +/* #define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) */ +#undef ddpp1_2 +/* #define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) */ + +#endif + +#else + +#if NDPM > 2 +#error Max NDPM is 2 for DCPLX +#endif + +#undef TREG +#define TREG 2 + +#ifdef NO_TRANSPOSE + +#undef w +#define w(a_) fp(a_ ## 0,si) fp(a_ ## 8,si) +#undef plax +#define plax fx1 +#undef srr +#define srr(a_) a_ +#undef sri +#define sri(a_) a_ +#undef sir +#define sir(a_) a_ +#undef sii +#define sii(a_) a_ +#undef trr +#define trr(a_) P(TREG,1) +#undef tri +#define tri(a_) M(TREG,1) +#undef tir +#define tir(a_) TREG +#undef tii +#define tii(a_) TREG +#undef mpx +#define mpx(a_) fl(0,si) fl(8,si) fc(M(a_,2)) fc(M(a_,2)) +#undef madd +#define madd(a_,b_,c_) faa(a_,b_) +#undef ulfa +#define ulfa(a_) fc(0) fc(0) + +#else + +#undef srr +#define srr(a_) P(TREG,1) +#undef sri +#define sri(a_) M(TREG,1) +#undef sir +#define sir(a_) TREG +#undef sii +#define sii(a_) TREG +#undef trr +#define trr(a_) a_ +#undef tri +#define tri(a_) a_ +#undef tir +#define tir(a_) a_ +#undef tii +#define tii(a_) a_ +#undef w +#define w(a_) +#undef plax +#define plax +#undef mpx +#define mpx(a_) fz fz +#ifdef BETA0 +#undef madd +#define madd(a_,b_,c_) +#else +#undef madd +#define madd(a_,b_,c_) faa(a_,b_) +#endif +#undef ulfa +#define ulfa(a_) madd(0,si,a_) fp(0,si) madd(8,si,a_) fp(8,si) + +#endif + + + +#ifdef Conj_ +#undef fapi +#define fapi(a_,b_) fsp(b_) +#undef fspi +#define fspi(a_,b_) fap(a_,b_) +#else +#undef fapi +#define fapi(a_,b_) fap(a_,b_) +#undef fspi +#define fspi(a_,b_) fsp(b_) +#endif + +#ifndef GER + + +#undef plaa +#define plaa(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) plax +#undef wa +#define wa(a_) w(a_) +#undef ddp +#define ddp(a_,b_,c_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \ + fm(sri(c_),0) fap(0,tri(c_))\ + fl(a_ ## 8,b_) fd(0) fm(sir(c_),0) fspi(0,tir(c_)) \ + fm(sii(c_),0) fapi(0,tii(c_)) +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \ + fm(sri(c_),0) fap(0,tri(c_))\ + fl(a_ ## 8,b_) fd(0) pf(d_,e_) fm(sir(c_),0) fspi(0,tir(c_))\ + fm(sii(c_),0) fapi(0,tii(c_)) + + + +#ifdef NO_TRANSPOSE + + + +#undef dp +#define dp(a_,b_,c_) ddp(a_,b_,c_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) ddpp(a_,b_,c_,d_,e_) + + + +#else + +#undef dp +#define dp(a_,b_,c_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \ + fm(sri(c_),0) fap(0,tri(c_))\ + fl(a_ ## 8,b_) fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \ + fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2)) + +#undef dpp +#define dpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \ + pf(d_,e_) fm(sri(c_),0) fap(0,tri(c_))\ + fl(a_ ## 8,b_) fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \ + fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2)) + + +#endif + +#else + +#undef plaa +#define plaa(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) plax +#undef wa +#define wa(a_) + +#undef ddprr +#define ddprr(a_,b_,c_) fl(a_ ## 0,b_) \ + fd(tri(c_)) fm(P(sri(c_),1),0) fap(0,1) \ + fd(M(trr(c_),1)) fm(srr(c_),0) fspi(0,1) \ + fp(a_ ## 0,b_) +#undef ddpri +#define ddpri(a_,b_,c_) fl(a_ ## 8,b_) \ + fd(tii(c_)) fm(P(sii(c_),1),0) fap(0,1) \ + fd(M(tir(c_),1)) fm(sir(c_),0) fapi(0,1) \ + fp(a_ ## 8,b_) +#undef dpri +#define dpri(a_,b_,c_) fl(a_ ## 8,b_) \ + fx(2) fm(sir(c_),0) fap(0,2) \ + fm(M(sii(c_),2),0) fapi(0,1) \ + fp(a_ ## 8,b_) + + +#undef ddpp +#define ddpp(a_,b_,c_,d_,e_) ddprr(a_,b_,c_) pf(d_,e_) ddpri(a_,b_,c_) +#undef ddp +#define ddp(a_,b_,c_) ddprr(a_,b_,c_) ddpri(a_,b_,c_) +#undef dpp +#define dpp(a_,b_,c_,d_,e_) ddprr(a_,b_,c_) pf(d_,e_) dpri(a_,b_,c_) +#undef dp +#define dp(a_,b_,c_) ddprr(a_,b_,c_) dpri(a_,b_,c_) + +#endif + + +#undef R1 +#define R1 4 +#undef R2 +#define R2 6 +#undef R3 +#define R3 6 +#undef R4 +#define R4 6 + +#endif + +#endif + + +/****************************************************************************** + * General Macros + ******************************************************************************/ + + + + +#undef bla1 +#define bla1(a_,b_) plaa(a_) dpp(a_,ax,R1,b_,si) wa(a_) +#undef blb1 +#define blb1(a_,b_) plaa(a_) dpp(a_,ax,R1,b_,ax) wa(a_) + +#undef bla2 +#undef bla2 +#define bla2(a_,b_) pf(b_,si) plaa(a_) ddp(a_,ax,R1) pf(b_,ax) dp(a_,bx,R2) wa(a_) +#undef blb2 +#undef blb2 +#define blb2(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,bx) dp(a_,bx,R2) wa(a_) + +#undef bla3 +#define bla3(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,si) ddp(a_,bx,R2) \ + dpp(a_,cx,R3,b_,ax) wa(a_) +#undef blb3 +#define blb3(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,bx) ddp(a_,bx,R2) \ + dpp(a_,cx,R3,b_,cx) wa(a_) + +#undef bla4 +#define bla4(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,si) ddpp(a_,bx,R2,b_,ax) \ + ddp(a_,cx,R3) dpp(a_,dx,R4,b_,bx) wa(a_) +#undef blb4 +#define blb4(a_,b_) plaa(a_) ddp(a_,ax,R1) ddpp(a_,bx,R2,b_,cx) \ + ddp(a_,cx,R3) dpp(a_,dx,R4,b_,dx) wa(a_) + +#undef bla +#define bla(a_,b_) Mjoin(bla,NDP)(a_,b_) +#undef blb +#define blb(a_,b_) Mjoin(blb,NDP)(a_,b_) + + + +#undef bla11_2 +#define bla11_2(a_) plaa1_2(a_) dp1_2(a_,ax,R1) wa1_2(a_) +#undef bla21_2 +#define bla21_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) dp1_2(a_,bx,R2) wa1_2(a_) +#undef bla31_2 +#define bla31_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \ + dp1_2(a_,cx,R3) wa1_2(a_) +#undef bla41_2 +#define bla41_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \ + ddp1_2(a_,cx,R3) dp1_2(a_,dx,R4) wa1_2(a_) + +#undef bla1_2 +#define bla1_2(a_) Mjoin(Mjoin(bla,NDP),1_2)(a_) + + + +#undef bla11_4 +#define bla11_4(a_) plaa1_4(a_) dp1_4(a_,ax,R1) wa1_4(a_) +#undef bla21_4 +#define bla21_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) dp1_4(a_,bx,R2) wa1_4(a_) +#undef bla31_4 +#define bla31_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \ + dp1_4(a_,cx,R3) wa1_4(a_) +#undef bla41_4 +#define bla41_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \ + ddp1_4(a_,cx,R3) dp1_4(a_,dx,R4) wa1_4(a_) + +#undef bla1_4 +#define bla1_4(a_) Mjoin(Mjoin(bla,NDP),1_4)(a_) + + + +#undef inc1 +#define inc1(a_) a(a_,si) a(a_,ax) +#undef inc2 +#define inc2(a_) inc1(a_) a(a_,bx) +#undef inc3 +#define inc3(a_) inc2(a_) a(a_,cx) +#undef inc4 +#define inc4(a_) inc3(a_) a(a_,dx) + +#undef inc +#define inc(a_) Mjoin(inc,NDP)(a_) + + +#ifdef PREFETCH +/* #include "camm_arith.h" */ +#undef S +#define S(a_,b_) (a_) + (b_) +#undef PF1 +#define PF1 PREFETCH +#undef PF2 +#define PF2 S(PF1,32) +#undef PF3 +#define PF3 S(PF1,64) +#undef PF4 +#define PF4 S(PF1,96) +#undef PF5 +#define PF5 S(PF1,128) +#undef PF6 +#define PF6 S(PF1,160) +#undef PF7 +#define PF7 S(PF1,192) +#undef PF8 +#define PF8 S(PF1,224) +#else +#undef PF1 +#define PF1 64 +#undef PF2 +#define PF2 96 +#undef PF3 +#define PF3 128 +#undef PF4 +#define PF4 160 +#undef PF5 +#define PF5 192 +#undef PF6 +#define PF6 224 +#undef PF7 +#define PF7 256 +#undef PF8 +#define PF8 288 +#endif + + +#if defined(NO_TRANSPOSE) && !defined(SREAL) && !defined(GER) +#undef pf +#define pf(a_,b_) f(t0,a_,b_) +#else +#undef pf +#define pf(a_,b_) f(nta,a_,b_) +#endif + +#undef bl1 +#define bl1 bla1_4(0x0) inc(4) +#undef bl2 +#define bl2 bla1_2(0x0) inc(8) +#undef bl4 +#define bl4 bla(0x0,PF1) inc(16) +#undef bl8 +#define bl8 bla(0x0,PF1) blb(0x1,PF1) inc(32) +#undef bl16 +#define bl16 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) inc(64) +#undef bl32 +#define bl32 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \ + bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) inc(128) +#undef bl64 +#define bl64 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \ + bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) \ + bla(0x8,PF5) blb(0x9,PF5) bla(0xa,PF6) blb(0xb,PF6) \ + bla(0xc,PF7) blb(0xd,PF7) bla(0xe,PF8) blb(0xf,PF8) inc(256) + +/* #define in2 inc(8) */ +/* #define in4 inc(16) */ +/* #define in8 inc(32) */ +/* #define in16 inc(64) */ + +#undef in2 +#define in2 +#undef in4 +#define in4 +#undef in8 +#define in8 +#undef in16 +#define in16 + +#ifdef NO_TRANSPOSE +#undef incf +#define incf ra(di,si) +#else +#undef incf +#define incf +#endif + +#undef lf1 +#define lf1 mpx(R1) +#undef lf2 +#define lf2 lf1 incf mpx(R2) +#undef lf3 +#define lf3 lf2 incf mpx(R3) +#undef lf4 +#define lf4 lf3 incf mpx(R4) + +#undef lf +#define lf Mjoin(lf,NDP) + + +#undef ulf1 +#define ulf1 ulfa(R1) +#undef ulf2 +#define ulf2 ulf1 ra(di,si) ulfa(R2) +#undef ulf3 +#define ulf3 ulf2 ra(di,si) ulfa(R3) +#undef ulf4 +#define ulf4 ulf3 ra(di,si) ulfa(R4) + +#undef ulf +#define ulf Mjoin(ulf,NDP) + +#undef lpba +#define lpba(a_) "movl %%esi,%%e" #a_ "\n\t" + +#undef lpb1 +#define lpb1 lpba(ax) +#undef lpb2 +#define lpb2 lpb1 ra(di,si) lpba(bx) +#undef lpb3 +#define lpb3 lpb2 ra(di,si) lpba(cx) +#undef lpb4 +#define lpb4 lpb3 ra(di,si) lpba(dx) + +#undef lpb +#define lpb Mjoin(lpb,NDP) + +#undef ipf1 +#define ipf1(a_) pf(a_,si) pf(a_,ax) +#undef ipf2 +#define ipf2(a_) ipf1(a_) pf(a_,bx) +#undef ipf3 +#define ipf3(a_) ipf2(a_) pf(a_,cx) +#undef ipf4 +#define ipf4(a_) ipf3(a_) pf(a_,dx) + +#undef ipf +#define ipf(a_) Mjoin(ipf,NDP)(a_) + +#ifdef LUNROLL +#undef UNROLL +#ifdef SREAL +#undef UNROLL +#define UNROLL LUNROLL +#elif defined(DREAL) || defined(SCPLX) +#undef UNROLL +#define UNROLL LUNROLL*2 +#elif defined(DCPLX) +#undef UNROLL +#define UNROLL LUNROLL*4 +#endif +#else +#undef UNROLL +#define UNROLL 16 +#endif + +#undef UNROLL1_2 +#if UNROLL == 64 +#undef blUNROLL +#define blUNROLL bl64 +#undef UNROLL1_2 +#define UNROLL1_2 32 +#elif UNROLL == 32 +#undef blUNROLL +#define blUNROLL bl32 +#undef UNROLL1_2 +#define UNROLL1_2 16 +#elif UNROLL == 16 +#undef blUNROLL +#define blUNROLL bl16 +#undef UNROLL1_2 +#define UNROLL1_2 8 +#elif UNROLL == 8 +#undef blUNROLL +#define blUNROLL bl8 +#undef UNROLL1_2 +#define UNROLL1_2 4 +#elif UNROLL == 4 +#undef blUNROLL +#define blUNROLL bl4 +#undef UNROLL1_2 +#define UNROLL1_2 2 +#elif UNROLL == 2 +#undef blUNROLL +#define blUNROLL bl2 +#undef UNROLL1_2 +#define UNROLL1_2 1 +#elif UNROLL == 1 +#undef blUNROLL +#define blUNROLL bl1 +#undef UNROLL1_2 +#define UNROLL1_2 stop +#endif +#ifndef UNROLL1_2 +#error UNROLL must be set to power of 2 < 128 +#endif + + +#ifdef GER +#undef aconst +#define aconst +#undef cconst +#define cconst const +#else +#undef aconst +#define aconst const +#undef cconst +#define cconst +#endif + +#undef MY_FUNCTION +#define MY_FUNCTION Mjoin(dp,EXT) + +static void +MY_FUNCTION(aconst TYPE *a,int lda, + const TYPE *b, + cconst TYPE *c,int stride,int len) { + +#ifdef SCPLX +#if defined(GER) && defined(Conj_) + const TYPE w1[2]={{-1.0,1.0},{-1.0,1.0}},*w=w1; +#else + const TYPE w1[2]={{1.0,-1.0},{1.0,-1.0}},*w=w1; +#endif +#endif + +#if defined(DCPLX) && defined(ATL_SSE2) +#if defined(GER) && defined(Conj_) + const TYPE w1[1]={{-1.0,1.0}},*w=w1; +#else + const TYPE w1[1]={{1.0,-1.0}},*w=w1; +#endif +#endif + +#ifdef NO_TRANSPOSE +#undef movm +#define movm c +#undef fixm +#define fixm b +#else +#undef movm +#define movm b +#undef fixm +#define fixm c +#endif + NO_INLINE + unsigned u1=stride*sizeof(*fixm),u2=lda*sizeof(*a),u3=len*sizeof(*movm)/sizeof(float); + + ASM ( + + "pushl %%ebx\n\t" + a(4,sp) + +#if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2)) + "movl %6,%%esi\n\t" + pl(0,si,SREG) +#endif + +#ifdef NO_TRANSPOSE + "movl %1,%%esi\n\t" /* fixm */ + "movl %2,%%edi\n\t" /* fixm2fixm */ +#endif + + lf + + "movl %3,%%esi\n\t" /* a */ + "movl %4,%%edi\n\t" /* a2a */ + + lpb + + ipf(0) + + "movl %0,%%esi\n\t" /* movm */ + "movl %5,%%edi\n\t" /* len */ + +#if defined(ALIGN) + +#if defined(SREAL) + + test(4,ax) + je(Mjoin(a1,EXT)) + test(-1,di) + je(Mjoin(a1,EXT)) + sub(1,di) + bl1 + + lab(Mjoin(a1,EXT)) + +#endif + +#if defined(DREAL) || defined(SREAL) + + test(8,ax) + je(Mjoin(as,EXT)) + test(-2,di) + je(Mjoin(as,EXT)) + sub(2,di) + bl2 + + lab(Mjoin(as,EXT)) + +#endif + +#endif + + + ipf(32) + + lab(Mjoin(loop,EXT)) + + test(-UNROLL,di) + je(Mjoin(UNROLL1_2,EXT)) + sub(UNROLL,di) + + blUNROLL + + jmp(Mjoin(loop,EXT)) + +#if UNROLL > 32 + lab(Mjoin(32,EXT)) + test(32,di) + je(Mjoin(16,EXT)) + bl32 +#endif + +#if UNROLL > 16 + lab(Mjoin(16,EXT)) + test(16,di) + je(Mjoin(8,EXT)) + bl16 +#endif + +#if UNROLL > 8 + lab(Mjoin(8,EXT)) + test(8,di) + je(Mjoin(4,EXT)) + bl8 +#endif + +#if UNROLL > 4 + lab(Mjoin(4,EXT)) + test(4,di) + je(Mjoin(2,EXT)) + bl4 +#endif + +#if UNROLL > 2 + lab(Mjoin(2,EXT)) +#ifndef DCPLX + test(2,di) + je(Mjoin(1,EXT)) + bl2 +#endif +#endif + +#if UNROLL > 1 + lab(Mjoin(1,EXT)) +#ifdef SREAL + test(1,di) + je(Mjoin(stop,EXT)) + bl1 +#endif +#endif + + lab(Mjoin(stop,EXT)) + +#ifndef NO_TRANSPOSE + "movl %1,%%esi\n\t" /* fixm */ + "movl %2,%%edi\n\t" /* fixm2fixm */ +#endif + + ulf + + a(-4,sp) + "popl %%ebx\n\t" + + + ::"m" (movm),"m" (fixm),"m" (u1),"m" (a),"m" (u2),"m" (u3) + +#if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2)) + ,"m" (w) +#endif + :"ax","bx","cx","dx","si","di"); + + +} + diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h new file mode 100644 index 0000000..7fd1404 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h @@ -0,0 +1,295 @@ +#include "camm_util.h" + +#ifndef N +#error N must be defined in camm_pipe3.h +#endif +#ifndef KB +#error KB must be defined in camm_pipe3.h +#endif + +#undef p1 +#define p1(a_) Mjoin(p1_4_,N)(a_) +#undef p2 +#define p2(a_) Mjoin(p1_2_,N)(a_) +#undef p4 +#define p4(a_) Mjoin(p1_,N)(a_) +#undef load_pipe +#define load_pipe(a_) Mjoin(lp,N)(a_) +#undef drain_pipe +#define drain_pipe(a_) Mjoin(dp,N)(a_) +#undef pipe_len +#define pipe_len Mjoin(pl,N) + +#undef p8 +#if pipe_len > 4 +#define p8(a_) Mjoin(p2_,N)(a_) +#else +#define p8(a_) p4(a_) p4(SS(a_,16)) +#endif + +#undef p16 +#if pipe_len > 8 +#define p16(a_) Mjoin(p4_,N)(a_) +#else +#define p16(a_) p8(a_) p8(SS(a_,32)) +#endif + +#undef p32 +#if pipe_len > 16 +#define p32(a_) Mjoin(p8_,N)(a_) +#else +#define p32(a_) p16(a_) p16(SS(a_,64)) +#endif + +#undef p64 +#if pipe_len > 32 +#define p64(a_) Mjoin(p16_,N)(a_) +#else +#define p64(a_) p32(a_) p32(SS(a_,128)) +#endif + +#undef p128 +#if pipe_len > 64 +#define p128(a_) Mjoin(p32_,N)(a_) +#else +#define p128(a_) p64(a_) p64(SS(a_,256)) +#endif + +#undef p256 +#if pipe_len > 128 +#define p256(a_) Mjoin(p64_,N)(a_) +#else +#define p256(a_) p128(a_) p128(SS(a_,512)) +#endif + +#if KB < pipe_len +#undef pipe_len +#define pipe_len 0 +#undef load_pipe +#define load_pipe(a_) +#undef drain_pipe +#define drain_pipe(a_) +#endif + + +#undef MKB +/* #ifdef SREAL */ +#define MKB KB +/* #elif defined (DCPLX) */ +/* #define MKB ( KB * 4 ) */ +/* #else */ +/* #define MKB ( KB * 2 ) */ +/* #endif */ + +#if MKB >= 512 +#error MKB must be less than 512 +#endif + +#undef x0 +#undef o0 +#define x0 load_pipe(0) +#define o0 0 + +#undef MKBB +#define MKBB ( MKB - pipe_len ) + +#undef xx1 +#undef oo1 +#if MKBB >= 256 +#define xx1 x0 p256(o0) +#define oo1 SS(1024,o0) +#else +#define xx1 x0 +#define oo1 o0 +#endif + +#undef xx1a +#undef oo1a +#if pipe_len == 256 +#define xx1a xx1 drain_pipe(oo1) +#define oo1a SS(1024,oo1) +#undef MKBB +#define MKBB MKB +#else +#define xx1a xx1 +#define oo1a oo1 +#endif + +#undef x1 +#undef o1 +#if ( MKBB / 128 ) % 2 +#define x1 xx1a p128(oo1a) +#define o1 SS(512,oo1a) +#else +#define x1 xx1a +#define o1 oo1a +#endif + +#undef x1a +#undef o1a +#if pipe_len == 128 +#define x1a x1 drain_pipe(o1) +#define o1a SS(512,o1) +#undef MKBB +#define MKBB MKB +#else +#define x1a x1 +#define o1a o1 +#endif + +#undef x2 +#undef o2 +#if ( MKBB / 64 ) % 2 +#define x2 x1a p64(o1a) +#define o2 SS(256,o1a) +#else +#define x2 x1a +#define o2 o1a +#endif + +#undef x2a +#undef o2a +#if pipe_len == 64 +#define x2a x2 drain_pipe(o2) +#define o2a SS(256,o2) +#undef MKBB +#define MKBB MKB +#else +#define x2a x2 +#define o2a o2 +#endif + +#undef x3 +#undef o3 +#if ( MKBB / 32 ) % 2 +#define x3 x2a p32(o2a) +#define o3 SS(128,o2a) +#else +#define x3 x2a +#define o3 o2a +#endif + +#undef x3a +#undef o3a +#if pipe_len == 32 +#define x3a x3 drain_pipe(o3) +#define o3a SS(128,o3) +#undef MKBB +#define MKBB MKB +#else +#define x3a x3 +#define o3a o3 +#endif + +#undef x4 +#undef o4 +#if ( MKBB / 16 ) % 2 +#define x4 x3a p16(o3a) +#define o4 SS(64,o3a) +#else +#define x4 x3a +#define o4 o3a +#endif + +#undef x4a +#undef o4a +#if pipe_len == 16 +#define x4a x4 drain_pipe(o4) +#define o4a SS(64,o4) +#undef MKBB +#define MKBB MKB +#else +#define x4a x4 +#define o4a o4 +#endif + +#undef x5 +#undef o5 +#if ( MKBB / 8 ) % 2 +#define x5 x4a p8(o4a) +#define o5 SS(32,o4a) +#else +#define x5 x4a +#define o5 o4a +#endif + +#undef x5a +#undef o5a +#if pipe_len == 8 +#define x5a x5 drain_pipe(o5) +#define o5a SS(32,o5) +#undef MKBB +#define MKBB MKB +#else +#define x5a x5 +#define o5a o5 +#endif + +#undef x6 +#undef o6 +#if ( MKBB / 4 ) % 2 +#define x6 x5a p4(o5a) +#define o6 SS(16,o5a) +#else +#define x6 x5a +#define o6 o5a +#endif + +#undef x6a +#undef o6a +#if pipe_len == 4 +#define x6a x6 drain_pipe(o6) +#define o6a SS(16,o6) +#undef MKBB +#define MKBB MKB +#else +#define x6a x6 +#define o6a o6 +#endif + +#undef x7 +#undef o7 +#if ( MKB / 2 ) % 2 +#define x7 x6a p2(o6a) +#define o7 SS(8,o6a) +#else +#define x7 x6a +#define o7 o6a +#endif + +#undef x7a +#undef o7a +#if pipe_len == 2 +#define x7a x7 drain_pipe(o7) +#define o7a SS(8,o7) +#undef MKBB +#define MKBB MKB +#else +#define x7a x7 +#define o7a o7 +#endif + +#undef x8 +#undef o8 +#if ( MKB / 1 ) % 2 +#define x8 x7a p1(o7a) +#define o8 SS(4,o7a) +#else +#define x8 x7a +#define o8 o7a +#endif + +#undef x8a +#undef o8a +#if pipe_len == 1 +#define x8a x8 drain_pipe(o8) +#define o8a SS(4,o8) +#undef MKBB +#define MKBB MKB +#else +#define x8a x8 +#define o8a o8 +#endif + +#undef KB_block +#define KB_block x8a diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h new file mode 100644 index 0000000..35e9e59 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h @@ -0,0 +1,215 @@ +#ifndef CAMM_SCALE_H +#define CAMM_SCALE_H /*+ To stop multiple inclusions. +*/ + +#include "camm_util.h" + +#undef spf +#define spf(a_,b_) f(t0,a_,b_) + +#ifdef SCPLX +#ifdef BETAX +#undef SSREG +#define SSREG 2 +#undef lbx +#define lbx pls(4,ax,1) ps(0,1,1) pm(SSREG,1) +#undef cxx +#define cxx pm(1,3) ps(177,3,3) pa(3,2) +#undef pcx +#define pcx pc(2,3) +#else +#undef lbx +#define lbx +#undef cxx +#define cxx +#undef pcx +#define pcx +#endif +#undef lb +#define lb pls(0,ax,0) ps(0,0,0) lbx +#undef c +#define c(a_) pl(a_ ## 0,si,2) pcx pm(0,2) cxx pu(2,a_ ## 0,si) +#undef cp +#define cp(a_,b_) pl(a_ ## 0,si,2) pcx pm(0,2) spf(b_,si) cxx pu(2,a_ ## 0,si) +#undef c1_2 +#define c1_2(a_) px(2) pld(a_ ## 0,si,2) pcx pm(0,2) cxx pud(2,a_ ## 0,si) +#undef ub +#define ub +#endif + +#ifdef SREAL +#undef lb +#define lb pls(0,ax,0) ps(0,0,0) +#undef c +#define c(a_) pl(a_ ## 0,si,2) pm(0,2) pu(2,a_ ## 0,si) +#undef cp +#define cp(a_,b_) pl(a_ ## 0,si,2) spf(b_,si) pm(0,2) pu(2,a_ ## 0,si) +#undef c1_2 +#define c1_2(a_) px(2) pld(a_ ## 0,si,2) pm(0,2) pud(2,a_ ## 0,si) +#undef c1_4 +#define c1_4(a_) pls(a_ ## 0,si,2) pm(0,2) pus(2,a_ ## 0,si) +#undef ub +#define ub +#endif + +#ifdef DREAL +#undef lb +#define lb fl(0,ax) +#undef c +#define c(a_) fl(a_ ## 0,si) fm(1,0) fl(a_ ## 8,si) fm(2,0) fx1 \ + fp(a_ ## 0,si) fp(a_ ## 8,si) +#undef cp +#define cp(a_,b_) fl(a_ ## 0,si) fm(1,0) fl(a_ ## 8,si) spf(b_,si) fm(2,0) fx1 \ + fp(a_ ## 0,si) fp(a_ ## 8,si) +#undef c1_2 +#define c1_2(a_) fl(a_ ## 0,si) fm(1,0) fp(a_ ## 0,si) +#undef ub +#define ub fc(0) +#endif + +#ifdef DCPLX +#undef lb +#define lb fl(0,ax) fl(8,ax) +#undef c +#define c(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) fd(3) fm(2,0) fd(3) \ + fm(2,0) fx(3) fm(4,0) fx(2) fm(5,0) fap(0,2) fx(2) fsp(2) fx1 \ + fp(a_ ## 0,si) fp(a_ ## 8,si) +#undef cp +#define cp(a_,b_) fl(a_ ## 0,si) fl(a_ ## 8,si) fd(3) fm(2,0) fd(3) \ + fm(2,0) fx(3) spf(b_,si) fm(4,0) fx(2) fm(5,0) fap(0,2) fx(2) \ + fsp(2) fx1 fp(a_ ## 0,si) fp(a_ ## 8,si) +#undef ub +#define ub fc(0) fc(0) +#endif + +#undef sbl1 +#define sbl1 c1_4(0x0) +#undef sbl2 +#define sbl2 c1_2(0x0) +#undef sbl4 +#define sbl4 cp(0x0,0x40) +#undef sbl8 +#define sbl8 sbl4 c(0x1) +#undef sbl16 +#define sbl16 sbl8 cp(0x2,0x60) c(0x3) + +#undef sinc16 +#define sinc16 a(0x40,si) +#undef sinc8 +#define sinc8 a(0x20,si) +#undef sinc4 +#define sinc4 a(0x10,si) +#undef sinc2 +#define sinc2 a(0x8,si) +#undef sinc1 +#define sinc1 a(0x4,si) + +#undef SCALE +#define SCALE Mjoin(Mjoin(PREC,Mjoin(scale,BLC)),FEXT) + +#undef MY_FUNCTION +#define MY_FUNCTION SCALE + +static void +MY_FUNCTION(const TYPE *b,TYPE *c,int len) { + + const TYPE *ce=c+len; +#if defined(BETAX) && defined(SCPLX) + const TYPE z1[2]={{1.0,-1.0},{1.0,-1.0}},*z=z1; +#endif + NO_INLINE + +#ifndef SREAL + len+=len; +#endif +#ifdef DCPLX + len+=len; +#endif + + + ASM( + + "pushl %%ebx\n\t" + a(4,sp) + + + "movl %0,%%esi\n\t" + + spf(0x00,si) + spf(0x20,si) + + "movl %1,%%eax\n\t" + "movl %2,%%edi\n\t" + +#if defined(BETAX) && defined(SCPLX) + "movl %3,%%ebx\n\t" + pl(0,bx,SSREG) +#endif + + lb + + lab(loop) + + test(-16,di) + je(8) + sub(16,di) + align + + sbl16 + sinc16 + + jmp(loop) + align + + lab(8) + + test(8,di) + je(4) + + sbl8 + sinc8 + + lab(4) + + test(4,di) + je(2) + + sbl4 + sinc4 + + lab(2) + +#ifndef DCPLX + test(2,di) + je(1) + + sbl2 + sinc2 + + lab(1) + +#ifdef SREAL + test(1,di) + je(stop) + + sbl1 + sinc1 + + lab(stop) +#endif +#endif + + ub + + a(-4,sp) + "popl %%ebx\n\t" + + + ::"m" (c),"m" (b), "m" (len) +#if defined(BETAX) && defined(SCPLX) + ,"m" (z) +#endif + : "si","ax","di"); + + +} +#endif /* CAMM_SCALE_H */ diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h new file mode 100644 index 0000000..4a92006 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h @@ -0,0 +1,2982 @@ +#include "camm_util.h" + +#undef p1_4_swap_1 +#define p1_4_swap_1(a_) \ + pls(a_,ax,1) \ + pls(a_,cx,0) \ + pus(0,a_,ax) \ + pus(1,a_,cx) +#undef p1_2_swap_1 +#define p1_2_swap_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + px(0) \ + pld(a_,cx,0) \ + pud(0,a_,ax) \ + pud(1,a_,cx) +#undef p1_swap_1 +#define p1_swap_1(a_) \ + plq(a_,ax,1) \ + pl(a_,cx,0) \ + puq(0,a_,ax) \ + pu(1,a_,cx) +#undef p2_swap_1 +#define p2_swap_1(a_) \ + plq(SS(a_,RS4),ax,3) \ + pl(SS(a_,RS4),cx,2) \ + puq(0,a_,ax) \ + pu(1,a_,cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,0) \ + puq(2,SS(a_,RS4),ax) \ + pu(3,SS(a_,RS4),cx) +#undef lpswap_1 +#define lpswap_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,0) +#undef dpswap_1 +#define dpswap_1(a_) \ + plq(SS(a_,RS4),ax,3) \ + pl(SS(a_,RS4),cx,2) \ + puq(0,a_,ax) \ + pu(1,a_,cx) \ + puq(2,SS(a_,RS4),ax) \ + pu(3,SS(a_,RS4),cx) +#undef plswap_1 +#define plswap_1 8 + + +#undef p1_4_scal_3 +#define p1_4_scal_3(a_) \ + pls(a_,ax,0) \ + pmsr(6,0) \ + pus(0,a_,ax) +#undef p1_2_scal_3 +#define p1_2_scal_3(a_) \ + pld(a_,ax,0) \ + pm(6,0) \ + pud(0,a_,ax) +#undef p1_scal_3 +#define p1_scal_3(a_) \ + plq(a_,ax,0) \ + pm(6,0) \ + puq(0,a_,ax) +#undef p2_scal_3 +#define p2_scal_3(a_) \ + plq(a_,ax,0) \ + plq(SS(a_,RS4),ax,1) \ + pm(6,0) \ + pm(6,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_scal_3 +#define p4_scal_3(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,2) \ + puq(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(6,3) \ + puq(1,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pm(6,0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(6,1) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpscal_3 +#define lpscal_3(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pm(6,1) +#undef dpscal_3 +#define dpscal_3(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,2) \ + puq(0,a_,ax) \ + pm(6,3) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plscal_3 +#define plscal_3 16 + +#undef p1_4_scal_3c +#define p1_4_scal_3c(a_) +#undef p1_2_scal_3c +#define p1_2_scal_3c(a_) \ + pld(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + pud(0,a_,ax) +#undef p1_scal_3c +#define p1_scal_3c(a_) \ + plq(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + puq(0,a_,ax) +#undef p2_scal_3c +#define p2_scal_3c(a_) \ + plq(a_,ax,0) \ + plq(SS(a_,RS4),ax,1) \ + pc(0,2) \ + pm(6,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + puq(0,a_,ax) \ + pc(1,3) \ + pm(6,1) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,1) \ + puq(1,SS(a_,RS4),ax) +#undef p4_scal_3c +#define p4_scal_3c(a_) \ + pm(7,5) \ + pa(5,1) \ + puq(0,a_,ax) \ + ps(CSHUF,4,4) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pc(3,5) \ + pm(6,3) \ + pm(7,4) \ + pa(4,2) \ + puq(1,SS(a_,RS4),ax) \ + ps(CSHUF,5,5) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pc(0,4) \ + pm(6,0) \ + pm(7,5) \ + pa(5,3) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pc(1,5) \ + pm(6,1) \ + pm(7,4) \ + pa(4,0) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + ps(CSHUF,5,5) \ + plq(SS(a_,MM(7,RS4)),ax,3) \ + pc(2,4) \ + pm(6,2) +#undef lpscal_3c +#define lpscal_3c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pc(0,4) \ + pm(6,0) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pc(1,5) \ + pm(6,1) \ + pm(7,4) \ + pa(4,0) \ + ps(CSHUF,5,5) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pc(2,4) \ + pm(6,2) +#undef dpscal_3c +#define dpscal_3c(a_) \ + pm(7,5) \ + pa(5,1) \ + ps(CSHUF,4,4) \ + puq(0,a_,ax) \ + pm(7,4) \ + pa(4,2) \ + pc(3,5) \ + pm(6,3) \ + puq(1,SS(a_,RS4),ax) \ + ps(CSHUF,5,5) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pm(7,5) \ + pa(5,3) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plscal_3c +#define plscal_3c 16 + +#undef p1_4_scal_4 +#define p1_4_scal_4(a_) \ + pls(SS(a_,MM(0,RS4)),ax,0) \ + pmsr(6,0) \ + pus(0,a_,ax) +#undef p1_2_scal_4 +#define p1_2_scal_4(a_) \ + pld(SS(a_,MM(0,RS4)),ax,0) \ + pm(6,0) \ + pud(0,a_,ax) +#undef p1_scal_4 +#define p1_scal_4(a_) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pm(6,0) \ + puq(0,a_,ax) +#undef p2_scal_4 +#define p2_scal_4(a_) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,0) \ + pm(6,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_scal_4 +#define p4_scal_4(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,0) \ + pm(6,1) \ + pm(6,2) \ + pm(6,3) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef p8_scal_4 +#define p8_scal_4(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + plq(SS(a_,MM(4,RS4)),ax,4) \ + plq(SS(a_,MM(5,RS4)),ax,5) \ + plq(SS(a_,MM(6,RS4)),ax,7) \ + pm(6,0) \ + pm(6,1) \ + pm(6,2) \ + puq(0,a_,ax) \ + pm(6,3) \ + pm(6,4) \ + pm(6,5) \ + plq(SS(a_,MM(7,RS4)),ax,0) \ + pm(6,7) \ + pm(6,0) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + puq(4,SS(a_,MM(4,RS4)),ax) \ + puq(5,SS(a_,MM(5,RS4)),ax) \ + puq(7,SS(a_,MM(6,RS4)),ax) \ + puq(0,SS(a_,MM(7,RS4)),ax) +#undef lpscal_4 +#define lpscal_4(a_) +#undef dpscal_4 +#define dpscal_4(a_) p4_scal_4(a_) +#undef plscal_4 +#define plscal_4 16 + +#undef p1_4_scal_4c +#define p1_4_scal_4c(a_) +#undef p1_2_scal_4c +#define p1_2_scal_4c(a_) \ + pld(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + pud(0,a_,ax) +#undef p1_scal_4c +#define p1_scal_4c(a_) \ + plq(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + puq(0,a_,ax) +#undef p2_scal_4c +#define p2_scal_4c(a_) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pc(0,4) \ + pc(1,5) \ + pm(6,0) \ + pm(6,1) \ + ps(CSHUF,4,4) \ + ps(CSHUF,5,5) \ + pm(7,4) \ + pa(4,0) \ + pm(7,5) \ + pa(5,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_scal_4c +#define p4_scal_4c(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pc(0,4) \ + pc(1,5) \ + pm(6,0) \ + pm(6,1) \ + ps(CSHUF,4,4) \ + ps(CSHUF,5,5) \ + pm(7,4) \ + pa(4,0) \ + pc(2,4) \ + pm(7,5) \ + pa(5,1) \ + pc(3,5) \ + pm(6,2) \ + pm(6,3) \ + ps(CSHUF,4,4) \ + ps(CSHUF,5,5) \ + pm(7,4) \ + pa(4,2) \ + pm(7,5) \ + pa(5,3) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef lpscal_4c +#define lpscal_4c(a_) +#undef dpscal_4c +#define dpscal_4c(a_) p4_scal_4c(a_) +#undef plscal_4c +#define plscal_4c 16 + +#undef p1_4_scal_1 +#define p1_4_scal_1(a_) \ + pls(a_,ax,1) \ + pmsr(0,1) \ + pus(1,a_,ax) +#undef p1_2_scal_1 +#define p1_2_scal_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + pm(0,1) \ + pud(1,a_,ax) +#undef p1_scal_1 +#define p1_scal_1(a_) \ + plq(a_,ax,1) \ + pm(0,1) \ + puq(1,a_,ax) +#undef p2_scal_1 +#define p2_scal_1(a_) \ + plq(a_,ax,1) \ + plq(SS(a_,RS4),ax,2) \ + pm(0,1) \ + pm(0,2) \ + puq(1,a_,ax) \ + puq(2,SS(a_,RS4),ax) +#undef p4_scal_1 +#define p4_scal_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pm(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(0,1) \ + puq(3,SS(a_,MM(1,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pm(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pm(0,7) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef lpscal_1 +#define lpscal_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,MM(1,RS4)),ax,3) \ + pm(0,7) +#undef dpscal_1 +#define dpscal_1(a_) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pm(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(0,1) \ + puq(3,SS(a_,MM(1,RS4)),ax) \ + pm(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef plscal_1 +#define plscal_1 RS4 + + +#undef p1_4_set_1 +#define p1_4_set_1(a_) \ + pls(a_,ax,1) \ + pcs(0,1) \ + pus(1,a_,ax) +#undef p1_2_set_1 +#define p1_2_set_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + pc(0,1) \ + pud(1,a_,ax) +#undef p1_set_1 +#define p1_set_1(a_) \ + plq(a_,ax,1) \ + pc(0,1) \ + puq(1,a_,ax) +#undef p2_set_1 +#define p2_set_1(a_) \ + plq(a_,ax,1) \ + plq(SS(a_,RS4),ax,2) \ + pc(0,1) \ + pc(0,2) \ + puq(1,a_,ax) \ + puq(2,SS(a_,RS4),ax) +#undef p4_set_1 +#define p4_set_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pc(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pc(0,1) \ + puq(3,SS(a_,MM(1,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pc(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pc(0,7) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef lpset_1 +#define lpset_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,MM(1,RS4)),ax,3) \ + pc(0,7) +#undef dpset_1 +#define dpset_1(a_) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pc(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pc(0,1) \ + puq(3,SS(a_,MM(1,RS4)),ax) \ + pc(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef plset_1 +#define plset_1 RS4 + + +#undef p1_4_set_2 +#define p1_4_set_2(a_) \ + pus(0,a_,ax) +#undef p1_2_set_2 +#define p1_2_set_2(a_) \ + pud(0,a_,ax) +#undef p1_set_2 +#define p1_set_2(a_) \ + puq(0,a_,ax) +#undef p2_set_2 +#define p2_set_2(a_) \ + puq(0,a_,ax) \ + puq(0,SS(a_,RS4),ax) +#undef p4_set_2 +#define p4_set_2(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + puq(0,a_,ax) \ + puq(0,SS(a_,MM(1,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + puq(0,SS(a_,MM(2,RS4)),ax) \ + puq(0,SS(a_,MM(3,RS4)),ax) +#undef lpset_2 +#define lpset_2(a_) +#undef dpset_2 +#define dpset_2(a_) \ + puq(0,a_,ax) \ + puq(0,SS(a_,MM(1,RS4)),ax) \ + puq(0,SS(a_,MM(2,RS4)),ax) \ + puq(0,SS(a_,MM(3,RS4)),ax) +#undef plset_2 +#define plset_2 RS4 + + +#undef p1_4_set_3 +#define p1_4_set_3(a_) \ + pus(0,a_,ax) +#undef p1_2_set_3 +#define p1_2_set_3(a_) \ + pud(0,a_,ax) +#undef p1_set_3 +#define p1_set_3(a_) \ + puq(0,SS(a_,MM(0,RS4)),ax) +#undef p2_set_3 +#define p2_set_3(a_) \ + puq(0,SS(a_,MM(0,RS4)),ax) \ + puq(0,SS(a_,MM(1,RS4)),ax) +#undef p4_set_3 +#define p4_set_3(a_) \ + puq(0,SS(a_,MM(0,RS4)),ax) \ + puq(0,SS(a_,MM(1,RS4)),ax) \ + puq(0,SS(a_,MM(2,RS4)),ax) \ + puq(0,SS(a_,MM(3,RS4)),ax) +#undef p8_set_3 +#define p8_set_3(a_) \ + puq(0,SS(a_,MM(0,RS4)),ax) \ + puq(0,SS(a_,MM(1,RS4)),ax) \ + puq(0,SS(a_,MM(2,RS4)),ax) \ + puq(0,SS(a_,MM(3,RS4)),ax) \ + puq(0,SS(a_,MM(4,RS4)),ax) \ + puq(0,SS(a_,MM(5,RS4)),ax) \ + puq(0,SS(a_,MM(6,RS4)),ax) \ + puq(0,SS(a_,MM(7,RS4)),ax) +#undef lpset_3 +#define lpset_3(a_) +#undef dpset_3 +#define dpset_3(a_) p8_set_3(a_) +#undef plset_3 +#define plset_3 32 + + +#undef p1_4_0x1_nrm2_1 +#define p1_4_0x1_nrm2_1(a_) \ + pls(a_,ax,1) \ + pmsr(1,1) \ + pasr(1,0) +#undef p1_2_0x1_nrm2_1 +#define p1_2_0x1_nrm2_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + pm(1,1) \ + pa(1,0) +#undef p1_0x1_nrm2_1 +#define p1_0x1_nrm2_1(a_) \ + plq(a_,ax,1) \ + pm(1,1) \ + pa(1,0) +#undef p2_0x1_nrm2_1 +#define p2_0x1_nrm2_1(a_) \ + plq(a_,ax,1) \ + plq(SS(a_,RS4),ax,2) \ + pm(1,1) \ + pm(2,2) \ + pa(1,0) \ + pm(2,0) +#undef p4_0x1_nrm2_1 +#define p4_0x1_nrm2_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pm(3,3) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(1,1) \ + pa(3,0) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pm(2,2) \ + pa(1,0) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pm(7,7) \ + pa(2,0) +#undef lp0x1_nrm2_1 +#define lp0x1_nrm2_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,MM(1,RS4)),ax,3) \ + pm(7,7) +#undef dp0x1_nrm2_1 +#define dp0x1_nrm2_1(a_) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pm(3,3) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(1,1) \ + pa(3,0) \ + pm(2,2) \ + pa(1,0) \ + pa(2,0) +#undef pl0x1_nrm2_1 +#define pl0x1_nrm2_1 RS4 + + +#undef p1_4_nrm2_2 +#define p1_4_nrm2_2(a_) \ + pls(a_,ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pcs(5,6) dbg(6) \ + pcs(5,7) dbg(7) \ + paxs(1,5) dbg(5) \ + prps(5,2) dbg(2) \ + px(3) \ + pcms(0,2,3) dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pasr(3,7) dbg(7) \ + pcs(7,5) dbg(5) \ + pdsr(5,6) dbg(6) \ + pdsr(5,1) dbg(1) \ + pmsr(6,6) dbg(6) \ + pmsr(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pasr(1,0) dbg(0) +#undef p1_2_nrm2_2 +#define p1_2_nrm2_2(a_) \ + px(1) pld(a_,ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef p1_nrm2_2 +#define p1_nrm2_2(a_) \ + plq(a_,ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#define p2_nrm2_2(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef lpnrm2_2 +#define lpnrm2_2(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef dpnrm2_2 +#define dpnrm2_2(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef plnrm2_2 +#define plnrm2_2 8 + + +#undef p1_4_nrm2_3 +#define p1_4_nrm2_3(a_) \ + pls(a_,ax,1) dbg(1) \ + pcs(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + paxs(1,5) dbg(5) \ + pdsr(5,6) dbg(6) \ + pdsr(5,1) dbg(1) \ + pmsr(6,6) dbg(6) \ + pmsr(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pasr(1,0) dbg(0) +#undef p1_2_nrm2_3 +#define p1_2_nrm2_3(a_) \ + px(1) pld(a_,ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef p1_nrm2_3 +#define p1_nrm2_3(a_) \ + plq(a_,ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#define p2_nrm2_3(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef lpnrm2_3 +#define lpnrm2_3(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef dpnrm2_3 +#define dpnrm2_3(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef plnrm2_3 +#define plnrm2_3 8 + +#define block_nrm2_4(a_,b_) \ + Mjoin(pc,a_)(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + Mjoin(pax,a_)(1,5) dbg(5) \ + Mjoin(pc,a_)(2,7) dbg(7) \ + Mjoin(pd,b_)(5,7) dbg(7) \ + Mjoin(pm,b_)(7,6) dbg(6) \ + Mjoin(pm,b_)(7,1) dbg(1) \ + Mjoin(pm,b_)(6,6) dbg(6) \ + Mjoin(pm,b_)(6,0) dbg(0) \ + Mjoin(pm,b_)(1,1) dbg(1) \ + Mjoin(pa,b_)(1,0) dbg(0) + + +/* #undef p1_4_nrm2_4 */ +/* #define p1_4_nrm2_4(a_) \ */ +/* pls(a_,ax,1) dbg(1) \ */ +/* pcs(5,6) dbg(6) \ */ +/* pan(4,1) dbg(1) \ */ +/* paxs(1,5) dbg(5) \ */ +/* pcs(2,7) dbg(7) \ */ +/* pdsr(5,7) dbg(7) \ */ +/* pmsr(7,6) dbg(6) \ */ +/* pmsr(7,1) dbg(1) \ */ +/* pmsr(6,6) dbg(6) \ */ +/* pmsr(6,0) dbg(0) \ */ +/* pmsr(1,1) dbg(1) \ */ +/* pasr(1,0) dbg(0) */ +#undef p1_4_nrm2_4 +#define p1_4_nrm2_4(a_) \ + pls(a_,ax,1) dbg(1) \ + block_nrm2_4(s,sr) +#undef p1_2_nrm2_4 +#define p1_2_nrm2_4(a_) \ + px(1) pld(a_,ax,1) dbg(1) \ + block_nrm2_4(,) +#undef p1_nrm2_4 +#define p1_nrm2_4(a_) \ + plq(a_,ax,1) dbg(1) \ + block_nrm2_4(,) +#define p2_nrm2_4(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + block_nrm2_4(,) \ + plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + block_nrm2_4(,) +#undef lpnrm2_4 +#define lpnrm2_4(a_) \ + plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + block_nrm2_4(,) +#undef dpnrm2_4 +#define dpnrm2_4(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + block_nrm2_4(,) +#undef plnrm2_4 +#define plnrm2_4 8 + + +#undef p1_4_1x1_1 +#define p1_4_1x1_1(a_) \ + pls(a_,ax,1) \ + pls(a_,bx,0) \ + pm(0,1) \ + pa(1,6) +#undef p1_2_1x1_1 +#define p1_2_1x1_1(a_) \ + pld(a_,ax,1) \ + pld(a_,bx,0) \ + pm(0,1) \ + pa(1,6) +#undef p1_1x1_1 +#define p1_1x1_1(a_) \ + plq(a_,ax,1) \ + plq(a_,bx,0) \ + pm(0,1) \ + pa(0,6) +#undef p2_1x1_1 +#define p2_1x1_1(a_) \ + plq(a_,ax,1) \ + plq(a_,bx,0) \ + plq(SS(a_,RS4),ax,2) \ + plq(SS(a_,RS4),bx,3) \ + pm(0,1) \ + pm(2,3) \ + pa(1,6) \ + pa(3,6) +#undef p4_1x1_1 +#define p4_1x1_1(a_) \ + f(nta,SS(a_,MM(4,RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pm(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(0,1) \ + puq(3,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM(6,RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pm(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pm(0,7) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef lp1x1_1 +#define lp1x1_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,RS4),ax,3) \ + pm(0,7) +#undef dp1x1_1 +#define dp1x1_1(a_) \ + plq(SS(,a_,MM(2,RS4)),ax,1) \ + pm(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(0,1) \ + puq(3,SS(a_,RS4),ax) \ + pm(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef pl1x1_1 +#define pl1x1_1 RS4 + + +#undef p1_4_0x1_asum_1 +#define p1_4_0x1_asum_1(a_) \ + pls(a_,ax,1) \ + pan(4,1) \ + pasr(1,0) +#undef p1_2_0x1_asum_1 +#define p1_2_0x1_asum_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + pan(4,1) \ + pa(1,0) +#undef p1_0x1_asum_1 +#define p1_0x1_asum_1(a_) \ + plq(a_,ax,1) \ + pan(4,1) \ + pa(1,0) +#undef p2_0x1_asum_1 +#define p2_0x1_asum_1(a_) \ + plq(a_,ax,1) \ + plq(SS(a_,RS4),ax,2) \ + pan(4,1) \ + pan(4,2) \ + pa(1,0) \ + pa(2,0) +#undef p4_0x1_asum_1 +#define p4_0x1_asum_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pan(4,3) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pan(4,1) \ + pa(3,0) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pan(4,2) \ + pa(1,0) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pan(4,7) \ + pa(2,0) +#undef lp0x1_asum_1 +#define lp0x1_asum_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,MM(1,RS4)),ax,3) \ + pan(4,7) +#undef dp0x1_asum_1 +#define dp0x1_asum_1(a_) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pan(4,3) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pan(4,1) \ + pa(3,0) \ + pan(4,2) \ + pa(1,0) \ + pa(2,0) +#undef pl0x1_asum_1 +#define pl0x1_asum_1 RS4 + + +#undef p1_4_sum_1 +#define p1_4_sum_1(a_) \ + pls(a_,ax,1) \ + pasr(1,0) +#undef p1_2_sum_1 +#define p1_2_sum_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + pa(1,0) +#undef p1_sum_1 +#define p1_sum_1(a_) \ + plq(a_,ax,1) \ + pa(1,0) +#undef p2_sum_1 +#define p2_sum_1(a_) \ + plq(a_,ax,1) \ + plq(SS(a_,RS4),ax,2) \ + pa(1,0) \ + pa(2,0) +#undef p4_sum_1 +#define p4_sum_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pa(3,0) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pa(1,0) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pa(2,0) +#undef lpsum_1 +#define lpsum_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,MM(1,RS4)),ax,3) +#undef dpsum_1 +#define dpsum_1(a_) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pa(3,0) \ + pa(1,0) \ + pa(2,0) +#undef plsum_1 +#define plsum_1 RS4 + + +#undef p1_4_dot_1 +#define p1_4_dot_1(a_) \ + pls(a_,ax,1) \ + pls(a_,cx,2) \ + pmsr(2,1) \ + pasr(1,0) +#undef p1_2_dot_1 +#define p1_2_dot_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + px(2) \ + pld(a_,cx,2) \ + pm(2,1) \ + pa(1,0) +#undef p1_dot_1 +#define p1_dot_1(a_) \ + plq(a_,ax,1) \ + pl(a_,cx,2) \ + pm(2,1) \ + pa(1,0) +#undef p2_dot_1 +#define p2_dot_1(a_) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pm(4,3) \ + pa(3,0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(2,1) \ + pa(1,0) +#undef lpdot_1 +#define lpdot_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(a_,ax,3) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(a_,cx,4) +#undef dpdot_1 +#define dpdot_1(a_) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pm(4,3) \ + pa(3,0) \ + pm(2,1) \ + pa(1,0) +#undef pldot_1 +#define pldot_1 8 + +#undef p1_4_dot_1c +#define p1_4_dot_1c(a_) +#undef p1_2_dot_1c +#define p1_2_dot_1c(a_) \ + px(1) \ + pld(a_,ax,1) \ + px(2) \ + pld(a_,cx,2) \ + pc(1,3) \ + ps(HSHUF,1,1) \ + ps(LSHUF,3,3) \ + pm(7,1) \ + pm(2,3) \ + pa(3,0) \ + pm(2,1) \ + pa(1,6) +#undef p1_dot_1c +#define p1_dot_1c(a_) \ + plq(a_,ax,1) \ + pl(a_,cx,2) \ + pc(1,3) \ + ps(HSHUF,1,1) \ + ps(LSHUF,3,3) \ + pm(7,1) \ + pm(2,3) \ + pa(3,0) \ + pm(2,1) \ + pa(1,6) +#undef p2_dot_1c +#define p2_dot_1c(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pc(3,5) \ + ps(HSHUF,3,3) \ + ps(LSHUF,5,5) \ + pm(7,3) \ + pm(4,5) \ + pa(5,0) \ + pm(4,3) \ + pa(3,6) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + plq(SS(a_,MM(2,RS4)),ax,3) \ + pc(1,5) \ + ps(HSHUF,1,1) \ + ps(LSHUF,5,5) \ + pm(7,1) \ + pm(2,5) \ + pa(5,0) \ + pm(2,1) \ + pa(1,6) +#undef lpdot_1c +#define lpdot_1c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(a_,ax,3) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(a_,cx,4) +#undef dpdot_1c +#define dpdot_1c(a_) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pc(3,5) \ + ps(HSHUF,3,3) \ + ps(LSHUF,5,5) \ + pm(7,3) \ + pm(4,5) \ + pa(5,0) \ + pm(4,3) \ + pa(3,6) \ + pc(1,5) \ + ps(HSHUF,1,1) \ + ps(LSHUF,5,5) \ + pm(7,1) \ + pm(2,5) \ + pa(5,0) \ + pm(2,1) \ + pa(1,6) +#undef pldot_1c +#define pldot_1c 8 + +#undef p1_4_dot_2c +#define p1_4_dot_2c(a_) +#undef p1_2_dot_2c +#define p1_2_dot_2c(a_) \ + px(1) \ + pld(a_,ax,1) \ + px(2) \ + pld(a_,cx,2) \ + pc(1,3) \ + ps(CSHUF,1,1) \ + pm(2,3) \ + pa(3,0) \ + pm(2,1) \ + pa(1,6) +#undef p1_dot_2c +#define p1_dot_2c(a_) \ + plq(a_,ax,1) \ + pl(a_,cx,2) \ + pc(1,3) \ + ps(CSHUF,1,1) \ + pm(2,3) \ + pa(3,0) \ + pm(2,1) \ + pa(1,6) +#undef p2_dot_2c +#define p2_dot_2c(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pc(3,5) \ + ps(CSHUF,3,3) \ + pm(4,5) \ + pa(5,0) \ + pm(4,3) \ + pa(3,6) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + plq(SS(a_,MM(2,RS4)),ax,3) \ + pc(1,5) \ + ps(CSHUF,1,1) \ + pm(2,5) \ + pa(5,0) \ + pm(2,1) \ + pa(1,6) +#undef lpdot_2c +#define lpdot_2c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(a_,ax,3) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(a_,cx,4) +#undef dpdot_2c +#define dpdot_2c(a_) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pc(3,5) \ + ps(CSHUF,3,3) \ + pm(4,5) \ + pa(5,0) \ + pm(4,3) \ + pa(3,6) \ + pc(1,5) \ + ps(CSHUF,1,1) \ + pm(2,5) \ + pa(5,0) \ + pm(2,1) \ + pa(1,6) +#undef pldot_2c +#define pldot_2c 8 + +#undef p1_4_axpby_3 +#define p1_4_axpby_3(a_) \ + pls(a_,ax,0) \ + pls(a_,cx,3) \ + pmsr(5,0) \ + pmsr(6,3) \ + pasr(3,0) \ + pus(0,a_,ax) +#undef p1_2_axpby_3 +#define p1_2_axpby_3(a_) \ + pld(a_,ax,0) \ + pld(a_,cx,3) \ + pm(5,0) \ + pm(6,3) \ + pa(3,0) \ + pud(0,a_,ax) +#undef p1_axpby_3 +#define p1_axpby_3(a_) \ + plq(a_,ax,0) \ + pl(a_,cx,3) \ + pm(5,0) \ + pm(6,3) \ + pa(3,0) \ + punt(0,a_,ax) +#undef p2_axpby_3 +#define p2_axpby_3(a_) \ + plq(a_,ax,0) \ + pl(a_,cx,3) \ + plq(SS(a_,RS4),ax,1) \ + pm(5,0) \ + pm(6,3) \ + pa(3,0) \ + pl(SS(a_,RS4),cx,3) \ + punt(0,a_,ax) \ + pm(5,1) \ + pm(6,3) \ + pa(3,1) \ + punt(1,SS(a_,RS4),ax) +#undef p4_axpby_3 +#define p4_axpby_3(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(5,2) \ + pl(SS(a_,MM(3,RS4)),cx,7) \ + pm(6,4) \ + pa(4,2) \ + punt(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + pm(5,3) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(6,7) \ + pa(7,3) \ + punt(1,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pm(5,0) \ + pl(SS(a_,MM(5,RS4)),cx,7) \ + pm(6,4) \ + pa(4,0) \ + punt(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,4) \ + pm(5,1) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(6,7) \ + pa(7,1) \ + punt(3,SS(a_,MM(3,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpaxpby_3 +#define lpaxpby_3(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,4) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pl(SS(a_,MM(1,RS4)),cx,7) \ + pm(5,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,4) \ + pa(4,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pm(5,1) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(6,7) \ + pa(7,1) +#undef dpaxpby_3 +#define dpaxpby_3(a_) \ + pl(SS(a_,MM(3,RS4)),cx,7) \ + pm(5,2) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,4) \ + pa(4,2) \ + pm(5,3) \ + punt(0,a_,ax) \ + pm(6,7) \ + pa(7,3) \ + punt(1,SS(a_,RS4),ax) \ + punt(2,SS(a_,MM(2,RS4)),ax) \ + punt(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpby_3 +#define plaxpby_3 16 + +#undef p1_4_axpby_3c +#define p1_4_axpby_3c(a_) +#undef p1_2_axpby_3c +#define p1_2_axpby_3c(a_) \ + pld(a_,ax,0) \ + pld(a_,cx,2) \ + pc(0,3) \ + pm(5,0) \ + ps(CSHUF,3,3) \ + pm(4,3) \ + pa(3,0) \ + pc(2,3) \ + pm(6,2) \ + pa(2,0) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,0) \ + pud(0,a_,ax) +#undef p1_axpby_3c +#define p1_axpby_3c(a_) \ + plq(a_,ax,0) \ + pl(a_,cx,2) \ + pc(0,3) \ + pm(5,0) \ + ps(CSHUF,3,3) \ + pm(4,3) \ + pa(3,0) \ + pc(2,3) \ + pm(6,2) \ + pa(2,0) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,0) \ + puq(0,a_,ax) +#undef p2_axpby_3c +#define p2_axpby_3c(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,3) \ + pc(1,2) \ + pm(5,1) \ + ps(CSHUF,2,2) \ + pm(4,2) \ + pa(2,1) \ + pc(3,2) \ + pm(6,3) \ + pa(3,1) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,1) \ + puq(0,a_,ax) \ + plq(SS(a_,MM(2,RS4)),ax,0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pc(0,3) \ + pm(5,0) \ + ps(CSHUF,3,3) \ + pm(4,3) \ + pa(3,0) \ + pc(2,3) \ + pm(6,2) \ + pa(2,0) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,0) \ + puq(1,SS(a_,RS4),ax) +#undef lpaxpby_3c +#define lpaxpby_3c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,2) \ + pc(0,3) \ + pm(5,0) \ + ps(CSHUF,3,3) \ + pm(4,3) \ + pa(3,0) \ + pc(2,3) \ + pm(6,2) \ + pa(2,0) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,0) +#undef dpaxpby_3c +#define dpaxpby_3c(a_) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,3) \ + pc(1,2) \ + pm(5,1) \ + ps(CSHUF,2,2) \ + pm(4,2) \ + pa(2,1) \ + pc(3,2) \ + pm(6,3) \ + pa(3,1) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef plaxpby_3c +#define plaxpby_3c 8 + +#undef p1_4_axpby_2 +#define p1_4_axpby_2(a_) \ + pls(a_,cx,5) \ + pls(a_,ax,0) \ + pmsr(6,5) \ + pasr(5,0) \ + pus(0,a_,ax) +#undef p1_2_axpby_2 +#define p1_2_axpby_2(a_) \ + pld(a_,cx,5) \ + pld(a_,ax,0) \ + pm(6,5) \ + pa(5,0) \ + pud(0,a_,ax) +#undef p1_axpby_2 +#define p1_axpby_2(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pm(6,5) \ + pa(5,0) \ + puq(0,a_,ax) +#undef p2_axpby_2 +#define p2_axpby_2(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,5) \ + pa(5,0) \ + plq(SS(a_,RS4),ax,1) \ + puq(0,a_,ax) \ + pm(6,4) \ + pa(4,1) \ + puq(1,SS(a_,RS4),ax) +#undef p4_axpby_2 +#define p4_axpby_2(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + pm(6,4) \ + pa(4,2) \ + puq(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(6,5) \ + pa(5,3) \ + puq(1,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pm(6,4) \ + pa(4,0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,4) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(6,5) \ + pa(5,1) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpaxpby_2 +#define lpaxpby_2(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,4) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pl(SS(a_,MM(1,RS4)),cx,5) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,4) \ + pa(4,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(6,5) \ + pa(5,1) +#undef dpaxpby_2 +#define dpaxpby_2(a_) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,4) \ + pa(4,2) \ + puq(0,a_,ax) \ + pm(6,5) \ + pa(5,3) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpby_2 +#define plaxpby_2 16 + +#undef p1_4_axpby_2c +#define p1_4_axpby_2c(a_) +#undef p1_2_axpby_2c +#define p1_2_axpby_2c(a_) \ + pld(a_,cx,5) \ + pld(a_,ax,0) \ + pc(5,1) \ + pm(6,5) \ + pa(5,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + pud(0,a_,ax) +#undef p1_axpby_2c +#define p1_axpby_2c(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pc(5,1) \ + pm(6,5) \ + pa(5,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + puq(0,a_,ax) +#undef p2_axpby_2c +#define p2_axpby_2c(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pl(SS(a_,RS4),cx,4) \ + pc(5,1) \ + pm(6,5) \ + pa(5,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + plq(SS(a_,RS4),ax,1) \ + puq(0,a_,ax) \ + pc(4,3) \ + pm(6,4) \ + pa(4,1) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,1) \ + puq(1,SS(a_,RS4),ax) +#undef p4_axpby_2c +#define p4_axpby_2c(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + puq(0,a_,ax) \ + pc(4,0) \ + pm(6,4) \ + pa(4,2) \ + ps(CSHUF,0,0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + pm(7,0) \ + pa(0,2) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + puq(1,SS(a_,RS4),ax) \ + pc(5,1) \ + pm(6,5) \ + pa(5,3) \ + ps(CSHUF,1,1) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pm(7,1) \ + pa(1,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,4) \ + pm(7,2) \ + pa(2,0) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + pc(5,3) \ + pm(6,5) \ + pa(5,1) \ + ps(CSHUF,3,3) \ + pl(SS(a_,MM(7,RS4)),cx,5) \ + pm(7,3) \ + pa(3,1) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpaxpby_2c +#define lpaxpby_2c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,4) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pl(SS(a_,MM(1,RS4)),cx,5) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(7,2) \ + pa(2,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pc(5,3) \ + pm(6,5) \ + pa(5,1) \ + ps(CSHUF,3,3) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + pm(7,3) \ + pa(3,1) +#undef dpaxpby_2c +#define dpaxpby_2c(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + puq(0,a_,ax) \ + pc(4,0) \ + pm(6,4) \ + pa(4,2) \ + ps(CSHUF,0,0) \ + puq(1,SS(a_,RS4),ax) \ + pm(7,0) \ + pa(0,2) \ + pc(5,1) \ + pm(6,5) \ + pa(5,3) \ + ps(CSHUF,1,1) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pm(7,1) \ + pa(1,3) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpby_2c +#define plaxpby_2c 16 + +#undef p1_4_axpby_1 +#define p1_4_axpby_1(a_) \ + pls(a_,ax,1) \ + pls(a_,cx,2) \ + pmsr(5,1) \ + pmsr(6,2) \ + pasr(2,1) \ + pus(1,a_,ax) +#undef p1_2_axpby_1 +#define p1_2_axpby_1(a_) \ + pld(a_,ax,1) \ + pld(a_,cx,2) \ + pm(5,1) \ + pm(6,2) \ + pa(2,1) \ + pud(1,a_,ax) +#undef p1_axpby_1 +#define p1_axpby_1(a_) \ + plq(a_,ax,1) \ + pl(a_,cx,2) \ + pm(5,1) \ + pm(6,2) \ + pa(2,1) \ + puq(1,a_,ax) +#undef p2_axpby_1 +#define p2_axpby_1(a_) \ + plq(SS(a_,RS4),ax,3) \ + pl(SS(a_,RS4),cx,4) \ + pm(5,1) \ + pm(6,2) \ + pa(2,1) \ + puq(1,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pm(5,3) \ + pm(6,4) \ + pa(4,3) \ + puq(3,SS(a_,RS4),ax) +#undef lpaxpby_1 +#define lpaxpby_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,2) +#undef dpaxpby_1 +#define dpaxpby_1(a_) \ + plq(SS(a_,RS4),ax,3) \ + pl(SS(a_,RS4),cx,4) \ + pm(5,1) \ + pm(6,2) \ + pa(2,1) \ + puq(1,a_,ax) \ + pm(5,3) \ + pm(6,4) \ + pa(4,3) \ + puq(3,SS(a_,RS4),ax) +#undef plaxpby_1 +#define plaxpby_1 8 + +#undef p1_4_axpy_0 +#define p1_4_axpy_0(a_) \ + pls(a_,cx,2) \ + pls(a_,ax,1) \ + pmsr(6,2) \ + pasr(2,1) \ + pus(1,a_,ax) +#undef p1_2_axpy_0 +#define p1_2_axpy_0(a_) \ + pld(a_,cx,2) \ + pld(a_,ax,1) \ + pm(6,2) \ + pa(2,1) \ + pud(1,a_,ax) +#undef p1_axpy_0 +#define p1_axpy_0(a_) \ + pl(a_,cx,2) \ + plq(a_,ax,1) \ + pm(6,2) \ + pa(2,1) \ + puq(1,a_,ax) +#undef p2_axpy_0 +#define p2_axpy_0(a_) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,2) \ + pa(2,1) \ + plq(SS(a_,RS4),ax,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + puq(1,a_,ax) \ + pm(6,4) \ + pa(4,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + puq(3,SS(a_,RS4),ax) +#undef lpaxpy_0 +#define lpaxpy_0(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,2) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) +#undef dpaxpy_0 +#define dpaxpy_0(a_) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,2) \ + pa(2,1) \ + plq(SS(a_,RS4),ax,3) \ + puq(1,a_,ax) \ + pm(6,4) \ + pa(4,3) \ + puq(3,SS(a_,RS4),ax) +#undef plaxpy_0 +#define plaxpy_0 8 + +#undef p1_4_axpy_1 +#define p1_4_axpy_1(a_) \ + pls(a_,cx,2) \ + pls(a_,ax,1) \ + pmsr(6,2) \ + pasr(2,1) \ + pus(1,a_,ax) +#undef p1_2_axpy_1 +#define p1_2_axpy_1(a_) \ + pld(a_,cx,2) \ + pld(a_,ax,1) \ + pm(6,2) \ + pa(2,1) \ + pud(1,a_,ax) +#undef p1_axpy_1 +#define p1_axpy_1(a_) \ + pl(a_,cx,2) \ + pm(6,2) \ + pam(a_,ax,2) \ + puq(2,a_,ax) +#undef p2_axpy_1 +#define p2_axpy_1(a_) \ + pl(a_,cx,2) \ + pm(6,2) \ + pl(SS(a_,RS4),cx,4) \ + pam(a_,ax,2) \ + pm(6,4) \ + puq(2,a_,ax) \ + pam(SS(a_,RS4),ax,4) \ + puq(4,SS(a_,RS4),ax) +#undef p4_axpy_1 +#define p4_axpy_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pm(6,2) \ + pam(SS(a_,MM(2,RS4)),ax,2) \ + puq(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + pl(SS(a_,MM(4,RS4)),cx,0) \ + pm(6,3) \ + pam(SS(a_,MM(3,RS4)),ax,3) \ + puq(1,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(5,RS4)),cx,1) \ + pm(6,0) \ + pam(SS(a_,MM(4,RS4)),ax,0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + pl(SS(a_,MM(6,RS4)),cx,2) \ + pm(6,1) \ + pam(SS(a_,MM(5,RS4)),ax,1) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef lpaxpy_1 +#define lpaxpy_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(a_,cx,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + pl(SS(a_,RS4),cx,1) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pm(6,0) \ + pam(a_,ax,0) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + pm(6,1) \ + pam(SS(a_,RS4),ax,1) +#undef dpaxpy_1 +#define dpaxpy_1(a_) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pm(6,2) \ + pam(SS(a_,MM(2,RS4)),ax,2) \ + puq(0,a_,ax) \ + pm(6,3) \ + pam(SS(a_,MM(3,RS4)),ax,3) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpy_1 +#define plaxpy_1 16 + +#undef p1_4_axpy_2 +#define p1_4_axpy_2(a_) \ + pls(a_,cx,5) \ + pls(a_,ax,0) \ + pmsr(6,5) \ + pasr(5,0) \ + pus(0,a_,ax) +#undef p1_2_axpy_2 +#define p1_2_axpy_2(a_) \ + pld(a_,cx,5) \ + pld(a_,ax,0) \ + pm(6,5) \ + pa(5,0) \ + pud(0,a_,ax) +#undef p1_axpy_2 +#define p1_axpy_2(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pm(6,5) \ + pa(5,0) \ + puq(0,a_,ax) +#undef p2_axpy_2 +#define p2_axpy_2(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,5) \ + pa(5,0) \ + plq(SS(a_,RS4),ax,1) \ + puq(0,a_,ax) \ + pm(6,4) \ + pa(4,1) \ + puq(1,SS(a_,RS4),ax) +#undef p4_axpy_2 +#define p4_axpy_2(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + pm(6,4) \ + pa(4,2) \ + puq(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(6,5) \ + pa(5,3) \ + puq(1,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pm(6,4) \ + pa(4,0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,4) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(6,5) \ + pa(5,1) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpaxpy_2 +#define lpaxpy_2(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,4) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pl(SS(a_,MM(1,RS4)),cx,5) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,4) \ + pa(4,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(6,5) \ + pa(5,1) +#undef dpaxpy_2 +#define dpaxpy_2(a_) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,4) \ + pa(4,2) \ + puq(0,a_,ax) \ + pm(6,5) \ + pa(5,3) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpy_2 +#define plaxpy_2 16 + +#undef p1_4_axpy_2c +#define p1_4_axpy_2c(a_) +#undef p1_2_axpy_2c +#define p1_2_axpy_2c(a_) \ + pld(a_,cx,4) \ + pld(a_,ax,0) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + pud(0,a_,ax) +#undef p1_axpy_2c +#define p1_axpy_2c(a_) \ + pl(a_,cx,4) \ + plq(a_,ax,0) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + puq(0,a_,ax) +#undef p2_axpy_2c +#define p2_axpy_2c(a_) \ + pl(a_,cx,4) \ + plq(a_,ax,0) \ + pl(SS(a_,RS4),cx,5) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + plq(SS(a_,RS4),ax,1) \ + puq(0,a_,ax) \ + pc(5,3) \ + pm(6,5) \ + pa(5,1) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,1) \ + puq(1,SS(a_,RS4),ax) +#undef p4_axpy_2c +#define p4_axpy_2c(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + puq(0,a_,ax) \ + pc(4,0) \ + pm(6,4) \ + pa(4,2) \ + ps(CSHUF,0,0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + pm(7,0) \ + pa(0,2) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + puq(1,SS(a_,RS4),ax) \ + pc(5,1) \ + pm(6,5) \ + pa(5,3) \ + ps(CSHUF,1,1) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pm(7,1) \ + pa(1,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,4) \ + pm(7,2) \ + pa(2,0) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + pc(5,3) \ + pm(6,5) \ + pa(5,1) \ + ps(CSHUF,3,3) \ + pl(SS(a_,MM(7,RS4)),cx,5) \ + pm(7,3) \ + pa(3,1) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpaxpy_2c +#define lpaxpy_2c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,4) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pl(SS(a_,MM(1,RS4)),cx,5) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(7,2) \ + pa(2,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pc(5,3) \ + pm(6,5) \ + pa(5,1) \ + ps(CSHUF,3,3) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + pm(7,3) \ + pa(3,1) +#undef dpaxpy_2c +#define dpaxpy_2c(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + puq(0,a_,ax) \ + pc(4,0) \ + pm(6,4) \ + pa(4,2) \ + ps(CSHUF,0,0) \ + puq(1,SS(a_,RS4),ax) \ + pm(7,0) \ + pa(0,2) \ + pc(5,1) \ + pm(6,5) \ + pa(5,3) \ + ps(CSHUF,1,1) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pm(7,1) \ + pa(1,3) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpy_2c +#define plaxpy_2c 16 + +#undef p1_4_axpy_1c +#define p1_4_axpy_1c(a_) +#undef p1_2_axpy_1c +#define p1_2_axpy_1c(a_) \ + pld(a_,cx,2) \ + pc(2,0) \ + pld(a_,ax,1) \ + ps(CSHUF,0,0) \ + pm(6,2) \ + pa(2,1) \ + pm(7,0) \ + pa(0,1) \ + pud(1,a_,ax) +#undef p1_axpy_1c +#define p1_axpy_1c(a_) \ + pl(a_,cx,2) \ + pc(2,0) \ + plq(a_,ax,1) \ + ps(CSHUF,0,0) \ + pm(6,2) \ + pa(2,1) \ + pm(7,0) \ + pa(0,1) \ + puq(1,a_,ax) +#undef p2_axpy_1c +#define p2_axpy_1c(a_) \ + plq(SS(a_,RS4),ax,3) \ + ps(CSHUF,0,0) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,2) \ + pa(2,1) \ + pm(7,0) \ + pa(0,1) \ + pc(4,0) \ + puq(1,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + ps(CSHUF,0,0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pm(6,4) \ + pa(4,3) \ + pm(7,0) \ + pa(0,3) \ + pc(2,0) \ + puq(3,SS(a_,RS4),ax) +#undef lpaxpy_1c +#define lpaxpy_1c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,2) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) \ + pc(2,0) +#undef dpaxpy_1c +#define dpaxpy_1c(a_) \ + plq(SS(a_,RS4),ax,3) \ + ps(CSHUF,0,0) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,2) \ + pa(2,1) \ + pm(7,0) \ + pa(0,1) \ + pc(4,0) \ + puq(1,a_,ax) \ + ps(CSHUF,0,0) \ + pm(6,4) \ + pa(4,3) \ + pm(7,0) \ + pa(0,3) \ + puq(3,SS(a_,RS4),ax) +#undef plaxpy_1c +#define plaxpy_1c 8 + +#undef p1_4_copy_1 +#define p1_4_copy_1(a_) \ + pls(a_,cx,2) \ + pus(2,a_,ax) +#undef p1_2_copy_1 +#define p1_2_copy_1(a_) \ + pld(a_,cx,2) \ + pud(2,a_,ax) +#undef p1_copy_1 +#define p1_copy_1(a_) \ + pl(a_,cx,2) \ + puq(2,a_,ax) +#undef p2_copy_1 +#define p2_copy_1(a_) \ + pl(SS(a_,RS4),cx,4) \ + puq(2,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + puq(4,SS(a_,RS4),ax) +#undef lpcopy_1 +#define lpcopy_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,2) +#undef dpcopy_1 +#define dpcopy_1(a_) \ + pl(SS(a_,RS4),cx,4) \ + puq(2,a_,ax) \ + puq(4,SS(a_,RS4),ax) +#undef plcopy_1 +#define plcopy_1 8 + +#undef p1_4_copy_2 +#define p1_4_copy_2(a_) \ + pls(a_,ax,2) \ + pus(2,a_,cx) +#undef p1_2_copy_2 +#define p1_2_copy_2(a_) \ + pld(a_,ax,2) \ + pud(2,a_,cx) +#undef p1_copy_2 +#define p1_copy_2(a_) \ + plq(a_,ax,2) \ + pu(2,a_,cx) +#undef p2_copy_2 +#define p2_copy_2(a_) \ + plq(SS(a_,RS4),ax,4) \ + pu(2,a_,cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pu(4,SS(a_,RS4),cx) +#undef lpcopy_2 +#define lpcopy_2(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,2) +#undef dpcopy_2 +#define dpcopy_2(a_) \ + plq(SS(a_,RS4),ax,4) \ + pu(2,a_,cx) \ + pu(4,SS(a_,RS4),cx) +#undef plcopy_2 +#define plcopy_2 8 + +#undef p1_4_copy_3 +#define p1_4_copy_3(a_) \ + pls(a_,cx,2) \ + pus(2,a_,ax) +#undef p1_2_copy_3 +#define p1_2_copy_3(a_) \ + pld(a_,cx,2) \ + pud(2,a_,ax) +#undef p1_copy_3 +#define p1_copy_3(a_) \ + pl(a_,cx,2) \ + punt(2,a_,ax) +#undef p2_copy_3 +#define p2_copy_3(a_) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + punt(0,SS(a_,MM(0,RS4)),ax) \ + punt(1,SS(a_,MM(1,RS4)),ax) +#undef p4_copy_3 +#define p4_copy_3(a_) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + punt(0,SS(a_,MM(0,RS4)),ax) \ + punt(1,SS(a_,MM(1,RS4)),ax) \ + punt(2,SS(a_,MM(2,RS4)),ax) \ + punt(3,SS(a_,MM(3,RS4)),ax) +#undef p8_copy_3 +#define p8_copy_3(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pl(SS(a_,MM(6,RS4)),cx,6) \ + pl(SS(a_,MM(7,RS4)),cx,7) \ + punt(0,SS(a_,MM(0,RS4)),ax) \ + punt(1,SS(a_,MM(1,RS4)),ax) \ + punt(2,SS(a_,MM(2,RS4)),ax) \ + punt(3,SS(a_,MM(3,RS4)),ax) \ + punt(4,SS(a_,MM(4,RS4)),ax) \ + punt(5,SS(a_,MM(5,RS4)),ax) \ + punt(6,SS(a_,MM(6,RS4)),ax) \ + punt(7,SS(a_,MM(7,RS4)),ax) +#undef lpcopy_3 +#define lpcopy_3(a_) +#undef dpcopy_3 +#define dpcopy_3(a_) p8_copy_3(a_) +#undef plcopy_3 +#define plcopy_3 32 + +#undef p1_4_cpsc_3 +#define p1_4_cpsc_3(a_) \ + pls(a_,ax,0) \ + pmsr(6,0) \ + pus(0,a_,cx) +#undef p1_2_cpsc_3 +#define p1_2_cpsc_3(a_) \ + pld(a_,ax,0) \ + pm(6,0) \ + pud(0,a_,cx) +#undef p1_cpsc_3 +#define p1_cpsc_3(a_) \ + plq(a_,ax,0) \ + pm(6,0) \ + pu(0,a_,cx) +#undef p2_cpsc_3 +#define p2_cpsc_3(a_) \ + plq(a_,ax,0) \ + plq(SS(a_,RS4),ax,1) \ + pm(6,0) \ + pm(6,1) \ + pu(0,a_,cx) \ + pu(1,SS(a_,RS4),cx) +#undef p4_cpsc_3 +#define p4_cpsc_3(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,2) \ + pu(0,a_,cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(6,3) \ + pu(1,SS(a_,RS4),cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pm(6,0) \ + pu(2,SS(a_,MM(2,RS4)),cx) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(6,1) \ + pu(3,SS(a_,MM(3,RS4)),cx) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) +#undef lpcpsc_3 +#define lpcpsc_3(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pm(6,1) +#undef dpcpsc_3 +#define dpcpsc_3(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,2) \ + pu(0,a_,cx) \ + pm(6,3) \ + pu(1,SS(a_,RS4),cx) \ + pu(2,SS(a_,MM(2,RS4)),cx) \ + pu(3,SS(a_,MM(3,RS4)),cx) +#undef plcpsc_3 +#define plcpsc_3 16 + +#undef p1_4_cpsc_3c +#define p1_4_cpsc_3c(a_) +#undef p1_2_cpsc_3c +#define p1_2_cpsc_3c(a_) \ + pld(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + pud(0,a_,cx) +#undef p1_cpsc_3c +#define p1_cpsc_3c(a_) \ + plq(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + pu(0,a_,cx) +#undef p2_cpsc_3c +#define p2_cpsc_3c(a_) \ + plq(a_,ax,0) \ + plq(SS(a_,RS4),ax,1) \ + pc(0,2) \ + pm(6,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + pu(0,a_,cx) \ + pc(1,3) \ + pm(6,1) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,1) \ + pu(1,SS(a_,RS4),cx) +#undef p4_cpsc_3c +#define p4_cpsc_3c(a_) \ + pu(0,a_,cx) \ + pc(2,4) \ + pm(6,2) \ + ps(CSHUF,4,4) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(7,4) \ + pa(4,2) \ + pu(1,SS(a_,RS4),cx) \ + pc(3,4) \ + pm(6,3) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pm(7,4) \ + pa(4,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pu(2,SS(a_,MM(2,RS4)),cx) \ + pc(0,4) \ + pm(6,0) \ + ps(CSHUF,4,4) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(7,4) \ + pa(4,0) \ + pu(3,SS(a_,MM(3,RS4)),cx) \ + pc(1,4) \ + pm(6,1) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(7,RS4)),ax,3) \ + pm(7,4) \ + pa(4,1) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) +#undef lpcpsc_3c +#define lpcpsc_3c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pc(0,4) \ + pm(6,0) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pm(7,4) \ + pa(4,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pc(1,4) \ + pm(6,1) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(7,4) \ + pa(4,1) +#undef dpcpsc_3c +#define dpcpsc_3c(a_) \ + pu(0,a_,cx) \ + pc(2,4) \ + pm(6,2) \ + ps(CSHUF,4,4) \ + pu(1,SS(a_,RS4),cx) \ + pm(7,4) \ + pa(4,2) \ + pc(3,4) \ + pm(6,3) \ + ps(CSHUF,4,4) \ + pu(2,SS(a_,MM(2,RS4)),cx) \ + pm(7,4) \ + pa(4,3) \ + pu(3,SS(a_,MM(3,RS4)),cx) +#undef plcpsc_3c +#define plcpsc_3c 16 + +#undef p1_4_cpsc_4 +#define p1_4_cpsc_4(a_) \ + pls(a_,cx,0) \ + pmsr(6,0) \ + pus(0,a_,ax) +#undef p1_2_cpsc_4 +#define p1_2_cpsc_4(a_) \ + pld(a_,cx,0) \ + pm(6,0) \ + pud(0,a_,ax) +#undef p1_cpsc_4 +#define p1_cpsc_4(a_) \ + pl(a_,cx,0) \ + pm(6,0) \ + puq(0,a_,ax) +#undef p2_cpsc_4 +#define p2_cpsc_4(a_) \ + pl(a_,cx,0) \ + pl(SS(a_,RS4),cx,1) \ + pm(6,0) \ + pm(6,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_cpsc_4 +#define p4_cpsc_4(a_) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pm(6,2) \ + puq(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,0) \ + pm(6,3) \ + puq(1,SS(a_,RS4),ax) \ + pl(SS(a_,MM(5,RS4)),cx,1) \ + pm(6,0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,2) \ + pm(6,1) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef lpcpsc_4 +#define lpcpsc_4(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pm(6,0) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pm(6,1) +#undef dpcpsc_4 +#define dpcpsc_4(a_) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pm(6,2) \ + puq(0,a_,ax) \ + pm(6,3) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plcpsc_4 +#define plcpsc_4 16 + +#undef p1_4_cpsc_5 +#define p1_4_cpsc_5(a_) \ + pls(a_,cx,0) \ + pmsr(6,0) \ + pus(0,a_,ax) +#undef p1_2_cpsc_5 +#define p1_2_cpsc_5(a_) \ + pld(a_,cx,0) \ + pm(6,0) \ + pud(0,a_,ax) +#undef p1_cpsc_5 +#define p1_cpsc_5(a_) \ + pl(a_,cx,0) \ + pm(6,0) \ + puq(0,a_,ax) +#undef p2_cpsc_5 +#define p2_cpsc_5(a_) \ + pl(a_,cx,0) \ + pl(SS(a_,RS4),cx,1) \ + pm(6,0) \ + pm(6,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_cpsc_5 +#define p4_cpsc_5(a_) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pm(6,0) \ + pm(6,1) \ + pm(6,2) \ + pm(6,3) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef p8_cpsc_5 +#define p8_cpsc_5(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pl(SS(a_,MM(6,RS4)),cx,7) \ + pm(6,0) \ + pm(6,1) \ + pm(6,2) \ + pm(6,3) \ + puq(0,a_,ax) \ + pl(SS(a_,MM(7,RS4)),cx,0) \ + pm(6,4) \ + pm(6,5) \ + pm(6,7) \ + pm(6,0) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + puq(4,SS(a_,MM(4,RS4)),ax) \ + puq(5,SS(a_,MM(5,RS4)),ax) \ + puq(7,SS(a_,MM(6,RS4)),ax) \ + puq(0,SS(a_,MM(7,RS4)),ax) +#undef lpcpsc_5 +#define lpcpsc_5(a_) +#undef dpcpsc_5 +#define dpcpsc_5(a_) p8_cpsc_5(a_) +#undef plcpsc_5 +#define plcpsc_5 32 + +#undef cpsc_cdp +#define cpsc_cdp(a_) pc(a_,5) pm(6,a_) ps(CSHUF,5,5) pm(7,5) pa(5,a_) +#undef p1_4_cpsc_5c +#define p1_4_cpsc_5c(a_) +#undef p1_2_cpsc_5c +#define p1_2_cpsc_5c(a_) \ + pld(a_,cx,0) \ + cpsc_cdp(0) \ + pud(0,a_,ax) +#undef p1_cpsc_5c +#define p1_cpsc_5c(a_) \ + pl(a_,cx,0) \ + cpsc_cdp(0) \ + puq(0,a_,ax) +#undef p2_cpsc_5c +#define p2_cpsc_5c(a_) \ + pl(a_,cx,0) \ + pl(SS(a_,RS4),cx,1) \ + cpsc_cdp(0) \ + cpsc_cdp(1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_cpsc_5c +#define p4_cpsc_5c(a_) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + cpsc_cdp(0) \ + cpsc_cdp(1) \ + cpsc_cdp(2) \ + cpsc_cdp(3) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef p8_cpsc_5c +#define p8_cpsc_5c(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + cpsc_cdp(0) \ + cpsc_cdp(1) \ + puq(0,a_,ax) \ + pl(SS(a_,MM(5,RS4)),cx,0) \ + cpsc_cdp(2) \ + cpsc_cdp(3) \ + puq(1,SS(a_,RS4),ax) \ + pl(SS(a_,MM(6,RS4)),cx,1) \ + cpsc_cdp(4) \ + cpsc_cdp(0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pl(SS(a_,MM(7,RS4)),cx,2) \ + cpsc_cdp(1) \ + cpsc_cdp(2) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + puq(4,SS(a_,MM(4,RS4)),ax) \ + puq(0,SS(a_,MM(5,RS4)),ax) \ + puq(1,SS(a_,MM(6,RS4)),ax) \ + puq(2,SS(a_,MM(7,RS4)),ax) +#undef lpcpsc_5c +#define lpcpsc_5c(a_) +#undef dpcpsc_5c +#define dpcpsc_5c(a_) p8_cpsc_5c(a_) +#undef plcpsc_5c +#define plcpsc_5c 32 + +#undef p1_4_cpsc_1 +#define p1_4_cpsc_1(a_) \ + pls(a_,ax,2) \ + pmsr(3,2) \ + pus(2,a_,cx) +#undef p1_2_cpsc_1 +#define p1_2_cpsc_1(a_) \ + pld(a_,ax,2) \ + pm(3,2) \ + pud(2,a_,cx) +#undef p1_cpsc_1 +#define p1_cpsc_1(a_) \ + plq(a_,ax,2) \ + pm(3,2) \ + pu(2,a_,cx) +#undef p2_cpsc_1 +#define p2_cpsc_1(a_) \ + plq(SS(a_,RS4),ax,4) \ + pm(3,2) \ + pu(2,a_,cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pm(3,4) \ + pu(4,SS(a_,RS4),cx) +#undef lpcpsc_1 +#define lpcpsc_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,2) +#undef dpcpsc_1 +#define dpcpsc_1(a_) \ + plq(SS(a_,RS4),ax,4) \ + pm(3,2) \ + pu(2,a_,cx) \ + pm(3,4) \ + pu(4,SS(a_,RS4),cx) +#undef plcpsc_1 +#define plcpsc_1 8 + +#undef p1_4_cpsc_2 +#define p1_4_cpsc_2(a_) \ + pls(a_,ax,2) \ + pmsr(3,2) \ + pus(2,a_,cx) +#undef p1_2_cpsc_2 +#define p1_2_cpsc_2(a_) \ + pld(a_,ax,2) \ + pm(3,2) \ + pud(2,a_,cx) +#undef p1_cpsc_2 +#define p1_cpsc_2(a_) \ + plq(a_,ax,2) \ + pm(3,2) \ + pu(2,a_,cx) +#undef p2_cpsc_2 +#define p2_cpsc_2(a_) \ + plq(a_,ax,2) \ + plq(SS(a_,RS4),ax,4) \ + pm(3,2) \ + pm(3,4) \ + pu(2,a_,cx) \ + pu(4,SS(a_,RS4),cx) +#undef p4_cpsc_2 +#define p4_cpsc_2(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,7) \ + pm(3,6) \ + pu(4,a_,cx) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(3,7) \ + pu(6,SS(a_,RS4),cx) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,4) \ + pm(3,2) \ + pu(7,SS(a_,MM(2,RS4)),cx) \ + plq(SS(a_,MM(5,RS4)),ax,6) \ + pm(3,4) \ + pu(2,SS(a_,MM(3,RS4)),cx) +#undef lpcpsc_2 +#define lpcpsc_2(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,4) \ + plq(SS(a_,MM(1,RS4)),ax,6) \ + pm(3,4) +#undef dpcpsc_2 +#define dpcpsc_2(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,7) \ + pm(3,6) \ + pu(4,a_,cx) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(3,7) \ + pu(6,SS(a_,RS4),cx) \ + pm(3,2) \ + pu(7,SS(a_,MM(2,RS4)),cx) \ + pu(2,SS(a_,MM(3,RS4)),cx) +#undef plcpsc_2 +#define plcpsc_2 RS4 + + +#undef p1_4_iamax_1 +#define p1_4_iamax_1(a_) \ + px(4) \ + pls(a_,ax,4) \ + pan(2,4) \ + pc(3,5) \ + pcm(6,4,5) \ + paxs(4,3) \ + pan(5,6) \ + pann(0,5) \ + pasr(5,6) \ + pasr(1,0) \ + ps(57,0,0) +#undef p1_2_iamax_1 +#define p1_2_iamax_1(a_) \ + px(4) \ + pld(a_,ax,4) \ + pan(2,4) \ + pc(3,5) \ + pcm(6,4,5) \ + pax(4,3) \ + pan(5,6) \ + pann(0,5) \ + pa(5,6) \ + pasr(1,0) \ + ps(57,0,0)\ + pasr(1,0) \ + ps(57,0,0) +#undef p1_iamax_1 +#define p1_iamax_1(a_) \ + plq(a_,ax,4) \ + pan(2,4) \ + pc(3,5) \ + pcm(6,4,5) \ + pax(4,3) \ + pan(5,6) \ + pann(0,5) \ + pa(5,6) \ + pa(1,0) +#define p2_iamax_1(a_) \ + plq(SS(a_,RS4),ax,4) \ + pan(2,4) \ + pc(3,5) \ + pcm(6,4,5) \ + pax(4,3) \ + pan(5,6) \ + pann(0,5) \ + pa(5,6) \ + pa(1,0) \ + f(nta,SS(a_,MM(SS(2,CL),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,4) \ + pan(2,4) \ + pc(3,5) \ + pcm(6,4,5) \ + pax(4,3) \ + pan(5,6) \ + pann(0,5) \ + pa(5,6) \ + pa(1,0) +#undef lpiamax_1 +#define lpiamax_1(a_) \ + f(nta,SS(a_,MM(CL,RS4)),ax) \ + plq(a_,ax,4) \ + pan(2,4) \ + pc(3,5) \ + pcm(6,4,5) \ + pax(4,3) \ + pan(5,6) \ + pann(0,5) \ + pa(5,6) \ + pa(1,0) +#undef dpiamax_1 +#define dpiamax_1(a_) \ + plq(SS(a_,RS4),ax,4) \ + pan(2,4) \ + pc(3,5) \ + pcm(6,4,5) \ + pax(4,3) \ + pan(5,6) \ + pann(0,5) \ + pa(5,6) \ + pa(1,0) +#undef pliamax_1 +#define pliamax_1 8 + +#undef p1_4_iamax_1d +#define p1_4_iamax_1d(a_) +#undef p1_2_iamax_1d +#define p1_2_iamax_1d(a_) \ + px(4) \ + pld(a_,ax,4) \ + dbg(2) \ + pan(2,4) \ + dbg(4) \ + pc(3,5) \ + dbg(5) \ + pcm(6,4,5) \ + dbg(5) \ + pax(4,3) \ + dbg(3) \ + pan(5,6) \ + dbg(6) \ + pann(0,5) \ + dbg(5) \ + pa(5,6) \ + dbg(6) \ + pasr(1,0) \ + dbg(0) \ + ps(1,0,0) +#undef p1_iamax_1d +#define p1_iamax_1d(a_) \ + plq(a_,ax,4) \ + dbg(2) \ + pan(2,4) \ + dbg(4) \ + pc(3,5) \ + dbg(5) \ + pcm(6,4,5) \ + dbg(5) \ + pax(4,3) \ + dbg(3) \ + pan(5,6) \ + dbg(6) \ + pann(0,5) \ + dbg(5) \ + pa(5,6) \ + dbg(6) \ + pa(1,0) +#define p2_iamax_1d(a_) \ + plq(SS(a_,RS4),ax,4) \ + dbg(2) \ + pan(2,4) \ + dbg(4) \ + pc(3,5) \ + dbg(5) \ + pcm(6,4,5) \ + dbg(5) \ + pax(4,3) \ + dbg(3) \ + pan(5,6) \ + dbg(6) \ + pann(0,5) \ + dbg(5) \ + pa(5,6) \ + dbg(6) \ + pa(1,0) \ + dbg(0) \ + f(nta,SS(a_,MM(SS(2,CL),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,4) \ + dbg(2) \ + pan(2,4) \ + dbg(4) \ + pc(3,5) \ + dbg(5) \ + pcm(6,4,5) \ + dbg(5) \ + pax(4,3) \ + dbg(3) \ + pan(5,6) \ + dbg(6) \ + pann(0,5) \ + dbg(5) \ + pa(5,6) \ + dbg(6) \ + pa(1,0) +#undef lpiamax_1d +#define lpiamax_1d(a_) \ + f(nta,SS(a_,MM(CL,RS4)),ax) \ + plq(a_,ax,4) \ + dbg(2) \ + pan(2,4) \ + dbg(4) \ + pc(3,5) \ + dbg(5) \ + pcm(6,4,5) \ + dbg(5) \ + pax(4,3) \ + dbg(3) \ + pan(5,6) \ + dbg(6) \ + pann(0,5) \ + dbg(5) \ + pa(5,6) \ + dbg(6) \ + pa(1,0) +#undef dpiamax_1d +#define dpiamax_1d(a_) \ + plq(SS(a_,RS4),ax,4) \ + dbg(2) \ + pan(2,4) \ + dbg(4) \ + pc(3,5) \ + dbg(5) \ + pcm(6,4,5) \ + dbg(5) \ + pax(4,3) \ + dbg(3) \ + pan(5,6) \ + dbg(6) \ + pann(0,5) \ + dbg(5) \ + pa(5,6) \ + dbg(6) \ + pa(1,0) +#undef pliamax_1d +#define pliamax_1d 8 + diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h new file mode 100644 index 0000000..03486cf --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h @@ -0,0 +1,331 @@ +/*************************************** + $Header: /cvsroot/math-atlas/AtlasBase/kernel/CammMaguire/camm_tpipe.h,v 1.2 2003/10/18 18:13:30 yycamm Exp $ + + +***************************************/ + + +/* #ifndef CAMM_TPIPE_H */ +/* #define CAMM_TPIPE_H */ /*+ To stop multiple inclusions. +*/ + +#ifndef BITS +#error BITS must be defined in camm_tpipe.h +#endif +#ifndef DIV +#error DIV must be defined in camm_tpipe.h +#endif +#ifndef INC +#error INC(a_) must be defined in camm_tpipe.h +#endif +#ifndef LR +#error LR must be defined in camm_tpipe.h +#endif + +#ifdef ALIGN + +#if defined(SREAL) + + test(4,ax) + je(a2) + +#undef KB +#define KB ( 1 /* / DIV */ ) +#include "camm_pipe3.h" + + KB_block + INC(4) + sub(1,LR) + + lab(a2) + +#endif + +#if defined(SREAL) || defined(DREAL) + + test(8,ax) + je(a4) + test(-2,LR) + je(a4) + +#undef KB +#define KB ( 2 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(8) + sub(2,LR) + + lab(a4) + +#endif +#endif + +/* "movl %%edx,%%edi\n\t" */ + push(LR) + shr(BITS,LR) + shl(BITS,LR) + m(4,LR) + ra(ax,LR) + +#if defined(ALIGN) && ( defined(SCPLX) || defined(DCPLX) ) + test(12,ax) + je(loopa) +#endif + +#if !defined(ALIGN) || defined(SCPLX) || defined(DCPLX) +#undef plq +#define plq(a_,b_,c_) pl(a_,b_,c_) +#undef puq +#define puq(a_,b_,c_) pu(a_,b_,c_) +#undef plqx +#define plqx(a_,b_,c_,d_,e_) plx(a_,b_,c_,d_,e_) +#undef puqx +#define puqx(a_,b_,c_,d_,e_) pux(a_,b_,c_,d_,e_) +#else +#undef plq +#define plq(a_,b_,c_) pla(a_,b_,c_) +#undef puq +#define puq(a_,b_,c_) punt(a_,b_,c_) +#undef plqx +#define plqx(a_,b_,c_,d_,e_) plax(a_,b_,c_,d_,e_) +#undef puqx +#define puqx(a_,b_,c_,d_,e_) puax(a_,b_,c_,d_,e_) +#endif + + align + lab(loop) + cmp(ax,LR) + je(stop) + +#undef KB +#define KB ( (1 << BITS) /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(4*KB/**DIV*/) + + jmp(loop) + + lab(stop) + pop(LR) + +#if ( 1 << BITS ) > 128 + test(128,LR) + je(64) +#undef KB +#define KB ( 128 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(512) + + lab(64) +#endif + +#if ( 1 << BITS ) > 64 + test(64,LR) + je(32) +#undef KB +#define KB ( 64 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(256) + + lab(32) +#endif + +#if ( 1 << BITS ) > 32 + test(32,LR) + je(16) +#undef KB +#define KB ( 32 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(128) + + lab(16) +#endif + +#if ( 1 << BITS ) > 16 + test(16,LR) + je(8) +#undef KB +#define KB ( 16 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(64) + + lab(8) +#endif + +#if ( 1 << BITS ) > 8 + test(8,LR) + je(4) +#undef KB +#define KB ( 8 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(32) + + lab(4) +#endif + +#if ( 1 << BITS ) > 4 + test(4,LR) + je(2) +#undef KB +#define KB ( 4 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(16) + + lab(2) +#endif + +#if DIV != 4 && ( 1 << BITS ) > 2 + test(2,LR) + je(1) +#undef KB +#define KB ( 2 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(8) + + lab(1) +#endif + +#if DIV == 1 && ( 1 << BITS ) > 1 + test(1,LR) + je(end) +#undef KB +#define KB ( 1 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + lab(end) +#endif + +#if defined (ALIGN) && ( defined(SCPLX) || defined(DCPLX) ) + + jmp(tend) + +#undef plq +#define plq(a_,b_,c_) pla(a_,b_,c_) +#undef puq +#define puq(a_,b_,c_) punt(a_,b_,c_) +#undef plqx +#define plqx(a_,b_,c_,d_,e_) plax(a_,b_,c_,d_,e_) +#undef puqx +#define puqx(a_,b_,c_,d_,e_) puax(a_,b_,c_,d_,e_) + + align + lab(loopa) + cmp(ax,LR) + je(stopa) + +#undef KB +#define KB ( (1 << BITS) /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(4*KB/**DIV*/) + + jmp(loopa) + + lab(stopa) + pop(LR) + +#if ( 1 << BITS ) > 128 + test(128,LR) + je(64a) +#undef KB +#define KB ( 128 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(512) + + lab(64a) +#endif + +#if ( 1 << BITS ) > 64 + test(64,LR) + je(32a) +#undef KB +#define KB ( 64 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(256) + + lab(32a) +#endif + +#if ( 1 << BITS ) > 32 + test(32,LR) + je(16a) +#undef KB +#define KB ( 32 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(128) + + lab(16a) +#endif + +#if ( 1 << BITS ) > 16 + test(16,LR) + je(8a) +#undef KB +#define KB ( 16 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(64) + + lab(8a) +#endif + +#if ( 1 << BITS ) > 8 + test(8,LR) + je(4a) +#undef KB +#define KB ( 8 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(32) + + lab(4a) +#endif + +#if ( 1 << BITS ) > 4 + test(4,LR) + je(2a) +#undef KB +#define KB ( 4 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(16) + + lab(2a) +#endif + +#if DIV != 4 && ( 1 << BITS ) > 2 + test(2,LR) + je(1a) +#undef KB +#define KB ( 2 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + INC(8) + + lab(1a) +#endif + +#if DIV == 1 && ( 1 << BITS ) > 1 + test(1,LR) + je(enda) +#undef KB +#define KB ( 1 /* / DIV */ ) +#include "camm_pipe3.h" + KB_block + lab(enda) +#endif + + lab(tend) + +#endif + +/* #endif */ /* CAMM_TPIPE_H */ diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h new file mode 100644 index 0000000..6b150d3 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h @@ -0,0 +1,508 @@ +#ifndef CAMM_UTIL_H +#define CAMM_UTIL_H /*+ To stop multiple inclusions. +*/ + +typedef struct { + float r,i; +} Complex; + +typedef struct { + double r,i; +} Dcomplex; + +#undef str +#define str(a_) xstr(a_) +#undef xstr +#define xstr(a_) #a_ + +#undef val +#define val(a_) xval(a_) +#undef xval +#define xval(a_) a_ + +#ifndef Mjoin +#define Mjoin(a,b) mjoin(a,b) +#ifdef mjoin + #undef mjoin +#endif +#define mjoin(a,b) a ## b +#endif + +#undef VOLATILE +#define VOLATILE __volatile__ +#undef ASM +#define ASM __asm__ VOLATILE + +#ifdef BETA0 +#undef BL +#define BL b0 +#endif +#ifdef BETA1 +#undef BL +#define BL b1 +#endif +#ifdef BETAX +#undef BL +#define BL bX +#endif +#ifdef BETAXI0 +#undef BL +#define BL bXi0 +#endif + +#ifdef NO_TRANSPOSE +#ifdef GER +#ifdef Conj_ +#undef FEXT +#define FEXT Gc +#else +#undef FEXT +#define FEXT Gu +#endif +#else +#ifdef Conj_ +#undef FEXT +#define FEXT Nc +#else +#undef FEXT +#define FEXT N +#endif +#endif +#else +#ifdef Conj_ +#undef FEXT +#define FEXT C +#else +#undef FEXT +#define FEXT T +#endif +#endif + +#undef BLC +#define BLC Mjoin(FEXT,BL) + +#ifdef __GNUC__ +#undef NO_INLINE +#define NO_INLINE double sq(double x) {return x*x;} +#else +#undef NO_INLINE +#define NO_INLINE +#endif + +#undef lab +#define lab(a_) "\n" str(MY_FUNCTION) "_" str(N) "_" str(a_) ":\n\t" +#undef jmp +#define jmp(a_) "jmp " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" +#undef je +#define je(a_) "je " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" +#undef jge +#define jge(a_) "jge " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" +#undef jle +#define jle(a_) "jle " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" +#undef jl +#define jl(a_) "jl " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" +#undef jne +#define jne(a_) "jne " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" +#undef align +#define align ".align 16\n\t" +#undef test +#define test(a_,b_) "testl $" str(a_) ",%%e" str(b_) "\n\t" +#undef and +#define and(a_,b_) "andl $" str(a_) ",%%e" str(b_) "\n\t" +#undef sub +#define sub(a_,b_) "subl $" str(a_) ",%%e" str(b_) "\n\t" +#undef SS +#define SS(a_,b_) a_ + b_ +#undef MM +#define MM(a_,b_) a_ * b_ +#undef E4 +#define E4(a_) (( a_ >> 2 ) << 2 ) + +#undef TYPE +#undef SCALAR +#undef PREC +#undef CSHUF +#undef LSHUF +#undef HSHUF +#undef ISHUF +#undef RSHUF +#undef SINGLE +#undef REAL +#undef DIV + +#ifdef SCPLX +#define TYPE Complex +#define SCALAR Complex * +#define PREC c +#define CSHUF 177 +#define LSHUF 160 +#define HSHUF 245 +#define ISHUF 13*17 +#define RSHUF 8*17 +#define SINGLE +#define DIV 2 +/* #ifdef Conj_ */ +/* static const TYPE signd[2]={{-1.0,1.0},{-1.0,1.0}}; */ +/* #else */ + static const TYPE signd[2]={{1.0,-1.0},{1.0,-1.0}}; +/* #endif */ +#endif + +#ifdef SREAL +#define TYPE float +#define SCALAR float +#define PREC s +#define SINGLE +#define REAL +#define DIV 1 +#endif + +#ifdef DREAL +#define TYPE double +#define SCALAR double +#define PREC d +#define REAL +#define DIV 2 +#endif + +#ifdef DCPLX +#define TYPE Dcomplex +#define SCALAR Dcomplex * +#define PREC z +#define CSHUF 1 +#define LSHUF 0 +#define HSHUF 3 +#define ISHUF 3 +#define RSHUF 0 +#define DIV 4 +/* #ifdef Conj_ */ +/* static const TYPE signd[1]={{-1.0,1.0}}; */ +/* #else */ + static const TYPE signd[1]={{1.0,-1.0}}; +/* #endif */ +#endif + +#undef M11 +#define M11 0 +#undef M12 +#define M12 1 +#undef M13 +#define M13 2 +#undef M14 +#define M14 3 +#undef M15 +#define M15 4 +#undef M16 +#define M16 5 +#undef M17 +#define M17 6 +#undef M18 +#define M18 7 + +#undef M23 +#define M23 1 +#undef M24 +#define M24 2 +#undef M25 +#define M25 3 +#undef M26 +#define M26 4 +#undef M27 +#define M27 5 +#undef M28 +#define M28 6 + +#undef M33 +#define M33 0 +#undef M34 +#define M34 1 +#undef M35 +#define M35 2 +#undef M36 +#define M36 3 +#undef M37 +#define M37 4 +#undef M38 +#define M38 5 + +#undef P10 +#define P10 1 +#undef P11 +#define P11 2 +#undef P12 +#define P12 3 +#undef P13 +#define P13 4 +#undef P14 +#define P14 5 +#undef P15 +#define P15 6 +#undef P16 +#define P16 7 + +#undef XM +#define XM(a_,b_) M ## b_ ## a_ +#undef M +#define M(a_,b_) XM(a_,b_) + +#undef XP +#define XP(a_,b_) P ## b_ ## a_ +#undef P +#define P(a_,b_) XP(a_,b_) + +#undef mex +#define mex(a_) str(%%e ## a_) +#undef msx +#define msx(a_) "%%st(" str(a_) ")" + +#undef cmp +#define cmp(a_,b_) "cmp " mex(a_) "," mex(b_) "\n\t" +#undef icmpr +#define icmpr(a_,b_) "cmp " mex(a_) ",(" mex(b_) ")\n\t" +#undef f +#define f(a_,b_,c_) "prefetch" str(a_) " " str(b_) "(%%e" #c_ ")\n\t" +#undef pfx +#define pfx(a_,b_,c_,d_,e_) "prefetch" str(a_) " " str(b_) "(%%e" #c_ ",%%e" #d_ "," str(e_) ")\n\t" +#undef a +#define a(a_,b_) "addl $" str(a_) "," mex(b_) "\n\t" +#undef m +#define m(a_,b_) "imul $" str(a_) "," mex(b_) "\n\t" +#undef pop +#define pop(a_) "popl %%e" str(a_) "\n\t" +#undef push +#define push(a_) "pushl %%e" str(a_) "\n\t" +#undef d +#define d(a_,b_) "idiv $" str(a_) "," mex(b_) "\n\t" +#undef shl +#define shl(a_,b_) "shl $" str(a_) "," mex(b_) "\n\t" +#undef shr +#define shr(a_,b_) "shr $" str(a_) "," mex(b_) "\n\t" +#undef mm +#define mm(a_,b_) "mov $" str(a_) "," mex(b_) "\n\t" +#undef ra +#define ra(a_,b_) "addl %%e" str(a_) "," mex(b_) "\n\t" +#undef rs +#define rs(a_,b_) "subl %%e" str(a_) "," mex(b_) "\n\t" + +#undef fl +#define fl(a_,b_) "fldl " str(a_) "(" mex(b_) ")\n\t" +#undef fp +#define fp(a_,b_) "fstpl " str(a_) "(" mex(b_) ")\n\t" +#undef fd +#define fd(a_) "fld " msx(a_) "\n\t" +#undef fap +#define fap(a_,b_) "faddp " msx(a_) "," msx(b_) "\n\t" +/* #define fsp(a_) fx(a_) "fsubp %%st," msx(a_) "\n\t" */ +#undef fsp +#define fsp(a_) "fsubrp %%st," msx(a_) "\n\t" +#undef fmp +#define fmp(a_,b_) "fmulp " msx(a_) "," msx(b_) "\n\t" +#undef fa +#define fa(a_,b_) "fadd " msx(a_) "," msx(b_) "\n\t" +#undef fm +#define fm(a_,b_) "fmul " msx(a_) "," msx(b_) "\n\t" +#undef faa +#define faa(a_,b_) "faddl " str(a_) "(" mex(b_) ")\n\t" +#undef fma +#define fma(a_,b_) "fmull " str(a_) "(" mex(b_) ")\n\t" +#undef fz +#define fz "fldz\n\t" +#undef fx +#define fx(a_) "fxch " msx(a_) "\n\t" +#undef fx1 +#define fx1 "fxch\n\t" +#undef fc +#define fc(a_) "fstp " msx(a_) "\n\t" + + +#ifndef ATHLON + + +#if defined(DREAL) || defined(DCPLX) +#undef SSESUF +#define SSESUF "d " +#undef RS4 +#define RS4 16 +#undef RS +#define RS 4 +#else +#undef SSESUF +#define SSESUF "s " +#undef RS4 +#define RS4 16 +#undef RS +#define RS 4 +#endif + +#undef mxx +#define mxx(a_) str(%%xmm ## a_) +#undef prp +#define prp(a_,b_) "rcpp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef prps +#define prps(a_,b_) "rcps" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pann +#define pann(a_,b_) "andnp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef psqs +#define psqs(a_,b_) "sqrts" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef por +#define por(a_,b_) "orp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pan +#define pan(a_,b_) "andp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pcm +#define pcm(a_,b_,c_) "cmpp" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t" +#undef pcms +#define pcms(a_,b_,c_) "cmps" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t" +#undef pax +#define pax(a_,b_) "maxp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef paxs +#define paxs(a_,b_) "maxs" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pd +#define pd(a_,b_) "divp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pdsr +#define pdsr(a_,b_) "divs" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pxx +#define pxx(a_,b_) "xorp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef px +#define px(a_) "xorp" SSESUF mxx(a_) "," mxx(a_) "\n\t" +#undef pm +#define pm(a_,b_) "mulp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pa +#define pa(a_,b_) "addp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pmm +#define pmm(a_,b_,c_) "mulp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pam +#define pam(a_,b_,c_) "addp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pl +#define pl(a_,b_,c_) "movup" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pla +#define pla(a_,b_,c_) "movap" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pu +#define pu(a_,b_,c_) "movup" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" +#undef punt +#define punt(a_,b_,c_) "movntp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" +#undef pua +#define pua(a_,b_,c_) "movap" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" +#undef pud +#define pud(a_,b_,c_) "movlp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" +#undef pudr +#define pudr(a_,b_) "movlp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pc +#define pc(a_,b_) "movap" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef ps +#define ps(a_,b_,c_) "shufp" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t" +#undef phl +#define phl(a_,b_) "movhlp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pus +#define pus(a_,b_,c_) "movs" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" +#undef pls +#define pls(a_,b_,c_) "movs" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pld +#define pld(a_,b_,c_) "movlp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef plh +#define plh(a_,b_) "movlhp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pas +#define pas(a_,b_,c_) "adds" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pms +#define pms(a_,b_,c_) "muls" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pcs +#define pcs(a_,b_) "movs" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pasr +#define pasr(a_,b_) "adds" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pmsr +#define pmsr(a_,b_) "muls" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef pul +#define pul(a_,b_) "unpcklp" SSESUF mxx(a_) "," mxx(b_) "\n\t" +#undef puh +#define puh(a_,b_) "unpckhp" SSESUF mxx(a_) "," mxx(b_) "\n\t" + +#undef plsx +#define plsx(a_,b_,c_,d_,e_) \ + "movs" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" +#undef plx +#define plx(a_,b_,c_,d_,e_) \ + "movup" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" +#undef plax +#define plax(a_,b_,c_,d_,e_) \ + "movap" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" +#undef pasx +#define pasx(a_,b_,c_,d_,e_) \ + "adds" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" +#undef pusx +#define pusx(a_,b_,c_,d_,e_) \ + "movs" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" +#undef pux +#define pux(a_,b_,c_,d_,e_) \ + "movup" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" +#undef puax +#define puax(a_,b_,c_,d_,e_) \ + "movap" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" +#undef pudx +#define pudx(a_,b_,c_,d_,e_) \ + "movlp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" + +#undef pldx +#define pldx(a_,b_,c_,d_,e_) \ + "movlp" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" + +#else + +#undef RS4 +#define RS4 8 +#undef RS +#define RS 2 + +#undef mxx +#define mxx(a_) str(%%mm ## a_) +#undef pul +#define pul(a_,b_) "punpckldq " mxx(a_) "," mxx(b_) "\n\t" +#undef puh +#define puh(a_,b_) "punpckhdq " mxx(a_) "," mxx(b_) "\n\t" + +#undef px +#define px(a_) "pxor " mxx(a_) "," mxx(a_) "\n\t" +#undef pm +#define pm(a_,b_) "pfmul " mxx(a_) "," mxx(b_) "\n\t" +#undef pa +#define pa(a_,b_) "pfadd " mxx(a_) "," mxx(b_) "\n\t" +#undef pac +#define pac(a_,b_) "pfacc " mxx(a_) "," mxx(b_) "\n\t" +#undef pmm +#define pmm(a_,b_,c_) "pfmul " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pam +#define pam(a_,b_,c_) "pfadd " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pl +#define pl(a_,b_,c_) "movq " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pla +#define pla(a_,b_,c_) "movq " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" +#undef pu +#define pu(a_,b_,c_) "movq " mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" +#undef pc +#define pc(a_,b_) "movq " mxx(a_) "," mxx(b_) "\n\t" +#undef ps +#define ps(a_,b_,c_) "pswapd " mxx(b_) "," mxx(c_) "\n\t" +#undef phl +#define phl(a_,b_) "punpckhdq " mxx(a_) "," mxx(b_) "\n\t" +#undef plh +#define plh(a_,b_) "punpckldq " mxx(a_) "," mxx(b_) "\n\t" +#undef pus +#define pus(a_,b_,c_) "movd " mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" +#undef pls +#define pls(a_,b_,c_) "movd " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" + +#undef plsx +#define plsx(a_,b_,c_,d_,e_) \ + "movd " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" +#undef plx +#define plx(a_,b_,c_,d_,e_) \ + "movq " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" +#undef pasx +#define pasx(a_,b_,c_,d_,e_) \ + "addss " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" +#undef pusx +#define pusx(a_,b_,c_,d_,e_) \ + "movd " mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" +#undef pux +#define pux(a_,b_,c_,d_,e_) \ + "movq " mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" +#endif + +#endif /* CAMM_UTIL_H */ -- cgit v1.2.3-70-g09d2