diff options
author | Ted Yin <[email protected]> | 2015-08-14 17:42:26 +0800 |
---|---|---|
committer | Ted Yin <[email protected]> | 2015-08-14 17:42:26 +0800 |
commit | c3cffb58b9921d78753336421b52b9ffdaa5515c (patch) | |
tree | bfea20e97c200cf734021e3756d749c892e658a4 /kaldi_io/src/tools/ATLAS/include/contrib | |
parent | 10cce5f6a5c9e2f8e00d5a2a4d87c9cb7c26bf4c (diff) | |
parent | dfdd17afc2e984ec6c32ea01290f5c76309a456a (diff) |
Merge pull request #2 from yimmon/master
remove needless files
Diffstat (limited to 'kaldi_io/src/tools/ATLAS/include/contrib')
-rw-r--r-- | kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h | 188 | ||||
-rw-r--r-- | kaldi_io/src/tools/ATLAS/include/contrib/Make.ext | 39 | ||||
-rw-r--r-- | kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h | 709 | ||||
-rw-r--r-- | kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h | 1626 | ||||
-rw-r--r-- | kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h | 295 | ||||
-rw-r--r-- | kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h | 215 | ||||
-rw-r--r-- | kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h | 2982 | ||||
-rw-r--r-- | kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h | 331 | ||||
-rw-r--r-- | kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h | 508 |
9 files changed, 0 insertions, 6893 deletions
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h b/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h deleted file mode 100644 index 118d3de..0000000 --- a/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h +++ /dev/null @@ -1,188 +0,0 @@ -#ifdef GER -#undef NO_TRANSPOSE -#define NO_TRANSPOSE -#endif - - -#if NDPM > 4 -#error Max NDPM is 4 -#endif - -#if !defined(ATL_SSE1) && ( defined(SREAL) || defined(SCPLX) ) -#error This routine needs ATL_SSE1 defined -#endif - -#if !defined(ATL_SSE2) && ( defined(DREAL) || defined(DCPLX) ) -#error This routine needs ATL_SSE2 defined -#endif - -#include <stdio.h> -#include <stdlib.h> - -#include "camm_util.h" - -#ifndef GER -#if defined(BETAX) || defined(BETAXI0) -#include "camm_scale.h" -#endif -#endif - -#if NDPM >= 4 -#define EXT4 Mjoin(4dp,BLC) -#undef NDP -#define NDP 4 -#undef EXT -#define EXT EXT4 -#include "camm_dpa.h" -#endif - -#if NDPM >= 3 -#define EXT3 Mjoin(3dp,BLC) -#undef NDP -#define NDP 3 -#undef EXT -#define EXT EXT3 -#include "camm_dpa.h" -#endif - -#if NDPM >= 2 -#define EXT2 Mjoin(2dp,BLC) -#undef NDP -#define NDP 2 -#undef EXT -#define EXT EXT2 -#include "camm_dpa.h" -#endif - -#define EXT1 Mjoin(1dp,BLC) -#undef NDP -#define NDP 1 -#undef EXT -#define EXT EXT1 -#include "camm_dpa.h" - -#undef NDP -#define NDP NDPM -#undef EXT -#define EXT Mjoin(Mjoin(NDP,Mjoin(dp,BLC)),m) -#include "camm_dpa.h" - -#ifdef GER -#if defined(SCPLX) || defined(DCPLX) -#ifdef Conj_ -#define IM 1c -#else -#define IM 1u -#endif -#else -#define IM 1 -#endif - - -#define FN Mjoin(Mjoin(Mjoin(ATL_,PREC),Mjoin(ger,IM)),_a1_x1_yX) - -#undef MY_FUNCTION -#define MY_FUNCTION FN - -void -MY_FUNCTION(int m,int n, const SCALAR alpha,const TYPE *c, - int cinc,const TYPE *b,int binc, - TYPE *a,int lda) { - -#else - - -#define FN Mjoin(Mjoin(Mjoin(ATL_,PREC),gemv),Mjoin(FEXT,Mjoin(_a1_x1_,Mjoin(BL,_y1)))) - -#undef MY_FUNCTION -#define MY_FUNCTION FN - -void -MY_FUNCTION(int m,int n, const SCALAR alpha,const TYPE *a, - int lda,const TYPE *b,int binc, - const SCALAR beta,TYPE *c,int cinc) { - -#endif - - int i,mm,nn; - const TYPE *ae; -#ifdef NO_TRANSPOSE - int len=m,w=n; -#define zz b -#else - int len=n,w=m; -#define zz c -#endif - -#ifdef GER -#define zzinc binc -#else -#define zzinc 1 - - -#if defined(NO_TRANSPOSE) && defined(BETA0) - memset(c,0,m*sizeof(*c)); -#endif - -#if defined(BETAX) || defined(BETAXI0) -#if defined(SCPLX) || defined(DCPLX) - SCALE(beta,c,m); -#endif -#if defined(SREAL) || defined(DREAL) - SCALE(&beta,c,m); -#endif -#endif - -#endif - - ae=a+w*lda; - nn=STRIDE*lda; - - -#if NDPM == 1 - for (;a<ae;a+=lda,zz+=zzinc) - Mjoin(dp,EXT)(a,nn,b,c,STRIDE*zzinc,len); - -#else - - while (a+NDPM*nn<=ae) { - for (i=0;i<STRIDE;i++,a+=lda,zz+=zzinc) - Mjoin(dp,EXT)(a,nn,b,c,STRIDE*zzinc,len); - - a+=(NDPM-1)*nn; - zz+=(NDPM-1)*STRIDE*zzinc; - } - - for (i=0;a<ae && i<STRIDE;i++,a+=lda,zz+=zzinc) { - - mm=(ae-a)/nn; -#if STRIDE > 1 - if (((ae-a)/lda)%STRIDE) - mm++; -#endif - - if (mm == 1) - Mjoin(dp,EXT1)(a,nn,b,c,STRIDE*zzinc,len); - -#if ( NDPM == 2 && STRIDE > 1 ) || NDPM > 2 - else if (mm == 2) - Mjoin(dp,EXT2)(a,nn,b,c,STRIDE*zzinc,len); -#endif - -#if ( NDPM == 3 && STRIDE > 1 ) || NDPM > 3 - else if (mm == 3) - Mjoin(dp,EXT3)(a,nn,b,c,STRIDE*zzinc,len); -#endif - -#if ( NDPM == 4 && STRIDE > 1 ) || NDPM > 4 - else if (mm == 4) - Mjoin(dp,EXT4)(a,nn,b,c,STRIDE*zzinc,len); -#endif - - - } - -#endif - -} - diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext b/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext deleted file mode 100644 index f7f9a0a..0000000 --- a/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext +++ /dev/null @@ -1,39 +0,0 @@ - -topd = /home/whaley/atlas3.8/AtlasBase -incs = -def topd /home/whaley/atlas3.8/AtlasBase \ - -def incd /home/whaley/atlas3.8/AtlasBase/Clint \ - -def BASEdir /home/whaley/atlas3.8/AtlasBase/Antoine/ \ - -def basd /home/whaley/atlas3.8/AtlasBase/Clint -ext = extract -extF = $(ext) -langF -lnlen71 -Remtblank -llwarn2 -LAPACK1 $(incs) -extC = $(ext) -langC -lnlen79 -Remtblank -llwarn2 $(incs) -extM = $(ext) -langM -lnlen79 -llwarn2 $(incs) - -default: all -force_build: -basd = /home/whaley/atlas3.8/AtlasBase/Clint -basdRCW = /home/whaley/atlas3.8/AtlasBase/Clint -basdAPP = /home/whaley/atlas3.8/AtlasBase/Antoine -incf = /home/whaley/atlas3.8/AtlasBase/gen.inc - -files = ATL_gemv_ger_SSE.h SSE3Dnow.h camm_dpa.h camm_pipe3.h camm_scale.h \ - camm_strat1.h camm_tpipe.h camm_util.h - -all : $(files) - -camm_strat1.h : $(topd)/kernel/CammMaguire/camm_strat1.h - cp $(topd)/kernel/CammMaguire/camm_strat1.h . -camm_tpipe.h : $(topd)/kernel/CammMaguire/camm_tpipe.h - cp $(topd)/kernel/CammMaguire/camm_tpipe.h . -camm_pipe3.h : $(topd)/kernel/CammMaguire/camm_pipe3.h - cp $(topd)/kernel/CammMaguire/camm_pipe3.h . -ATL_gemv_ger_SSE.h : $(topd)/kernel/CammMaguire/ATL_gemv_ger_SSE.h - cp $(topd)/kernel/CammMaguire/ATL_gemv_ger_SSE.h . -camm_util.h : $(topd)/kernel/CammMaguire/camm_util.h - cp $(topd)/kernel/CammMaguire/camm_util.h . -camm_scale.h : $(topd)/kernel/CammMaguire/camm_scale.h - cp $(topd)/kernel/CammMaguire/camm_scale.h . -camm_dpa.h : $(topd)/kernel/CammMaguire/camm_dpa.h - cp $(topd)/kernel/CammMaguire/camm_dpa.h . -SSE3Dnow.h : $(topd)/kernel/PeterSoendergaard/SSE3Dnow.h - cp $(topd)/kernel/PeterSoendergaard/SSE3Dnow.h . diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h b/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h deleted file mode 100644 index a783749..0000000 --- a/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h +++ /dev/null @@ -1,709 +0,0 @@ -#if !defined(ATL_GAS_x8632) && !defined(ATL_GAS_x8664) - #error "This kernel requires gas x86 assembler!" -#endif -#ifndef Mstr /* Added by RCW to make multiline macros work */ - #define Mstr2(m) # m - #define Mstr(m) Mstr2(m) -#endif -/* The mening of the defined macros is as follows: - * VECLEN: The length of a singleprecision vector register - * vec_add: Add to single precision vectors. - * vec_mul: Multiply to single precision vectors. - * vec_mov: Moves data around - * vec_mov1: Load one element in a vector and zero all other entries! - * vec_splat: Load one element relpicated in all positions in the vector. - * vec_load_apart: Load elements from different memory positions into a register. - * vec_sum: Sums a register. - * vec_store_one: Stores lowest element in vector to memory, no zero-extend! - * Meaning of suffixes is as follows: - * mr means memory to register - * rr means register to register - * rm means register to memory - * a means that instruction needs aligned data - * 1 means that the instructions only operates on the lowest element of the - * vector. - * - * The _1 instructions work under one important assumption: That you never mix - * them with regular instructions, e.g. loading into a register with a normal - * mov, and then using add_rr_1 will not work under 3dnow! since it is in - * reality a normal add. However, if using a mov_1 first, the upper part of - * the register will be zeroed, and it will therefore work. The _1 system is - * more robust under SSE, but other architectures might be implemented the - * same way as 3dnow! - * - * RCW: I added the following functionality for SSE only (note that vw may - * be overwritten with intermediate results, but is not used as input, - * and that all input array may be overwritten wt intermediate results. - * VL : vector length -1): - * vec_red(vd, vw) : vd[0] = sum(vd[0:VL]) - * vec_red2(v1, v2, vw) : v1[0] = sum(v1[0:VL]); v1[1] = sum(v2[0:VL]) - * vec_red4(v0, v1, v2, v3 vw1, vw2) : - * v0[0] = sum(v0[0:VL]); v0[1] = sum(v1[0:VL]) - * if type = double: - * v2[0] = sum(v2[0:VL]); v2[1] = sum(v3[0:VL]) - * else - * v0[2] = sum(v2[0:VL]); v0[3] = sum(v3[0:VL]) - * vec_zero(vd) : vd[0:VL] = 0.0 - */ - - -/* Things to try: - * Non-temporal stores - * Sequences of instructions instead of movups - * - * - * - * - */ - - - -#define gen_vec_rr(op,reg1,reg2) \ - __asm__ __volatile__ (#op " " #reg1 ", " #reg2 \ - : /* nothing */ \ - : /* nothing */) - - -#define w(p) p - -#define nop() __asm__ __volatile__ ("nop") - -#define rep() __asm__ __volatile__ ("rep") - -#define align() __asm__ __volatile__ (".align 16") - - -#ifdef x87double - -#define st0 %%st(0) -#define st1 %%st(1) -#define st2 %%st(2) -#define st3 %%st(3) -#define st4 %%st(4) -#define st5 %%st(5) -#define st6 %%st(6) -#define st7 %%st(7) - - -#define gen_stack_rt(op,reg) \ - __asm__ __volatile__ (#op " " #reg \ - : /* nothing */ \ - : /* nothing */) - -#define gen_stack_tr(op,reg) \ - __asm__ __volatile__ (#op " %%st(0)," #reg \ - : \ - : ) - - -#define gen_stack_rr(op,reg1,reg2) \ - __asm__ __volatile__ (#op " " #reg1 ", " #reg2 \ - : /* nothing */ \ - : /* nothing */) - -#define gen_stack_t(op) \ - __asm__ __volatile__ (#op \ - : /* nothing */ \ - : /* nothing */) - - -#define gen_stack_tm(op,mem) \ - __asm__ __volatile__ (#op " %0" \ - : "=m" (((mem)[0])) \ - : ) - -#define gen_stack_mt(op,mem) \ - __asm__ __volatile__ (#op " %0" \ - : \ - : "m" (((mem)[0]))) - - -#define stack_mov_mt_push(mem) gen_stack_mt(fldl,mem) - -#define stack_add_tr_pop(reg) gen_stack_tr(faddp,reg) -#define stack_add_mt(mem) gen_stack_mt(faddl,mem) - -#define stack_mul_tr(reg) gen_stack_tr(fmul,reg) -#define stack_mul_tr_pop(reg) gen_stack_tr(fmulp,reg) -#define stack_mul_mt(mem) gen_stack_mt(fmul,mem) - -#define stack_mov_tm_pop(mem) gen_stack_tm(fstpl,mem) - -#define stack_zero_push() gen_stack_t(fldz) - -#endif /* x87double */ - -#ifdef SSE - -/* Peculiarities of SSE: Alignment is good, but not mandatory. It is possible to - * load/store from misaligned adresses using movups at a cost of some cycles. Loading - * using mul/add must always be aligned. Alignment is 16 bytes. - * No muladd. - */ - - - -#define gen_vec_mr(op,mem,reg) \ - __asm__ __volatile__ (#op " %0, " #reg \ - : /* nothing */ \ - : "m" (((mem)[0])), "m" (((mem)[1])), "m" (((mem)[2])), "m" (((mem)[3]))) - - -#define gen_vec_rm(op,reg,mem) \ - __asm__ __volatile__ (#op " " #reg ", %0" \ - : "=m" (((mem)[0])), "=m" (((mem)[1])), "=m" (((mem)[2])), "=m" (((mem)[3])) \ - : /* nothing */ ) - - - - -#define VECLEN 4 - -#define reg0 %%xmm0 -#define reg1 %%xmm1 -#define reg2 %%xmm2 -#define reg3 %%xmm3 -#define reg4 %%xmm4 -#define reg5 %%xmm5 -#define reg6 %%xmm6 -#define reg7 %%xmm7 -#ifdef ATL_GAS_x8664 - #define reg8 %%xmm8 - #define reg9 %%xmm9 - #define reg10 %%xmm10 - #define reg11 %%xmm11 - #define reg12 %%xmm12 - #define reg13 %%xmm13 - #define reg14 %%xmm14 - #define reg15 %%xmm15 -#endif - -#define vec_mov_mr(mem,reg) gen_vec_mr(movups,mem,reg) -#define vec_mov_rm(reg,mem) gen_vec_rm(movups,reg,mem) -#define vec_mov_mr_a(mem,reg) gen_vec_mr(movaps,mem,reg) -#define vec_mov_rm_a(reg,mem) gen_vec_rm(movaps,reg,mem) -#define vec_mov_rr(reg1,reg2) gen_vec_rr(movaps,reg1,reg2) - -#define vec_add_mr_a(mem,reg) gen_vec_mr(addps,mem,reg) -#define vec_mul_mr_a(mem,reg) gen_vec_mr(mulps,mem,reg) - -#define vec_add_rr(mem,reg) gen_vec_rr(addps,mem,reg) -#define vec_mul_rr(mem,reg) gen_vec_rr(mulps,mem,reg) - -#define vec_mov_mr_1(mem,reg) gen_vec_mr(movss,mem,reg) -#define vec_mov_rm_1(reg,mem) gen_vec_rm(movss,reg,mem) -#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movss,reg1,reg2) - -#define vec_add_mr_1(mem,reg) gen_vec_mr(addss,mem,reg) -#define vec_add_rr_1(reg1,reg2) gen_vec_rr(addss,reg1,reg2) - -#define vec_mul_mr_1(mem,reg) gen_vec_mr(mulss,mem,reg) -#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(mulss,reg1,reg2) - -#define vec_unpack_low(reg1,reg2) gen_vec_rr(unpcklps,reg1,reg2) -#define vec_unpack_high(reg1,reg2) gen_vec_rr(unpckhps,reg1,reg2) -#define vec_shuffle(mode,reg1,reg2) vec_shuffle_wrap(mode,reg1,reg2) -#define vec_shuffle_wrap(mode,reg1,reg2) \ - __asm__ __volatile__ ("shufps " #mode ", " #reg1 ", " #reg2 \ - : /* nothing */\ - : /* nothing */) - -/* Hack! */ -/* To use this instruction be sure that register 7 is not in use!!! */ -/* It must be possible to reduce this sequence to only four instructions. - * please tell me how! */ -#define vec_sum(reg) vec_sum_wrap(reg) -#define vec_sum_wrap(reg) \ - __asm__ __volatile__ ("movhlps " #reg ", %%xmm7\n"\ - "addps " #reg ", %%xmm7\n"\ - "movaps %%xmm7, " #reg "\n"\ - "shufps $1, " #reg ", %%xmm7\n"\ - "addss %%xmm7, " #reg "\n"\ - : /* nothing */\ - : /* nothing */) - -/* RCW: added to safely replace vec_sum (vec reduce), and use SSE3 when avail */ -#define vec_zero(vd) __asm__ __volatile__("xorps " Mstr(vd) ", " Mstr(vd) ::) -#ifdef ATL_SSE3 - #define vec_red(vr, vwrk) \ - __asm__ __volatile__("haddps " Mstr(vr) ", " Mstr(vr) "\n"\ - "haddps " Mstr(vr) ", " Mstr(vr) "\n" ::) -/* - * haddps v1 v0 # v0 = {v1cd, v1ab, v0cd, v0ab} - * haddps v0 v0 # v0 = {v1abcd, v0abcd, v1abcd, v0abcd} - */ - #define vec_red2(v0, v1, vwork) \ - __asm__ __volatile__("haddps " Mstr(v1) ", " Mstr(v0) "\n"\ - "haddps " Mstr(v0) ", " Mstr(v0) "\n" ::) -/* - * haddps v1, v0 # v0 = {v1cd,v1ab,v0cd,v0ab} - * haddps v3, v2 # v2 = {v3cd,v3ab,v2cd,v2ab} - * haddps v2, v0 # v0 = {v3abcd,v2abcd,v1abcd, v0abcd} - */ - #define vec_red4(v0, v1, v2, v3, w0, w1) \ - __asm__ __volatile__("haddps " Mstr(v1) ", " Mstr(v0) "\n"\ - "haddps " Mstr(v3) ", " Mstr(v2) "\n"\ - "haddps " Mstr(v2) ", " Mstr(v0) "\n" ::) -#elif defined(ATL_SSE2) - #define vec_red(vr, vwrk) \ - __asm__ __volatile__ ("pshufd $0xEE, " Mstr(vr) ", " Mstr(vwrk) "\n"\ - "addps " Mstr(vwrk) ", " Mstr(vr) "\n"\ - "pshufd $0xE5, " Mstr(vr) ", " Mstr(vwrk) "\n"\ - "addss " Mstr(vwrk) ", " Mstr(vr) "\n"\ - ::) -#else - #define vec_red(vr, vwrk) \ - __asm__ __volatile__ ("movhlps " Mstr(vr) ", " Mstr(vwrk) "\n"\ - "addps " Mstr(vwrk) ", " Mstr(vr) "\n"\ - "movaps " Mstr(vr) ", " Mstr(vwrk) "\n"\ - "shufps $0xE5, " Mstr(vr) ", " Mstr(vr) "\n"\ - "addss " Mstr(vwrk) ", " Mstr(vr) "\n"\ - ::) -#endif -#ifndef ATL_SSE3 /* codes that are the same for SSE2 and SSE1 */ -/* - # v0 = {v0d,v0c,v0b,v0a} - # v1 = {v1d,v1c,v1b,v1a} - movaps v0, vw # vw = {v0d,v0c,v0b,v0a} - unpacklps v1, v0 # v0 = {v1b,v0b,v1a,v0a} - unpackhps v1, vw # vw = {v1d,v0d,v1c,v0c} - addps vw, v0 # v0 = {v1bd,v0bd,v1ac,v0ac} - movhlps v0, vw # vw = {X , X,v1bd,v0bd} - addps vw, v0 # v0 = {X , X,v1abcd,v0abcd} -*/ - #define vec_red2(v0, v1, vw) \ - __asm__ __volatile__ ("movaps " Mstr(v0) ", " Mstr(vw) "\n"\ - "unpcklps " Mstr(v1) ", " Mstr(v0) "\n"\ - "unpckhps " Mstr(v1) ", " Mstr(vw) "\n"\ - "addps " Mstr(vw) ", " Mstr(v0) "\n"\ - "movhlps " Mstr(v0) ", " Mstr(vw) "\n"\ - "addps " Mstr(vw) ", " Mstr(v0) "\n"\ - ::) -/* - * movaps v0, w0 # w0 = {v0d, v0c, v0b, v0a} - * unpcklps v1, v0 # v0 = {v1b, v0b, v1a, v0a} - * movaps v2, w1 # w1 = {v2d, v2c, v2b, v2a} - * unpckhps v1, w0 # w0 = {v1d, v0d, v1c, v0c} - * unpcklps v3, v2 # v2 = {v3b, v2b, v3a, v2a} - * addps w0, v0 # v0 = {v1bd, v0bd, v1ac, v0ac} - * unpckhps v3, w1 # w1 = {v3d, v2d, v3c, v2c} - * movaps v0, w0 # w0 = {v1bd, v0bd, v1ac, v0ac} - * addps w1, v2 # v2 = {v3bd, v2bd, v3ac, v2ac} - * shufps $0x44,v2,v0 # v0 = {v3ac, v2ac, v1ac, v0ac} - * shufps $0xEE,v2,w0 # w0 = {v3bd, v2bd, v1bd, v0bd} - * addps w0, v0 # v0 = {v3abcd, v2abcd, v1abcd, v0abcd} - */ - #define vec_red4(v0, v1, v2, v3, w0, w1) \ - __asm__ __volatile__ ("movaps " Mstr(v0) ", " Mstr(w0) "\n"\ - "unpcklps " Mstr(v1) ", " Mstr(v0) "\n"\ - "movaps " Mstr(v2) ", " Mstr(w1) "\n"\ - "unpckhps " Mstr(v1) ", " Mstr(w0) "\n"\ - "unpcklps " Mstr(v3) ", " Mstr(v2) "\n"\ - "addps " Mstr(w0) ", " Mstr(v0) "\n"\ - "unpckhps " Mstr(v3) ", " Mstr(w1) "\n"\ - "movaps " Mstr(v0) ", " Mstr(w0) "\n"\ - "addps " Mstr(w1) ", " Mstr(v2) "\n"\ - "shufps $0x44, " Mstr(v2) ", " Mstr(v0) "\n"\ - "shufps $0xEE, " Mstr(v2) ", " Mstr(w0) "\n"\ - "addps " Mstr(w0) ", " Mstr(v0) "\n"\ - ::) -#endif - -#define vec_splat(mem,reg) vec_splat_wrap(mem,reg) -#define vec_splat_wrap(mem,reg) \ - __asm__ __volatile__ ("movss %0, " #reg "\n"\ - "unpcklps " #reg ", " #reg "\n"\ - "movlhps " #reg ", " #reg "\n"\ - : /* nothing */ \ - : "m" ((mem)[0])) - - -/* This instruction sequence appears courtesy of Camm Maguire. */ -#define vec_sum_full(reg0,reg1,reg2,reg3,regout,empty0,empty1) vec_sum_full_wrap(reg0,reg1,reg2,reg3,regout,empty0,empty1) -#define vec_sum_full_wrap(reg0,reg1,reg2,reg3,regout,empty0,empty1) \ - __asm__ __volatile__ ("movaps " #reg0 "," #empty0 "\n"\ - "unpcklps " #reg1 "," #reg0 "\n"\ - "movaps " #reg2 "," #empty1 "\n"\ - "unpckhps " #reg1 "," #empty0 "\n"\ - "unpcklps " #reg3 "," #reg2 "\n"\ - "addps " #empty0 "," #reg0 "\n"\ - "unpckhps " #reg3 "," #empty1 "\n"\ - "movaps " #reg0 "," #regout "\n"\ - "addps " #empty1 "," #reg2 "\n"\ - "shufps $0x44," #reg2 "," #reg0 "\n"\ - "shufps $0xee," #reg2 "," #regout "\n"\ - "addps " #reg0 "," #regout "\n"\ - : /* nothing */ \ - : /* nothing */) - - - -typedef float vector[VECLEN]; - -#endif /* end ifdef SSE */ - - -#ifdef SSE2 - -/* Peculiarities of SSE: Alignment is good, but not mandatory. It is possible to - * load/store from misaligned adresses using movups at a cost of some cycles. Loading - * using mul/add must always be aligned. Alignment is 16 bytes. - * No muladd. - */ - - - -#define gen_vec_mr(op,mem,reg) \ - __asm__ __volatile__ (#op " %0, " #reg \ - : /* nothing */ \ - : "m" (((mem)[0])), "m" (((mem)[1]))) - - -#define gen_vec_rm(op,reg,mem) \ - __asm__ __volatile__ (#op " " #reg ", %0" \ - : "=m" (((mem)[0])), "=m" (((mem)[1])) \ - : /* nothing */ ) - - - - -#define VECLEN 2 - -#define reg0 %%xmm0 -#define reg1 %%xmm1 -#define reg2 %%xmm2 -#define reg3 %%xmm3 -#define reg4 %%xmm4 -#define reg5 %%xmm5 -#define reg6 %%xmm6 -#define reg7 %%xmm7 -#ifdef ATL_GAS_x8664 - #define reg8 %%xmm8 - #define reg9 %%xmm9 - #define reg10 %%xmm10 - #define reg11 %%xmm11 - #define reg12 %%xmm12 - #define reg13 %%xmm13 - #define reg14 %%xmm14 - #define reg15 %%xmm15 -#endif - - -#define vec_mov_mr(mem,reg) gen_vec_mr(movupd,mem,reg) -#define vec_mov_rm(reg,mem) gen_vec_rm(movupd,reg,mem) -#define vec_mov_mr_a(mem,reg) gen_vec_mr(movapd,mem,reg) -#define vec_mov_rm_a(reg,mem) gen_vec_rm(movapd,reg,mem) -#define vec_mov_rr(reg1,reg2) gen_vec_rr(movapd,reg1,reg2) - -#define vec_add_mr_a(mem,reg) gen_vec_mr(addpd,mem,reg) -#define vec_mul_mr_a(mem,reg) gen_vec_mr(mulpd,mem,reg) - -#define vec_add_rr(mem,reg) gen_vec_rr(addpd,mem,reg) -#define vec_mul_rr(mem,reg) gen_vec_rr(mulpd,mem,reg) - -#define vec_mov_mr_1(mem,reg) gen_vec_mr(movsd,mem,reg) -#define vec_mov_rm_1(reg,mem) gen_vec_rm(movsd,reg,mem) -#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movsd,reg1,reg2) - -#define vec_add_mr_1(mem,reg) gen_vec_mr(addsd,mem,reg) -#define vec_add_rr_1(reg1,reg2) gen_vec_rr(addsd,reg1,reg2) - -#define vec_mul_mr_1(mem,reg) gen_vec_mr(mulsd,mem,reg) -#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(mulsd,reg1,reg2) - -#define vec_splat(mem,reg) vec_splat_wrap(mem,reg) -#define vec_splat_wrap(mem,reg) \ - __asm__ __volatile__ ("movsd %0, " #reg "\n"\ - "unpcklpd " #reg ", " #reg \ - : /* nothing */ \ - : "m" ((mem)[0])) - -/* Hack! */ -/* To use this instruction be sure that register 7 is not in use!!! */ -#define vec_sum(reg) vec_sum_wrap(reg) -#define vec_sum_wrap(reg) \ - __asm__ __volatile__ ("movhlps " #reg ", %%xmm7\n"\ - "addpd %%xmm7, " #reg "\n"\ - : /* nothing */\ - : /* nothing */) -/* - * Added by RCW to improve performance and avoid xmm7 hack (replace vec_sum) - */ -#define vec_zero(vd) __asm__ __volatile__("xorps " Mstr(vd) ", " Mstr(vd) ::) -#ifdef ATL_SSE3 - #define vec_red(vr, vwrk) \ - __asm__ __volatile__("haddpd " Mstr(vr) ", " Mstr(vr) "\n" ::) - #define vec_red2(v0, v1, vw) \ - __asm__ __volatile__("haddpd " Mstr(v1) ", " Mstr(v0) "\n" ::) - #define vec_red4(v0, v1, v2, v3, w0, w1) \ - __asm__ __volatile__("haddpd " Mstr(v1) ", " Mstr(v0) "\n"\ - "haddpd " Mstr(v3) ", " Mstr(v2) "\n"\ - ::) -#else - #define vec_red(vr, vwrk) \ - __asm__ __volatile__ ("pshufd $0xEE, " Mstr(vr) ", " Mstr(vwrk) "\n"\ - "addsd " Mstr(vwrk) ", " Mstr(vr) "\n" ::) -/* - * movapd v0, vw # vw = {v0b, v0a} - * unpcklpd v1,v0 # v0 = {v1a, v0a} - * unpckhpd v1, vw # vw = {v1b, v0b} - * addpd vw, v0 # v0 = {v1ab,v0ab} - */ - #define vec_red2(v0, v1, vw) \ - __asm__ __volatile__("movapd " Mstr(v0) ", " Mstr(vw) "\n"\ - "unpcklpd " Mstr(v1) ", " Mstr(v0) "\n"\ - "unpckhpd " Mstr(v1) ", " Mstr(vw) "\n"\ - "addpd " Mstr(vw) ", " Mstr(v0) "\n"\ - ::) -/* - * movapd v0, w0 # w0 = {v0b, v0a} - * movapd v2, w1 # w1 = {v2b, v2a} - * unpcklpd v1, v0 # v0 = {v1a, v0a} - * unpcklpd v3, v2 # v2 = {v3a, v2a} - * unpckhpd v1, w0 # w0 = {v1b, v0b} - * unpckhpd v3, w1 # w1 = {v3b, v2b} - * addpd w0, v0 # v0 = {v1ab, v0ab} - * addpd w1, v2 # v2 = {v3ab, v2ab} - */ - #define vec_red4(v0, v1, v2, v3, w0, w1) \ - __asm__ __volatile__("movapd " Mstr(v0) ", " Mstr(w0) "\n"\ - "movapd " Mstr(v2) ", " Mstr(w1) "\n"\ - "unpcklpd " Mstr(v1) ", " Mstr(v0) "\n"\ - "unpcklpd " Mstr(v3) ", " Mstr(v2) "\n"\ - "unpckhpd " Mstr(v1) ", " Mstr(w0) "\n"\ - "unpckhpd " Mstr(v3) ", " Mstr(w1) "\n"\ - "addpd " Mstr(w0) ", " Mstr(v0) "\n"\ - "addpd " Mstr(w1) ", " Mstr(v2) "\n"\ - ::) -#endif - -#define vec_sum_full(reg1,reg2,empty1) vec_sum_full_wrap(reg1,reg2,empty1) -#define vec_sum_full_wrap(reg1,reg2,empty1) \ - __asm__ __volatile__ ("movhlps " #reg2 ", " #empty1 "\n"\ - "movlhps " #reg2 ", " #empty1 "\n"\ - "addpd " #empty1 ", " #reg1 "\n"\ - : /* nothing */\ - : /* nothing */) - - -typedef double vector[VECLEN]; - -#endif /* end ifdef SSE2 */ - - -#ifdef THREEDNOW - -/* Peculiarities of 3DNOW. Alignment is not an issue, - * all alignments are legal, however alignment gives a speed increase. - * The vec_acc instruction can be used to sum to registers at once more efficiently - * than a series of vec_sum and vec_store_one - * No muladd. - */ - - -#define gen_vec_mr(op,mem,reg) \ - __asm__ __volatile__ (#op " %0, " #reg \ - : /* nothing */ \ - : "m" (((mem)[0])), "m" (((mem)[1]))) - -#define gen_vec_rm(op,reg,mem) \ - __asm__ __volatile__ (#op " " #reg ", %0" \ - : "=m" (((mem)[0])), "=m" (((mem)[1])) \ - : /* nothing */ ) - - - - -#define VECLEN 2 - -#define reg0 %%mm0 -#define reg1 %%mm1 -#define reg2 %%mm2 -#define reg3 %%mm3 -#define reg4 %%mm4 -#define reg5 %%mm5 -#define reg6 %%mm6 -#define reg7 %%mm7 - -#define vec_add_mr(mem,reg) gen_vec_mr(pfadd,mem,reg) -#define vec_mul_mr(mem,reg) gen_vec_mr(pfmul,mem,reg) -#define vec_mov_mr(mem,reg) gen_vec_mr(movq,mem,reg) -#define vec_mov_rm(reg,mem) gen_vec_rm(movq,reg,mem) -#define vec_add_rr(reg1,reg2) gen_vec_rr(pfadd,reg1,reg2) -#define vec_mul_rr(reg1,reg2) gen_vec_rr(pfmul,reg1,reg2) -#define vec_acc_rr(reg1,reg2) gen_vec_rr(pfacc,reg1,reg2) -#define vec_mov_rr(reg1,reg2) gen_vec_rr(movq,reg1,reg2) - -#define vec_sum(reg) gen_vec_rr(pfacc,reg,reg) -#define vec_sum_full(reg1,reg2) gen_vec_rr(pfacc,reg1,reg2) - -#define vec_mov_mr_1(mem,reg) gen_vec_mr(movd,mem,reg) -#define vec_mov_rm_1(reg,mem) gen_vec_rm(movd,reg,mem) -#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movd,reg1,reg2) - -#define vec_add_rr_1(reg1,reg2) gen_vec_rr(pfadd,reg1,reg2) -#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(pfmul,reg1,reg2) - - -#define vec_splat(mem,reg) vec_splat_wrap(mem,reg) -#define vec_splat_wrap(mem,reg) \ - __asm__ __volatile__ ("movd %0, " #reg "\n"\ - "punpckldq " #reg ", " #reg \ - : /* nothing */ \ - : "m" ((mem)[0])) - - -#define vec_load_apart(mem1,mem2,reg) vec_load_apart_wrap(mem1,mem2,reg) -#define vec_load_apart_wrap(mem1,mem2,reg) \ - __asm__ __volatile__ ("movd %0, " #reg "\n"\ - "punpckldq %1, " #reg \ - : /* nothing */ \ - : "m" ((mem1)[0]), "m" (((mem2)[0]))) - - -#define vec_zero(reg) gen_vec_rr(pxor,reg,reg) - -#define vec_enter() __asm__ __volatile__ ("femms") -#define vec_exit() __asm__ __volatile__ ("femms") - -#define align() __asm__ __volatile__ (".align 16") - - -typedef float vector[VECLEN]; - -#endif - - - - - -#ifdef ALTIVEC - -#define VECLEN 4 - -#define reg0 %%vr0 -#define reg1 %%vr1 -#define reg2 %%vr2 -#define reg3 %%vr3 -#define reg4 %%vr4 -#define reg5 %%vr5 -#define reg6 %%vr6 -#define reg7 %%vr7 -#define reg8 %%vr8 -#define reg9 %%vr9 -#define reg10 %%vr10 -#define reg11 %%vr11 -#define reg12 %%vr12 -#define reg13 %%vr13 -#define reg14 %%vr14 -#define reg15 %%vr15 -#define reg16 %%vr16 -#define reg17 %%vr17 -#define reg18 %%vr18 -#define reg19 %%vr19 -#define reg20 %%vr20 -#define reg21 %%vr21 -#define reg22 %%vr22 -#define reg23 %%vr23 -#define reg24 %%vr24 -#define reg25 %%vr25 -#define reg26 %%vr26 -#define reg27 %%vr27 -#define reg28 %%vr28 -#define reg29 %%vr29 -#define reg30 %%vr30 -#define reg31 %%vr31 - -#define gen_vec_mr(op,mem,reg) \ - __asm__ __volatile__ (#op " %0, " #reg \ - : /* nothing */ \ - : "m" (((mem)[0])), "m" (((mem)[1])), "m" (((mem)[2])), "m" (((mem)[3]))) - - -#define gen_vec_rm(op,reg,mem) \ - __asm__ __volatile__ (#op " " #reg ", %0" \ - : "=m" (((mem)[0])), "=m" (((mem)[1])), "=m" (((mem)[2])), "=m" (((mem)[3])) \ - : /* nothing */ ) - - -#define gen_alti3(op,reg1,reg2,regout) \ - __asm__ __volatile__ (#op " " #reg1 ", " #reg2 ", " #regout \ - : /* nothing */ \ - : /* nothing */) - -#define gen_alti_muladd(op,reg1,reg2,regout) \ - __asm__ __volatile__ (#op " " #reg1 ", " #reg2 ", " #regout ", " #regout \ - : /* nothing */ \ - : /* nothing */) - - - -#define vec_mov_mr_a(mem,reg) gen_vec_mr(lvx,mem,reg) -#define vec_mov_rm_a(reg,mem) gen_vec_rm(svx,reg,mem) -#define vec_muladd(reg1,reg2,regout) gen_alti3(vmaddfp,reg1,reg2,regout) - -#define vec_zero(reg) gen_alti3(vxor,reg,reg,reg) - - -typedef float vector[VECLEN]; - -#endif - - -#ifdef ALTIVEC_C - -/* These macros have been written by, or greatly inspired by, - * Nicholas A. Coult . Thanks. - */ - -/* assumes that last four registers are not in use! */ -#define transpose(x0,x1,x2,x3) \ -reg28 = vec_mergeh(x0,x2); \ -reg29 = vec_mergeh(x1,x3); \ -reg30 = vec_mergel(x0,x2); \ -reg31 = vec_mergel(x1,x3); \ -x0 = vec_mergeh(reg28,reg29); \ -x1 = vec_mergel(reg28,reg29); \ -x2 = vec_mergeh(reg30,reg31); \ -x3 = vec_mergel(reg30,reg31) - -#define vec_mov_rm(v, where) \ -low = vec_ld(0, (where)); \ -high = vec_ld(16, (where)); \ -p_vector = vec_lvsr(0, (int *)(where)); \ -mask = vec_perm((vector unsigned char)(0), (vector unsigned char)(-1), p_vector); \ -v = vec_perm(v, v, p_vector); \ -low = vec_sel(low, v, mask); \ -high = vec_sel(v, high, mask); \ -vec_st(low, 0, (where)); \ -vec_st(high, 16, (where)) - -#define vec_mov_mr_a(mem,reg) reg = vec_ld(0, mem) - -#define vec_mov_mr(u,v) \ -p_vector = (vector unsigned char)vec_lvsl(0, (int*)(v)); \ -low = (vector unsigned char)vec_ld(0, (v)); \ -high = (vector unsigned char)vec_ld(16, (v)); \ -u=(vector float)vec_perm(low, high, p_vector) - -#define vec_muladd(reg1,reg2,regout) regout = vec_madd(reg1,reg2,regout) -#define vec_add_rr(reg1,reg2) reg2 = vec_add(reg1,reg2) - -#define vec_zero(reg) reg = vec_xor(reg,reg) - -#define vec_sum_full(reg0,reg1,reg2,reg3,regout,empty0,empty1) \ -transpose(reg0, reg1,reg2,reg3,regout,empty0,empty1); \ -empty0 = vec_add(reg0,reg1); \ -empty1 = vec_add(reg2,reg3); \ -regout = vec_add(empty0,empty1) - - -#endif /* ALTIVEC_C */ - - - - - - - - diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h deleted file mode 100644 index af9c6b1..0000000 --- a/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h +++ /dev/null @@ -1,1626 +0,0 @@ -#include <stdlib.h> -#include <sys/time.h> -#include <stdio.h> - -#include "camm_util.h" - - -#if defined(ALIGN) -#if( defined(SCPLX) || defined(DCPLX)) -#error Cannot align complex routines -#endif -#if defined(SREAL) && ( NDPM != 1 ) && ( STRIDE % 4 != 0) -#error Can only align SREAL with NDPM 1 or STRIDE % 4 = 0 -#endif -#if defined(DREAL) && ( NDPM != 1 ) && ( STRIDE % 2 != 0) -#error Can only align DREAL with NDPM 1 or STRIDE % 2 = 0 -#endif -#endif - -/****************************************************************************** - * Single Precision Complex Macros - ******************************************************************************/ - -#ifdef SCPLX - -#ifdef NO_TRANSPOSE - -#if NDPM > 3 -#error Max NDPM is 3 for SCPLX NO_TRANSPOSE -#endif - -#undef plax -#define plax - -#undef R1 -#define R1 2 -#undef R2 -#define R2 4 -#undef R3 -#define R3 6 -#undef R4 -#define R4 6 - -#undef TREG -#define TREG 1 -#undef SREG -#define SREG 0 -#undef CREG -#define CREG 0 - -#ifdef GER -#undef AREG -#define AREG 0 -#undef targ -#define targ(a_) AREG -#undef wb -#define wb(a_,b_) pu(AREG,a_,b_) -#undef wbd -#define wbd(a_,b_) pud(AREG,a_,b_) -#undef w -#define w(a_) -#undef w1_2 -#define w1_2(a_) -#else -#undef AREG -#define AREG TREG -#undef targ -#define targ(a_) CREG -#undef wb -#define wb(a_,b_) -#undef wbd -#define wbd(a_,b_) -#undef w -#define w(a_) pu(CREG,a_ ## 0,si) -#undef w1_2 -#define w1_2(a_) pud(CREG,a_ ## 0,si) -#endif - -#undef src -#define src(a_) a_ -#undef mpx -#define mpx(a_) pls(0,si,a_) ps(0,a_,a_) pls(4,si,P(a_,1)) \ - ps(0,P(a_,1),P(a_,1)) sign(a_) -#undef madd -#define madd(a_,b_,c_) pas(a_,b_,c_) -#undef ulfa -#define ulfa(a_) - -#else - -#undef R1 -#define R1 4 -#undef R2 -#define R2 5 -#undef R3 -#define R3 6 -#undef R4 -#define R4 7 - -#undef TREG -#define TREG 3 -#undef SREG -#define SREG 2 -#undef CREG -#define CREG 0 -#undef targ -#define targ(a_) a_ -#undef src -#define src(a_) 0 -#undef w -#define w(a_) -#undef w1_2 -#define w1_2(a_) -#undef mpx -#define mpx(a_) px(a_) -#ifdef BETA0 -#undef ulfa -#define ulfa(a_) phl(a_,0) pa(0,a_) pud(a_,0,si) -#else -#undef ulfa -#define ulfa(a_) pld(0,si,TREG) phl(a_,0) pa(0,a_) pa(TREG,a_) pud(a_,0,si) -#endif -#undef AREG -#define AREG TREG -#undef wb -#define wb(a_,b_) -#undef wbd -#define wbd(a_,b_) -#undef wbs -#define wbs(a_,b_) - - -#undef plax -#define plax pc(CREG,1) ps(160,CREG,CREG) ps(245,1,1) sign(CREG) - - - -#endif - -#if defined(Conj_) && ! defined(GER) -#undef sign -#define sign(a_) pm(SREG,a_) -#else -#undef sign -#define sign(a_) pm(SREG,P(a_,1)) -#endif - - - -#undef plb -#define plb(a_,b_) pl(a_,b_,AREG) -#undef plbd -#define plbd(a_,b_) px(AREG) pld(a_,b_,AREG) - -#undef dpr -#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_)) -#undef dprp -#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_)) -#undef dpi -#define dpi(a_) pm(P(src(a_),1),TREG) ps(177,TREG,TREG) pa(TREG,targ(a_)) - -#ifndef GER - - -#undef plaa -#define plaa(a_) pl(a_ ## 0,si,CREG) plax -#undef wa -#define wa(a_) w(a_) -#undef dp -#define dp(a_,b_,c_) plb(a_ ## 0,b_) dpr(c_) plb(a_ ## 0,b_) dpi(c_) -#undef dpp -#define dpp(a_,b_,c_,d_,e_) plb(a_ ## 0,b_) dprp(c_,d_,e_) plb(a_ ## 0,b_) dpi(c_) -#undef ddp -#define ddp(a_,b_,c_) dp(a_,b_,c_) -#undef ddpp -#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) - -#undef plaa1_2 -#define plaa1_2(a_) px(CREG) pld(a_ ## 0,si,CREG) plax -#undef wa1_2 -#define wa1_2(a_) w1_2(a_) -#undef dp1_2 -#define dp1_2(a_,b_,c_) plbd(a_ ## 0,b_) dpr(c_) plbd(a_ ## 0,b_) dpi(c_) -#undef dpp1_2 -#define dpp1_2(a_,b_,c_,d_,e_) plbd(a_ ## 0,b_) dprp(c_,d_,e_) plbd(a_ ## 0,b_) dpi(c_) -#undef ddp1_2 -#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) -#undef ddpp1_2 -#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) - - -#else - -#undef lqc -#define lqc(a_) pl(a_ ## 0,si,TREG) -#undef lqc1 -#define lqc1_2(a_) px(TREG) pld(a_ ## 0,si,TREG) - - -#undef plaa -#define plaa(a_) -#undef wa -#define wa(a_) -#undef dp -#define dp(a_,b_,c_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) \ - lqc(a_) dpi(c_) wb(a_ ## 0,b_) -#undef dpp -#define dpp(a_,b_,c_,d_,e_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) pf(d_,e_) \ - lqc(a_) dpi(c_) wb(a_ ## 0,b_) -#undef ddp -#define ddp(a_,b_,c_) dp(a_,b_,c_) -#undef ddpp -#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) - -#undef plaa1_2 -#define plaa1_2(a_) -#undef wa1_2 -#define wa1_2(a_) -#undef dp1_2 -#define dp1_2(a_,b_,c_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) \ - lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) -#undef dpp1_2 -#define dpp1_2(a_,b_,c_,d_,e_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) pf(d_,e_) \ - lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) -#undef ddp1_2 -#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) -#undef ddpp1_2 -#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) - -#endif - -#endif - -/****************************************************************************** - * Single Precision Real Macros - ******************************************************************************/ - -#ifdef SREAL - -#ifdef NO_TRANSPOSE - -#undef mpx -#define mpx(a_) pls(0,si,a_) ps(0,a_,a_) -#undef madd -#define madd(a_,b_,c_) pas(a_,b_,c_) -#undef TREG -#define TREG 1 -#undef targ -#define targ(a_) 0 -#undef src -#define src(a_) a_ -#undef ulfa -#define ulfa(a_) - -#ifdef GER -#undef w -#define w(a_) -#undef w1_2 -#define w1_2(a_) -#undef w1_4 -#define w1_4(a_) -#undef CREG -#define CREG 2 -#undef AREG -#define AREG 0 -#undef cp -#define cp pc(CREG,TREG) -#undef wb -#define wb(a_,b_) pu(AREG,a_,b_) -#undef wbd -#define wbd(a_,b_) pud(AREG,a_,b_) -#undef wbs -#define wbs(a_,b_) pus(AREG,a_,b_) -#else -#undef CREG -#define CREG 0 -#undef AREG -#define AREG TREG -#undef cp -#define cp -#undef wb -#define wb(a_,b_) -#undef wbd -#define wbd(a_,b_) -#undef wbs -#define wbs(a_,b_) -#undef w -#define w(a_) pu(CREG,a_ ## 0,si) -#undef w1_2 -#define w1_2(a_) pud(CREG,a_ ## 0,si) -#undef w1_4 -#define w1_4(a_) pus(CREG,a_ ## 0,si) -#endif - -#else - -#undef mpx -#define mpx(a_) px(a_) -#ifdef BETA0 -#undef madd -#define madd(a_,b_,c_) -#else -#undef madd -#define madd(a_,b_,c_) pas(a_,b_,c_) -#endif -#undef TREG -#define TREG 3 -#undef targ -#define targ(a_) a_ -#undef src -#define src(a_) 0 -#undef w -#define w(a_) -#undef w1_2 -#define w1_2(a_) -#undef w1_4 -#define w1_4(a_) -#undef ulfa -#undef ulfa -#define ulfa(a_) phl(a_,0) pa(0,a_) pc(a_,0) ps(1,0,0) pa(0,a_) \ - madd(0,si,a_) pus(a_,0,si) - -#undef CREG -#define CREG 0 -#undef AREG -#define AREG TREG -#undef cp -#define cp -#undef wb -#define wb(a_,b_) -#undef wbd -#define wbd(a_,b_) -#undef wbs -#define wbs(a_,b_) - -#endif - -#if defined(ALIGN) -#undef plb -#define plb(a_,b_) pla(a_,b_,AREG) -#else -#undef plb -#define plb(a_,b_) pl(a_,b_,AREG) -#endif -#undef plbd -#define plbd(a_,b_) px(AREG) pld(a_,b_,AREG) -#undef plbs -#define plbs(a_,b_) pls(a_,b_,AREG) -#undef dpr -#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_)) -#undef dprp -#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_)) -#undef dprs -#define dprs(a_) pmsr(src(a_),TREG) pasr(TREG,targ(a_)) -#undef dprps -#define dprps(a_,b_,c_) pf(b_,c_) pmsr(src(a_),TREG) pasr(TREG,targ(a_)) - -#undef plaa -#define plaa(a_) pl(a_ ## 0,si,CREG) -#undef wa -#define wa(a_) w(a_) -#undef dp -#define dp(a_,b_,c_) cp plb(a_ ## 0,b_) dpr(c_) wb(a_ ## 0,b_) -#undef dpp -#define dpp(a_,b_,c_,d_,e_) cp plb(a_ ## 0,b_) dprp(c_,d_,e_) wb(a_ ## 0,b_) -#undef ddp -#define ddp(a_,b_,c_) dp(a_,b_,c_) -#undef ddpp -#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) - -#undef plaa1_2 -#define plaa1_2(a_) px(CREG) pld(a_ ## 0,si,CREG) -#undef wa1_2 -#define wa1_2(a_) w1_2(a_) -#undef dp1_2 -#define dp1_2(a_,b_,c_) cp plbd(a_ ## 0,b_) dpr(c_) wbd(a_ ## 0,b_) -#undef dpp1_2 -#define dpp1_2(a_,b_,c_,d_,e_) cp plbd(a_ ## 0,b_) dprp(c_,d_,e_) wbd(a_ ## 0,b_) -#undef ddp1_2 -#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) -#undef ddpp1_2 -#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) - -#undef plaa1_4 -#define plaa1_4(a_) pls(a_ ## 0,si,CREG) -#undef wa1_4 -#define wa1_4(a_) w1_4(a_) -#undef dp1_4 -#define dp1_4(a_,b_,c_) cp plbs(a_ ## 0,b_) dprs(c_) wbs(a_ ## 0,b_) -#undef dpp1_4 -#define dpp1_4(a_,b_,c_,d_,e_) cp plbs(a_ ## 0,b_) dprps(c_,d_,e_) wbs(a_ ## 0,b_) -#undef ddp1_4 -#define ddp1_4(a_,b_,c_) dp1_4(a_,b_,c_) -#undef ddpp1_4 -#define ddpp1_4(a_,b_,c_,d_,e_) dpp1_4(a_,b_,c_,d_,e_) - - - -#undef R1 -#define R1 4 -#undef R2 -#define R2 5 -#undef R3 -#define R3 6 -#undef R4 -#define R4 7 - -#endif - -/****************************************************************************** - * Double Precision Real Macros - ******************************************************************************/ - -#ifdef DREAL - -#ifdef ATL_SSE2 - -#ifdef NO_TRANSPOSE - -#undef mpx -#define mpx(a_) pls(0,si,a_) ps(0,a_,a_) -#undef madd -#define madd(a_,b_,c_) pas(a_,b_,c_) -#undef TREG -#define TREG 1 -#undef targ -#define targ(a_) 0 -#undef src -#define src(a_) a_ -#undef ulfa -#define ulfa(a_) - -#ifdef GER -#undef w -#define w(a_) -#undef w1_2 -#define w1_2(a_) -#undef w1_4 -#define w1_4(a_) -#undef CREG -#define CREG 2 -#undef AREG -#define AREG 0 -#undef cp -#define cp pc(CREG,TREG) -#undef wb -#define wb(a_,b_) pu(AREG,a_,b_) -#undef wbd -#define wbd(a_,b_) pus(AREG,a_,b_) -#undef wbs -/* #define wbs(a_,b_) pus(AREG,a_,b_) */ -#else -#undef CREG -#define CREG 0 -#undef AREG -#define AREG TREG -#undef cp -#define cp -#undef wb -#define wb(a_,b_) -#undef wbd -#define wbd(a_,b_) -#undef wbs -/* #define wbs(a_,b_) */ -#undef w -#define w(a_) pu(CREG,a_ ## 0,si) -#undef w1_2 -#define w1_2(a_) pus(CREG,a_ ## 0,si) -#undef w1_4 -/* #define w1_4(a_) pus(CREG,a_ ## 0,si) */ -#endif - -#else - -#undef mpx -#define mpx(a_) px(a_) -#ifdef BETA0 -#undef madd -#define madd(a_,b_,c_) -#else -#undef madd -#define madd(a_,b_,c_) pas(a_,b_,c_) -#endif -#undef TREG -#define TREG 3 -#undef targ -#define targ(a_) a_ -#undef src -#define src(a_) 0 -#undef w -#define w(a_) -#undef w1_2 -#define w1_2(a_) -#undef w1_4 -#define w1_4(a_) -#undef ulfa -#undef ulfa -#define ulfa(a_) /* phl(a_,0) pa(0,a_) */ pc(a_,0) ps(1,0,0) pa(0,a_) \ - madd(0,si,a_) pus(a_,0,si) - -#undef CREG -#define CREG 0 -#undef AREG -#define AREG TREG -#undef cp -#define cp -#undef wb -#define wb(a_,b_) -#undef wbd -#define wbd(a_,b_) -#undef wbs -#define wbs(a_,b_) - -#endif - -#if defined(ALIGN) -#undef plb -#define plb(a_,b_) pla(a_,b_,AREG) -#else -#undef plb -#define plb(a_,b_) pl(a_,b_,AREG) -#endif -#undef plbd -#define plbd(a_,b_) /* px(AREG) */pls(a_,b_,AREG) -#undef plbs -/* #define plbs(a_,b_) pls(a_,b_,AREG) */ -#undef dpr -#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_)) -#undef dprp -#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_)) -#undef dprs -#define dprs(a_) pmsr(src(a_),TREG) pasr(TREG,targ(a_)) -#undef dprps -#define dprps(a_,b_,c_) pf(b_,c_) pmsr(src(a_),TREG) pasr(TREG,targ(a_)) - -#undef plaa -#define plaa(a_) pl(a_ ## 0,si,CREG) -#undef wa -#define wa(a_) w(a_) -#undef dp -#define dp(a_,b_,c_) cp plb(a_ ## 0,b_) dpr(c_) wb(a_ ## 0,b_) -#undef dpp -#define dpp(a_,b_,c_,d_,e_) cp plb(a_ ## 0,b_) dprp(c_,d_,e_) wb(a_ ## 0,b_) -#undef ddp -#define ddp(a_,b_,c_) dp(a_,b_,c_) -#undef ddpp -#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) - -#undef plaa1_2 -#define plaa1_2(a_) /* px(CREG) */pls(a_ ## 0,si,CREG) -#undef wa1_2 -#define wa1_2(a_) w1_2(a_) -#undef dp1_2 -#define dp1_2(a_,b_,c_) cp plbd(a_ ## 0,b_) dprs(c_) wbd(a_ ## 0,b_) -#undef dpp1_2 -#define dpp1_2(a_,b_,c_,d_,e_) cp plbd(a_ ## 0,b_) dprps(c_,d_,e_) wbd(a_ ## 0,b_) -#undef ddp1_2 -#define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) -#undef ddpp1_2 -#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) - -#undef plaa1_4 -/* #define plaa1_4(a_) pls(a_ ## 0,si,CREG) */ -#undef wa1_4 -/* #define wa1_4(a_) w1_4(a_) */ -#undef dp1_4 -/* #define dp1_4(a_,b_,c_) cp plbs(a_ ## 0,b_) dprs(c_) wbs(a_ ## 0,b_) */ -#undef dpp1_4 -/* #define dpp1_4(a_,b_,c_,d_,e_) cp plbs(a_ ## 0,b_) dprps(c_,d_,e_) wbs(a_ ## 0,b_) */ -#undef ddp1_4 -/* #define ddp1_4(a_,b_,c_) dp1_4(a_,b_,c_) */ -#undef ddpp1_4 -/* #define ddpp1_4(a_,b_,c_,d_,e_) dpp1_4(a_,b_,c_,d_,e_) */ - - - -#undef R1 -#define R1 4 -#undef R2 -#define R2 5 -#undef R3 -#define R3 6 -#undef R4 -#define R4 7 - -#else - -#ifdef NO_TRANSPOSE - -#undef t0 -#define t0(a_) 1 -#undef s0 -#define s0(a_) a_ -#undef t8 -#define t8(a_) 2 -#undef s8 -#define s8(a_) a_ -#undef w -#define w(a_) fp(a_ ## 0,si) fp(a_ ## 8,si) -#undef w1_2 -#define w1_2(a_) fp(a_ ## 0,si) -#undef mpx -#define mpx(a_) fl(0,si) fc(M(a_,2)) -#undef madd -#define madd(a_,b_,c_) faa(a_,b_) -#undef ulfa -#define ulfa(a_) fc(0) - -#else - -#undef t0 -#define t0(a_) a_ -#undef s0 -#define s0(a_) 1 -#undef t8 -#define t8(a_) a_ -#undef s8 -#define s8(a_) 2 -#undef w -#define w(a_) -#undef w1_2 -#define w1_2(a_) -#undef mpx -#define mpx(a_) fz -#ifdef BETA0 -#undef madd -#define madd(a_,b_,c_) -#else -#undef madd -#define madd(a_,b_,c_) faa(a_,b_) -#endif -#undef ulfa -#define ulfa(a_) madd(0,si,a_) fp(0,si) - -#endif - - -#ifndef GER - -#undef plaa1_2 -#define plaa1_2(a_) fl(a_ ## 0,si) -#undef wa1_2 -#define wa1_2(a_) w1_2(a_) -#ifdef NO_TRANSPOSE -#undef ddp1_2 -#define ddp1_2(a_,b_,c_) fl(a_ ## 0,b_) fm(M(s0(c_),1),0) fap(0,t0(c_)) -#undef dp1_2 -#define dp1_2(a_,b_,c_) ddp1_2(a_,b_,c_) -#else -#undef ddp1_2 -#define ddp1_2(a_,b_,c_) fl(a_ ## 0,b_) fm(s0(c_),0) fap(0,M(t0(c_),1)) -#undef dp1_2 -#define dp1_2(a_,b_,c_) fl(a_ ## 0,b_) fmp(0,s0(c_)) fap(0,M(t0(c_),2)) -#endif - -#else - -#undef plaa1_2 -#define plaa1_2(a_) fl(a_ ## 0,si) -#undef wa1_2 -#define wa1_2(a_) -#undef ddp1_2 -#define ddp1_2(a_,b_,c_) fd(M(s0(c_),2)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) -#undef dp1_2 -#define dp1_2(a_,b_,c_) fm(M(s0(c_),2),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) - -#endif - - - -#undef plaa -#define plaa(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) fx1 - -#ifndef GER - - -#undef wa -#define wa(a_) w(a_) - - -#undef ddp -#define ddp(a_,b_,c_) fl(a_ ## 0,b_) fm(s0(c_),0) fl(a_ ## 8,b_) \ - fm(P(s8(c_),1),0) fx1 fap(0,P(t0(c_),1)) \ - fap(0,t8(c_)) -#undef ddpp -#define ddpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fm(s0(c_),0) fl(a_ ## 8,b_) \ - fm(P(s8(c_),1),0) pf(d_,e_) fx1 fap(0,P(t0(c_),1)) \ - fap(0,t8(c_)) - -/* #define ddp(a_,b_,c_) fd(M(s0(c_),1)) fma(a_ ## 0,b_) fap(0,t0(c_)) \ */ -/* fd(M(s8(c_),1)) fma(a_ ## 8,b_) fap(0,t8(c_)) */ -/* #define ddpp(a_,b_,c_,d_,e_) fd(M(s0(c_),1)) fma(a_ ## 0,b_) fap(0,t0(c_)) \ */ -/* \ */ -/* fd(M(s8(c_),1)) fma(a_ ## 8,b_) fap(0,t8(c_)) pf(d_,e_) */ - -#ifdef NO_TRANSPOSE - -#undef dp -#define dp(a_,b_,c_) ddp(a_,b_,c_) -#undef dpp -#define dpp(a_,b_,c_,d_,e_) ddpp(a_,b_,c_,d_,e_) - -#else - -#undef dp -#define dp(a_,b_,c_) fl(a_ ## 0,b_) fmp(0,s0(c_)) fl(a_ ## 8,b_) \ - fmp(0,s8(c_)) fap(0,M(t0(c_),1)) fap(0,M(t8(c_),2)) -#undef dpp -#define dpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) pf(d_ ,e_) fmp(0,s0(c_)) fl(a_ ## 8,b_) \ - fmp(0,s8(c_)) fap(0,M(t0(c_),1)) fap(0,M(t8(c_),2)) - -/* #define dp(a_,b_,c_) fma(a_ ## 0,b_) fap(0,M(t0(c_),1)) \ */ -/* fma(a_ ## 8,b_) fap(0,M(t8(c_),2)) */ -/* #define dpp(a_,b_,c_,d_,e_) fma(a_ ## 0,b_) fap(0,M(t0(c_),1)) \ */ -/* \ */ -/* fma(a_ ## 8,b_) fap(0,M(t8(c_),2)) pf(d_,e_) */ - -#endif - - -#else - -#undef wa -#define wa(a_) -#undef ddp -#define ddp(a_,b_,c_) fd(M(s0(c_),1)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \ - fd(M(s8(c_),1)) fm(t8(c_),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) -#undef ddpp -#define ddpp(a_,b_,c_,d_,e_) fd(M(s0(c_),1)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \ - fd(M(s8(c_),1)) fm(t8(c_),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) pf(d_,e_) - -#undef dp -#define dp(a_,b_,c_) fm(M(s0(c_),1),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \ - fm(M(s8(c_),2),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) -#undef dpp -#define dpp(a_,b_,c_,d_,e_) fm(M(s0(c_),1),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \ - fm(M(s8(c_),2),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) pf(d_,e_) - -#endif - - -#undef R1 -#define R1 3 -#undef R2 -#define R2 4 -#undef R3 -#define R3 5 -#undef R4 -#define R4 6 - -#endif - -#endif - -/****************************************************************************** - * Double Precision Complex Macros - ******************************************************************************/ - -#ifdef DCPLX - -#ifdef ATL_SSE2 -#ifdef NO_TRANSPOSE - -#if NDPM > 3 -#error Max NDPM is 3 for DCPLX NO_TRANSPOSE -#endif - -#undef plax -#define plax - -#undef R1 -#define R1 2 -#undef R2 -#define R2 4 -#undef R3 -#define R3 6 -#undef R4 -#define R4 6 - -#undef TREG -#define TREG 1 -#undef SREG -#define SREG 0 -#undef CREG -#define CREG 0 - -#ifdef GER -#undef AREG -#define AREG 0 -#undef targ -#define targ(a_) AREG -#undef wb -#define wb(a_,b_) pu(AREG,a_,b_) -#undef wbd -/* #define wbd(a_,b_) pud(AREG,a_,b_) */ -#undef w -#define w(a_) -#undef w1_2 -/* #define w1_2(a_) */ -#else -#undef AREG -#define AREG TREG -#undef targ -#define targ(a_) CREG -#undef wb -#define wb(a_,b_) -#undef wbd -/* #define wbd(a_,b_) */ -#undef w -#define w(a_) pu(CREG,a_ ## 0,si) -#undef w1_2 -/* #define w1_2(a_) pud(CREG,a_ ## 0,si) */ -#endif - -#undef src -#define src(a_) a_ -#undef mpx -#define mpx(a_) pls(0,si,a_) ps(0,a_,a_) pls(8,si,P(a_,1)) \ - ps(0,P(a_,1),P(a_,1)) sign(a_) -#undef madd -#define madd(a_,b_,c_) pas(a_,b_,c_) -#undef ulfa -#define ulfa(a_) - -#else - -#undef R1 -#define R1 4 -#undef R2 -#define R2 5 -#undef R3 -#define R3 6 -#undef R4 -#define R4 7 - -#undef TREG -#define TREG 3 -#undef SREG -#define SREG 2 -#undef CREG -#define CREG 0 -#undef targ -#define targ(a_) a_ -#undef src -#define src(a_) 0 -#undef w -#define w(a_) -#undef w1_2 -#define w1_2(a_) -#undef mpx -#define mpx(a_) px(a_) -#ifdef BETA0 -#undef ulfa -#define ulfa(a_) /* phl(a_,0) pa(0,a_) */pu(a_,0,si) -#else -#undef ulfa -#define ulfa(a_) pl(0,si,TREG) /* phl(a_,0) pa(0,a_) */ pa(TREG,a_) pu(a_,0,si) -#endif -#undef AREG -#define AREG TREG -#undef wb -#define wb(a_,b_) -#undef wbd -#define wbd(a_,b_) -#undef wbs -#define wbs(a_,b_) - - -#undef plax -#define plax pc(CREG,1) ps(0,CREG,CREG) ps(3,1,1) sign(CREG) - - - -#endif - -#if defined(Conj_) && ! defined(GER) -#undef sign -#define sign(a_) pm(SREG,a_) -#else -#undef sign -#define sign(a_) pm(SREG,P(a_,1)) -#endif - - - -#undef plb -#define plb(a_,b_) pl(a_,b_,AREG) -#undef plbd -/* #define plbd(a_,b_) px(AREG) pld(a_,b_,AREG) */ - -#undef dpr -#define dpr(a_) pm(src(a_),TREG) pa(TREG,targ(a_)) -#undef dprp -#define dprp(a_,b_,c_) pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_)) -#undef dpi -#define dpi(a_) pm(P(src(a_),1),TREG) ps(1,TREG,TREG) pa(TREG,targ(a_)) - -#ifndef GER - -#undef plaa -#define plaa(a_) pl(a_ ## 0,si,CREG) plax -#undef wa -#define wa(a_) w(a_) -#undef dp -#define dp(a_,b_,c_) plb(a_ ## 0,b_) dpr(c_) plb(a_ ## 0,b_) dpi(c_) -#undef dpp -#define dpp(a_,b_,c_,d_,e_) plb(a_ ## 0,b_) dprp(c_,d_,e_) plb(a_ ## 0,b_) dpi(c_) -#undef ddp -#define ddp(a_,b_,c_) dp(a_,b_,c_) -#undef ddpp -#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) - -#undef plaa1_2 -/* #define plaa1_2(a_) px(CREG) pld(a_ ## 0,si,CREG) plax */ -#undef wa1_2 -/* #define wa1_2(a_) w1_2(a_) */ -#undef dp1_2 -/* #define dp1_2(a_,b_,c_) plbd(a_ ## 0,b_) dpr(c_) plbd(a_ ## 0,b_) dpi(c_) */ -#undef dpp1_2 -/* #define dpp1_2(a_,b_,c_,d_,e_) plbd(a_ ## 0,b_) dprp(c_,d_,e_) plbd(a_ ## 0,b_) dpi(c_) */ -#undef ddp1_2 -/* #define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) */ -#undef ddpp1_2 -/* #define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) */ - - -#else - -#undef lqc -#define lqc(a_) pl(a_ ## 0,si,TREG) -#undef lqc1 -/* #define lqc1_2(a_) px(TREG) pld(a_ ## 0,si,TREG) */ - - -#undef plaa -#define plaa(a_) -#undef wa -#define wa(a_) -#undef dp -#define dp(a_,b_,c_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) \ - lqc(a_) dpi(c_) wb(a_ ## 0,b_) -#undef dpp -#define dpp(a_,b_,c_,d_,e_) lqc(a_) plb(a_ ## 0,b_) dpr(c_) pf(d_,e_) \ - lqc(a_) dpi(c_) wb(a_ ## 0,b_) -#undef ddp -#define ddp(a_,b_,c_) dp(a_,b_,c_) -#undef ddpp -#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_) - -#undef plaa1_2 -/* #define plaa1_2(a_) */ -#undef wa1_2 -/* #define wa1_2(a_) */ -#undef dp1_2 -/* #define dp1_2(a_,b_,c_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) \ */ -/* lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) */ -#undef dpp1_2 -/* #define dpp1_2(a_,b_,c_,d_,e_) lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) pf(d_,e_) \ */ -/* lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) */ -#undef ddp1_2 -/* #define ddp1_2(a_,b_,c_) dp1_2(a_,b_,c_) */ -#undef ddpp1_2 -/* #define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) */ - -#endif - -#else - -#if NDPM > 2 -#error Max NDPM is 2 for DCPLX -#endif - -#undef TREG -#define TREG 2 - -#ifdef NO_TRANSPOSE - -#undef w -#define w(a_) fp(a_ ## 0,si) fp(a_ ## 8,si) -#undef plax -#define plax fx1 -#undef srr -#define srr(a_) a_ -#undef sri -#define sri(a_) a_ -#undef sir -#define sir(a_) a_ -#undef sii -#define sii(a_) a_ -#undef trr -#define trr(a_) P(TREG,1) -#undef tri -#define tri(a_) M(TREG,1) -#undef tir -#define tir(a_) TREG -#undef tii -#define tii(a_) TREG -#undef mpx -#define mpx(a_) fl(0,si) fl(8,si) fc(M(a_,2)) fc(M(a_,2)) -#undef madd -#define madd(a_,b_,c_) faa(a_,b_) -#undef ulfa -#define ulfa(a_) fc(0) fc(0) - -#else - -#undef srr -#define srr(a_) P(TREG,1) -#undef sri -#define sri(a_) M(TREG,1) -#undef sir -#define sir(a_) TREG -#undef sii -#define sii(a_) TREG -#undef trr -#define trr(a_) a_ -#undef tri -#define tri(a_) a_ -#undef tir -#define tir(a_) a_ -#undef tii -#define tii(a_) a_ -#undef w -#define w(a_) -#undef plax -#define plax -#undef mpx -#define mpx(a_) fz fz -#ifdef BETA0 -#undef madd -#define madd(a_,b_,c_) -#else -#undef madd -#define madd(a_,b_,c_) faa(a_,b_) -#endif -#undef ulfa -#define ulfa(a_) madd(0,si,a_) fp(0,si) madd(8,si,a_) fp(8,si) - -#endif - - - -#ifdef Conj_ -#undef fapi -#define fapi(a_,b_) fsp(b_) -#undef fspi -#define fspi(a_,b_) fap(a_,b_) -#else -#undef fapi -#define fapi(a_,b_) fap(a_,b_) -#undef fspi -#define fspi(a_,b_) fsp(b_) -#endif - -#ifndef GER - - -#undef plaa -#define plaa(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) plax -#undef wa -#define wa(a_) w(a_) -#undef ddp -#define ddp(a_,b_,c_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \ - fm(sri(c_),0) fap(0,tri(c_))\ - fl(a_ ## 8,b_) fd(0) fm(sir(c_),0) fspi(0,tir(c_)) \ - fm(sii(c_),0) fapi(0,tii(c_)) -#undef ddpp -#define ddpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \ - fm(sri(c_),0) fap(0,tri(c_))\ - fl(a_ ## 8,b_) fd(0) pf(d_,e_) fm(sir(c_),0) fspi(0,tir(c_))\ - fm(sii(c_),0) fapi(0,tii(c_)) - - - -#ifdef NO_TRANSPOSE - - - -#undef dp -#define dp(a_,b_,c_) ddp(a_,b_,c_) -#undef dpp -#define dpp(a_,b_,c_,d_,e_) ddpp(a_,b_,c_,d_,e_) - - - -#else - -#undef dp -#define dp(a_,b_,c_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \ - fm(sri(c_),0) fap(0,tri(c_))\ - fl(a_ ## 8,b_) fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \ - fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2)) - -#undef dpp -#define dpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \ - pf(d_,e_) fm(sri(c_),0) fap(0,tri(c_))\ - fl(a_ ## 8,b_) fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \ - fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2)) - - -#endif - -#else - -#undef plaa -#define plaa(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) plax -#undef wa -#define wa(a_) - -#undef ddprr -#define ddprr(a_,b_,c_) fl(a_ ## 0,b_) \ - fd(tri(c_)) fm(P(sri(c_),1),0) fap(0,1) \ - fd(M(trr(c_),1)) fm(srr(c_),0) fspi(0,1) \ - fp(a_ ## 0,b_) -#undef ddpri -#define ddpri(a_,b_,c_) fl(a_ ## 8,b_) \ - fd(tii(c_)) fm(P(sii(c_),1),0) fap(0,1) \ - fd(M(tir(c_),1)) fm(sir(c_),0) fapi(0,1) \ - fp(a_ ## 8,b_) -#undef dpri -#define dpri(a_,b_,c_) fl(a_ ## 8,b_) \ - fx(2) fm(sir(c_),0) fap(0,2) \ - fm(M(sii(c_),2),0) fapi(0,1) \ - fp(a_ ## 8,b_) - - -#undef ddpp -#define ddpp(a_,b_,c_,d_,e_) ddprr(a_,b_,c_) pf(d_,e_) ddpri(a_,b_,c_) -#undef ddp -#define ddp(a_,b_,c_) ddprr(a_,b_,c_) ddpri(a_,b_,c_) -#undef dpp -#define dpp(a_,b_,c_,d_,e_) ddprr(a_,b_,c_) pf(d_,e_) dpri(a_,b_,c_) -#undef dp -#define dp(a_,b_,c_) ddprr(a_,b_,c_) dpri(a_,b_,c_) - -#endif - - -#undef R1 -#define R1 4 -#undef R2 -#define R2 6 -#undef R3 -#define R3 6 -#undef R4 -#define R4 6 - -#endif - -#endif - - -/****************************************************************************** - * General Macros - ******************************************************************************/ - - - - -#undef bla1 -#define bla1(a_,b_) plaa(a_) dpp(a_,ax,R1,b_,si) wa(a_) -#undef blb1 -#define blb1(a_,b_) plaa(a_) dpp(a_,ax,R1,b_,ax) wa(a_) - -#undef bla2 -#undef bla2 -#define bla2(a_,b_) pf(b_,si) plaa(a_) ddp(a_,ax,R1) pf(b_,ax) dp(a_,bx,R2) wa(a_) -#undef blb2 -#undef blb2 -#define blb2(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,bx) dp(a_,bx,R2) wa(a_) - -#undef bla3 -#define bla3(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,si) ddp(a_,bx,R2) \ - dpp(a_,cx,R3,b_,ax) wa(a_) -#undef blb3 -#define blb3(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,bx) ddp(a_,bx,R2) \ - dpp(a_,cx,R3,b_,cx) wa(a_) - -#undef bla4 -#define bla4(a_,b_) plaa(a_) ddpp(a_,ax,R1,b_,si) ddpp(a_,bx,R2,b_,ax) \ - ddp(a_,cx,R3) dpp(a_,dx,R4,b_,bx) wa(a_) -#undef blb4 -#define blb4(a_,b_) plaa(a_) ddp(a_,ax,R1) ddpp(a_,bx,R2,b_,cx) \ - ddp(a_,cx,R3) dpp(a_,dx,R4,b_,dx) wa(a_) - -#undef bla -#define bla(a_,b_) Mjoin(bla,NDP)(a_,b_) -#undef blb -#define blb(a_,b_) Mjoin(blb,NDP)(a_,b_) - - - -#undef bla11_2 -#define bla11_2(a_) plaa1_2(a_) dp1_2(a_,ax,R1) wa1_2(a_) -#undef bla21_2 -#define bla21_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) dp1_2(a_,bx,R2) wa1_2(a_) -#undef bla31_2 -#define bla31_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \ - dp1_2(a_,cx,R3) wa1_2(a_) -#undef bla41_2 -#define bla41_2(a_) plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \ - ddp1_2(a_,cx,R3) dp1_2(a_,dx,R4) wa1_2(a_) - -#undef bla1_2 -#define bla1_2(a_) Mjoin(Mjoin(bla,NDP),1_2)(a_) - - - -#undef bla11_4 -#define bla11_4(a_) plaa1_4(a_) dp1_4(a_,ax,R1) wa1_4(a_) -#undef bla21_4 -#define bla21_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) dp1_4(a_,bx,R2) wa1_4(a_) -#undef bla31_4 -#define bla31_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \ - dp1_4(a_,cx,R3) wa1_4(a_) -#undef bla41_4 -#define bla41_4(a_) plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \ - ddp1_4(a_,cx,R3) dp1_4(a_,dx,R4) wa1_4(a_) - -#undef bla1_4 -#define bla1_4(a_) Mjoin(Mjoin(bla,NDP),1_4)(a_) - - - -#undef inc1 -#define inc1(a_) a(a_,si) a(a_,ax) -#undef inc2 -#define inc2(a_) inc1(a_) a(a_,bx) -#undef inc3 -#define inc3(a_) inc2(a_) a(a_,cx) -#undef inc4 -#define inc4(a_) inc3(a_) a(a_,dx) - -#undef inc -#define inc(a_) Mjoin(inc,NDP)(a_) - - -#ifdef PREFETCH -/* #include "camm_arith.h" */ -#undef S -#define S(a_,b_) (a_) + (b_) -#undef PF1 -#define PF1 PREFETCH -#undef PF2 -#define PF2 S(PF1,32) -#undef PF3 -#define PF3 S(PF1,64) -#undef PF4 -#define PF4 S(PF1,96) -#undef PF5 -#define PF5 S(PF1,128) -#undef PF6 -#define PF6 S(PF1,160) -#undef PF7 -#define PF7 S(PF1,192) -#undef PF8 -#define PF8 S(PF1,224) -#else -#undef PF1 -#define PF1 64 -#undef PF2 -#define PF2 96 -#undef PF3 -#define PF3 128 -#undef PF4 -#define PF4 160 -#undef PF5 -#define PF5 192 -#undef PF6 -#define PF6 224 -#undef PF7 -#define PF7 256 -#undef PF8 -#define PF8 288 -#endif - - -#if defined(NO_TRANSPOSE) && !defined(SREAL) && !defined(GER) -#undef pf -#define pf(a_,b_) f(t0,a_,b_) -#else -#undef pf -#define pf(a_,b_) f(nta,a_,b_) -#endif - -#undef bl1 -#define bl1 bla1_4(0x0) inc(4) -#undef bl2 -#define bl2 bla1_2(0x0) inc(8) -#undef bl4 -#define bl4 bla(0x0,PF1) inc(16) -#undef bl8 -#define bl8 bla(0x0,PF1) blb(0x1,PF1) inc(32) -#undef bl16 -#define bl16 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) inc(64) -#undef bl32 -#define bl32 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \ - bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) inc(128) -#undef bl64 -#define bl64 bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \ - bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) \ - bla(0x8,PF5) blb(0x9,PF5) bla(0xa,PF6) blb(0xb,PF6) \ - bla(0xc,PF7) blb(0xd,PF7) bla(0xe,PF8) blb(0xf,PF8) inc(256) - -/* #define in2 inc(8) */ -/* #define in4 inc(16) */ -/* #define in8 inc(32) */ -/* #define in16 inc(64) */ - -#undef in2 -#define in2 -#undef in4 -#define in4 -#undef in8 -#define in8 -#undef in16 -#define in16 - -#ifdef NO_TRANSPOSE -#undef incf -#define incf ra(di,si) -#else -#undef incf -#define incf -#endif - -#undef lf1 -#define lf1 mpx(R1) -#undef lf2 -#define lf2 lf1 incf mpx(R2) -#undef lf3 -#define lf3 lf2 incf mpx(R3) -#undef lf4 -#define lf4 lf3 incf mpx(R4) - -#undef lf -#define lf Mjoin(lf,NDP) - - -#undef ulf1 -#define ulf1 ulfa(R1) -#undef ulf2 -#define ulf2 ulf1 ra(di,si) ulfa(R2) -#undef ulf3 -#define ulf3 ulf2 ra(di,si) ulfa(R3) -#undef ulf4 -#define ulf4 ulf3 ra(di,si) ulfa(R4) - -#undef ulf -#define ulf Mjoin(ulf,NDP) - -#undef lpba -#define lpba(a_) "movl %%esi,%%e" #a_ "\n\t" - -#undef lpb1 -#define lpb1 lpba(ax) -#undef lpb2 -#define lpb2 lpb1 ra(di,si) lpba(bx) -#undef lpb3 -#define lpb3 lpb2 ra(di,si) lpba(cx) -#undef lpb4 -#define lpb4 lpb3 ra(di,si) lpba(dx) - -#undef lpb -#define lpb Mjoin(lpb,NDP) - -#undef ipf1 -#define ipf1(a_) pf(a_,si) pf(a_,ax) -#undef ipf2 -#define ipf2(a_) ipf1(a_) pf(a_,bx) -#undef ipf3 -#define ipf3(a_) ipf2(a_) pf(a_,cx) -#undef ipf4 -#define ipf4(a_) ipf3(a_) pf(a_,dx) - -#undef ipf -#define ipf(a_) Mjoin(ipf,NDP)(a_) - -#ifdef LUNROLL -#undef UNROLL -#ifdef SREAL -#undef UNROLL -#define UNROLL LUNROLL -#elif defined(DREAL) || defined(SCPLX) -#undef UNROLL -#define UNROLL LUNROLL*2 -#elif defined(DCPLX) -#undef UNROLL -#define UNROLL LUNROLL*4 -#endif -#else -#undef UNROLL -#define UNROLL 16 -#endif - -#undef UNROLL1_2 -#if UNROLL == 64 -#undef blUNROLL -#define blUNROLL bl64 -#undef UNROLL1_2 -#define UNROLL1_2 32 -#elif UNROLL == 32 -#undef blUNROLL -#define blUNROLL bl32 -#undef UNROLL1_2 -#define UNROLL1_2 16 -#elif UNROLL == 16 -#undef blUNROLL -#define blUNROLL bl16 -#undef UNROLL1_2 -#define UNROLL1_2 8 -#elif UNROLL == 8 -#undef blUNROLL -#define blUNROLL bl8 -#undef UNROLL1_2 -#define UNROLL1_2 4 -#elif UNROLL == 4 -#undef blUNROLL -#define blUNROLL bl4 -#undef UNROLL1_2 -#define UNROLL1_2 2 -#elif UNROLL == 2 -#undef blUNROLL -#define blUNROLL bl2 -#undef UNROLL1_2 -#define UNROLL1_2 1 -#elif UNROLL == 1 -#undef blUNROLL -#define blUNROLL bl1 -#undef UNROLL1_2 -#define UNROLL1_2 stop -#endif -#ifndef UNROLL1_2 -#error UNROLL must be set to power of 2 < 128 -#endif - - -#ifdef GER -#undef aconst -#define aconst -#undef cconst -#define cconst const -#else -#undef aconst -#define aconst const -#undef cconst -#define cconst -#endif - -#undef MY_FUNCTION -#define MY_FUNCTION Mjoin(dp,EXT) - -static void -MY_FUNCTION(aconst TYPE *a,int lda, - const TYPE *b, - cconst TYPE *c,int stride,int len) { - -#ifdef SCPLX -#if defined(GER) && defined(Conj_) - const TYPE w1[2]={{-1.0,1.0},{-1.0,1.0}},*w=w1; -#else - const TYPE w1[2]={{1.0,-1.0},{1.0,-1.0}},*w=w1; -#endif -#endif - -#if defined(DCPLX) && defined(ATL_SSE2) -#if defined(GER) && defined(Conj_) - const TYPE w1[1]={{-1.0,1.0}},*w=w1; -#else - const TYPE w1[1]={{1.0,-1.0}},*w=w1; -#endif -#endif - -#ifdef NO_TRANSPOSE -#undef movm -#define movm c -#undef fixm -#define fixm b -#else -#undef movm -#define movm b -#undef fixm -#define fixm c -#endif - NO_INLINE - unsigned u1=stride*sizeof(*fixm),u2=lda*sizeof(*a),u3=len*sizeof(*movm)/sizeof(float); - - ASM ( - - "pushl %%ebx\n\t" - a(4,sp) - -#if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2)) - "movl %6,%%esi\n\t" - pl(0,si,SREG) -#endif - -#ifdef NO_TRANSPOSE - "movl %1,%%esi\n\t" /* fixm */ - "movl %2,%%edi\n\t" /* fixm2fixm */ -#endif - - lf - - "movl %3,%%esi\n\t" /* a */ - "movl %4,%%edi\n\t" /* a2a */ - - lpb - - ipf(0) - - "movl %0,%%esi\n\t" /* movm */ - "movl %5,%%edi\n\t" /* len */ - -#if defined(ALIGN) - -#if defined(SREAL) - - test(4,ax) - je(Mjoin(a1,EXT)) - test(-1,di) - je(Mjoin(a1,EXT)) - sub(1,di) - bl1 - - lab(Mjoin(a1,EXT)) - -#endif - -#if defined(DREAL) || defined(SREAL) - - test(8,ax) - je(Mjoin(as,EXT)) - test(-2,di) - je(Mjoin(as,EXT)) - sub(2,di) - bl2 - - lab(Mjoin(as,EXT)) - -#endif - -#endif - - - ipf(32) - - lab(Mjoin(loop,EXT)) - - test(-UNROLL,di) - je(Mjoin(UNROLL1_2,EXT)) - sub(UNROLL,di) - - blUNROLL - - jmp(Mjoin(loop,EXT)) - -#if UNROLL > 32 - lab(Mjoin(32,EXT)) - test(32,di) - je(Mjoin(16,EXT)) - bl32 -#endif - -#if UNROLL > 16 - lab(Mjoin(16,EXT)) - test(16,di) - je(Mjoin(8,EXT)) - bl16 -#endif - -#if UNROLL > 8 - lab(Mjoin(8,EXT)) - test(8,di) - je(Mjoin(4,EXT)) - bl8 -#endif - -#if UNROLL > 4 - lab(Mjoin(4,EXT)) - test(4,di) - je(Mjoin(2,EXT)) - bl4 -#endif - -#if UNROLL > 2 - lab(Mjoin(2,EXT)) -#ifndef DCPLX - test(2,di) - je(Mjoin(1,EXT)) - bl2 -#endif -#endif - -#if UNROLL > 1 - lab(Mjoin(1,EXT)) -#ifdef SREAL - test(1,di) - je(Mjoin(stop,EXT)) - bl1 -#endif -#endif - - lab(Mjoin(stop,EXT)) - -#ifndef NO_TRANSPOSE - "movl %1,%%esi\n\t" /* fixm */ - "movl %2,%%edi\n\t" /* fixm2fixm */ -#endif - - ulf - - a(-4,sp) - "popl %%ebx\n\t" - - - ::"m" (movm),"m" (fixm),"m" (u1),"m" (a),"m" (u2),"m" (u3) - -#if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2)) - ,"m" (w) -#endif - :"ax","bx","cx","dx","si","di"); - - -} - diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h deleted file mode 100644 index 7fd1404..0000000 --- a/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h +++ /dev/null @@ -1,295 +0,0 @@ -#include "camm_util.h" - -#ifndef N -#error N must be defined in camm_pipe3.h -#endif -#ifndef KB -#error KB must be defined in camm_pipe3.h -#endif - -#undef p1 -#define p1(a_) Mjoin(p1_4_,N)(a_) -#undef p2 -#define p2(a_) Mjoin(p1_2_,N)(a_) -#undef p4 -#define p4(a_) Mjoin(p1_,N)(a_) -#undef load_pipe -#define load_pipe(a_) Mjoin(lp,N)(a_) -#undef drain_pipe -#define drain_pipe(a_) Mjoin(dp,N)(a_) -#undef pipe_len -#define pipe_len Mjoin(pl,N) - -#undef p8 -#if pipe_len > 4 -#define p8(a_) Mjoin(p2_,N)(a_) -#else -#define p8(a_) p4(a_) p4(SS(a_,16)) -#endif - -#undef p16 -#if pipe_len > 8 -#define p16(a_) Mjoin(p4_,N)(a_) -#else -#define p16(a_) p8(a_) p8(SS(a_,32)) -#endif - -#undef p32 -#if pipe_len > 16 -#define p32(a_) Mjoin(p8_,N)(a_) -#else -#define p32(a_) p16(a_) p16(SS(a_,64)) -#endif - -#undef p64 -#if pipe_len > 32 -#define p64(a_) Mjoin(p16_,N)(a_) -#else -#define p64(a_) p32(a_) p32(SS(a_,128)) -#endif - -#undef p128 -#if pipe_len > 64 -#define p128(a_) Mjoin(p32_,N)(a_) -#else -#define p128(a_) p64(a_) p64(SS(a_,256)) -#endif - -#undef p256 -#if pipe_len > 128 -#define p256(a_) Mjoin(p64_,N)(a_) -#else -#define p256(a_) p128(a_) p128(SS(a_,512)) -#endif - -#if KB < pipe_len -#undef pipe_len -#define pipe_len 0 -#undef load_pipe -#define load_pipe(a_) -#undef drain_pipe -#define drain_pipe(a_) -#endif - - -#undef MKB -/* #ifdef SREAL */ -#define MKB KB -/* #elif defined (DCPLX) */ -/* #define MKB ( KB * 4 ) */ -/* #else */ -/* #define MKB ( KB * 2 ) */ -/* #endif */ - -#if MKB >= 512 -#error MKB must be less than 512 -#endif - -#undef x0 -#undef o0 -#define x0 load_pipe(0) -#define o0 0 - -#undef MKBB -#define MKBB ( MKB - pipe_len ) - -#undef xx1 -#undef oo1 -#if MKBB >= 256 -#define xx1 x0 p256(o0) -#define oo1 SS(1024,o0) -#else -#define xx1 x0 -#define oo1 o0 -#endif - -#undef xx1a -#undef oo1a -#if pipe_len == 256 -#define xx1a xx1 drain_pipe(oo1) -#define oo1a SS(1024,oo1) -#undef MKBB -#define MKBB MKB -#else -#define xx1a xx1 -#define oo1a oo1 -#endif - -#undef x1 -#undef o1 -#if ( MKBB / 128 ) % 2 -#define x1 xx1a p128(oo1a) -#define o1 SS(512,oo1a) -#else -#define x1 xx1a -#define o1 oo1a -#endif - -#undef x1a -#undef o1a -#if pipe_len == 128 -#define x1a x1 drain_pipe(o1) -#define o1a SS(512,o1) -#undef MKBB -#define MKBB MKB -#else -#define x1a x1 -#define o1a o1 -#endif - -#undef x2 -#undef o2 -#if ( MKBB / 64 ) % 2 -#define x2 x1a p64(o1a) -#define o2 SS(256,o1a) -#else -#define x2 x1a -#define o2 o1a -#endif - -#undef x2a -#undef o2a -#if pipe_len == 64 -#define x2a x2 drain_pipe(o2) -#define o2a SS(256,o2) -#undef MKBB -#define MKBB MKB -#else -#define x2a x2 -#define o2a o2 -#endif - -#undef x3 -#undef o3 -#if ( MKBB / 32 ) % 2 -#define x3 x2a p32(o2a) -#define o3 SS(128,o2a) -#else -#define x3 x2a -#define o3 o2a -#endif - -#undef x3a -#undef o3a -#if pipe_len == 32 -#define x3a x3 drain_pipe(o3) -#define o3a SS(128,o3) -#undef MKBB -#define MKBB MKB -#else -#define x3a x3 -#define o3a o3 -#endif - -#undef x4 -#undef o4 -#if ( MKBB / 16 ) % 2 -#define x4 x3a p16(o3a) -#define o4 SS(64,o3a) -#else -#define x4 x3a -#define o4 o3a -#endif - -#undef x4a -#undef o4a -#if pipe_len == 16 -#define x4a x4 drain_pipe(o4) -#define o4a SS(64,o4) -#undef MKBB -#define MKBB MKB -#else -#define x4a x4 -#define o4a o4 -#endif - -#undef x5 -#undef o5 -#if ( MKBB / 8 ) % 2 -#define x5 x4a p8(o4a) -#define o5 SS(32,o4a) -#else -#define x5 x4a -#define o5 o4a -#endif - -#undef x5a -#undef o5a -#if pipe_len == 8 -#define x5a x5 drain_pipe(o5) -#define o5a SS(32,o5) -#undef MKBB -#define MKBB MKB -#else -#define x5a x5 -#define o5a o5 -#endif - -#undef x6 -#undef o6 -#if ( MKBB / 4 ) % 2 -#define x6 x5a p4(o5a) -#define o6 SS(16,o5a) -#else -#define x6 x5a -#define o6 o5a -#endif - -#undef x6a -#undef o6a -#if pipe_len == 4 -#define x6a x6 drain_pipe(o6) -#define o6a SS(16,o6) -#undef MKBB -#define MKBB MKB -#else -#define x6a x6 -#define o6a o6 -#endif - -#undef x7 -#undef o7 -#if ( MKB / 2 ) % 2 -#define x7 x6a p2(o6a) -#define o7 SS(8,o6a) -#else -#define x7 x6a -#define o7 o6a -#endif - -#undef x7a -#undef o7a -#if pipe_len == 2 -#define x7a x7 drain_pipe(o7) -#define o7a SS(8,o7) -#undef MKBB -#define MKBB MKB -#else -#define x7a x7 -#define o7a o7 -#endif - -#undef x8 -#undef o8 -#if ( MKB / 1 ) % 2 -#define x8 x7a p1(o7a) -#define o8 SS(4,o7a) -#else -#define x8 x7a -#define o8 o7a -#endif - -#undef x8a -#undef o8a -#if pipe_len == 1 -#define x8a x8 drain_pipe(o8) -#define o8a SS(4,o8) -#undef MKBB -#define MKBB MKB -#else -#define x8a x8 -#define o8a o8 -#endif - -#undef KB_block -#define KB_block x8a diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h deleted file mode 100644 index 35e9e59..0000000 --- a/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h +++ /dev/null @@ -1,215 +0,0 @@ -#ifndef CAMM_SCALE_H -#define CAMM_SCALE_H /*+ To stop multiple inclusions. +*/ - -#include "camm_util.h" - -#undef spf -#define spf(a_,b_) f(t0,a_,b_) - -#ifdef SCPLX -#ifdef BETAX -#undef SSREG -#define SSREG 2 -#undef lbx -#define lbx pls(4,ax,1) ps(0,1,1) pm(SSREG,1) -#undef cxx -#define cxx pm(1,3) ps(177,3,3) pa(3,2) -#undef pcx -#define pcx pc(2,3) -#else -#undef lbx -#define lbx -#undef cxx -#define cxx -#undef pcx -#define pcx -#endif -#undef lb -#define lb pls(0,ax,0) ps(0,0,0) lbx -#undef c -#define c(a_) pl(a_ ## 0,si,2) pcx pm(0,2) cxx pu(2,a_ ## 0,si) -#undef cp -#define cp(a_,b_) pl(a_ ## 0,si,2) pcx pm(0,2) spf(b_,si) cxx pu(2,a_ ## 0,si) -#undef c1_2 -#define c1_2(a_) px(2) pld(a_ ## 0,si,2) pcx pm(0,2) cxx pud(2,a_ ## 0,si) -#undef ub -#define ub -#endif - -#ifdef SREAL -#undef lb -#define lb pls(0,ax,0) ps(0,0,0) -#undef c -#define c(a_) pl(a_ ## 0,si,2) pm(0,2) pu(2,a_ ## 0,si) -#undef cp -#define cp(a_,b_) pl(a_ ## 0,si,2) spf(b_,si) pm(0,2) pu(2,a_ ## 0,si) -#undef c1_2 -#define c1_2(a_) px(2) pld(a_ ## 0,si,2) pm(0,2) pud(2,a_ ## 0,si) -#undef c1_4 -#define c1_4(a_) pls(a_ ## 0,si,2) pm(0,2) pus(2,a_ ## 0,si) -#undef ub -#define ub -#endif - -#ifdef DREAL -#undef lb -#define lb fl(0,ax) -#undef c -#define c(a_) fl(a_ ## 0,si) fm(1,0) fl(a_ ## 8,si) fm(2,0) fx1 \ - fp(a_ ## 0,si) fp(a_ ## 8,si) -#undef cp -#define cp(a_,b_) fl(a_ ## 0,si) fm(1,0) fl(a_ ## 8,si) spf(b_,si) fm(2,0) fx1 \ - fp(a_ ## 0,si) fp(a_ ## 8,si) -#undef c1_2 -#define c1_2(a_) fl(a_ ## 0,si) fm(1,0) fp(a_ ## 0,si) -#undef ub -#define ub fc(0) -#endif - -#ifdef DCPLX -#undef lb -#define lb fl(0,ax) fl(8,ax) -#undef c -#define c(a_) fl(a_ ## 0,si) fl(a_ ## 8,si) fd(3) fm(2,0) fd(3) \ - fm(2,0) fx(3) fm(4,0) fx(2) fm(5,0) fap(0,2) fx(2) fsp(2) fx1 \ - fp(a_ ## 0,si) fp(a_ ## 8,si) -#undef cp -#define cp(a_,b_) fl(a_ ## 0,si) fl(a_ ## 8,si) fd(3) fm(2,0) fd(3) \ - fm(2,0) fx(3) spf(b_,si) fm(4,0) fx(2) fm(5,0) fap(0,2) fx(2) \ - fsp(2) fx1 fp(a_ ## 0,si) fp(a_ ## 8,si) -#undef ub -#define ub fc(0) fc(0) -#endif - -#undef sbl1 -#define sbl1 c1_4(0x0) -#undef sbl2 -#define sbl2 c1_2(0x0) -#undef sbl4 -#define sbl4 cp(0x0,0x40) -#undef sbl8 -#define sbl8 sbl4 c(0x1) -#undef sbl16 -#define sbl16 sbl8 cp(0x2,0x60) c(0x3) - -#undef sinc16 -#define sinc16 a(0x40,si) -#undef sinc8 -#define sinc8 a(0x20,si) -#undef sinc4 -#define sinc4 a(0x10,si) -#undef sinc2 -#define sinc2 a(0x8,si) -#undef sinc1 -#define sinc1 a(0x4,si) - -#undef SCALE -#define SCALE Mjoin(Mjoin(PREC,Mjoin(scale,BLC)),FEXT) - -#undef MY_FUNCTION -#define MY_FUNCTION SCALE - -static void -MY_FUNCTION(const TYPE *b,TYPE *c,int len) { - - const TYPE *ce=c+len; -#if defined(BETAX) && defined(SCPLX) - const TYPE z1[2]={{1.0,-1.0},{1.0,-1.0}},*z=z1; -#endif - NO_INLINE - -#ifndef SREAL - len+=len; -#endif -#ifdef DCPLX - len+=len; -#endif - - - ASM( - - "pushl %%ebx\n\t" - a(4,sp) - - - "movl %0,%%esi\n\t" - - spf(0x00,si) - spf(0x20,si) - - "movl %1,%%eax\n\t" - "movl %2,%%edi\n\t" - -#if defined(BETAX) && defined(SCPLX) - "movl %3,%%ebx\n\t" - pl(0,bx,SSREG) -#endif - - lb - - lab(loop) - - test(-16,di) - je(8) - sub(16,di) - align - - sbl16 - sinc16 - - jmp(loop) - align - - lab(8) - - test(8,di) - je(4) - - sbl8 - sinc8 - - lab(4) - - test(4,di) - je(2) - - sbl4 - sinc4 - - lab(2) - -#ifndef DCPLX - test(2,di) - je(1) - - sbl2 - sinc2 - - lab(1) - -#ifdef SREAL - test(1,di) - je(stop) - - sbl1 - sinc1 - - lab(stop) -#endif -#endif - - ub - - a(-4,sp) - "popl %%ebx\n\t" - - - ::"m" (c),"m" (b), "m" (len) -#if defined(BETAX) && defined(SCPLX) - ,"m" (z) -#endif - : "si","ax","di"); - - -} -#endif /* CAMM_SCALE_H */ diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h deleted file mode 100644 index 4a92006..0000000 --- a/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h +++ /dev/null @@ -1,2982 +0,0 @@ -#include "camm_util.h" - -#undef p1_4_swap_1 -#define p1_4_swap_1(a_) \ - pls(a_,ax,1) \ - pls(a_,cx,0) \ - pus(0,a_,ax) \ - pus(1,a_,cx) -#undef p1_2_swap_1 -#define p1_2_swap_1(a_) \ - px(1) \ - pld(a_,ax,1) \ - px(0) \ - pld(a_,cx,0) \ - pud(0,a_,ax) \ - pud(1,a_,cx) -#undef p1_swap_1 -#define p1_swap_1(a_) \ - plq(a_,ax,1) \ - pl(a_,cx,0) \ - puq(0,a_,ax) \ - pu(1,a_,cx) -#undef p2_swap_1 -#define p2_swap_1(a_) \ - plq(SS(a_,RS4),ax,3) \ - pl(SS(a_,RS4),cx,2) \ - puq(0,a_,ax) \ - pu(1,a_,cx) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,1) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(2,RS4)),cx,0) \ - puq(2,SS(a_,RS4),ax) \ - pu(3,SS(a_,RS4),cx) -#undef lpswap_1 -#define lpswap_1(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,1) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(SS(a_,MM(0,RS4)),cx,0) -#undef dpswap_1 -#define dpswap_1(a_) \ - plq(SS(a_,RS4),ax,3) \ - pl(SS(a_,RS4),cx,2) \ - puq(0,a_,ax) \ - pu(1,a_,cx) \ - puq(2,SS(a_,RS4),ax) \ - pu(3,SS(a_,RS4),cx) -#undef plswap_1 -#define plswap_1 8 - - -#undef p1_4_scal_3 -#define p1_4_scal_3(a_) \ - pls(a_,ax,0) \ - pmsr(6,0) \ - pus(0,a_,ax) -#undef p1_2_scal_3 -#define p1_2_scal_3(a_) \ - pld(a_,ax,0) \ - pm(6,0) \ - pud(0,a_,ax) -#undef p1_scal_3 -#define p1_scal_3(a_) \ - plq(a_,ax,0) \ - pm(6,0) \ - puq(0,a_,ax) -#undef p2_scal_3 -#define p2_scal_3(a_) \ - plq(a_,ax,0) \ - plq(SS(a_,RS4),ax,1) \ - pm(6,0) \ - pm(6,1) \ - puq(0,a_,ax) \ - puq(1,SS(a_,RS4),ax) -#undef p4_scal_3 -#define p4_scal_3(a_) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - pm(6,2) \ - puq(0,a_,ax) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(4,RS4)),ax,0) \ - pm(6,3) \ - puq(1,SS(a_,RS4),ax) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(5,RS4)),ax,1) \ - pm(6,0) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ - plq(SS(a_,MM(6,RS4)),ax,2) \ - pm(6,1) \ - puq(3,SS(a_,MM(3,RS4)),ax) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) -#undef lpscal_3 -#define lpscal_3(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,0) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pm(6,0) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,2) \ - pm(6,1) -#undef dpscal_3 -#define dpscal_3(a_) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - pm(6,2) \ - puq(0,a_,ax) \ - pm(6,3) \ - puq(1,SS(a_,RS4),ax) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - puq(3,SS(a_,MM(3,RS4)),ax) -#undef plscal_3 -#define plscal_3 16 - -#undef p1_4_scal_3c -#define p1_4_scal_3c(a_) -#undef p1_2_scal_3c -#define p1_2_scal_3c(a_) \ - pld(a_,ax,0) \ - pc(0,1) \ - pm(6,0) \ - ps(CSHUF,1,1) \ - pm(7,1) \ - pa(1,0) \ - pud(0,a_,ax) -#undef p1_scal_3c -#define p1_scal_3c(a_) \ - plq(a_,ax,0) \ - pc(0,1) \ - pm(6,0) \ - ps(CSHUF,1,1) \ - pm(7,1) \ - pa(1,0) \ - puq(0,a_,ax) -#undef p2_scal_3c -#define p2_scal_3c(a_) \ - plq(a_,ax,0) \ - plq(SS(a_,RS4),ax,1) \ - pc(0,2) \ - pm(6,0) \ - ps(CSHUF,2,2) \ - pm(7,2) \ - pa(2,0) \ - puq(0,a_,ax) \ - pc(1,3) \ - pm(6,1) \ - ps(CSHUF,3,3) \ - pm(7,3) \ - pa(3,1) \ - puq(1,SS(a_,RS4),ax) -#undef p4_scal_3c -#define p4_scal_3c(a_) \ - pm(7,5) \ - pa(5,1) \ - puq(0,a_,ax) \ - ps(CSHUF,4,4) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(4,RS4)),ax,0) \ - pc(3,5) \ - pm(6,3) \ - pm(7,4) \ - pa(4,2) \ - puq(1,SS(a_,RS4),ax) \ - ps(CSHUF,5,5) \ - plq(SS(a_,MM(5,RS4)),ax,1) \ - pc(0,4) \ - pm(6,0) \ - pm(7,5) \ - pa(5,3) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - ps(CSHUF,4,4) \ - plq(SS(a_,MM(6,RS4)),ax,2) \ - pc(1,5) \ - pm(6,1) \ - pm(7,4) \ - pa(4,0) \ - puq(3,SS(a_,MM(3,RS4)),ax) \ - ps(CSHUF,5,5) \ - plq(SS(a_,MM(7,RS4)),ax,3) \ - pc(2,4) \ - pm(6,2) -#undef lpscal_3c -#define lpscal_3c(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,0) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pc(0,4) \ - pm(6,0) \ - ps(CSHUF,4,4) \ - plq(SS(a_,MM(2,RS4)),ax,2) \ - pc(1,5) \ - pm(6,1) \ - pm(7,4) \ - pa(4,0) \ - ps(CSHUF,5,5) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - pc(2,4) \ - pm(6,2) -#undef dpscal_3c -#define dpscal_3c(a_) \ - pm(7,5) \ - pa(5,1) \ - ps(CSHUF,4,4) \ - puq(0,a_,ax) \ - pm(7,4) \ - pa(4,2) \ - pc(3,5) \ - pm(6,3) \ - puq(1,SS(a_,RS4),ax) \ - ps(CSHUF,5,5) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - pm(7,5) \ - pa(5,3) \ - puq(3,SS(a_,MM(3,RS4)),ax) -#undef plscal_3c -#define plscal_3c 16 - -#undef p1_4_scal_4 -#define p1_4_scal_4(a_) \ - pls(SS(a_,MM(0,RS4)),ax,0) \ - pmsr(6,0) \ - pus(0,a_,ax) -#undef p1_2_scal_4 -#define p1_2_scal_4(a_) \ - pld(SS(a_,MM(0,RS4)),ax,0) \ - pm(6,0) \ - pud(0,a_,ax) -#undef p1_scal_4 -#define p1_scal_4(a_) \ - plq(SS(a_,MM(0,RS4)),ax,0) \ - pm(6,0) \ - puq(0,a_,ax) -#undef p2_scal_4 -#define p2_scal_4(a_) \ - plq(SS(a_,MM(0,RS4)),ax,0) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pm(6,0) \ - pm(6,1) \ - puq(0,a_,ax) \ - puq(1,SS(a_,RS4),ax) -#undef p4_scal_4 -#define p4_scal_4(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,0) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - plq(SS(a_,MM(2,RS4)),ax,2) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - pm(6,0) \ - pm(6,1) \ - pm(6,2) \ - pm(6,3) \ - puq(0,a_,ax) \ - puq(1,SS(a_,RS4),ax) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - puq(3,SS(a_,MM(3,RS4)),ax) -#undef p8_scal_4 -#define p8_scal_4(a_) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,0) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - plq(SS(a_,MM(2,RS4)),ax,2) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - plq(SS(a_,MM(4,RS4)),ax,4) \ - plq(SS(a_,MM(5,RS4)),ax,5) \ - plq(SS(a_,MM(6,RS4)),ax,7) \ - pm(6,0) \ - pm(6,1) \ - pm(6,2) \ - puq(0,a_,ax) \ - pm(6,3) \ - pm(6,4) \ - pm(6,5) \ - plq(SS(a_,MM(7,RS4)),ax,0) \ - pm(6,7) \ - pm(6,0) \ - puq(1,SS(a_,RS4),ax) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - puq(3,SS(a_,MM(3,RS4)),ax) \ - puq(4,SS(a_,MM(4,RS4)),ax) \ - puq(5,SS(a_,MM(5,RS4)),ax) \ - puq(7,SS(a_,MM(6,RS4)),ax) \ - puq(0,SS(a_,MM(7,RS4)),ax) -#undef lpscal_4 -#define lpscal_4(a_) -#undef dpscal_4 -#define dpscal_4(a_) p4_scal_4(a_) -#undef plscal_4 -#define plscal_4 16 - -#undef p1_4_scal_4c -#define p1_4_scal_4c(a_) -#undef p1_2_scal_4c -#define p1_2_scal_4c(a_) \ - pld(a_,ax,0) \ - pc(0,1) \ - pm(6,0) \ - ps(CSHUF,1,1) \ - pm(7,1) \ - pa(1,0) \ - pud(0,a_,ax) -#undef p1_scal_4c -#define p1_scal_4c(a_) \ - plq(a_,ax,0) \ - pc(0,1) \ - pm(6,0) \ - ps(CSHUF,1,1) \ - pm(7,1) \ - pa(1,0) \ - puq(0,a_,ax) -#undef p2_scal_4c -#define p2_scal_4c(a_) \ - plq(SS(a_,MM(0,RS4)),ax,0) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pc(0,4) \ - pc(1,5) \ - pm(6,0) \ - pm(6,1) \ - ps(CSHUF,4,4) \ - ps(CSHUF,5,5) \ - pm(7,4) \ - pa(4,0) \ - pm(7,5) \ - pa(5,1) \ - puq(0,a_,ax) \ - puq(1,SS(a_,RS4),ax) -#undef p4_scal_4c -#define p4_scal_4c(a_) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,0) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - plq(SS(a_,MM(2,RS4)),ax,2) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - pc(0,4) \ - pc(1,5) \ - pm(6,0) \ - pm(6,1) \ - ps(CSHUF,4,4) \ - ps(CSHUF,5,5) \ - pm(7,4) \ - pa(4,0) \ - pc(2,4) \ - pm(7,5) \ - pa(5,1) \ - pc(3,5) \ - pm(6,2) \ - pm(6,3) \ - ps(CSHUF,4,4) \ - ps(CSHUF,5,5) \ - pm(7,4) \ - pa(4,2) \ - pm(7,5) \ - pa(5,3) \ - puq(0,a_,ax) \ - puq(1,SS(a_,RS4),ax) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - puq(3,SS(a_,MM(3,RS4)),ax) -#undef lpscal_4c -#define lpscal_4c(a_) -#undef dpscal_4c -#define dpscal_4c(a_) p4_scal_4c(a_) -#undef plscal_4c -#define plscal_4c 16 - -#undef p1_4_scal_1 -#define p1_4_scal_1(a_) \ - pls(a_,ax,1) \ - pmsr(0,1) \ - pus(1,a_,ax) -#undef p1_2_scal_1 -#define p1_2_scal_1(a_) \ - px(1) \ - pld(a_,ax,1) \ - pm(0,1) \ - pud(1,a_,ax) -#undef p1_scal_1 -#define p1_scal_1(a_) \ - plq(a_,ax,1) \ - pm(0,1) \ - puq(1,a_,ax) -#undef p2_scal_1 -#define p2_scal_1(a_) \ - plq(a_,ax,1) \ - plq(SS(a_,RS4),ax,2) \ - pm(0,1) \ - pm(0,2) \ - puq(1,a_,ax) \ - puq(2,SS(a_,RS4),ax) -#undef p4_scal_1 -#define p4_scal_1(a_) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,1) \ - pm(0,3) \ - puq(7,a_,ax) \ - plq(SS(a_,MM(3,RS4)),ax,2) \ - pm(0,1) \ - puq(3,SS(a_,MM(1,RS4)),ax) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ - plq(SS(a_,MM(4,RS4)),ax,7) \ - pm(0,2) \ - puq(1,SS(a_,MM(2,RS4)),ax) \ - plq(SS(a_,MM(5,RS4)),ax,3) \ - pm(0,7) \ - puq(2,SS(a_,MM(3,RS4)),ax) -#undef lpscal_1 -#define lpscal_1(a_) \ - plq(a_,ax,7) \ - plq(SS(a_,MM(1,RS4)),ax,3) \ - pm(0,7) -#undef dpscal_1 -#define dpscal_1(a_) \ - plq(SS(a_,MM(2,RS4)),ax,1) \ - pm(0,3) \ - puq(7,a_,ax) \ - plq(SS(a_,MM(3,RS4)),ax,2) \ - pm(0,1) \ - puq(3,SS(a_,MM(1,RS4)),ax) \ - pm(0,2) \ - puq(1,SS(a_,MM(2,RS4)),ax) \ - puq(2,SS(a_,MM(3,RS4)),ax) -#undef plscal_1 -#define plscal_1 RS4 - - -#undef p1_4_set_1 -#define p1_4_set_1(a_) \ - pls(a_,ax,1) \ - pcs(0,1) \ - pus(1,a_,ax) -#undef p1_2_set_1 -#define p1_2_set_1(a_) \ - px(1) \ - pld(a_,ax,1) \ - pc(0,1) \ - pud(1,a_,ax) -#undef p1_set_1 -#define p1_set_1(a_) \ - plq(a_,ax,1) \ - pc(0,1) \ - puq(1,a_,ax) -#undef p2_set_1 -#define p2_set_1(a_) \ - plq(a_,ax,1) \ - plq(SS(a_,RS4),ax,2) \ - pc(0,1) \ - pc(0,2) \ - puq(1,a_,ax) \ - puq(2,SS(a_,RS4),ax) -#undef p4_set_1 -#define p4_set_1(a_) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,1) \ - pc(0,3) \ - puq(7,a_,ax) \ - plq(SS(a_,MM(3,RS4)),ax,2) \ - pc(0,1) \ - puq(3,SS(a_,MM(1,RS4)),ax) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ - plq(SS(a_,MM(4,RS4)),ax,7) \ - pc(0,2) \ - puq(1,SS(a_,MM(2,RS4)),ax) \ - plq(SS(a_,MM(5,RS4)),ax,3) \ - pc(0,7) \ - puq(2,SS(a_,MM(3,RS4)),ax) -#undef lpset_1 -#define lpset_1(a_) \ - plq(a_,ax,7) \ - plq(SS(a_,MM(1,RS4)),ax,3) \ - pc(0,7) -#undef dpset_1 -#define dpset_1(a_) \ - plq(SS(a_,MM(2,RS4)),ax,1) \ - pc(0,3) \ - puq(7,a_,ax) \ - plq(SS(a_,MM(3,RS4)),ax,2) \ - pc(0,1) \ - puq(3,SS(a_,MM(1,RS4)),ax) \ - pc(0,2) \ - puq(1,SS(a_,MM(2,RS4)),ax) \ - puq(2,SS(a_,MM(3,RS4)),ax) -#undef plset_1 -#define plset_1 RS4 - - -#undef p1_4_set_2 -#define p1_4_set_2(a_) \ - pus(0,a_,ax) -#undef p1_2_set_2 -#define p1_2_set_2(a_) \ - pud(0,a_,ax) -#undef p1_set_2 -#define p1_set_2(a_) \ - puq(0,a_,ax) -#undef p2_set_2 -#define p2_set_2(a_) \ - puq(0,a_,ax) \ - puq(0,SS(a_,RS4),ax) -#undef p4_set_2 -#define p4_set_2(a_) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - puq(0,a_,ax) \ - puq(0,SS(a_,MM(1,RS4)),ax) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ - puq(0,SS(a_,MM(2,RS4)),ax) \ - puq(0,SS(a_,MM(3,RS4)),ax) -#undef lpset_2 -#define lpset_2(a_) -#undef dpset_2 -#define dpset_2(a_) \ - puq(0,a_,ax) \ - puq(0,SS(a_,MM(1,RS4)),ax) \ - puq(0,SS(a_,MM(2,RS4)),ax) \ - puq(0,SS(a_,MM(3,RS4)),ax) -#undef plset_2 -#define plset_2 RS4 - - -#undef p1_4_set_3 -#define p1_4_set_3(a_) \ - pus(0,a_,ax) -#undef p1_2_set_3 -#define p1_2_set_3(a_) \ - pud(0,a_,ax) -#undef p1_set_3 -#define p1_set_3(a_) \ - puq(0,SS(a_,MM(0,RS4)),ax) -#undef p2_set_3 -#define p2_set_3(a_) \ - puq(0,SS(a_,MM(0,RS4)),ax) \ - puq(0,SS(a_,MM(1,RS4)),ax) -#undef p4_set_3 -#define p4_set_3(a_) \ - puq(0,SS(a_,MM(0,RS4)),ax) \ - puq(0,SS(a_,MM(1,RS4)),ax) \ - puq(0,SS(a_,MM(2,RS4)),ax) \ - puq(0,SS(a_,MM(3,RS4)),ax) -#undef p8_set_3 -#define p8_set_3(a_) \ - puq(0,SS(a_,MM(0,RS4)),ax) \ - puq(0,SS(a_,MM(1,RS4)),ax) \ - puq(0,SS(a_,MM(2,RS4)),ax) \ - puq(0,SS(a_,MM(3,RS4)),ax) \ - puq(0,SS(a_,MM(4,RS4)),ax) \ - puq(0,SS(a_,MM(5,RS4)),ax) \ - puq(0,SS(a_,MM(6,RS4)),ax) \ - puq(0,SS(a_,MM(7,RS4)),ax) -#undef lpset_3 -#define lpset_3(a_) -#undef dpset_3 -#define dpset_3(a_) p8_set_3(a_) -#undef plset_3 -#define plset_3 32 - - -#undef p1_4_0x1_nrm2_1 -#define p1_4_0x1_nrm2_1(a_) \ - pls(a_,ax,1) \ - pmsr(1,1) \ - pasr(1,0) -#undef p1_2_0x1_nrm2_1 -#define p1_2_0x1_nrm2_1(a_) \ - px(1) \ - pld(a_,ax,1) \ - pm(1,1) \ - pa(1,0) -#undef p1_0x1_nrm2_1 -#define p1_0x1_nrm2_1(a_) \ - plq(a_,ax,1) \ - pm(1,1) \ - pa(1,0) -#undef p2_0x1_nrm2_1 -#define p2_0x1_nrm2_1(a_) \ - plq(a_,ax,1) \ - plq(SS(a_,RS4),ax,2) \ - pm(1,1) \ - pm(2,2) \ - pa(1,0) \ - pm(2,0) -#undef p4_0x1_nrm2_1 -#define p4_0x1_nrm2_1(a_) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,1) \ - pm(3,3) \ - pa(7,0) \ - plq(SS(a_,MM(3,RS4)),ax,2) \ - pm(1,1) \ - pa(3,0) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ - plq(SS(a_,MM(4,RS4)),ax,7) \ - pm(2,2) \ - pa(1,0) \ - plq(SS(a_,MM(5,RS4)),ax,3) \ - pm(7,7) \ - pa(2,0) -#undef lp0x1_nrm2_1 -#define lp0x1_nrm2_1(a_) \ - plq(a_,ax,7) \ - plq(SS(a_,MM(1,RS4)),ax,3) \ - pm(7,7) -#undef dp0x1_nrm2_1 -#define dp0x1_nrm2_1(a_) \ - plq(SS(a_,MM(2,RS4)),ax,1) \ - pm(3,3) \ - pa(7,0) \ - plq(SS(a_,MM(3,RS4)),ax,2) \ - pm(1,1) \ - pa(3,0) \ - pm(2,2) \ - pa(1,0) \ - pa(2,0) -#undef pl0x1_nrm2_1 -#define pl0x1_nrm2_1 RS4 - - -#undef p1_4_nrm2_2 -#define p1_4_nrm2_2(a_) \ - pls(a_,ax,1) dbg(1) \ - pan(4,1) dbg(1) \ - pcs(5,6) dbg(6) \ - pcs(5,7) dbg(7) \ - paxs(1,5) dbg(5) \ - prps(5,2) dbg(2) \ - px(3) \ - pcms(0,2,3) dbg(3) \ - pan(3,7) dbg(7) \ - pann(5,3) dbg(3) \ - pasr(3,7) dbg(7) \ - pcs(7,5) dbg(5) \ - pdsr(5,6) dbg(6) \ - pdsr(5,1) dbg(1) \ - pmsr(6,6) dbg(6) \ - pmsr(1,1) dbg(1) \ - pm(6,0) dbg(0) \ - pasr(1,0) dbg(0) -#undef p1_2_nrm2_2 -#define p1_2_nrm2_2(a_) \ - px(1) pld(a_,ax,1) dbg(1) \ - pan(4,1) dbg(1) \ - pc(5,6) dbg(6) \ - pc(5,7) dbg(7) \ - pax(1,5) dbg(5) \ - prp(5,2) dbg(2) \ - px(3) \ - pcm(0,2,3)dbg(3) \ - pan(3,7) dbg(7) \ - pann(5,3) dbg(3) \ - pa(3,7) dbg(7) \ - pc(7,5) dbg(5) \ - pd(5,6) dbg(6) \ - pd(5,1) dbg(1) \ - pm(6,6) dbg(6) \ - pm(1,1) dbg(1) \ - pm(6,0) dbg(0) \ - pa(1,0) dbg(0) -#undef p1_nrm2_2 -#define p1_nrm2_2(a_) \ - plq(a_,ax,1) dbg(1) \ - pan(4,1) dbg(1) \ - pc(5,6) dbg(6) \ - pc(5,7) dbg(7) \ - pax(1,5) dbg(5) \ - prp(5,2) dbg(2) \ - px(3) \ - pcm(0,2,3)dbg(3) \ - pan(3,7) dbg(7) \ - pann(5,3) dbg(3) \ - pa(3,7) dbg(7) \ - pc(7,5) dbg(5) \ - pd(5,6) dbg(6) \ - pd(5,1) dbg(1) \ - pm(6,6) dbg(6) \ - pm(1,1) dbg(1) \ - pm(6,0) dbg(0) \ - pa(1,0) dbg(0) -#define p2_nrm2_2(a_) \ - plq(SS(a_,RS4),ax,1) dbg(1) \ - pan(4,1) dbg(1) \ - pc(5,6) dbg(6) \ - pc(5,7) dbg(7) \ - pax(1,5) dbg(5) \ - prp(5,2) dbg(2) \ - px(3) \ - pcm(0,2,3)dbg(3) \ - pan(3,7) dbg(7) \ - pann(5,3) dbg(3) \ - pa(3,7) dbg(7) \ - pc(7,5) dbg(5) \ - pd(5,6) dbg(6) \ - pd(5,1) dbg(1) \ - pm(6,6) dbg(6) \ - pm(1,1) dbg(1) \ - pm(6,0) dbg(0) \ - pa(1,0) dbg(0) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \ - pan(4,1) dbg(1) \ - pc(5,6) dbg(6) \ - pc(5,7) dbg(7) \ - pax(1,5) dbg(5) \ - prp(5,2) dbg(2) \ - px(3) \ - pcm(0,2,3)dbg(3) \ - pan(3,7) dbg(7) \ - pann(5,3) dbg(3) \ - pa(3,7) dbg(7) \ - pc(7,5) dbg(5) \ - pd(5,6) dbg(6) \ - pd(5,1) dbg(1) \ - pm(6,6) dbg(6) \ - pm(1,1) dbg(1) \ - pm(6,0) dbg(0) \ - pa(1,0) dbg(0) -#undef lpnrm2_2 -#define lpnrm2_2(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \ - pan(4,1) dbg(1) \ - pc(5,6) dbg(6) \ - pc(5,7) dbg(7) \ - pax(1,5) dbg(5) \ - prp(5,2) dbg(2) \ - px(3) \ - pcm(0,2,3)dbg(3) \ - pan(3,7) dbg(7) \ - pann(5,3) dbg(3) \ - pa(3,7) dbg(7) \ - pc(7,5) dbg(5) \ - pd(5,6) dbg(6) \ - pd(5,1) dbg(1) \ - pm(6,6) dbg(6) \ - pm(1,1) dbg(1) \ - pm(6,0) dbg(0) \ - pa(1,0) dbg(0) -#undef dpnrm2_2 -#define dpnrm2_2(a_) \ - plq(SS(a_,RS4),ax,1) dbg(1) \ - pan(4,1) dbg(1) \ - pc(5,6) dbg(6) \ - pc(5,7) dbg(7) \ - pax(1,5) dbg(5) \ - prp(5,2) dbg(2) \ - px(3) \ - pcm(0,2,3)dbg(3) \ - pan(3,7) dbg(7) \ - pann(5,3) dbg(3) \ - pa(3,7) dbg(7) \ - pc(7,5) dbg(5) \ - pd(5,6) dbg(6) \ - pd(5,1) dbg(1) \ - pm(6,6) dbg(6) \ - pm(1,1) dbg(1) \ - pm(6,0) dbg(0) \ - pa(1,0) dbg(0) -#undef plnrm2_2 -#define plnrm2_2 8 - - -#undef p1_4_nrm2_3 -#define p1_4_nrm2_3(a_) \ - pls(a_,ax,1) dbg(1) \ - pcs(5,6) dbg(6) \ - pan(4,1) dbg(1) \ - paxs(1,5) dbg(5) \ - pdsr(5,6) dbg(6) \ - pdsr(5,1) dbg(1) \ - pmsr(6,6) dbg(6) \ - pmsr(1,1) dbg(1) \ - pm(6,0) dbg(0) \ - pasr(1,0) dbg(0) -#undef p1_2_nrm2_3 -#define p1_2_nrm2_3(a_) \ - px(1) pld(a_,ax,1) dbg(1) \ - pc(5,6) dbg(6) \ - pan(4,1) dbg(1) \ - pax(1,5) dbg(5) \ - pd(5,6) dbg(6) \ - pd(5,1) dbg(1) \ - pm(6,6) dbg(6) \ - pm(1,1) dbg(1) \ - pm(6,0) dbg(0) \ - pa(1,0) dbg(0) -#undef p1_nrm2_3 -#define p1_nrm2_3(a_) \ - plq(a_,ax,1) dbg(1) \ - pc(5,6) dbg(6) \ - pan(4,1) dbg(1) \ - pax(1,5) dbg(5) \ - pd(5,6) dbg(6) \ - pd(5,1) dbg(1) \ - pm(6,6) dbg(6) \ - pm(1,1) dbg(1) \ - pm(6,0) dbg(0) \ - pa(1,0) dbg(0) -#define p2_nrm2_3(a_) \ - plq(SS(a_,RS4),ax,1) dbg(1) \ - pc(5,6) dbg(6) \ - pan(4,1) dbg(1) \ - pax(1,5) dbg(5) \ - pd(5,6) dbg(6) \ - pd(5,1) dbg(1) \ - pm(6,6) dbg(6) \ - pm(1,1) dbg(1) \ - pm(6,0) dbg(0) \ - pa(1,0) dbg(0) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \ - pc(5,6) dbg(6) \ - pan(4,1) dbg(1) \ - pax(1,5) dbg(5) \ - pd(5,6) dbg(6) \ - pd(5,1) dbg(1) \ - pm(6,6) dbg(6) \ - pm(1,1) dbg(1) \ - pm(6,0) dbg(0) \ - pa(1,0) dbg(0) -#undef lpnrm2_3 -#define lpnrm2_3(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \ - pc(5,6) dbg(6) \ - pan(4,1) dbg(1) \ - pax(1,5) dbg(5) \ - pd(5,6) dbg(6) \ - pd(5,1) dbg(1) \ - pm(6,6) dbg(6) \ - pm(1,1) dbg(1) \ - pm(6,0) dbg(0) \ - pa(1,0) dbg(0) -#undef dpnrm2_3 -#define dpnrm2_3(a_) \ - plq(SS(a_,RS4),ax,1) dbg(1) \ - pc(5,6) dbg(6) \ - pan(4,1) dbg(1) \ - pax(1,5) dbg(5) \ - pd(5,6) dbg(6) \ - pd(5,1) dbg(1) \ - pm(6,6) dbg(6) \ - pm(1,1) dbg(1) \ - pm(6,0) dbg(0) \ - pa(1,0) dbg(0) -#undef plnrm2_3 -#define plnrm2_3 8 - -#define block_nrm2_4(a_,b_) \ - Mjoin(pc,a_)(5,6) dbg(6) \ - pan(4,1) dbg(1) \ - Mjoin(pax,a_)(1,5) dbg(5) \ - Mjoin(pc,a_)(2,7) dbg(7) \ - Mjoin(pd,b_)(5,7) dbg(7) \ - Mjoin(pm,b_)(7,6) dbg(6) \ - Mjoin(pm,b_)(7,1) dbg(1) \ - Mjoin(pm,b_)(6,6) dbg(6) \ - Mjoin(pm,b_)(6,0) dbg(0) \ - Mjoin(pm,b_)(1,1) dbg(1) \ - Mjoin(pa,b_)(1,0) dbg(0) - - -/* #undef p1_4_nrm2_4 */ -/* #define p1_4_nrm2_4(a_) \ */ -/* pls(a_,ax,1) dbg(1) \ */ -/* pcs(5,6) dbg(6) \ */ -/* pan(4,1) dbg(1) \ */ -/* paxs(1,5) dbg(5) \ */ -/* pcs(2,7) dbg(7) \ */ -/* pdsr(5,7) dbg(7) \ */ -/* pmsr(7,6) dbg(6) \ */ -/* pmsr(7,1) dbg(1) \ */ -/* pmsr(6,6) dbg(6) \ */ -/* pmsr(6,0) dbg(0) \ */ -/* pmsr(1,1) dbg(1) \ */ -/* pasr(1,0) dbg(0) */ -#undef p1_4_nrm2_4 -#define p1_4_nrm2_4(a_) \ - pls(a_,ax,1) dbg(1) \ - block_nrm2_4(s,sr) -#undef p1_2_nrm2_4 -#define p1_2_nrm2_4(a_) \ - px(1) pld(a_,ax,1) dbg(1) \ - block_nrm2_4(,) -#undef p1_nrm2_4 -#define p1_nrm2_4(a_) \ - plq(a_,ax,1) dbg(1) \ - block_nrm2_4(,) -#define p2_nrm2_4(a_) \ - plq(SS(a_,RS4),ax,1) dbg(1) \ - block_nrm2_4(,) \ - plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - block_nrm2_4(,) -#undef lpnrm2_4 -#define lpnrm2_4(a_) \ - plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - block_nrm2_4(,) -#undef dpnrm2_4 -#define dpnrm2_4(a_) \ - plq(SS(a_,RS4),ax,1) dbg(1) \ - block_nrm2_4(,) -#undef plnrm2_4 -#define plnrm2_4 8 - - -#undef p1_4_1x1_1 -#define p1_4_1x1_1(a_) \ - pls(a_,ax,1) \ - pls(a_,bx,0) \ - pm(0,1) \ - pa(1,6) -#undef p1_2_1x1_1 -#define p1_2_1x1_1(a_) \ - pld(a_,ax,1) \ - pld(a_,bx,0) \ - pm(0,1) \ - pa(1,6) -#undef p1_1x1_1 -#define p1_1x1_1(a_) \ - plq(a_,ax,1) \ - plq(a_,bx,0) \ - pm(0,1) \ - pa(0,6) -#undef p2_1x1_1 -#define p2_1x1_1(a_) \ - plq(a_,ax,1) \ - plq(a_,bx,0) \ - plq(SS(a_,RS4),ax,2) \ - plq(SS(a_,RS4),bx,3) \ - pm(0,1) \ - pm(2,3) \ - pa(1,6) \ - pa(3,6) -#undef p4_1x1_1 -#define p4_1x1_1(a_) \ - f(nta,SS(a_,MM(4,RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,1) \ - plq(SS(a_,MM(2,RS4)),ax,1) \ - pm(0,3) \ - puq(7,a_,ax) \ - plq(SS(a_,MM(3,RS4)),ax,2) \ - pm(0,1) \ - puq(3,SS(a_,RS4),ax) \ - f(nta,SS(a_,MM(6,RS4)),ax) \ - plq(SS(a_,MM(4,RS4)),ax,7) \ - pm(0,2) \ - puq(1,SS(a_,MM(2,RS4)),ax) \ - plq(SS(a_,MM(5,RS4)),ax,3) \ - pm(0,7) \ - puq(2,SS(a_,MM(3,RS4)),ax) -#undef lp1x1_1 -#define lp1x1_1(a_) \ - plq(a_,ax,7) \ - plq(SS(a_,RS4),ax,3) \ - pm(0,7) -#undef dp1x1_1 -#define dp1x1_1(a_) \ - plq(SS(,a_,MM(2,RS4)),ax,1) \ - pm(0,3) \ - puq(7,a_,ax) \ - plq(SS(a_,MM(3,RS4)),ax,2) \ - pm(0,1) \ - puq(3,SS(a_,RS4),ax) \ - pm(0,2) \ - puq(1,SS(a_,MM(2,RS4)),ax) \ - puq(2,SS(a_,MM(3,RS4)),ax) -#undef pl1x1_1 -#define pl1x1_1 RS4 - - -#undef p1_4_0x1_asum_1 -#define p1_4_0x1_asum_1(a_) \ - pls(a_,ax,1) \ - pan(4,1) \ - pasr(1,0) -#undef p1_2_0x1_asum_1 -#define p1_2_0x1_asum_1(a_) \ - px(1) \ - pld(a_,ax,1) \ - pan(4,1) \ - pa(1,0) -#undef p1_0x1_asum_1 -#define p1_0x1_asum_1(a_) \ - plq(a_,ax,1) \ - pan(4,1) \ - pa(1,0) -#undef p2_0x1_asum_1 -#define p2_0x1_asum_1(a_) \ - plq(a_,ax,1) \ - plq(SS(a_,RS4),ax,2) \ - pan(4,1) \ - pan(4,2) \ - pa(1,0) \ - pa(2,0) -#undef p4_0x1_asum_1 -#define p4_0x1_asum_1(a_) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,1) \ - pan(4,3) \ - pa(7,0) \ - plq(SS(a_,MM(3,RS4)),ax,2) \ - pan(4,1) \ - pa(3,0) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ - plq(SS(a_,MM(4,RS4)),ax,7) \ - pan(4,2) \ - pa(1,0) \ - plq(SS(a_,MM(5,RS4)),ax,3) \ - pan(4,7) \ - pa(2,0) -#undef lp0x1_asum_1 -#define lp0x1_asum_1(a_) \ - plq(a_,ax,7) \ - plq(SS(a_,MM(1,RS4)),ax,3) \ - pan(4,7) -#undef dp0x1_asum_1 -#define dp0x1_asum_1(a_) \ - plq(SS(a_,MM(2,RS4)),ax,1) \ - pan(4,3) \ - pa(7,0) \ - plq(SS(a_,MM(3,RS4)),ax,2) \ - pan(4,1) \ - pa(3,0) \ - pan(4,2) \ - pa(1,0) \ - pa(2,0) -#undef pl0x1_asum_1 -#define pl0x1_asum_1 RS4 - - -#undef p1_4_sum_1 -#define p1_4_sum_1(a_) \ - pls(a_,ax,1) \ - pasr(1,0) -#undef p1_2_sum_1 -#define p1_2_sum_1(a_) \ - px(1) \ - pld(a_,ax,1) \ - pa(1,0) -#undef p1_sum_1 -#define p1_sum_1(a_) \ - plq(a_,ax,1) \ - pa(1,0) -#undef p2_sum_1 -#define p2_sum_1(a_) \ - plq(a_,ax,1) \ - plq(SS(a_,RS4),ax,2) \ - pa(1,0) \ - pa(2,0) -#undef p4_sum_1 -#define p4_sum_1(a_) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,1) \ - pa(7,0) \ - plq(SS(a_,MM(3,RS4)),ax,2) \ - pa(3,0) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ - plq(SS(a_,MM(4,RS4)),ax,7) \ - pa(1,0) \ - plq(SS(a_,MM(5,RS4)),ax,3) \ - pa(2,0) -#undef lpsum_1 -#define lpsum_1(a_) \ - plq(a_,ax,7) \ - plq(SS(a_,MM(1,RS4)),ax,3) -#undef dpsum_1 -#define dpsum_1(a_) \ - plq(SS(a_,MM(2,RS4)),ax,1) \ - pa(7,0) \ - plq(SS(a_,MM(3,RS4)),ax,2) \ - pa(3,0) \ - pa(1,0) \ - pa(2,0) -#undef plsum_1 -#define plsum_1 RS4 - - -#undef p1_4_dot_1 -#define p1_4_dot_1(a_) \ - pls(a_,ax,1) \ - pls(a_,cx,2) \ - pmsr(2,1) \ - pasr(1,0) -#undef p1_2_dot_1 -#define p1_2_dot_1(a_) \ - px(1) \ - pld(a_,ax,1) \ - px(2) \ - pld(a_,cx,2) \ - pm(2,1) \ - pa(1,0) -#undef p1_dot_1 -#define p1_dot_1(a_) \ - plq(a_,ax,1) \ - pl(a_,cx,2) \ - pm(2,1) \ - pa(1,0) -#undef p2_dot_1 -#define p2_dot_1(a_) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pl(SS(a_,MM(1,RS4)),cx,2) \ - pm(4,3) \ - pa(3,0) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,3) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(2,RS4)),cx,4) \ - pm(2,1) \ - pa(1,0) -#undef lpdot_1 -#define lpdot_1(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(a_,ax,3) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(a_,cx,4) -#undef dpdot_1 -#define dpdot_1(a_) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pl(SS(a_,MM(1,RS4)),cx,2) \ - pm(4,3) \ - pa(3,0) \ - pm(2,1) \ - pa(1,0) -#undef pldot_1 -#define pldot_1 8 - -#undef p1_4_dot_1c -#define p1_4_dot_1c(a_) -#undef p1_2_dot_1c -#define p1_2_dot_1c(a_) \ - px(1) \ - pld(a_,ax,1) \ - px(2) \ - pld(a_,cx,2) \ - pc(1,3) \ - ps(HSHUF,1,1) \ - ps(LSHUF,3,3) \ - pm(7,1) \ - pm(2,3) \ - pa(3,0) \ - pm(2,1) \ - pa(1,6) -#undef p1_dot_1c -#define p1_dot_1c(a_) \ - plq(a_,ax,1) \ - pl(a_,cx,2) \ - pc(1,3) \ - ps(HSHUF,1,1) \ - ps(LSHUF,3,3) \ - pm(7,1) \ - pm(2,3) \ - pa(3,0) \ - pm(2,1) \ - pa(1,6) -#undef p2_dot_1c -#define p2_dot_1c(a_) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pl(SS(a_,MM(1,RS4)),cx,2) \ - pc(3,5) \ - ps(HSHUF,3,3) \ - ps(LSHUF,5,5) \ - pm(7,3) \ - pm(4,5) \ - pa(5,0) \ - pm(4,3) \ - pa(3,6) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(2,RS4)),cx,4) \ - plq(SS(a_,MM(2,RS4)),ax,3) \ - pc(1,5) \ - ps(HSHUF,1,1) \ - ps(LSHUF,5,5) \ - pm(7,1) \ - pm(2,5) \ - pa(5,0) \ - pm(2,1) \ - pa(1,6) -#undef lpdot_1c -#define lpdot_1c(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(a_,ax,3) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(a_,cx,4) -#undef dpdot_1c -#define dpdot_1c(a_) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pl(SS(a_,MM(1,RS4)),cx,2) \ - pc(3,5) \ - ps(HSHUF,3,3) \ - ps(LSHUF,5,5) \ - pm(7,3) \ - pm(4,5) \ - pa(5,0) \ - pm(4,3) \ - pa(3,6) \ - pc(1,5) \ - ps(HSHUF,1,1) \ - ps(LSHUF,5,5) \ - pm(7,1) \ - pm(2,5) \ - pa(5,0) \ - pm(2,1) \ - pa(1,6) -#undef pldot_1c -#define pldot_1c 8 - -#undef p1_4_dot_2c -#define p1_4_dot_2c(a_) -#undef p1_2_dot_2c -#define p1_2_dot_2c(a_) \ - px(1) \ - pld(a_,ax,1) \ - px(2) \ - pld(a_,cx,2) \ - pc(1,3) \ - ps(CSHUF,1,1) \ - pm(2,3) \ - pa(3,0) \ - pm(2,1) \ - pa(1,6) -#undef p1_dot_2c -#define p1_dot_2c(a_) \ - plq(a_,ax,1) \ - pl(a_,cx,2) \ - pc(1,3) \ - ps(CSHUF,1,1) \ - pm(2,3) \ - pa(3,0) \ - pm(2,1) \ - pa(1,6) -#undef p2_dot_2c -#define p2_dot_2c(a_) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pl(SS(a_,MM(1,RS4)),cx,2) \ - pc(3,5) \ - ps(CSHUF,3,3) \ - pm(4,5) \ - pa(5,0) \ - pm(4,3) \ - pa(3,6) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(2,RS4)),cx,4) \ - plq(SS(a_,MM(2,RS4)),ax,3) \ - pc(1,5) \ - ps(CSHUF,1,1) \ - pm(2,5) \ - pa(5,0) \ - pm(2,1) \ - pa(1,6) -#undef lpdot_2c -#define lpdot_2c(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(a_,ax,3) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(a_,cx,4) -#undef dpdot_2c -#define dpdot_2c(a_) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pl(SS(a_,MM(1,RS4)),cx,2) \ - pc(3,5) \ - ps(CSHUF,3,3) \ - pm(4,5) \ - pa(5,0) \ - pm(4,3) \ - pa(3,6) \ - pc(1,5) \ - ps(CSHUF,1,1) \ - pm(2,5) \ - pa(5,0) \ - pm(2,1) \ - pa(1,6) -#undef pldot_2c -#define pldot_2c 8 - -#undef p1_4_axpby_3 -#define p1_4_axpby_3(a_) \ - pls(a_,ax,0) \ - pls(a_,cx,3) \ - pmsr(5,0) \ - pmsr(6,3) \ - pasr(3,0) \ - pus(0,a_,ax) -#undef p1_2_axpby_3 -#define p1_2_axpby_3(a_) \ - pld(a_,ax,0) \ - pld(a_,cx,3) \ - pm(5,0) \ - pm(6,3) \ - pa(3,0) \ - pud(0,a_,ax) -#undef p1_axpby_3 -#define p1_axpby_3(a_) \ - plq(a_,ax,0) \ - pl(a_,cx,3) \ - pm(5,0) \ - pm(6,3) \ - pa(3,0) \ - punt(0,a_,ax) -#undef p2_axpby_3 -#define p2_axpby_3(a_) \ - plq(a_,ax,0) \ - pl(a_,cx,3) \ - plq(SS(a_,RS4),ax,1) \ - pm(5,0) \ - pm(6,3) \ - pa(3,0) \ - pl(SS(a_,RS4),cx,3) \ - punt(0,a_,ax) \ - pm(5,1) \ - pm(6,3) \ - pa(3,1) \ - punt(1,SS(a_,RS4),ax) -#undef p4_axpby_3 -#define p4_axpby_3(a_) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - pm(5,2) \ - pl(SS(a_,MM(3,RS4)),cx,7) \ - pm(6,4) \ - pa(4,2) \ - punt(0,a_,ax) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(4,RS4)),cx,4) \ - pm(5,3) \ - plq(SS(a_,MM(4,RS4)),ax,0) \ - pm(6,7) \ - pa(7,3) \ - punt(1,SS(a_,RS4),ax) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(5,RS4)),ax,1) \ - pm(5,0) \ - pl(SS(a_,MM(5,RS4)),cx,7) \ - pm(6,4) \ - pa(4,0) \ - punt(2,SS(a_,MM(2,RS4)),ax) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ - pl(SS(a_,MM(6,RS4)),cx,4) \ - pm(5,1) \ - plq(SS(a_,MM(6,RS4)),ax,2) \ - pm(6,7) \ - pa(7,1) \ - punt(3,SS(a_,MM(3,RS4)),ax) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) -#undef lpaxpby_3 -#define lpaxpby_3(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(SS(a_,MM(0,RS4)),cx,4) \ - plq(SS(a_,MM(0,RS4)),ax,0) \ - pl(SS(a_,MM(1,RS4)),cx,7) \ - pm(5,0) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pm(6,4) \ - pa(4,0) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,2) \ - pm(5,1) \ - pl(SS(a_,MM(2,RS4)),cx,4) \ - pm(6,7) \ - pa(7,1) -#undef dpaxpby_3 -#define dpaxpby_3(a_) \ - pl(SS(a_,MM(3,RS4)),cx,7) \ - pm(5,2) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - pm(6,4) \ - pa(4,2) \ - pm(5,3) \ - punt(0,a_,ax) \ - pm(6,7) \ - pa(7,3) \ - punt(1,SS(a_,RS4),ax) \ - punt(2,SS(a_,MM(2,RS4)),ax) \ - punt(3,SS(a_,MM(3,RS4)),ax) -#undef plaxpby_3 -#define plaxpby_3 16 - -#undef p1_4_axpby_3c -#define p1_4_axpby_3c(a_) -#undef p1_2_axpby_3c -#define p1_2_axpby_3c(a_) \ - pld(a_,ax,0) \ - pld(a_,cx,2) \ - pc(0,3) \ - pm(5,0) \ - ps(CSHUF,3,3) \ - pm(4,3) \ - pa(3,0) \ - pc(2,3) \ - pm(6,2) \ - pa(2,0) \ - ps(CSHUF,3,3) \ - pm(7,3) \ - pa(3,0) \ - pud(0,a_,ax) -#undef p1_axpby_3c -#define p1_axpby_3c(a_) \ - plq(a_,ax,0) \ - pl(a_,cx,2) \ - pc(0,3) \ - pm(5,0) \ - ps(CSHUF,3,3) \ - pm(4,3) \ - pa(3,0) \ - pc(2,3) \ - pm(6,2) \ - pa(2,0) \ - ps(CSHUF,3,3) \ - pm(7,3) \ - pa(3,0) \ - puq(0,a_,ax) -#undef p2_axpby_3c -#define p2_axpby_3c(a_) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pl(SS(a_,MM(1,RS4)),cx,3) \ - pc(1,2) \ - pm(5,1) \ - ps(CSHUF,2,2) \ - pm(4,2) \ - pa(2,1) \ - pc(3,2) \ - pm(6,3) \ - pa(3,1) \ - ps(CSHUF,2,2) \ - pm(7,2) \ - pa(2,1) \ - puq(0,a_,ax) \ - plq(SS(a_,MM(2,RS4)),ax,0) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(2,RS4)),cx,2) \ - pc(0,3) \ - pm(5,0) \ - ps(CSHUF,3,3) \ - pm(4,3) \ - pa(3,0) \ - pc(2,3) \ - pm(6,2) \ - pa(2,0) \ - ps(CSHUF,3,3) \ - pm(7,3) \ - pa(3,0) \ - puq(1,SS(a_,RS4),ax) -#undef lpaxpby_3c -#define lpaxpby_3c(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,0) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(SS(a_,MM(0,RS4)),cx,2) \ - pc(0,3) \ - pm(5,0) \ - ps(CSHUF,3,3) \ - pm(4,3) \ - pa(3,0) \ - pc(2,3) \ - pm(6,2) \ - pa(2,0) \ - ps(CSHUF,3,3) \ - pm(7,3) \ - pa(3,0) -#undef dpaxpby_3c -#define dpaxpby_3c(a_) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pl(SS(a_,MM(1,RS4)),cx,3) \ - pc(1,2) \ - pm(5,1) \ - ps(CSHUF,2,2) \ - pm(4,2) \ - pa(2,1) \ - pc(3,2) \ - pm(6,3) \ - pa(3,1) \ - ps(CSHUF,2,2) \ - pm(7,2) \ - pa(2,1) \ - puq(0,a_,ax) \ - puq(1,SS(a_,RS4),ax) -#undef plaxpby_3c -#define plaxpby_3c 8 - -#undef p1_4_axpby_2 -#define p1_4_axpby_2(a_) \ - pls(a_,cx,5) \ - pls(a_,ax,0) \ - pmsr(6,5) \ - pasr(5,0) \ - pus(0,a_,ax) -#undef p1_2_axpby_2 -#define p1_2_axpby_2(a_) \ - pld(a_,cx,5) \ - pld(a_,ax,0) \ - pm(6,5) \ - pa(5,0) \ - pud(0,a_,ax) -#undef p1_axpby_2 -#define p1_axpby_2(a_) \ - pl(a_,cx,5) \ - plq(a_,ax,0) \ - pm(6,5) \ - pa(5,0) \ - puq(0,a_,ax) -#undef p2_axpby_2 -#define p2_axpby_2(a_) \ - pl(a_,cx,5) \ - plq(a_,ax,0) \ - pl(SS(a_,RS4),cx,4) \ - pm(6,5) \ - pa(5,0) \ - plq(SS(a_,RS4),ax,1) \ - puq(0,a_,ax) \ - pm(6,4) \ - pa(4,1) \ - puq(1,SS(a_,RS4),ax) -#undef p4_axpby_2 -#define p4_axpby_2(a_) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - pl(SS(a_,MM(3,RS4)),cx,5) \ - pm(6,4) \ - pa(4,2) \ - puq(0,a_,ax) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(4,RS4)),cx,4) \ - plq(SS(a_,MM(4,RS4)),ax,0) \ - pm(6,5) \ - pa(5,3) \ - puq(1,SS(a_,RS4),ax) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(5,RS4)),ax,1) \ - pl(SS(a_,MM(5,RS4)),cx,5) \ - pm(6,4) \ - pa(4,0) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ - pl(SS(a_,MM(6,RS4)),cx,4) \ - plq(SS(a_,MM(6,RS4)),ax,2) \ - pm(6,5) \ - pa(5,1) \ - puq(3,SS(a_,MM(3,RS4)),ax) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) -#undef lpaxpby_2 -#define lpaxpby_2(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(SS(a_,MM(0,RS4)),cx,4) \ - plq(SS(a_,MM(0,RS4)),ax,0) \ - pl(SS(a_,MM(1,RS4)),cx,5) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pm(6,4) \ - pa(4,0) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,2) \ - pl(SS(a_,MM(2,RS4)),cx,4) \ - pm(6,5) \ - pa(5,1) -#undef dpaxpby_2 -#define dpaxpby_2(a_) \ - pl(SS(a_,MM(3,RS4)),cx,5) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - pm(6,4) \ - pa(4,2) \ - puq(0,a_,ax) \ - pm(6,5) \ - pa(5,3) \ - puq(1,SS(a_,RS4),ax) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - puq(3,SS(a_,MM(3,RS4)),ax) -#undef plaxpby_2 -#define plaxpby_2 16 - -#undef p1_4_axpby_2c -#define p1_4_axpby_2c(a_) -#undef p1_2_axpby_2c -#define p1_2_axpby_2c(a_) \ - pld(a_,cx,5) \ - pld(a_,ax,0) \ - pc(5,1) \ - pm(6,5) \ - pa(5,0) \ - ps(CSHUF,1,1) \ - pm(7,1) \ - pa(1,0) \ - pud(0,a_,ax) -#undef p1_axpby_2c -#define p1_axpby_2c(a_) \ - pl(a_,cx,5) \ - plq(a_,ax,0) \ - pc(5,1) \ - pm(6,5) \ - pa(5,0) \ - ps(CSHUF,1,1) \ - pm(7,1) \ - pa(1,0) \ - puq(0,a_,ax) -#undef p2_axpby_2c -#define p2_axpby_2c(a_) \ - pl(a_,cx,5) \ - plq(a_,ax,0) \ - pl(SS(a_,RS4),cx,4) \ - pc(5,1) \ - pm(6,5) \ - pa(5,0) \ - ps(CSHUF,2,2) \ - pm(7,2) \ - pa(2,0) \ - plq(SS(a_,RS4),ax,1) \ - puq(0,a_,ax) \ - pc(4,3) \ - pm(6,4) \ - pa(4,1) \ - ps(CSHUF,3,3) \ - pm(7,3) \ - pa(3,1) \ - puq(1,SS(a_,RS4),ax) -#undef p4_axpby_2c -#define p4_axpby_2c(a_) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - puq(0,a_,ax) \ - pc(4,0) \ - pm(6,4) \ - pa(4,2) \ - ps(CSHUF,0,0) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(4,RS4)),cx,4) \ - pm(7,0) \ - pa(0,2) \ - plq(SS(a_,MM(4,RS4)),ax,0) \ - puq(1,SS(a_,RS4),ax) \ - pc(5,1) \ - pm(6,5) \ - pa(5,3) \ - ps(CSHUF,1,1) \ - pl(SS(a_,MM(5,RS4)),cx,5) \ - pm(7,1) \ - pa(1,3) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(5,RS4)),ax,1) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - pc(4,2) \ - pm(6,4) \ - pa(4,0) \ - ps(CSHUF,2,2) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ - pl(SS(a_,MM(6,RS4)),cx,4) \ - pm(7,2) \ - pa(2,0) \ - plq(SS(a_,MM(6,RS4)),ax,2) \ - puq(3,SS(a_,MM(3,RS4)),ax) \ - pc(5,3) \ - pm(6,5) \ - pa(5,1) \ - ps(CSHUF,3,3) \ - pl(SS(a_,MM(7,RS4)),cx,5) \ - pm(7,3) \ - pa(3,1) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) -#undef lpaxpby_2c -#define lpaxpby_2c(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(SS(a_,MM(0,RS4)),cx,4) \ - plq(SS(a_,MM(0,RS4)),ax,0) \ - pl(SS(a_,MM(1,RS4)),cx,5) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pc(4,2) \ - pm(6,4) \ - pa(4,0) \ - ps(CSHUF,2,2) \ - pl(SS(a_,MM(2,RS4)),cx,4) \ - pm(7,2) \ - pa(2,0) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,2) \ - pc(5,3) \ - pm(6,5) \ - pa(5,1) \ - ps(CSHUF,3,3) \ - pl(SS(a_,MM(3,RS4)),cx,5) \ - pm(7,3) \ - pa(3,1) -#undef dpaxpby_2c -#define dpaxpby_2c(a_) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - puq(0,a_,ax) \ - pc(4,0) \ - pm(6,4) \ - pa(4,2) \ - ps(CSHUF,0,0) \ - puq(1,SS(a_,RS4),ax) \ - pm(7,0) \ - pa(0,2) \ - pc(5,1) \ - pm(6,5) \ - pa(5,3) \ - ps(CSHUF,1,1) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - pm(7,1) \ - pa(1,3) \ - puq(3,SS(a_,MM(3,RS4)),ax) -#undef plaxpby_2c -#define plaxpby_2c 16 - -#undef p1_4_axpby_1 -#define p1_4_axpby_1(a_) \ - pls(a_,ax,1) \ - pls(a_,cx,2) \ - pmsr(5,1) \ - pmsr(6,2) \ - pasr(2,1) \ - pus(1,a_,ax) -#undef p1_2_axpby_1 -#define p1_2_axpby_1(a_) \ - pld(a_,ax,1) \ - pld(a_,cx,2) \ - pm(5,1) \ - pm(6,2) \ - pa(2,1) \ - pud(1,a_,ax) -#undef p1_axpby_1 -#define p1_axpby_1(a_) \ - plq(a_,ax,1) \ - pl(a_,cx,2) \ - pm(5,1) \ - pm(6,2) \ - pa(2,1) \ - puq(1,a_,ax) -#undef p2_axpby_1 -#define p2_axpby_1(a_) \ - plq(SS(a_,RS4),ax,3) \ - pl(SS(a_,RS4),cx,4) \ - pm(5,1) \ - pm(6,2) \ - pa(2,1) \ - puq(1,a_,ax) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,1) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(2,RS4)),cx,2) \ - pm(5,3) \ - pm(6,4) \ - pa(4,3) \ - puq(3,SS(a_,RS4),ax) -#undef lpaxpby_1 -#define lpaxpby_1(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,1) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(SS(a_,MM(0,RS4)),cx,2) -#undef dpaxpby_1 -#define dpaxpby_1(a_) \ - plq(SS(a_,RS4),ax,3) \ - pl(SS(a_,RS4),cx,4) \ - pm(5,1) \ - pm(6,2) \ - pa(2,1) \ - puq(1,a_,ax) \ - pm(5,3) \ - pm(6,4) \ - pa(4,3) \ - puq(3,SS(a_,RS4),ax) -#undef plaxpby_1 -#define plaxpby_1 8 - -#undef p1_4_axpy_0 -#define p1_4_axpy_0(a_) \ - pls(a_,cx,2) \ - pls(a_,ax,1) \ - pmsr(6,2) \ - pasr(2,1) \ - pus(1,a_,ax) -#undef p1_2_axpy_0 -#define p1_2_axpy_0(a_) \ - pld(a_,cx,2) \ - pld(a_,ax,1) \ - pm(6,2) \ - pa(2,1) \ - pud(1,a_,ax) -#undef p1_axpy_0 -#define p1_axpy_0(a_) \ - pl(a_,cx,2) \ - plq(a_,ax,1) \ - pm(6,2) \ - pa(2,1) \ - puq(1,a_,ax) -#undef p2_axpy_0 -#define p2_axpy_0(a_) \ - pl(SS(a_,RS4),cx,4) \ - pm(6,2) \ - pa(2,1) \ - plq(SS(a_,RS4),ax,3) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(2,RS4)),cx,2) \ - puq(1,a_,ax) \ - pm(6,4) \ - pa(4,3) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,1) \ - puq(3,SS(a_,RS4),ax) -#undef lpaxpy_0 -#define lpaxpy_0(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(SS(a_,MM(0,RS4)),cx,2) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,1) -#undef dpaxpy_0 -#define dpaxpy_0(a_) \ - pl(SS(a_,RS4),cx,4) \ - pm(6,2) \ - pa(2,1) \ - plq(SS(a_,RS4),ax,3) \ - puq(1,a_,ax) \ - pm(6,4) \ - pa(4,3) \ - puq(3,SS(a_,RS4),ax) -#undef plaxpy_0 -#define plaxpy_0 8 - -#undef p1_4_axpy_1 -#define p1_4_axpy_1(a_) \ - pls(a_,cx,2) \ - pls(a_,ax,1) \ - pmsr(6,2) \ - pasr(2,1) \ - pus(1,a_,ax) -#undef p1_2_axpy_1 -#define p1_2_axpy_1(a_) \ - pld(a_,cx,2) \ - pld(a_,ax,1) \ - pm(6,2) \ - pa(2,1) \ - pud(1,a_,ax) -#undef p1_axpy_1 -#define p1_axpy_1(a_) \ - pl(a_,cx,2) \ - pm(6,2) \ - pam(a_,ax,2) \ - puq(2,a_,ax) -#undef p2_axpy_1 -#define p2_axpy_1(a_) \ - pl(a_,cx,2) \ - pm(6,2) \ - pl(SS(a_,RS4),cx,4) \ - pam(a_,ax,2) \ - pm(6,4) \ - puq(2,a_,ax) \ - pam(SS(a_,RS4),ax,4) \ - puq(4,SS(a_,RS4),ax) -#undef p4_axpy_1 -#define p4_axpy_1(a_) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(3,RS4)),cx,3) \ - pm(6,2) \ - pam(SS(a_,MM(2,RS4)),ax,2) \ - puq(0,a_,ax) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - pl(SS(a_,MM(4,RS4)),cx,0) \ - pm(6,3) \ - pam(SS(a_,MM(3,RS4)),ax,3) \ - puq(1,SS(a_,RS4),ax) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ - pl(SS(a_,MM(5,RS4)),cx,1) \ - pm(6,0) \ - pam(SS(a_,MM(4,RS4)),ax,0) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ - pl(SS(a_,MM(6,RS4)),cx,2) \ - pm(6,1) \ - pam(SS(a_,MM(5,RS4)),ax,1) \ - puq(3,SS(a_,MM(3,RS4)),ax) -#undef lpaxpy_1 -#define lpaxpy_1(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(a_,cx,0) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - pl(SS(a_,RS4),cx,1) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pm(6,0) \ - pam(a_,ax,0) \ - pl(SS(a_,MM(2,RS4)),cx,2) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - pm(6,1) \ - pam(SS(a_,RS4),ax,1) -#undef dpaxpy_1 -#define dpaxpy_1(a_) \ - pl(SS(a_,MM(3,RS4)),cx,3) \ - pm(6,2) \ - pam(SS(a_,MM(2,RS4)),ax,2) \ - puq(0,a_,ax) \ - pm(6,3) \ - pam(SS(a_,MM(3,RS4)),ax,3) \ - puq(1,SS(a_,RS4),ax) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - puq(3,SS(a_,MM(3,RS4)),ax) -#undef plaxpy_1 -#define plaxpy_1 16 - -#undef p1_4_axpy_2 -#define p1_4_axpy_2(a_) \ - pls(a_,cx,5) \ - pls(a_,ax,0) \ - pmsr(6,5) \ - pasr(5,0) \ - pus(0,a_,ax) -#undef p1_2_axpy_2 -#define p1_2_axpy_2(a_) \ - pld(a_,cx,5) \ - pld(a_,ax,0) \ - pm(6,5) \ - pa(5,0) \ - pud(0,a_,ax) -#undef p1_axpy_2 -#define p1_axpy_2(a_) \ - pl(a_,cx,5) \ - plq(a_,ax,0) \ - pm(6,5) \ - pa(5,0) \ - puq(0,a_,ax) -#undef p2_axpy_2 -#define p2_axpy_2(a_) \ - pl(a_,cx,5) \ - plq(a_,ax,0) \ - pl(SS(a_,RS4),cx,4) \ - pm(6,5) \ - pa(5,0) \ - plq(SS(a_,RS4),ax,1) \ - puq(0,a_,ax) \ - pm(6,4) \ - pa(4,1) \ - puq(1,SS(a_,RS4),ax) -#undef p4_axpy_2 -#define p4_axpy_2(a_) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - pl(SS(a_,MM(3,RS4)),cx,5) \ - pm(6,4) \ - pa(4,2) \ - puq(0,a_,ax) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(4,RS4)),cx,4) \ - plq(SS(a_,MM(4,RS4)),ax,0) \ - pm(6,5) \ - pa(5,3) \ - puq(1,SS(a_,RS4),ax) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(5,RS4)),ax,1) \ - pl(SS(a_,MM(5,RS4)),cx,5) \ - pm(6,4) \ - pa(4,0) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ - pl(SS(a_,MM(6,RS4)),cx,4) \ - plq(SS(a_,MM(6,RS4)),ax,2) \ - pm(6,5) \ - pa(5,1) \ - puq(3,SS(a_,MM(3,RS4)),ax) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) -#undef lpaxpy_2 -#define lpaxpy_2(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(SS(a_,MM(0,RS4)),cx,4) \ - plq(SS(a_,MM(0,RS4)),ax,0) \ - pl(SS(a_,MM(1,RS4)),cx,5) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pm(6,4) \ - pa(4,0) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,2) \ - pl(SS(a_,MM(2,RS4)),cx,4) \ - pm(6,5) \ - pa(5,1) -#undef dpaxpy_2 -#define dpaxpy_2(a_) \ - pl(SS(a_,MM(3,RS4)),cx,5) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - pm(6,4) \ - pa(4,2) \ - puq(0,a_,ax) \ - pm(6,5) \ - pa(5,3) \ - puq(1,SS(a_,RS4),ax) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - puq(3,SS(a_,MM(3,RS4)),ax) -#undef plaxpy_2 -#define plaxpy_2 16 - -#undef p1_4_axpy_2c -#define p1_4_axpy_2c(a_) -#undef p1_2_axpy_2c -#define p1_2_axpy_2c(a_) \ - pld(a_,cx,4) \ - pld(a_,ax,0) \ - pc(4,2) \ - pm(6,4) \ - pa(4,0) \ - ps(CSHUF,2,2) \ - pm(7,2) \ - pa(2,0) \ - pud(0,a_,ax) -#undef p1_axpy_2c -#define p1_axpy_2c(a_) \ - pl(a_,cx,4) \ - plq(a_,ax,0) \ - pc(4,2) \ - pm(6,4) \ - pa(4,0) \ - ps(CSHUF,2,2) \ - pm(7,2) \ - pa(2,0) \ - puq(0,a_,ax) -#undef p2_axpy_2c -#define p2_axpy_2c(a_) \ - pl(a_,cx,4) \ - plq(a_,ax,0) \ - pl(SS(a_,RS4),cx,5) \ - pc(4,2) \ - pm(6,4) \ - pa(4,0) \ - ps(CSHUF,2,2) \ - pm(7,2) \ - pa(2,0) \ - plq(SS(a_,RS4),ax,1) \ - puq(0,a_,ax) \ - pc(5,3) \ - pm(6,5) \ - pa(5,1) \ - ps(CSHUF,3,3) \ - pm(7,3) \ - pa(3,1) \ - puq(1,SS(a_,RS4),ax) -#undef p4_axpy_2c -#define p4_axpy_2c(a_) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - puq(0,a_,ax) \ - pc(4,0) \ - pm(6,4) \ - pa(4,2) \ - ps(CSHUF,0,0) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(4,RS4)),cx,4) \ - pm(7,0) \ - pa(0,2) \ - plq(SS(a_,MM(4,RS4)),ax,0) \ - puq(1,SS(a_,RS4),ax) \ - pc(5,1) \ - pm(6,5) \ - pa(5,3) \ - ps(CSHUF,1,1) \ - pl(SS(a_,MM(5,RS4)),cx,5) \ - pm(7,1) \ - pa(1,3) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(5,RS4)),ax,1) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - pc(4,2) \ - pm(6,4) \ - pa(4,0) \ - ps(CSHUF,2,2) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ - pl(SS(a_,MM(6,RS4)),cx,4) \ - pm(7,2) \ - pa(2,0) \ - plq(SS(a_,MM(6,RS4)),ax,2) \ - puq(3,SS(a_,MM(3,RS4)),ax) \ - pc(5,3) \ - pm(6,5) \ - pa(5,1) \ - ps(CSHUF,3,3) \ - pl(SS(a_,MM(7,RS4)),cx,5) \ - pm(7,3) \ - pa(3,1) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) -#undef lpaxpy_2c -#define lpaxpy_2c(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(SS(a_,MM(0,RS4)),cx,4) \ - plq(SS(a_,MM(0,RS4)),ax,0) \ - pl(SS(a_,MM(1,RS4)),cx,5) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pc(4,2) \ - pm(6,4) \ - pa(4,0) \ - ps(CSHUF,2,2) \ - pl(SS(a_,MM(2,RS4)),cx,4) \ - pm(7,2) \ - pa(2,0) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,2) \ - pc(5,3) \ - pm(6,5) \ - pa(5,1) \ - ps(CSHUF,3,3) \ - pl(SS(a_,MM(3,RS4)),cx,5) \ - pm(7,3) \ - pa(3,1) -#undef dpaxpy_2c -#define dpaxpy_2c(a_) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - puq(0,a_,ax) \ - pc(4,0) \ - pm(6,4) \ - pa(4,2) \ - ps(CSHUF,0,0) \ - puq(1,SS(a_,RS4),ax) \ - pm(7,0) \ - pa(0,2) \ - pc(5,1) \ - pm(6,5) \ - pa(5,3) \ - ps(CSHUF,1,1) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - pm(7,1) \ - pa(1,3) \ - puq(3,SS(a_,MM(3,RS4)),ax) -#undef plaxpy_2c -#define plaxpy_2c 16 - -#undef p1_4_axpy_1c -#define p1_4_axpy_1c(a_) -#undef p1_2_axpy_1c -#define p1_2_axpy_1c(a_) \ - pld(a_,cx,2) \ - pc(2,0) \ - pld(a_,ax,1) \ - ps(CSHUF,0,0) \ - pm(6,2) \ - pa(2,1) \ - pm(7,0) \ - pa(0,1) \ - pud(1,a_,ax) -#undef p1_axpy_1c -#define p1_axpy_1c(a_) \ - pl(a_,cx,2) \ - pc(2,0) \ - plq(a_,ax,1) \ - ps(CSHUF,0,0) \ - pm(6,2) \ - pa(2,1) \ - pm(7,0) \ - pa(0,1) \ - puq(1,a_,ax) -#undef p2_axpy_1c -#define p2_axpy_1c(a_) \ - plq(SS(a_,RS4),ax,3) \ - ps(CSHUF,0,0) \ - pl(SS(a_,RS4),cx,4) \ - pm(6,2) \ - pa(2,1) \ - pm(7,0) \ - pa(0,1) \ - pc(4,0) \ - puq(1,a_,ax) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,1) \ - ps(CSHUF,0,0) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(2,RS4)),cx,2) \ - pm(6,4) \ - pa(4,3) \ - pm(7,0) \ - pa(0,3) \ - pc(2,0) \ - puq(3,SS(a_,RS4),ax) -#undef lpaxpy_1c -#define lpaxpy_1c(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(SS(a_,MM(0,RS4)),cx,2) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,1) \ - pc(2,0) -#undef dpaxpy_1c -#define dpaxpy_1c(a_) \ - plq(SS(a_,RS4),ax,3) \ - ps(CSHUF,0,0) \ - pl(SS(a_,RS4),cx,4) \ - pm(6,2) \ - pa(2,1) \ - pm(7,0) \ - pa(0,1) \ - pc(4,0) \ - puq(1,a_,ax) \ - ps(CSHUF,0,0) \ - pm(6,4) \ - pa(4,3) \ - pm(7,0) \ - pa(0,3) \ - puq(3,SS(a_,RS4),ax) -#undef plaxpy_1c -#define plaxpy_1c 8 - -#undef p1_4_copy_1 -#define p1_4_copy_1(a_) \ - pls(a_,cx,2) \ - pus(2,a_,ax) -#undef p1_2_copy_1 -#define p1_2_copy_1(a_) \ - pld(a_,cx,2) \ - pud(2,a_,ax) -#undef p1_copy_1 -#define p1_copy_1(a_) \ - pl(a_,cx,2) \ - puq(2,a_,ax) -#undef p2_copy_1 -#define p2_copy_1(a_) \ - pl(SS(a_,RS4),cx,4) \ - puq(2,a_,ax) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(2,RS4)),cx,2) \ - puq(4,SS(a_,RS4),ax) -#undef lpcopy_1 -#define lpcopy_1(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(SS(a_,MM(0,RS4)),cx,2) -#undef dpcopy_1 -#define dpcopy_1(a_) \ - pl(SS(a_,RS4),cx,4) \ - puq(2,a_,ax) \ - puq(4,SS(a_,RS4),ax) -#undef plcopy_1 -#define plcopy_1 8 - -#undef p1_4_copy_2 -#define p1_4_copy_2(a_) \ - pls(a_,ax,2) \ - pus(2,a_,cx) -#undef p1_2_copy_2 -#define p1_2_copy_2(a_) \ - pld(a_,ax,2) \ - pud(2,a_,cx) -#undef p1_copy_2 -#define p1_copy_2(a_) \ - plq(a_,ax,2) \ - pu(2,a_,cx) -#undef p2_copy_2 -#define p2_copy_2(a_) \ - plq(SS(a_,RS4),ax,4) \ - pu(2,a_,cx) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,2) \ - pu(4,SS(a_,RS4),cx) -#undef lpcopy_2 -#define lpcopy_2(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,2) -#undef dpcopy_2 -#define dpcopy_2(a_) \ - plq(SS(a_,RS4),ax,4) \ - pu(2,a_,cx) \ - pu(4,SS(a_,RS4),cx) -#undef plcopy_2 -#define plcopy_2 8 - -#undef p1_4_copy_3 -#define p1_4_copy_3(a_) \ - pls(a_,cx,2) \ - pus(2,a_,ax) -#undef p1_2_copy_3 -#define p1_2_copy_3(a_) \ - pld(a_,cx,2) \ - pud(2,a_,ax) -#undef p1_copy_3 -#define p1_copy_3(a_) \ - pl(a_,cx,2) \ - punt(2,a_,ax) -#undef p2_copy_3 -#define p2_copy_3(a_) \ - pl(SS(a_,MM(0,RS4)),cx,0) \ - pl(SS(a_,MM(1,RS4)),cx,1) \ - punt(0,SS(a_,MM(0,RS4)),ax) \ - punt(1,SS(a_,MM(1,RS4)),ax) -#undef p4_copy_3 -#define p4_copy_3(a_) \ - pl(SS(a_,MM(0,RS4)),cx,0) \ - pl(SS(a_,MM(1,RS4)),cx,1) \ - pl(SS(a_,MM(2,RS4)),cx,2) \ - pl(SS(a_,MM(3,RS4)),cx,3) \ - punt(0,SS(a_,MM(0,RS4)),ax) \ - punt(1,SS(a_,MM(1,RS4)),ax) \ - punt(2,SS(a_,MM(2,RS4)),ax) \ - punt(3,SS(a_,MM(3,RS4)),ax) -#undef p8_copy_3 -#define p8_copy_3(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(SS(a_,MM(0,RS4)),cx,0) \ - pl(SS(a_,MM(1,RS4)),cx,1) \ - pl(SS(a_,MM(2,RS4)),cx,2) \ - pl(SS(a_,MM(3,RS4)),cx,3) \ - pl(SS(a_,MM(4,RS4)),cx,4) \ - pl(SS(a_,MM(5,RS4)),cx,5) \ - pl(SS(a_,MM(6,RS4)),cx,6) \ - pl(SS(a_,MM(7,RS4)),cx,7) \ - punt(0,SS(a_,MM(0,RS4)),ax) \ - punt(1,SS(a_,MM(1,RS4)),ax) \ - punt(2,SS(a_,MM(2,RS4)),ax) \ - punt(3,SS(a_,MM(3,RS4)),ax) \ - punt(4,SS(a_,MM(4,RS4)),ax) \ - punt(5,SS(a_,MM(5,RS4)),ax) \ - punt(6,SS(a_,MM(6,RS4)),ax) \ - punt(7,SS(a_,MM(7,RS4)),ax) -#undef lpcopy_3 -#define lpcopy_3(a_) -#undef dpcopy_3 -#define dpcopy_3(a_) p8_copy_3(a_) -#undef plcopy_3 -#define plcopy_3 32 - -#undef p1_4_cpsc_3 -#define p1_4_cpsc_3(a_) \ - pls(a_,ax,0) \ - pmsr(6,0) \ - pus(0,a_,cx) -#undef p1_2_cpsc_3 -#define p1_2_cpsc_3(a_) \ - pld(a_,ax,0) \ - pm(6,0) \ - pud(0,a_,cx) -#undef p1_cpsc_3 -#define p1_cpsc_3(a_) \ - plq(a_,ax,0) \ - pm(6,0) \ - pu(0,a_,cx) -#undef p2_cpsc_3 -#define p2_cpsc_3(a_) \ - plq(a_,ax,0) \ - plq(SS(a_,RS4),ax,1) \ - pm(6,0) \ - pm(6,1) \ - pu(0,a_,cx) \ - pu(1,SS(a_,RS4),cx) -#undef p4_cpsc_3 -#define p4_cpsc_3(a_) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - pm(6,2) \ - pu(0,a_,cx) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(4,RS4)),ax,0) \ - pm(6,3) \ - pu(1,SS(a_,RS4),cx) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - plq(SS(a_,MM(5,RS4)),ax,1) \ - pm(6,0) \ - pu(2,SS(a_,MM(2,RS4)),cx) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ - plq(SS(a_,MM(6,RS4)),ax,2) \ - pm(6,1) \ - pu(3,SS(a_,MM(3,RS4)),cx) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) -#undef lpcpsc_3 -#define lpcpsc_3(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,0) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pm(6,0) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - plq(SS(a_,MM(2,RS4)),ax,2) \ - pm(6,1) -#undef dpcpsc_3 -#define dpcpsc_3(a_) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - pm(6,2) \ - pu(0,a_,cx) \ - pm(6,3) \ - pu(1,SS(a_,RS4),cx) \ - pu(2,SS(a_,MM(2,RS4)),cx) \ - pu(3,SS(a_,MM(3,RS4)),cx) -#undef plcpsc_3 -#define plcpsc_3 16 - -#undef p1_4_cpsc_3c -#define p1_4_cpsc_3c(a_) -#undef p1_2_cpsc_3c -#define p1_2_cpsc_3c(a_) \ - pld(a_,ax,0) \ - pc(0,1) \ - pm(6,0) \ - ps(CSHUF,1,1) \ - pm(7,1) \ - pa(1,0) \ - pud(0,a_,cx) -#undef p1_cpsc_3c -#define p1_cpsc_3c(a_) \ - plq(a_,ax,0) \ - pc(0,1) \ - pm(6,0) \ - ps(CSHUF,1,1) \ - pm(7,1) \ - pa(1,0) \ - pu(0,a_,cx) -#undef p2_cpsc_3c -#define p2_cpsc_3c(a_) \ - plq(a_,ax,0) \ - plq(SS(a_,RS4),ax,1) \ - pc(0,2) \ - pm(6,0) \ - ps(CSHUF,2,2) \ - pm(7,2) \ - pa(2,0) \ - pu(0,a_,cx) \ - pc(1,3) \ - pm(6,1) \ - ps(CSHUF,3,3) \ - pm(7,3) \ - pa(3,1) \ - pu(1,SS(a_,RS4),cx) -#undef p4_cpsc_3c -#define p4_cpsc_3c(a_) \ - pu(0,a_,cx) \ - pc(2,4) \ - pm(6,2) \ - ps(CSHUF,4,4) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(4,RS4)),ax,0) \ - pm(7,4) \ - pa(4,2) \ - pu(1,SS(a_,RS4),cx) \ - pc(3,4) \ - pm(6,3) \ - ps(CSHUF,4,4) \ - plq(SS(a_,MM(5,RS4)),ax,1) \ - pm(7,4) \ - pa(4,3) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pu(2,SS(a_,MM(2,RS4)),cx) \ - pc(0,4) \ - pm(6,0) \ - ps(CSHUF,4,4) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ - plq(SS(a_,MM(6,RS4)),ax,2) \ - pm(7,4) \ - pa(4,0) \ - pu(3,SS(a_,MM(3,RS4)),cx) \ - pc(1,4) \ - pm(6,1) \ - ps(CSHUF,4,4) \ - plq(SS(a_,MM(7,RS4)),ax,3) \ - pm(7,4) \ - pa(4,1) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) -#undef lpcpsc_3c -#define lpcpsc_3c(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,0) \ - plq(SS(a_,MM(1,RS4)),ax,1) \ - pc(0,4) \ - pm(6,0) \ - ps(CSHUF,4,4) \ - plq(SS(a_,MM(2,RS4)),ax,2) \ - pm(7,4) \ - pa(4,0) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pc(1,4) \ - pm(6,1) \ - ps(CSHUF,4,4) \ - plq(SS(a_,MM(3,RS4)),ax,3) \ - pm(7,4) \ - pa(4,1) -#undef dpcpsc_3c -#define dpcpsc_3c(a_) \ - pu(0,a_,cx) \ - pc(2,4) \ - pm(6,2) \ - ps(CSHUF,4,4) \ - pu(1,SS(a_,RS4),cx) \ - pm(7,4) \ - pa(4,2) \ - pc(3,4) \ - pm(6,3) \ - ps(CSHUF,4,4) \ - pu(2,SS(a_,MM(2,RS4)),cx) \ - pm(7,4) \ - pa(4,3) \ - pu(3,SS(a_,MM(3,RS4)),cx) -#undef plcpsc_3c -#define plcpsc_3c 16 - -#undef p1_4_cpsc_4 -#define p1_4_cpsc_4(a_) \ - pls(a_,cx,0) \ - pmsr(6,0) \ - pus(0,a_,ax) -#undef p1_2_cpsc_4 -#define p1_2_cpsc_4(a_) \ - pld(a_,cx,0) \ - pm(6,0) \ - pud(0,a_,ax) -#undef p1_cpsc_4 -#define p1_cpsc_4(a_) \ - pl(a_,cx,0) \ - pm(6,0) \ - puq(0,a_,ax) -#undef p2_cpsc_4 -#define p2_cpsc_4(a_) \ - pl(a_,cx,0) \ - pl(SS(a_,RS4),cx,1) \ - pm(6,0) \ - pm(6,1) \ - puq(0,a_,ax) \ - puq(1,SS(a_,RS4),ax) -#undef p4_cpsc_4 -#define p4_cpsc_4(a_) \ - pl(SS(a_,MM(3,RS4)),cx,3) \ - pm(6,2) \ - puq(0,a_,ax) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(4,RS4)),cx,0) \ - pm(6,3) \ - puq(1,SS(a_,RS4),ax) \ - pl(SS(a_,MM(5,RS4)),cx,1) \ - pm(6,0) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ - pl(SS(a_,MM(6,RS4)),cx,2) \ - pm(6,1) \ - puq(3,SS(a_,MM(3,RS4)),ax) -#undef lpcpsc_4 -#define lpcpsc_4(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - pl(SS(a_,MM(0,RS4)),cx,0) \ - pl(SS(a_,MM(1,RS4)),cx,1) \ - pm(6,0) \ - pl(SS(a_,MM(2,RS4)),cx,2) \ - pm(6,1) -#undef dpcpsc_4 -#define dpcpsc_4(a_) \ - pl(SS(a_,MM(3,RS4)),cx,3) \ - pm(6,2) \ - puq(0,a_,ax) \ - pm(6,3) \ - puq(1,SS(a_,RS4),ax) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - puq(3,SS(a_,MM(3,RS4)),ax) -#undef plcpsc_4 -#define plcpsc_4 16 - -#undef p1_4_cpsc_5 -#define p1_4_cpsc_5(a_) \ - pls(a_,cx,0) \ - pmsr(6,0) \ - pus(0,a_,ax) -#undef p1_2_cpsc_5 -#define p1_2_cpsc_5(a_) \ - pld(a_,cx,0) \ - pm(6,0) \ - pud(0,a_,ax) -#undef p1_cpsc_5 -#define p1_cpsc_5(a_) \ - pl(a_,cx,0) \ - pm(6,0) \ - puq(0,a_,ax) -#undef p2_cpsc_5 -#define p2_cpsc_5(a_) \ - pl(a_,cx,0) \ - pl(SS(a_,RS4),cx,1) \ - pm(6,0) \ - pm(6,1) \ - puq(0,a_,ax) \ - puq(1,SS(a_,RS4),ax) -#undef p4_cpsc_5 -#define p4_cpsc_5(a_) \ - pl(SS(a_,MM(0,RS4)),cx,0) \ - pl(SS(a_,MM(1,RS4)),cx,1) \ - pl(SS(a_,MM(2,RS4)),cx,2) \ - pl(SS(a_,MM(3,RS4)),cx,3) \ - pm(6,0) \ - pm(6,1) \ - pm(6,2) \ - pm(6,3) \ - puq(0,a_,ax) \ - puq(1,SS(a_,RS4),ax) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - puq(3,SS(a_,MM(3,RS4)),ax) -#undef p8_cpsc_5 -#define p8_cpsc_5(a_) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(0,RS4)),cx,0) \ - pl(SS(a_,MM(1,RS4)),cx,1) \ - pl(SS(a_,MM(2,RS4)),cx,2) \ - pl(SS(a_,MM(3,RS4)),cx,3) \ - pl(SS(a_,MM(4,RS4)),cx,4) \ - pl(SS(a_,MM(5,RS4)),cx,5) \ - pl(SS(a_,MM(6,RS4)),cx,7) \ - pm(6,0) \ - pm(6,1) \ - pm(6,2) \ - pm(6,3) \ - puq(0,a_,ax) \ - pl(SS(a_,MM(7,RS4)),cx,0) \ - pm(6,4) \ - pm(6,5) \ - pm(6,7) \ - pm(6,0) \ - puq(1,SS(a_,RS4),ax) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - puq(3,SS(a_,MM(3,RS4)),ax) \ - puq(4,SS(a_,MM(4,RS4)),ax) \ - puq(5,SS(a_,MM(5,RS4)),ax) \ - puq(7,SS(a_,MM(6,RS4)),ax) \ - puq(0,SS(a_,MM(7,RS4)),ax) -#undef lpcpsc_5 -#define lpcpsc_5(a_) -#undef dpcpsc_5 -#define dpcpsc_5(a_) p8_cpsc_5(a_) -#undef plcpsc_5 -#define plcpsc_5 32 - -#undef cpsc_cdp -#define cpsc_cdp(a_) pc(a_,5) pm(6,a_) ps(CSHUF,5,5) pm(7,5) pa(5,a_) -#undef p1_4_cpsc_5c -#define p1_4_cpsc_5c(a_) -#undef p1_2_cpsc_5c -#define p1_2_cpsc_5c(a_) \ - pld(a_,cx,0) \ - cpsc_cdp(0) \ - pud(0,a_,ax) -#undef p1_cpsc_5c -#define p1_cpsc_5c(a_) \ - pl(a_,cx,0) \ - cpsc_cdp(0) \ - puq(0,a_,ax) -#undef p2_cpsc_5c -#define p2_cpsc_5c(a_) \ - pl(a_,cx,0) \ - pl(SS(a_,RS4),cx,1) \ - cpsc_cdp(0) \ - cpsc_cdp(1) \ - puq(0,a_,ax) \ - puq(1,SS(a_,RS4),ax) -#undef p4_cpsc_5c -#define p4_cpsc_5c(a_) \ - pl(SS(a_,MM(0,RS4)),cx,0) \ - pl(SS(a_,MM(1,RS4)),cx,1) \ - pl(SS(a_,MM(2,RS4)),cx,2) \ - pl(SS(a_,MM(3,RS4)),cx,3) \ - cpsc_cdp(0) \ - cpsc_cdp(1) \ - cpsc_cdp(2) \ - cpsc_cdp(3) \ - puq(0,a_,ax) \ - puq(1,SS(a_,RS4),ax) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - puq(3,SS(a_,MM(3,RS4)),ax) -#undef p8_cpsc_5c -#define p8_cpsc_5c(a_) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - pl(SS(a_,MM(0,RS4)),cx,0) \ - pl(SS(a_,MM(1,RS4)),cx,1) \ - pl(SS(a_,MM(2,RS4)),cx,2) \ - pl(SS(a_,MM(3,RS4)),cx,3) \ - pl(SS(a_,MM(4,RS4)),cx,4) \ - cpsc_cdp(0) \ - cpsc_cdp(1) \ - puq(0,a_,ax) \ - pl(SS(a_,MM(5,RS4)),cx,0) \ - cpsc_cdp(2) \ - cpsc_cdp(3) \ - puq(1,SS(a_,RS4),ax) \ - pl(SS(a_,MM(6,RS4)),cx,1) \ - cpsc_cdp(4) \ - cpsc_cdp(0) \ - puq(2,SS(a_,MM(2,RS4)),ax) \ - pl(SS(a_,MM(7,RS4)),cx,2) \ - cpsc_cdp(1) \ - cpsc_cdp(2) \ - puq(3,SS(a_,MM(3,RS4)),ax) \ - puq(4,SS(a_,MM(4,RS4)),ax) \ - puq(0,SS(a_,MM(5,RS4)),ax) \ - puq(1,SS(a_,MM(6,RS4)),ax) \ - puq(2,SS(a_,MM(7,RS4)),ax) -#undef lpcpsc_5c -#define lpcpsc_5c(a_) -#undef dpcpsc_5c -#define dpcpsc_5c(a_) p8_cpsc_5c(a_) -#undef plcpsc_5c -#define plcpsc_5c 32 - -#undef p1_4_cpsc_1 -#define p1_4_cpsc_1(a_) \ - pls(a_,ax,2) \ - pmsr(3,2) \ - pus(2,a_,cx) -#undef p1_2_cpsc_1 -#define p1_2_cpsc_1(a_) \ - pld(a_,ax,2) \ - pm(3,2) \ - pud(2,a_,cx) -#undef p1_cpsc_1 -#define p1_cpsc_1(a_) \ - plq(a_,ax,2) \ - pm(3,2) \ - pu(2,a_,cx) -#undef p2_cpsc_1 -#define p2_cpsc_1(a_) \ - plq(SS(a_,RS4),ax,4) \ - pm(3,2) \ - pu(2,a_,cx) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,2) \ - pm(3,4) \ - pu(4,SS(a_,RS4),cx) -#undef lpcpsc_1 -#define lpcpsc_1(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,2) -#undef dpcpsc_1 -#define dpcpsc_1(a_) \ - plq(SS(a_,RS4),ax,4) \ - pm(3,2) \ - pu(2,a_,cx) \ - pm(3,4) \ - pu(4,SS(a_,RS4),cx) -#undef plcpsc_1 -#define plcpsc_1 8 - -#undef p1_4_cpsc_2 -#define p1_4_cpsc_2(a_) \ - pls(a_,ax,2) \ - pmsr(3,2) \ - pus(2,a_,cx) -#undef p1_2_cpsc_2 -#define p1_2_cpsc_2(a_) \ - pld(a_,ax,2) \ - pm(3,2) \ - pud(2,a_,cx) -#undef p1_cpsc_2 -#define p1_cpsc_2(a_) \ - plq(a_,ax,2) \ - pm(3,2) \ - pu(2,a_,cx) -#undef p2_cpsc_2 -#define p2_cpsc_2(a_) \ - plq(a_,ax,2) \ - plq(SS(a_,RS4),ax,4) \ - pm(3,2) \ - pm(3,4) \ - pu(2,a_,cx) \ - pu(4,SS(a_,RS4),cx) -#undef p4_cpsc_2 -#define p4_cpsc_2(a_) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,7) \ - pm(3,6) \ - pu(4,a_,cx) \ - plq(SS(a_,MM(3,RS4)),ax,2) \ - pm(3,7) \ - pu(6,SS(a_,RS4),cx) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ - f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ - plq(SS(a_,MM(4,RS4)),ax,4) \ - pm(3,2) \ - pu(7,SS(a_,MM(2,RS4)),cx) \ - plq(SS(a_,MM(5,RS4)),ax,6) \ - pm(3,4) \ - pu(2,SS(a_,MM(3,RS4)),cx) -#undef lpcpsc_2 -#define lpcpsc_2(a_) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ - f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ - plq(SS(a_,MM(0,RS4)),ax,4) \ - plq(SS(a_,MM(1,RS4)),ax,6) \ - pm(3,4) -#undef dpcpsc_2 -#define dpcpsc_2(a_) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ - f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,7) \ - pm(3,6) \ - pu(4,a_,cx) \ - plq(SS(a_,MM(3,RS4)),ax,2) \ - pm(3,7) \ - pu(6,SS(a_,RS4),cx) \ - pm(3,2) \ - pu(7,SS(a_,MM(2,RS4)),cx) \ - pu(2,SS(a_,MM(3,RS4)),cx) -#undef plcpsc_2 -#define plcpsc_2 RS4 - - -#undef p1_4_iamax_1 -#define p1_4_iamax_1(a_) \ - px(4) \ - pls(a_,ax,4) \ - pan(2,4) \ - pc(3,5) \ - pcm(6,4,5) \ - paxs(4,3) \ - pan(5,6) \ - pann(0,5) \ - pasr(5,6) \ - pasr(1,0) \ - ps(57,0,0) -#undef p1_2_iamax_1 -#define p1_2_iamax_1(a_) \ - px(4) \ - pld(a_,ax,4) \ - pan(2,4) \ - pc(3,5) \ - pcm(6,4,5) \ - pax(4,3) \ - pan(5,6) \ - pann(0,5) \ - pa(5,6) \ - pasr(1,0) \ - ps(57,0,0)\ - pasr(1,0) \ - ps(57,0,0) -#undef p1_iamax_1 -#define p1_iamax_1(a_) \ - plq(a_,ax,4) \ - pan(2,4) \ - pc(3,5) \ - pcm(6,4,5) \ - pax(4,3) \ - pan(5,6) \ - pann(0,5) \ - pa(5,6) \ - pa(1,0) -#define p2_iamax_1(a_) \ - plq(SS(a_,RS4),ax,4) \ - pan(2,4) \ - pc(3,5) \ - pcm(6,4,5) \ - pax(4,3) \ - pan(5,6) \ - pann(0,5) \ - pa(5,6) \ - pa(1,0) \ - f(nta,SS(a_,MM(SS(2,CL),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,4) \ - pan(2,4) \ - pc(3,5) \ - pcm(6,4,5) \ - pax(4,3) \ - pan(5,6) \ - pann(0,5) \ - pa(5,6) \ - pa(1,0) -#undef lpiamax_1 -#define lpiamax_1(a_) \ - f(nta,SS(a_,MM(CL,RS4)),ax) \ - plq(a_,ax,4) \ - pan(2,4) \ - pc(3,5) \ - pcm(6,4,5) \ - pax(4,3) \ - pan(5,6) \ - pann(0,5) \ - pa(5,6) \ - pa(1,0) -#undef dpiamax_1 -#define dpiamax_1(a_) \ - plq(SS(a_,RS4),ax,4) \ - pan(2,4) \ - pc(3,5) \ - pcm(6,4,5) \ - pax(4,3) \ - pan(5,6) \ - pann(0,5) \ - pa(5,6) \ - pa(1,0) -#undef pliamax_1 -#define pliamax_1 8 - -#undef p1_4_iamax_1d -#define p1_4_iamax_1d(a_) -#undef p1_2_iamax_1d -#define p1_2_iamax_1d(a_) \ - px(4) \ - pld(a_,ax,4) \ - dbg(2) \ - pan(2,4) \ - dbg(4) \ - pc(3,5) \ - dbg(5) \ - pcm(6,4,5) \ - dbg(5) \ - pax(4,3) \ - dbg(3) \ - pan(5,6) \ - dbg(6) \ - pann(0,5) \ - dbg(5) \ - pa(5,6) \ - dbg(6) \ - pasr(1,0) \ - dbg(0) \ - ps(1,0,0) -#undef p1_iamax_1d -#define p1_iamax_1d(a_) \ - plq(a_,ax,4) \ - dbg(2) \ - pan(2,4) \ - dbg(4) \ - pc(3,5) \ - dbg(5) \ - pcm(6,4,5) \ - dbg(5) \ - pax(4,3) \ - dbg(3) \ - pan(5,6) \ - dbg(6) \ - pann(0,5) \ - dbg(5) \ - pa(5,6) \ - dbg(6) \ - pa(1,0) -#define p2_iamax_1d(a_) \ - plq(SS(a_,RS4),ax,4) \ - dbg(2) \ - pan(2,4) \ - dbg(4) \ - pc(3,5) \ - dbg(5) \ - pcm(6,4,5) \ - dbg(5) \ - pax(4,3) \ - dbg(3) \ - pan(5,6) \ - dbg(6) \ - pann(0,5) \ - dbg(5) \ - pa(5,6) \ - dbg(6) \ - pa(1,0) \ - dbg(0) \ - f(nta,SS(a_,MM(SS(2,CL),RS4)),ax) \ - plq(SS(a_,MM(2,RS4)),ax,4) \ - dbg(2) \ - pan(2,4) \ - dbg(4) \ - pc(3,5) \ - dbg(5) \ - pcm(6,4,5) \ - dbg(5) \ - pax(4,3) \ - dbg(3) \ - pan(5,6) \ - dbg(6) \ - pann(0,5) \ - dbg(5) \ - pa(5,6) \ - dbg(6) \ - pa(1,0) -#undef lpiamax_1d -#define lpiamax_1d(a_) \ - f(nta,SS(a_,MM(CL,RS4)),ax) \ - plq(a_,ax,4) \ - dbg(2) \ - pan(2,4) \ - dbg(4) \ - pc(3,5) \ - dbg(5) \ - pcm(6,4,5) \ - dbg(5) \ - pax(4,3) \ - dbg(3) \ - pan(5,6) \ - dbg(6) \ - pann(0,5) \ - dbg(5) \ - pa(5,6) \ - dbg(6) \ - pa(1,0) -#undef dpiamax_1d -#define dpiamax_1d(a_) \ - plq(SS(a_,RS4),ax,4) \ - dbg(2) \ - pan(2,4) \ - dbg(4) \ - pc(3,5) \ - dbg(5) \ - pcm(6,4,5) \ - dbg(5) \ - pax(4,3) \ - dbg(3) \ - pan(5,6) \ - dbg(6) \ - pann(0,5) \ - dbg(5) \ - pa(5,6) \ - dbg(6) \ - pa(1,0) -#undef pliamax_1d -#define pliamax_1d 8 - diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h deleted file mode 100644 index 03486cf..0000000 --- a/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h +++ /dev/null @@ -1,331 +0,0 @@ -/*************************************** - $Header: /cvsroot/math-atlas/AtlasBase/kernel/CammMaguire/camm_tpipe.h,v 1.2 2003/10/18 18:13:30 yycamm Exp $ - - -***************************************/ - - -/* #ifndef CAMM_TPIPE_H */ -/* #define CAMM_TPIPE_H */ /*+ To stop multiple inclusions. +*/ - -#ifndef BITS -#error BITS must be defined in camm_tpipe.h -#endif -#ifndef DIV -#error DIV must be defined in camm_tpipe.h -#endif -#ifndef INC -#error INC(a_) must be defined in camm_tpipe.h -#endif -#ifndef LR -#error LR must be defined in camm_tpipe.h -#endif - -#ifdef ALIGN - -#if defined(SREAL) - - test(4,ax) - je(a2) - -#undef KB -#define KB ( 1 /* / DIV */ ) -#include "camm_pipe3.h" - - KB_block - INC(4) - sub(1,LR) - - lab(a2) - -#endif - -#if defined(SREAL) || defined(DREAL) - - test(8,ax) - je(a4) - test(-2,LR) - je(a4) - -#undef KB -#define KB ( 2 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(8) - sub(2,LR) - - lab(a4) - -#endif -#endif - -/* "movl %%edx,%%edi\n\t" */ - push(LR) - shr(BITS,LR) - shl(BITS,LR) - m(4,LR) - ra(ax,LR) - -#if defined(ALIGN) && ( defined(SCPLX) || defined(DCPLX) ) - test(12,ax) - je(loopa) -#endif - -#if !defined(ALIGN) || defined(SCPLX) || defined(DCPLX) -#undef plq -#define plq(a_,b_,c_) pl(a_,b_,c_) -#undef puq -#define puq(a_,b_,c_) pu(a_,b_,c_) -#undef plqx -#define plqx(a_,b_,c_,d_,e_) plx(a_,b_,c_,d_,e_) -#undef puqx -#define puqx(a_,b_,c_,d_,e_) pux(a_,b_,c_,d_,e_) -#else -#undef plq -#define plq(a_,b_,c_) pla(a_,b_,c_) -#undef puq -#define puq(a_,b_,c_) punt(a_,b_,c_) -#undef plqx -#define plqx(a_,b_,c_,d_,e_) plax(a_,b_,c_,d_,e_) -#undef puqx -#define puqx(a_,b_,c_,d_,e_) puax(a_,b_,c_,d_,e_) -#endif - - align - lab(loop) - cmp(ax,LR) - je(stop) - -#undef KB -#define KB ( (1 << BITS) /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(4*KB/**DIV*/) - - jmp(loop) - - lab(stop) - pop(LR) - -#if ( 1 << BITS ) > 128 - test(128,LR) - je(64) -#undef KB -#define KB ( 128 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(512) - - lab(64) -#endif - -#if ( 1 << BITS ) > 64 - test(64,LR) - je(32) -#undef KB -#define KB ( 64 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(256) - - lab(32) -#endif - -#if ( 1 << BITS ) > 32 - test(32,LR) - je(16) -#undef KB -#define KB ( 32 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(128) - - lab(16) -#endif - -#if ( 1 << BITS ) > 16 - test(16,LR) - je(8) -#undef KB -#define KB ( 16 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(64) - - lab(8) -#endif - -#if ( 1 << BITS ) > 8 - test(8,LR) - je(4) -#undef KB -#define KB ( 8 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(32) - - lab(4) -#endif - -#if ( 1 << BITS ) > 4 - test(4,LR) - je(2) -#undef KB -#define KB ( 4 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(16) - - lab(2) -#endif - -#if DIV != 4 && ( 1 << BITS ) > 2 - test(2,LR) - je(1) -#undef KB -#define KB ( 2 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(8) - - lab(1) -#endif - -#if DIV == 1 && ( 1 << BITS ) > 1 - test(1,LR) - je(end) -#undef KB -#define KB ( 1 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - lab(end) -#endif - -#if defined (ALIGN) && ( defined(SCPLX) || defined(DCPLX) ) - - jmp(tend) - -#undef plq -#define plq(a_,b_,c_) pla(a_,b_,c_) -#undef puq -#define puq(a_,b_,c_) punt(a_,b_,c_) -#undef plqx -#define plqx(a_,b_,c_,d_,e_) plax(a_,b_,c_,d_,e_) -#undef puqx -#define puqx(a_,b_,c_,d_,e_) puax(a_,b_,c_,d_,e_) - - align - lab(loopa) - cmp(ax,LR) - je(stopa) - -#undef KB -#define KB ( (1 << BITS) /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(4*KB/**DIV*/) - - jmp(loopa) - - lab(stopa) - pop(LR) - -#if ( 1 << BITS ) > 128 - test(128,LR) - je(64a) -#undef KB -#define KB ( 128 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(512) - - lab(64a) -#endif - -#if ( 1 << BITS ) > 64 - test(64,LR) - je(32a) -#undef KB -#define KB ( 64 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(256) - - lab(32a) -#endif - -#if ( 1 << BITS ) > 32 - test(32,LR) - je(16a) -#undef KB -#define KB ( 32 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(128) - - lab(16a) -#endif - -#if ( 1 << BITS ) > 16 - test(16,LR) - je(8a) -#undef KB -#define KB ( 16 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(64) - - lab(8a) -#endif - -#if ( 1 << BITS ) > 8 - test(8,LR) - je(4a) -#undef KB -#define KB ( 8 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(32) - - lab(4a) -#endif - -#if ( 1 << BITS ) > 4 - test(4,LR) - je(2a) -#undef KB -#define KB ( 4 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(16) - - lab(2a) -#endif - -#if DIV != 4 && ( 1 << BITS ) > 2 - test(2,LR) - je(1a) -#undef KB -#define KB ( 2 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - INC(8) - - lab(1a) -#endif - -#if DIV == 1 && ( 1 << BITS ) > 1 - test(1,LR) - je(enda) -#undef KB -#define KB ( 1 /* / DIV */ ) -#include "camm_pipe3.h" - KB_block - lab(enda) -#endif - - lab(tend) - -#endif - -/* #endif */ /* CAMM_TPIPE_H */ diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h deleted file mode 100644 index 6b150d3..0000000 --- a/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h +++ /dev/null @@ -1,508 +0,0 @@ -#ifndef CAMM_UTIL_H -#define CAMM_UTIL_H /*+ To stop multiple inclusions. +*/ - -typedef struct { - float r,i; -} Complex; - -typedef struct { - double r,i; -} Dcomplex; - -#undef str -#define str(a_) xstr(a_) -#undef xstr -#define xstr(a_) #a_ - -#undef val -#define val(a_) xval(a_) -#undef xval -#define xval(a_) a_ - -#ifndef Mjoin -#define Mjoin(a,b) mjoin(a,b) -#ifdef mjoin - #undef mjoin -#endif -#define mjoin(a,b) a ## b -#endif - -#undef VOLATILE -#define VOLATILE __volatile__ -#undef ASM -#define ASM __asm__ VOLATILE - -#ifdef BETA0 -#undef BL -#define BL b0 -#endif -#ifdef BETA1 -#undef BL -#define BL b1 -#endif -#ifdef BETAX -#undef BL -#define BL bX -#endif -#ifdef BETAXI0 -#undef BL -#define BL bXi0 -#endif - -#ifdef NO_TRANSPOSE -#ifdef GER -#ifdef Conj_ -#undef FEXT -#define FEXT Gc -#else -#undef FEXT -#define FEXT Gu -#endif -#else -#ifdef Conj_ -#undef FEXT -#define FEXT Nc -#else -#undef FEXT -#define FEXT N -#endif -#endif -#else -#ifdef Conj_ -#undef FEXT -#define FEXT C -#else -#undef FEXT -#define FEXT T -#endif -#endif - -#undef BLC -#define BLC Mjoin(FEXT,BL) - -#ifdef __GNUC__ -#undef NO_INLINE -#define NO_INLINE double sq(double x) {return x*x;} -#else -#undef NO_INLINE -#define NO_INLINE -#endif - -#undef lab -#define lab(a_) "\n" str(MY_FUNCTION) "_" str(N) "_" str(a_) ":\n\t" -#undef jmp -#define jmp(a_) "jmp " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" -#undef je -#define je(a_) "je " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" -#undef jge -#define jge(a_) "jge " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" -#undef jle -#define jle(a_) "jle " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" -#undef jl -#define jl(a_) "jl " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" -#undef jne -#define jne(a_) "jne " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t" -#undef align -#define align ".align 16\n\t" -#undef test -#define test(a_,b_) "testl $" str(a_) ",%%e" str(b_) "\n\t" -#undef and -#define and(a_,b_) "andl $" str(a_) ",%%e" str(b_) "\n\t" -#undef sub -#define sub(a_,b_) "subl $" str(a_) ",%%e" str(b_) "\n\t" -#undef SS -#define SS(a_,b_) a_ + b_ -#undef MM -#define MM(a_,b_) a_ * b_ -#undef E4 -#define E4(a_) (( a_ >> 2 ) << 2 ) - -#undef TYPE -#undef SCALAR -#undef PREC -#undef CSHUF -#undef LSHUF -#undef HSHUF -#undef ISHUF -#undef RSHUF -#undef SINGLE -#undef REAL -#undef DIV - -#ifdef SCPLX -#define TYPE Complex -#define SCALAR Complex * -#define PREC c -#define CSHUF 177 -#define LSHUF 160 -#define HSHUF 245 -#define ISHUF 13*17 -#define RSHUF 8*17 -#define SINGLE -#define DIV 2 -/* #ifdef Conj_ */ -/* static const TYPE signd[2]={{-1.0,1.0},{-1.0,1.0}}; */ -/* #else */ - static const TYPE signd[2]={{1.0,-1.0},{1.0,-1.0}}; -/* #endif */ -#endif - -#ifdef SREAL -#define TYPE float -#define SCALAR float -#define PREC s -#define SINGLE -#define REAL -#define DIV 1 -#endif - -#ifdef DREAL -#define TYPE double -#define SCALAR double -#define PREC d -#define REAL -#define DIV 2 -#endif - -#ifdef DCPLX -#define TYPE Dcomplex -#define SCALAR Dcomplex * -#define PREC z -#define CSHUF 1 -#define LSHUF 0 -#define HSHUF 3 -#define ISHUF 3 -#define RSHUF 0 -#define DIV 4 -/* #ifdef Conj_ */ -/* static const TYPE signd[1]={{-1.0,1.0}}; */ -/* #else */ - static const TYPE signd[1]={{1.0,-1.0}}; -/* #endif */ -#endif - -#undef M11 -#define M11 0 -#undef M12 -#define M12 1 -#undef M13 -#define M13 2 -#undef M14 -#define M14 3 -#undef M15 -#define M15 4 -#undef M16 -#define M16 5 -#undef M17 -#define M17 6 -#undef M18 -#define M18 7 - -#undef M23 -#define M23 1 -#undef M24 -#define M24 2 -#undef M25 -#define M25 3 -#undef M26 -#define M26 4 -#undef M27 -#define M27 5 -#undef M28 -#define M28 6 - -#undef M33 -#define M33 0 -#undef M34 -#define M34 1 -#undef M35 -#define M35 2 -#undef M36 -#define M36 3 -#undef M37 -#define M37 4 -#undef M38 -#define M38 5 - -#undef P10 -#define P10 1 -#undef P11 -#define P11 2 -#undef P12 -#define P12 3 -#undef P13 -#define P13 4 -#undef P14 -#define P14 5 -#undef P15 -#define P15 6 -#undef P16 -#define P16 7 - -#undef XM -#define XM(a_,b_) M ## b_ ## a_ -#undef M -#define M(a_,b_) XM(a_,b_) - -#undef XP -#define XP(a_,b_) P ## b_ ## a_ -#undef P -#define P(a_,b_) XP(a_,b_) - -#undef mex -#define mex(a_) str(%%e ## a_) -#undef msx -#define msx(a_) "%%st(" str(a_) ")" - -#undef cmp -#define cmp(a_,b_) "cmp " mex(a_) "," mex(b_) "\n\t" -#undef icmpr -#define icmpr(a_,b_) "cmp " mex(a_) ",(" mex(b_) ")\n\t" -#undef f -#define f(a_,b_,c_) "prefetch" str(a_) " " str(b_) "(%%e" #c_ ")\n\t" -#undef pfx -#define pfx(a_,b_,c_,d_,e_) "prefetch" str(a_) " " str(b_) "(%%e" #c_ ",%%e" #d_ "," str(e_) ")\n\t" -#undef a -#define a(a_,b_) "addl $" str(a_) "," mex(b_) "\n\t" -#undef m -#define m(a_,b_) "imul $" str(a_) "," mex(b_) "\n\t" -#undef pop -#define pop(a_) "popl %%e" str(a_) "\n\t" -#undef push -#define push(a_) "pushl %%e" str(a_) "\n\t" -#undef d -#define d(a_,b_) "idiv $" str(a_) "," mex(b_) "\n\t" -#undef shl -#define shl(a_,b_) "shl $" str(a_) "," mex(b_) "\n\t" -#undef shr -#define shr(a_,b_) "shr $" str(a_) "," mex(b_) "\n\t" -#undef mm -#define mm(a_,b_) "mov $" str(a_) "," mex(b_) "\n\t" -#undef ra -#define ra(a_,b_) "addl %%e" str(a_) "," mex(b_) "\n\t" -#undef rs -#define rs(a_,b_) "subl %%e" str(a_) "," mex(b_) "\n\t" - -#undef fl -#define fl(a_,b_) "fldl " str(a_) "(" mex(b_) ")\n\t" -#undef fp -#define fp(a_,b_) "fstpl " str(a_) "(" mex(b_) ")\n\t" -#undef fd -#define fd(a_) "fld " msx(a_) "\n\t" -#undef fap -#define fap(a_,b_) "faddp " msx(a_) "," msx(b_) "\n\t" -/* #define fsp(a_) fx(a_) "fsubp %%st," msx(a_) "\n\t" */ -#undef fsp -#define fsp(a_) "fsubrp %%st," msx(a_) "\n\t" -#undef fmp -#define fmp(a_,b_) "fmulp " msx(a_) "," msx(b_) "\n\t" -#undef fa -#define fa(a_,b_) "fadd " msx(a_) "," msx(b_) "\n\t" -#undef fm -#define fm(a_,b_) "fmul " msx(a_) "," msx(b_) "\n\t" -#undef faa -#define faa(a_,b_) "faddl " str(a_) "(" mex(b_) ")\n\t" -#undef fma -#define fma(a_,b_) "fmull " str(a_) "(" mex(b_) ")\n\t" -#undef fz -#define fz "fldz\n\t" -#undef fx -#define fx(a_) "fxch " msx(a_) "\n\t" -#undef fx1 -#define fx1 "fxch\n\t" -#undef fc -#define fc(a_) "fstp " msx(a_) "\n\t" - - -#ifndef ATHLON - - -#if defined(DREAL) || defined(DCPLX) -#undef SSESUF -#define SSESUF "d " -#undef RS4 -#define RS4 16 -#undef RS -#define RS 4 -#else -#undef SSESUF -#define SSESUF "s " -#undef RS4 -#define RS4 16 -#undef RS -#define RS 4 -#endif - -#undef mxx -#define mxx(a_) str(%%xmm ## a_) -#undef prp -#define prp(a_,b_) "rcpp" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef prps -#define prps(a_,b_) "rcps" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef pann -#define pann(a_,b_) "andnp" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef psqs -#define psqs(a_,b_) "sqrts" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef por -#define por(a_,b_) "orp" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef pan -#define pan(a_,b_) "andp" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef pcm -#define pcm(a_,b_,c_) "cmpp" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t" -#undef pcms -#define pcms(a_,b_,c_) "cmps" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t" -#undef pax -#define pax(a_,b_) "maxp" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef paxs -#define paxs(a_,b_) "maxs" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef pd -#define pd(a_,b_) "divp" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef pdsr -#define pdsr(a_,b_) "divs" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef pxx -#define pxx(a_,b_) "xorp" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef px -#define px(a_) "xorp" SSESUF mxx(a_) "," mxx(a_) "\n\t" -#undef pm -#define pm(a_,b_) "mulp" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef pa -#define pa(a_,b_) "addp" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef pmm -#define pmm(a_,b_,c_) "mulp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" -#undef pam -#define pam(a_,b_,c_) "addp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" -#undef pl -#define pl(a_,b_,c_) "movup" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" -#undef pla -#define pla(a_,b_,c_) "movap" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" -#undef pu -#define pu(a_,b_,c_) "movup" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" -#undef punt -#define punt(a_,b_,c_) "movntp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" -#undef pua -#define pua(a_,b_,c_) "movap" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" -#undef pud -#define pud(a_,b_,c_) "movlp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" -#undef pudr -#define pudr(a_,b_) "movlp" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef pc -#define pc(a_,b_) "movap" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef ps -#define ps(a_,b_,c_) "shufp" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t" -#undef phl -#define phl(a_,b_) "movhlp" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef pus -#define pus(a_,b_,c_) "movs" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" -#undef pls -#define pls(a_,b_,c_) "movs" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" -#undef pld -#define pld(a_,b_,c_) "movlp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" -#undef plh -#define plh(a_,b_) "movlhp" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef pas -#define pas(a_,b_,c_) "adds" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" -#undef pms -#define pms(a_,b_,c_) "muls" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" -#undef pcs -#define pcs(a_,b_) "movs" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef pasr -#define pasr(a_,b_) "adds" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef pmsr -#define pmsr(a_,b_) "muls" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef pul -#define pul(a_,b_) "unpcklp" SSESUF mxx(a_) "," mxx(b_) "\n\t" -#undef puh -#define puh(a_,b_) "unpckhp" SSESUF mxx(a_) "," mxx(b_) "\n\t" - -#undef plsx -#define plsx(a_,b_,c_,d_,e_) \ - "movs" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" -#undef plx -#define plx(a_,b_,c_,d_,e_) \ - "movup" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" -#undef plax -#define plax(a_,b_,c_,d_,e_) \ - "movap" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" -#undef pasx -#define pasx(a_,b_,c_,d_,e_) \ - "adds" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" -#undef pusx -#define pusx(a_,b_,c_,d_,e_) \ - "movs" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" -#undef pux -#define pux(a_,b_,c_,d_,e_) \ - "movup" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" -#undef puax -#define puax(a_,b_,c_,d_,e_) \ - "movap" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" -#undef pudx -#define pudx(a_,b_,c_,d_,e_) \ - "movlp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" - -#undef pldx -#define pldx(a_,b_,c_,d_,e_) \ - "movlp" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" - -#else - -#undef RS4 -#define RS4 8 -#undef RS -#define RS 2 - -#undef mxx -#define mxx(a_) str(%%mm ## a_) -#undef pul -#define pul(a_,b_) "punpckldq " mxx(a_) "," mxx(b_) "\n\t" -#undef puh -#define puh(a_,b_) "punpckhdq " mxx(a_) "," mxx(b_) "\n\t" - -#undef px -#define px(a_) "pxor " mxx(a_) "," mxx(a_) "\n\t" -#undef pm -#define pm(a_,b_) "pfmul " mxx(a_) "," mxx(b_) "\n\t" -#undef pa -#define pa(a_,b_) "pfadd " mxx(a_) "," mxx(b_) "\n\t" -#undef pac -#define pac(a_,b_) "pfacc " mxx(a_) "," mxx(b_) "\n\t" -#undef pmm -#define pmm(a_,b_,c_) "pfmul " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" -#undef pam -#define pam(a_,b_,c_) "pfadd " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" -#undef pl -#define pl(a_,b_,c_) "movq " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" -#undef pla -#define pla(a_,b_,c_) "movq " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" -#undef pu -#define pu(a_,b_,c_) "movq " mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" -#undef pc -#define pc(a_,b_) "movq " mxx(a_) "," mxx(b_) "\n\t" -#undef ps -#define ps(a_,b_,c_) "pswapd " mxx(b_) "," mxx(c_) "\n\t" -#undef phl -#define phl(a_,b_) "punpckhdq " mxx(a_) "," mxx(b_) "\n\t" -#undef plh -#define plh(a_,b_) "punpckldq " mxx(a_) "," mxx(b_) "\n\t" -#undef pus -#define pus(a_,b_,c_) "movd " mxx(a_) "," str(b_) "(" mex(c_) ")\n\t" -#undef pls -#define pls(a_,b_,c_) "movd " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t" - -#undef plsx -#define plsx(a_,b_,c_,d_,e_) \ - "movd " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" -#undef plx -#define plx(a_,b_,c_,d_,e_) \ - "movq " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" -#undef pasx -#define pasx(a_,b_,c_,d_,e_) \ - "addss " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t" -#undef pusx -#define pusx(a_,b_,c_,d_,e_) \ - "movd " mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" -#undef pux -#define pux(a_,b_,c_,d_,e_) \ - "movq " mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t" -#endif - -#endif /* CAMM_UTIL_H */ |