From 96a32415ab43377cf1575bd3f4f2980f58028209 Mon Sep 17 00:00:00 2001 From: Determinant Date: Fri, 14 Aug 2015 11:51:42 +0800 Subject: add implementation for kaldi io (by ymz) --- .../src/tools/ATLAS/include/contrib/camm_strat1.h | 2982 ++++++++++++++++++++ 1 file changed, 2982 insertions(+) create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h (limited to 'kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h') diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h new file mode 100644 index 0000000..4a92006 --- /dev/null +++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h @@ -0,0 +1,2982 @@ +#include "camm_util.h" + +#undef p1_4_swap_1 +#define p1_4_swap_1(a_) \ + pls(a_,ax,1) \ + pls(a_,cx,0) \ + pus(0,a_,ax) \ + pus(1,a_,cx) +#undef p1_2_swap_1 +#define p1_2_swap_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + px(0) \ + pld(a_,cx,0) \ + pud(0,a_,ax) \ + pud(1,a_,cx) +#undef p1_swap_1 +#define p1_swap_1(a_) \ + plq(a_,ax,1) \ + pl(a_,cx,0) \ + puq(0,a_,ax) \ + pu(1,a_,cx) +#undef p2_swap_1 +#define p2_swap_1(a_) \ + plq(SS(a_,RS4),ax,3) \ + pl(SS(a_,RS4),cx,2) \ + puq(0,a_,ax) \ + pu(1,a_,cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,0) \ + puq(2,SS(a_,RS4),ax) \ + pu(3,SS(a_,RS4),cx) +#undef lpswap_1 +#define lpswap_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,0) +#undef dpswap_1 +#define dpswap_1(a_) \ + plq(SS(a_,RS4),ax,3) \ + pl(SS(a_,RS4),cx,2) \ + puq(0,a_,ax) \ + pu(1,a_,cx) \ + puq(2,SS(a_,RS4),ax) \ + pu(3,SS(a_,RS4),cx) +#undef plswap_1 +#define plswap_1 8 + + +#undef p1_4_scal_3 +#define p1_4_scal_3(a_) \ + pls(a_,ax,0) \ + pmsr(6,0) \ + pus(0,a_,ax) +#undef p1_2_scal_3 +#define p1_2_scal_3(a_) \ + pld(a_,ax,0) \ + pm(6,0) \ + pud(0,a_,ax) +#undef p1_scal_3 +#define p1_scal_3(a_) \ + plq(a_,ax,0) \ + pm(6,0) \ + puq(0,a_,ax) +#undef p2_scal_3 +#define p2_scal_3(a_) \ + plq(a_,ax,0) \ + plq(SS(a_,RS4),ax,1) \ + pm(6,0) \ + pm(6,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_scal_3 +#define p4_scal_3(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,2) \ + puq(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(6,3) \ + puq(1,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pm(6,0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(6,1) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpscal_3 +#define lpscal_3(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pm(6,1) +#undef dpscal_3 +#define dpscal_3(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,2) \ + puq(0,a_,ax) \ + pm(6,3) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plscal_3 +#define plscal_3 16 + +#undef p1_4_scal_3c +#define p1_4_scal_3c(a_) +#undef p1_2_scal_3c +#define p1_2_scal_3c(a_) \ + pld(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + pud(0,a_,ax) +#undef p1_scal_3c +#define p1_scal_3c(a_) \ + plq(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + puq(0,a_,ax) +#undef p2_scal_3c +#define p2_scal_3c(a_) \ + plq(a_,ax,0) \ + plq(SS(a_,RS4),ax,1) \ + pc(0,2) \ + pm(6,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + puq(0,a_,ax) \ + pc(1,3) \ + pm(6,1) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,1) \ + puq(1,SS(a_,RS4),ax) +#undef p4_scal_3c +#define p4_scal_3c(a_) \ + pm(7,5) \ + pa(5,1) \ + puq(0,a_,ax) \ + ps(CSHUF,4,4) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pc(3,5) \ + pm(6,3) \ + pm(7,4) \ + pa(4,2) \ + puq(1,SS(a_,RS4),ax) \ + ps(CSHUF,5,5) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pc(0,4) \ + pm(6,0) \ + pm(7,5) \ + pa(5,3) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pc(1,5) \ + pm(6,1) \ + pm(7,4) \ + pa(4,0) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + ps(CSHUF,5,5) \ + plq(SS(a_,MM(7,RS4)),ax,3) \ + pc(2,4) \ + pm(6,2) +#undef lpscal_3c +#define lpscal_3c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pc(0,4) \ + pm(6,0) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pc(1,5) \ + pm(6,1) \ + pm(7,4) \ + pa(4,0) \ + ps(CSHUF,5,5) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pc(2,4) \ + pm(6,2) +#undef dpscal_3c +#define dpscal_3c(a_) \ + pm(7,5) \ + pa(5,1) \ + ps(CSHUF,4,4) \ + puq(0,a_,ax) \ + pm(7,4) \ + pa(4,2) \ + pc(3,5) \ + pm(6,3) \ + puq(1,SS(a_,RS4),ax) \ + ps(CSHUF,5,5) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pm(7,5) \ + pa(5,3) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plscal_3c +#define plscal_3c 16 + +#undef p1_4_scal_4 +#define p1_4_scal_4(a_) \ + pls(SS(a_,MM(0,RS4)),ax,0) \ + pmsr(6,0) \ + pus(0,a_,ax) +#undef p1_2_scal_4 +#define p1_2_scal_4(a_) \ + pld(SS(a_,MM(0,RS4)),ax,0) \ + pm(6,0) \ + pud(0,a_,ax) +#undef p1_scal_4 +#define p1_scal_4(a_) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pm(6,0) \ + puq(0,a_,ax) +#undef p2_scal_4 +#define p2_scal_4(a_) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,0) \ + pm(6,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_scal_4 +#define p4_scal_4(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,0) \ + pm(6,1) \ + pm(6,2) \ + pm(6,3) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef p8_scal_4 +#define p8_scal_4(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + plq(SS(a_,MM(4,RS4)),ax,4) \ + plq(SS(a_,MM(5,RS4)),ax,5) \ + plq(SS(a_,MM(6,RS4)),ax,7) \ + pm(6,0) \ + pm(6,1) \ + pm(6,2) \ + puq(0,a_,ax) \ + pm(6,3) \ + pm(6,4) \ + pm(6,5) \ + plq(SS(a_,MM(7,RS4)),ax,0) \ + pm(6,7) \ + pm(6,0) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + puq(4,SS(a_,MM(4,RS4)),ax) \ + puq(5,SS(a_,MM(5,RS4)),ax) \ + puq(7,SS(a_,MM(6,RS4)),ax) \ + puq(0,SS(a_,MM(7,RS4)),ax) +#undef lpscal_4 +#define lpscal_4(a_) +#undef dpscal_4 +#define dpscal_4(a_) p4_scal_4(a_) +#undef plscal_4 +#define plscal_4 16 + +#undef p1_4_scal_4c +#define p1_4_scal_4c(a_) +#undef p1_2_scal_4c +#define p1_2_scal_4c(a_) \ + pld(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + pud(0,a_,ax) +#undef p1_scal_4c +#define p1_scal_4c(a_) \ + plq(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + puq(0,a_,ax) +#undef p2_scal_4c +#define p2_scal_4c(a_) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pc(0,4) \ + pc(1,5) \ + pm(6,0) \ + pm(6,1) \ + ps(CSHUF,4,4) \ + ps(CSHUF,5,5) \ + pm(7,4) \ + pa(4,0) \ + pm(7,5) \ + pa(5,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_scal_4c +#define p4_scal_4c(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pc(0,4) \ + pc(1,5) \ + pm(6,0) \ + pm(6,1) \ + ps(CSHUF,4,4) \ + ps(CSHUF,5,5) \ + pm(7,4) \ + pa(4,0) \ + pc(2,4) \ + pm(7,5) \ + pa(5,1) \ + pc(3,5) \ + pm(6,2) \ + pm(6,3) \ + ps(CSHUF,4,4) \ + ps(CSHUF,5,5) \ + pm(7,4) \ + pa(4,2) \ + pm(7,5) \ + pa(5,3) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef lpscal_4c +#define lpscal_4c(a_) +#undef dpscal_4c +#define dpscal_4c(a_) p4_scal_4c(a_) +#undef plscal_4c +#define plscal_4c 16 + +#undef p1_4_scal_1 +#define p1_4_scal_1(a_) \ + pls(a_,ax,1) \ + pmsr(0,1) \ + pus(1,a_,ax) +#undef p1_2_scal_1 +#define p1_2_scal_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + pm(0,1) \ + pud(1,a_,ax) +#undef p1_scal_1 +#define p1_scal_1(a_) \ + plq(a_,ax,1) \ + pm(0,1) \ + puq(1,a_,ax) +#undef p2_scal_1 +#define p2_scal_1(a_) \ + plq(a_,ax,1) \ + plq(SS(a_,RS4),ax,2) \ + pm(0,1) \ + pm(0,2) \ + puq(1,a_,ax) \ + puq(2,SS(a_,RS4),ax) +#undef p4_scal_1 +#define p4_scal_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pm(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(0,1) \ + puq(3,SS(a_,MM(1,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pm(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pm(0,7) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef lpscal_1 +#define lpscal_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,MM(1,RS4)),ax,3) \ + pm(0,7) +#undef dpscal_1 +#define dpscal_1(a_) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pm(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(0,1) \ + puq(3,SS(a_,MM(1,RS4)),ax) \ + pm(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef plscal_1 +#define plscal_1 RS4 + + +#undef p1_4_set_1 +#define p1_4_set_1(a_) \ + pls(a_,ax,1) \ + pcs(0,1) \ + pus(1,a_,ax) +#undef p1_2_set_1 +#define p1_2_set_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + pc(0,1) \ + pud(1,a_,ax) +#undef p1_set_1 +#define p1_set_1(a_) \ + plq(a_,ax,1) \ + pc(0,1) \ + puq(1,a_,ax) +#undef p2_set_1 +#define p2_set_1(a_) \ + plq(a_,ax,1) \ + plq(SS(a_,RS4),ax,2) \ + pc(0,1) \ + pc(0,2) \ + puq(1,a_,ax) \ + puq(2,SS(a_,RS4),ax) +#undef p4_set_1 +#define p4_set_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pc(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pc(0,1) \ + puq(3,SS(a_,MM(1,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pc(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pc(0,7) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef lpset_1 +#define lpset_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,MM(1,RS4)),ax,3) \ + pc(0,7) +#undef dpset_1 +#define dpset_1(a_) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pc(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pc(0,1) \ + puq(3,SS(a_,MM(1,RS4)),ax) \ + pc(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef plset_1 +#define plset_1 RS4 + + +#undef p1_4_set_2 +#define p1_4_set_2(a_) \ + pus(0,a_,ax) +#undef p1_2_set_2 +#define p1_2_set_2(a_) \ + pud(0,a_,ax) +#undef p1_set_2 +#define p1_set_2(a_) \ + puq(0,a_,ax) +#undef p2_set_2 +#define p2_set_2(a_) \ + puq(0,a_,ax) \ + puq(0,SS(a_,RS4),ax) +#undef p4_set_2 +#define p4_set_2(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + puq(0,a_,ax) \ + puq(0,SS(a_,MM(1,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + puq(0,SS(a_,MM(2,RS4)),ax) \ + puq(0,SS(a_,MM(3,RS4)),ax) +#undef lpset_2 +#define lpset_2(a_) +#undef dpset_2 +#define dpset_2(a_) \ + puq(0,a_,ax) \ + puq(0,SS(a_,MM(1,RS4)),ax) \ + puq(0,SS(a_,MM(2,RS4)),ax) \ + puq(0,SS(a_,MM(3,RS4)),ax) +#undef plset_2 +#define plset_2 RS4 + + +#undef p1_4_set_3 +#define p1_4_set_3(a_) \ + pus(0,a_,ax) +#undef p1_2_set_3 +#define p1_2_set_3(a_) \ + pud(0,a_,ax) +#undef p1_set_3 +#define p1_set_3(a_) \ + puq(0,SS(a_,MM(0,RS4)),ax) +#undef p2_set_3 +#define p2_set_3(a_) \ + puq(0,SS(a_,MM(0,RS4)),ax) \ + puq(0,SS(a_,MM(1,RS4)),ax) +#undef p4_set_3 +#define p4_set_3(a_) \ + puq(0,SS(a_,MM(0,RS4)),ax) \ + puq(0,SS(a_,MM(1,RS4)),ax) \ + puq(0,SS(a_,MM(2,RS4)),ax) \ + puq(0,SS(a_,MM(3,RS4)),ax) +#undef p8_set_3 +#define p8_set_3(a_) \ + puq(0,SS(a_,MM(0,RS4)),ax) \ + puq(0,SS(a_,MM(1,RS4)),ax) \ + puq(0,SS(a_,MM(2,RS4)),ax) \ + puq(0,SS(a_,MM(3,RS4)),ax) \ + puq(0,SS(a_,MM(4,RS4)),ax) \ + puq(0,SS(a_,MM(5,RS4)),ax) \ + puq(0,SS(a_,MM(6,RS4)),ax) \ + puq(0,SS(a_,MM(7,RS4)),ax) +#undef lpset_3 +#define lpset_3(a_) +#undef dpset_3 +#define dpset_3(a_) p8_set_3(a_) +#undef plset_3 +#define plset_3 32 + + +#undef p1_4_0x1_nrm2_1 +#define p1_4_0x1_nrm2_1(a_) \ + pls(a_,ax,1) \ + pmsr(1,1) \ + pasr(1,0) +#undef p1_2_0x1_nrm2_1 +#define p1_2_0x1_nrm2_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + pm(1,1) \ + pa(1,0) +#undef p1_0x1_nrm2_1 +#define p1_0x1_nrm2_1(a_) \ + plq(a_,ax,1) \ + pm(1,1) \ + pa(1,0) +#undef p2_0x1_nrm2_1 +#define p2_0x1_nrm2_1(a_) \ + plq(a_,ax,1) \ + plq(SS(a_,RS4),ax,2) \ + pm(1,1) \ + pm(2,2) \ + pa(1,0) \ + pm(2,0) +#undef p4_0x1_nrm2_1 +#define p4_0x1_nrm2_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pm(3,3) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(1,1) \ + pa(3,0) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pm(2,2) \ + pa(1,0) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pm(7,7) \ + pa(2,0) +#undef lp0x1_nrm2_1 +#define lp0x1_nrm2_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,MM(1,RS4)),ax,3) \ + pm(7,7) +#undef dp0x1_nrm2_1 +#define dp0x1_nrm2_1(a_) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pm(3,3) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(1,1) \ + pa(3,0) \ + pm(2,2) \ + pa(1,0) \ + pa(2,0) +#undef pl0x1_nrm2_1 +#define pl0x1_nrm2_1 RS4 + + +#undef p1_4_nrm2_2 +#define p1_4_nrm2_2(a_) \ + pls(a_,ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pcs(5,6) dbg(6) \ + pcs(5,7) dbg(7) \ + paxs(1,5) dbg(5) \ + prps(5,2) dbg(2) \ + px(3) \ + pcms(0,2,3) dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pasr(3,7) dbg(7) \ + pcs(7,5) dbg(5) \ + pdsr(5,6) dbg(6) \ + pdsr(5,1) dbg(1) \ + pmsr(6,6) dbg(6) \ + pmsr(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pasr(1,0) dbg(0) +#undef p1_2_nrm2_2 +#define p1_2_nrm2_2(a_) \ + px(1) pld(a_,ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef p1_nrm2_2 +#define p1_nrm2_2(a_) \ + plq(a_,ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#define p2_nrm2_2(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef lpnrm2_2 +#define lpnrm2_2(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef dpnrm2_2 +#define dpnrm2_2(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + pan(4,1) dbg(1) \ + pc(5,6) dbg(6) \ + pc(5,7) dbg(7) \ + pax(1,5) dbg(5) \ + prp(5,2) dbg(2) \ + px(3) \ + pcm(0,2,3)dbg(3) \ + pan(3,7) dbg(7) \ + pann(5,3) dbg(3) \ + pa(3,7) dbg(7) \ + pc(7,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef plnrm2_2 +#define plnrm2_2 8 + + +#undef p1_4_nrm2_3 +#define p1_4_nrm2_3(a_) \ + pls(a_,ax,1) dbg(1) \ + pcs(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + paxs(1,5) dbg(5) \ + pdsr(5,6) dbg(6) \ + pdsr(5,1) dbg(1) \ + pmsr(6,6) dbg(6) \ + pmsr(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pasr(1,0) dbg(0) +#undef p1_2_nrm2_3 +#define p1_2_nrm2_3(a_) \ + px(1) pld(a_,ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef p1_nrm2_3 +#define p1_nrm2_3(a_) \ + plq(a_,ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#define p2_nrm2_3(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef lpnrm2_3 +#define lpnrm2_3(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef dpnrm2_3 +#define dpnrm2_3(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + pc(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + pax(1,5) dbg(5) \ + pd(5,6) dbg(6) \ + pd(5,1) dbg(1) \ + pm(6,6) dbg(6) \ + pm(1,1) dbg(1) \ + pm(6,0) dbg(0) \ + pa(1,0) dbg(0) +#undef plnrm2_3 +#define plnrm2_3 8 + +#define block_nrm2_4(a_,b_) \ + Mjoin(pc,a_)(5,6) dbg(6) \ + pan(4,1) dbg(1) \ + Mjoin(pax,a_)(1,5) dbg(5) \ + Mjoin(pc,a_)(2,7) dbg(7) \ + Mjoin(pd,b_)(5,7) dbg(7) \ + Mjoin(pm,b_)(7,6) dbg(6) \ + Mjoin(pm,b_)(7,1) dbg(1) \ + Mjoin(pm,b_)(6,6) dbg(6) \ + Mjoin(pm,b_)(6,0) dbg(0) \ + Mjoin(pm,b_)(1,1) dbg(1) \ + Mjoin(pa,b_)(1,0) dbg(0) + + +/* #undef p1_4_nrm2_4 */ +/* #define p1_4_nrm2_4(a_) \ */ +/* pls(a_,ax,1) dbg(1) \ */ +/* pcs(5,6) dbg(6) \ */ +/* pan(4,1) dbg(1) \ */ +/* paxs(1,5) dbg(5) \ */ +/* pcs(2,7) dbg(7) \ */ +/* pdsr(5,7) dbg(7) \ */ +/* pmsr(7,6) dbg(6) \ */ +/* pmsr(7,1) dbg(1) \ */ +/* pmsr(6,6) dbg(6) \ */ +/* pmsr(6,0) dbg(0) \ */ +/* pmsr(1,1) dbg(1) \ */ +/* pasr(1,0) dbg(0) */ +#undef p1_4_nrm2_4 +#define p1_4_nrm2_4(a_) \ + pls(a_,ax,1) dbg(1) \ + block_nrm2_4(s,sr) +#undef p1_2_nrm2_4 +#define p1_2_nrm2_4(a_) \ + px(1) pld(a_,ax,1) dbg(1) \ + block_nrm2_4(,) +#undef p1_nrm2_4 +#define p1_nrm2_4(a_) \ + plq(a_,ax,1) dbg(1) \ + block_nrm2_4(,) +#define p2_nrm2_4(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + block_nrm2_4(,) \ + plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + block_nrm2_4(,) +#undef lpnrm2_4 +#define lpnrm2_4(a_) \ + plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + block_nrm2_4(,) +#undef dpnrm2_4 +#define dpnrm2_4(a_) \ + plq(SS(a_,RS4),ax,1) dbg(1) \ + block_nrm2_4(,) +#undef plnrm2_4 +#define plnrm2_4 8 + + +#undef p1_4_1x1_1 +#define p1_4_1x1_1(a_) \ + pls(a_,ax,1) \ + pls(a_,bx,0) \ + pm(0,1) \ + pa(1,6) +#undef p1_2_1x1_1 +#define p1_2_1x1_1(a_) \ + pld(a_,ax,1) \ + pld(a_,bx,0) \ + pm(0,1) \ + pa(1,6) +#undef p1_1x1_1 +#define p1_1x1_1(a_) \ + plq(a_,ax,1) \ + plq(a_,bx,0) \ + pm(0,1) \ + pa(0,6) +#undef p2_1x1_1 +#define p2_1x1_1(a_) \ + plq(a_,ax,1) \ + plq(a_,bx,0) \ + plq(SS(a_,RS4),ax,2) \ + plq(SS(a_,RS4),bx,3) \ + pm(0,1) \ + pm(2,3) \ + pa(1,6) \ + pa(3,6) +#undef p4_1x1_1 +#define p4_1x1_1(a_) \ + f(nta,SS(a_,MM(4,RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pm(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(0,1) \ + puq(3,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM(6,RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pm(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pm(0,7) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef lp1x1_1 +#define lp1x1_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,RS4),ax,3) \ + pm(0,7) +#undef dp1x1_1 +#define dp1x1_1(a_) \ + plq(SS(,a_,MM(2,RS4)),ax,1) \ + pm(0,3) \ + puq(7,a_,ax) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pm(0,1) \ + puq(3,SS(a_,RS4),ax) \ + pm(0,2) \ + puq(1,SS(a_,MM(2,RS4)),ax) \ + puq(2,SS(a_,MM(3,RS4)),ax) +#undef pl1x1_1 +#define pl1x1_1 RS4 + + +#undef p1_4_0x1_asum_1 +#define p1_4_0x1_asum_1(a_) \ + pls(a_,ax,1) \ + pan(4,1) \ + pasr(1,0) +#undef p1_2_0x1_asum_1 +#define p1_2_0x1_asum_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + pan(4,1) \ + pa(1,0) +#undef p1_0x1_asum_1 +#define p1_0x1_asum_1(a_) \ + plq(a_,ax,1) \ + pan(4,1) \ + pa(1,0) +#undef p2_0x1_asum_1 +#define p2_0x1_asum_1(a_) \ + plq(a_,ax,1) \ + plq(SS(a_,RS4),ax,2) \ + pan(4,1) \ + pan(4,2) \ + pa(1,0) \ + pa(2,0) +#undef p4_0x1_asum_1 +#define p4_0x1_asum_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pan(4,3) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pan(4,1) \ + pa(3,0) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pan(4,2) \ + pa(1,0) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pan(4,7) \ + pa(2,0) +#undef lp0x1_asum_1 +#define lp0x1_asum_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,MM(1,RS4)),ax,3) \ + pan(4,7) +#undef dp0x1_asum_1 +#define dp0x1_asum_1(a_) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pan(4,3) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pan(4,1) \ + pa(3,0) \ + pan(4,2) \ + pa(1,0) \ + pa(2,0) +#undef pl0x1_asum_1 +#define pl0x1_asum_1 RS4 + + +#undef p1_4_sum_1 +#define p1_4_sum_1(a_) \ + pls(a_,ax,1) \ + pasr(1,0) +#undef p1_2_sum_1 +#define p1_2_sum_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + pa(1,0) +#undef p1_sum_1 +#define p1_sum_1(a_) \ + plq(a_,ax,1) \ + pa(1,0) +#undef p2_sum_1 +#define p2_sum_1(a_) \ + plq(a_,ax,1) \ + plq(SS(a_,RS4),ax,2) \ + pa(1,0) \ + pa(2,0) +#undef p4_sum_1 +#define p4_sum_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pa(3,0) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,7) \ + pa(1,0) \ + plq(SS(a_,MM(5,RS4)),ax,3) \ + pa(2,0) +#undef lpsum_1 +#define lpsum_1(a_) \ + plq(a_,ax,7) \ + plq(SS(a_,MM(1,RS4)),ax,3) +#undef dpsum_1 +#define dpsum_1(a_) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + pa(7,0) \ + plq(SS(a_,MM(3,RS4)),ax,2) \ + pa(3,0) \ + pa(1,0) \ + pa(2,0) +#undef plsum_1 +#define plsum_1 RS4 + + +#undef p1_4_dot_1 +#define p1_4_dot_1(a_) \ + pls(a_,ax,1) \ + pls(a_,cx,2) \ + pmsr(2,1) \ + pasr(1,0) +#undef p1_2_dot_1 +#define p1_2_dot_1(a_) \ + px(1) \ + pld(a_,ax,1) \ + px(2) \ + pld(a_,cx,2) \ + pm(2,1) \ + pa(1,0) +#undef p1_dot_1 +#define p1_dot_1(a_) \ + plq(a_,ax,1) \ + pl(a_,cx,2) \ + pm(2,1) \ + pa(1,0) +#undef p2_dot_1 +#define p2_dot_1(a_) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pm(4,3) \ + pa(3,0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(2,1) \ + pa(1,0) +#undef lpdot_1 +#define lpdot_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(a_,ax,3) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(a_,cx,4) +#undef dpdot_1 +#define dpdot_1(a_) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pm(4,3) \ + pa(3,0) \ + pm(2,1) \ + pa(1,0) +#undef pldot_1 +#define pldot_1 8 + +#undef p1_4_dot_1c +#define p1_4_dot_1c(a_) +#undef p1_2_dot_1c +#define p1_2_dot_1c(a_) \ + px(1) \ + pld(a_,ax,1) \ + px(2) \ + pld(a_,cx,2) \ + pc(1,3) \ + ps(HSHUF,1,1) \ + ps(LSHUF,3,3) \ + pm(7,1) \ + pm(2,3) \ + pa(3,0) \ + pm(2,1) \ + pa(1,6) +#undef p1_dot_1c +#define p1_dot_1c(a_) \ + plq(a_,ax,1) \ + pl(a_,cx,2) \ + pc(1,3) \ + ps(HSHUF,1,1) \ + ps(LSHUF,3,3) \ + pm(7,1) \ + pm(2,3) \ + pa(3,0) \ + pm(2,1) \ + pa(1,6) +#undef p2_dot_1c +#define p2_dot_1c(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pc(3,5) \ + ps(HSHUF,3,3) \ + ps(LSHUF,5,5) \ + pm(7,3) \ + pm(4,5) \ + pa(5,0) \ + pm(4,3) \ + pa(3,6) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + plq(SS(a_,MM(2,RS4)),ax,3) \ + pc(1,5) \ + ps(HSHUF,1,1) \ + ps(LSHUF,5,5) \ + pm(7,1) \ + pm(2,5) \ + pa(5,0) \ + pm(2,1) \ + pa(1,6) +#undef lpdot_1c +#define lpdot_1c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(a_,ax,3) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(a_,cx,4) +#undef dpdot_1c +#define dpdot_1c(a_) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pc(3,5) \ + ps(HSHUF,3,3) \ + ps(LSHUF,5,5) \ + pm(7,3) \ + pm(4,5) \ + pa(5,0) \ + pm(4,3) \ + pa(3,6) \ + pc(1,5) \ + ps(HSHUF,1,1) \ + ps(LSHUF,5,5) \ + pm(7,1) \ + pm(2,5) \ + pa(5,0) \ + pm(2,1) \ + pa(1,6) +#undef pldot_1c +#define pldot_1c 8 + +#undef p1_4_dot_2c +#define p1_4_dot_2c(a_) +#undef p1_2_dot_2c +#define p1_2_dot_2c(a_) \ + px(1) \ + pld(a_,ax,1) \ + px(2) \ + pld(a_,cx,2) \ + pc(1,3) \ + ps(CSHUF,1,1) \ + pm(2,3) \ + pa(3,0) \ + pm(2,1) \ + pa(1,6) +#undef p1_dot_2c +#define p1_dot_2c(a_) \ + plq(a_,ax,1) \ + pl(a_,cx,2) \ + pc(1,3) \ + ps(CSHUF,1,1) \ + pm(2,3) \ + pa(3,0) \ + pm(2,1) \ + pa(1,6) +#undef p2_dot_2c +#define p2_dot_2c(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pc(3,5) \ + ps(CSHUF,3,3) \ + pm(4,5) \ + pa(5,0) \ + pm(4,3) \ + pa(3,6) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + plq(SS(a_,MM(2,RS4)),ax,3) \ + pc(1,5) \ + ps(CSHUF,1,1) \ + pm(2,5) \ + pa(5,0) \ + pm(2,1) \ + pa(1,6) +#undef lpdot_2c +#define lpdot_2c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(a_,ax,3) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(a_,cx,4) +#undef dpdot_2c +#define dpdot_2c(a_) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,2) \ + pc(3,5) \ + ps(CSHUF,3,3) \ + pm(4,5) \ + pa(5,0) \ + pm(4,3) \ + pa(3,6) \ + pc(1,5) \ + ps(CSHUF,1,1) \ + pm(2,5) \ + pa(5,0) \ + pm(2,1) \ + pa(1,6) +#undef pldot_2c +#define pldot_2c 8 + +#undef p1_4_axpby_3 +#define p1_4_axpby_3(a_) \ + pls(a_,ax,0) \ + pls(a_,cx,3) \ + pmsr(5,0) \ + pmsr(6,3) \ + pasr(3,0) \ + pus(0,a_,ax) +#undef p1_2_axpby_3 +#define p1_2_axpby_3(a_) \ + pld(a_,ax,0) \ + pld(a_,cx,3) \ + pm(5,0) \ + pm(6,3) \ + pa(3,0) \ + pud(0,a_,ax) +#undef p1_axpby_3 +#define p1_axpby_3(a_) \ + plq(a_,ax,0) \ + pl(a_,cx,3) \ + pm(5,0) \ + pm(6,3) \ + pa(3,0) \ + punt(0,a_,ax) +#undef p2_axpby_3 +#define p2_axpby_3(a_) \ + plq(a_,ax,0) \ + pl(a_,cx,3) \ + plq(SS(a_,RS4),ax,1) \ + pm(5,0) \ + pm(6,3) \ + pa(3,0) \ + pl(SS(a_,RS4),cx,3) \ + punt(0,a_,ax) \ + pm(5,1) \ + pm(6,3) \ + pa(3,1) \ + punt(1,SS(a_,RS4),ax) +#undef p4_axpby_3 +#define p4_axpby_3(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(5,2) \ + pl(SS(a_,MM(3,RS4)),cx,7) \ + pm(6,4) \ + pa(4,2) \ + punt(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + pm(5,3) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(6,7) \ + pa(7,3) \ + punt(1,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pm(5,0) \ + pl(SS(a_,MM(5,RS4)),cx,7) \ + pm(6,4) \ + pa(4,0) \ + punt(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,4) \ + pm(5,1) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(6,7) \ + pa(7,1) \ + punt(3,SS(a_,MM(3,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpaxpby_3 +#define lpaxpby_3(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,4) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pl(SS(a_,MM(1,RS4)),cx,7) \ + pm(5,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,4) \ + pa(4,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pm(5,1) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(6,7) \ + pa(7,1) +#undef dpaxpby_3 +#define dpaxpby_3(a_) \ + pl(SS(a_,MM(3,RS4)),cx,7) \ + pm(5,2) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,4) \ + pa(4,2) \ + pm(5,3) \ + punt(0,a_,ax) \ + pm(6,7) \ + pa(7,3) \ + punt(1,SS(a_,RS4),ax) \ + punt(2,SS(a_,MM(2,RS4)),ax) \ + punt(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpby_3 +#define plaxpby_3 16 + +#undef p1_4_axpby_3c +#define p1_4_axpby_3c(a_) +#undef p1_2_axpby_3c +#define p1_2_axpby_3c(a_) \ + pld(a_,ax,0) \ + pld(a_,cx,2) \ + pc(0,3) \ + pm(5,0) \ + ps(CSHUF,3,3) \ + pm(4,3) \ + pa(3,0) \ + pc(2,3) \ + pm(6,2) \ + pa(2,0) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,0) \ + pud(0,a_,ax) +#undef p1_axpby_3c +#define p1_axpby_3c(a_) \ + plq(a_,ax,0) \ + pl(a_,cx,2) \ + pc(0,3) \ + pm(5,0) \ + ps(CSHUF,3,3) \ + pm(4,3) \ + pa(3,0) \ + pc(2,3) \ + pm(6,2) \ + pa(2,0) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,0) \ + puq(0,a_,ax) +#undef p2_axpby_3c +#define p2_axpby_3c(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,3) \ + pc(1,2) \ + pm(5,1) \ + ps(CSHUF,2,2) \ + pm(4,2) \ + pa(2,1) \ + pc(3,2) \ + pm(6,3) \ + pa(3,1) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,1) \ + puq(0,a_,ax) \ + plq(SS(a_,MM(2,RS4)),ax,0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pc(0,3) \ + pm(5,0) \ + ps(CSHUF,3,3) \ + pm(4,3) \ + pa(3,0) \ + pc(2,3) \ + pm(6,2) \ + pa(2,0) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,0) \ + puq(1,SS(a_,RS4),ax) +#undef lpaxpby_3c +#define lpaxpby_3c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,2) \ + pc(0,3) \ + pm(5,0) \ + ps(CSHUF,3,3) \ + pm(4,3) \ + pa(3,0) \ + pc(2,3) \ + pm(6,2) \ + pa(2,0) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,0) +#undef dpaxpby_3c +#define dpaxpby_3c(a_) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pl(SS(a_,MM(1,RS4)),cx,3) \ + pc(1,2) \ + pm(5,1) \ + ps(CSHUF,2,2) \ + pm(4,2) \ + pa(2,1) \ + pc(3,2) \ + pm(6,3) \ + pa(3,1) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef plaxpby_3c +#define plaxpby_3c 8 + +#undef p1_4_axpby_2 +#define p1_4_axpby_2(a_) \ + pls(a_,cx,5) \ + pls(a_,ax,0) \ + pmsr(6,5) \ + pasr(5,0) \ + pus(0,a_,ax) +#undef p1_2_axpby_2 +#define p1_2_axpby_2(a_) \ + pld(a_,cx,5) \ + pld(a_,ax,0) \ + pm(6,5) \ + pa(5,0) \ + pud(0,a_,ax) +#undef p1_axpby_2 +#define p1_axpby_2(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pm(6,5) \ + pa(5,0) \ + puq(0,a_,ax) +#undef p2_axpby_2 +#define p2_axpby_2(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,5) \ + pa(5,0) \ + plq(SS(a_,RS4),ax,1) \ + puq(0,a_,ax) \ + pm(6,4) \ + pa(4,1) \ + puq(1,SS(a_,RS4),ax) +#undef p4_axpby_2 +#define p4_axpby_2(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + pm(6,4) \ + pa(4,2) \ + puq(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(6,5) \ + pa(5,3) \ + puq(1,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pm(6,4) \ + pa(4,0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,4) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(6,5) \ + pa(5,1) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpaxpby_2 +#define lpaxpby_2(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,4) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pl(SS(a_,MM(1,RS4)),cx,5) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,4) \ + pa(4,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(6,5) \ + pa(5,1) +#undef dpaxpby_2 +#define dpaxpby_2(a_) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,4) \ + pa(4,2) \ + puq(0,a_,ax) \ + pm(6,5) \ + pa(5,3) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpby_2 +#define plaxpby_2 16 + +#undef p1_4_axpby_2c +#define p1_4_axpby_2c(a_) +#undef p1_2_axpby_2c +#define p1_2_axpby_2c(a_) \ + pld(a_,cx,5) \ + pld(a_,ax,0) \ + pc(5,1) \ + pm(6,5) \ + pa(5,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + pud(0,a_,ax) +#undef p1_axpby_2c +#define p1_axpby_2c(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pc(5,1) \ + pm(6,5) \ + pa(5,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + puq(0,a_,ax) +#undef p2_axpby_2c +#define p2_axpby_2c(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pl(SS(a_,RS4),cx,4) \ + pc(5,1) \ + pm(6,5) \ + pa(5,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + plq(SS(a_,RS4),ax,1) \ + puq(0,a_,ax) \ + pc(4,3) \ + pm(6,4) \ + pa(4,1) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,1) \ + puq(1,SS(a_,RS4),ax) +#undef p4_axpby_2c +#define p4_axpby_2c(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + puq(0,a_,ax) \ + pc(4,0) \ + pm(6,4) \ + pa(4,2) \ + ps(CSHUF,0,0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + pm(7,0) \ + pa(0,2) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + puq(1,SS(a_,RS4),ax) \ + pc(5,1) \ + pm(6,5) \ + pa(5,3) \ + ps(CSHUF,1,1) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pm(7,1) \ + pa(1,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,4) \ + pm(7,2) \ + pa(2,0) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + pc(5,3) \ + pm(6,5) \ + pa(5,1) \ + ps(CSHUF,3,3) \ + pl(SS(a_,MM(7,RS4)),cx,5) \ + pm(7,3) \ + pa(3,1) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpaxpby_2c +#define lpaxpby_2c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,4) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pl(SS(a_,MM(1,RS4)),cx,5) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(7,2) \ + pa(2,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pc(5,3) \ + pm(6,5) \ + pa(5,1) \ + ps(CSHUF,3,3) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + pm(7,3) \ + pa(3,1) +#undef dpaxpby_2c +#define dpaxpby_2c(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + puq(0,a_,ax) \ + pc(4,0) \ + pm(6,4) \ + pa(4,2) \ + ps(CSHUF,0,0) \ + puq(1,SS(a_,RS4),ax) \ + pm(7,0) \ + pa(0,2) \ + pc(5,1) \ + pm(6,5) \ + pa(5,3) \ + ps(CSHUF,1,1) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pm(7,1) \ + pa(1,3) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpby_2c +#define plaxpby_2c 16 + +#undef p1_4_axpby_1 +#define p1_4_axpby_1(a_) \ + pls(a_,ax,1) \ + pls(a_,cx,2) \ + pmsr(5,1) \ + pmsr(6,2) \ + pasr(2,1) \ + pus(1,a_,ax) +#undef p1_2_axpby_1 +#define p1_2_axpby_1(a_) \ + pld(a_,ax,1) \ + pld(a_,cx,2) \ + pm(5,1) \ + pm(6,2) \ + pa(2,1) \ + pud(1,a_,ax) +#undef p1_axpby_1 +#define p1_axpby_1(a_) \ + plq(a_,ax,1) \ + pl(a_,cx,2) \ + pm(5,1) \ + pm(6,2) \ + pa(2,1) \ + puq(1,a_,ax) +#undef p2_axpby_1 +#define p2_axpby_1(a_) \ + plq(SS(a_,RS4),ax,3) \ + pl(SS(a_,RS4),cx,4) \ + pm(5,1) \ + pm(6,2) \ + pa(2,1) \ + puq(1,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pm(5,3) \ + pm(6,4) \ + pa(4,3) \ + puq(3,SS(a_,RS4),ax) +#undef lpaxpby_1 +#define lpaxpby_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,2) +#undef dpaxpby_1 +#define dpaxpby_1(a_) \ + plq(SS(a_,RS4),ax,3) \ + pl(SS(a_,RS4),cx,4) \ + pm(5,1) \ + pm(6,2) \ + pa(2,1) \ + puq(1,a_,ax) \ + pm(5,3) \ + pm(6,4) \ + pa(4,3) \ + puq(3,SS(a_,RS4),ax) +#undef plaxpby_1 +#define plaxpby_1 8 + +#undef p1_4_axpy_0 +#define p1_4_axpy_0(a_) \ + pls(a_,cx,2) \ + pls(a_,ax,1) \ + pmsr(6,2) \ + pasr(2,1) \ + pus(1,a_,ax) +#undef p1_2_axpy_0 +#define p1_2_axpy_0(a_) \ + pld(a_,cx,2) \ + pld(a_,ax,1) \ + pm(6,2) \ + pa(2,1) \ + pud(1,a_,ax) +#undef p1_axpy_0 +#define p1_axpy_0(a_) \ + pl(a_,cx,2) \ + plq(a_,ax,1) \ + pm(6,2) \ + pa(2,1) \ + puq(1,a_,ax) +#undef p2_axpy_0 +#define p2_axpy_0(a_) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,2) \ + pa(2,1) \ + plq(SS(a_,RS4),ax,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + puq(1,a_,ax) \ + pm(6,4) \ + pa(4,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + puq(3,SS(a_,RS4),ax) +#undef lpaxpy_0 +#define lpaxpy_0(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,2) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) +#undef dpaxpy_0 +#define dpaxpy_0(a_) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,2) \ + pa(2,1) \ + plq(SS(a_,RS4),ax,3) \ + puq(1,a_,ax) \ + pm(6,4) \ + pa(4,3) \ + puq(3,SS(a_,RS4),ax) +#undef plaxpy_0 +#define plaxpy_0 8 + +#undef p1_4_axpy_1 +#define p1_4_axpy_1(a_) \ + pls(a_,cx,2) \ + pls(a_,ax,1) \ + pmsr(6,2) \ + pasr(2,1) \ + pus(1,a_,ax) +#undef p1_2_axpy_1 +#define p1_2_axpy_1(a_) \ + pld(a_,cx,2) \ + pld(a_,ax,1) \ + pm(6,2) \ + pa(2,1) \ + pud(1,a_,ax) +#undef p1_axpy_1 +#define p1_axpy_1(a_) \ + pl(a_,cx,2) \ + pm(6,2) \ + pam(a_,ax,2) \ + puq(2,a_,ax) +#undef p2_axpy_1 +#define p2_axpy_1(a_) \ + pl(a_,cx,2) \ + pm(6,2) \ + pl(SS(a_,RS4),cx,4) \ + pam(a_,ax,2) \ + pm(6,4) \ + puq(2,a_,ax) \ + pam(SS(a_,RS4),ax,4) \ + puq(4,SS(a_,RS4),ax) +#undef p4_axpy_1 +#define p4_axpy_1(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pm(6,2) \ + pam(SS(a_,MM(2,RS4)),ax,2) \ + puq(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + pl(SS(a_,MM(4,RS4)),cx,0) \ + pm(6,3) \ + pam(SS(a_,MM(3,RS4)),ax,3) \ + puq(1,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(5,RS4)),cx,1) \ + pm(6,0) \ + pam(SS(a_,MM(4,RS4)),ax,0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + pl(SS(a_,MM(6,RS4)),cx,2) \ + pm(6,1) \ + pam(SS(a_,MM(5,RS4)),ax,1) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef lpaxpy_1 +#define lpaxpy_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(a_,cx,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + pl(SS(a_,RS4),cx,1) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pm(6,0) \ + pam(a_,ax,0) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + pm(6,1) \ + pam(SS(a_,RS4),ax,1) +#undef dpaxpy_1 +#define dpaxpy_1(a_) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pm(6,2) \ + pam(SS(a_,MM(2,RS4)),ax,2) \ + puq(0,a_,ax) \ + pm(6,3) \ + pam(SS(a_,MM(3,RS4)),ax,3) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpy_1 +#define plaxpy_1 16 + +#undef p1_4_axpy_2 +#define p1_4_axpy_2(a_) \ + pls(a_,cx,5) \ + pls(a_,ax,0) \ + pmsr(6,5) \ + pasr(5,0) \ + pus(0,a_,ax) +#undef p1_2_axpy_2 +#define p1_2_axpy_2(a_) \ + pld(a_,cx,5) \ + pld(a_,ax,0) \ + pm(6,5) \ + pa(5,0) \ + pud(0,a_,ax) +#undef p1_axpy_2 +#define p1_axpy_2(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pm(6,5) \ + pa(5,0) \ + puq(0,a_,ax) +#undef p2_axpy_2 +#define p2_axpy_2(a_) \ + pl(a_,cx,5) \ + plq(a_,ax,0) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,5) \ + pa(5,0) \ + plq(SS(a_,RS4),ax,1) \ + puq(0,a_,ax) \ + pm(6,4) \ + pa(4,1) \ + puq(1,SS(a_,RS4),ax) +#undef p4_axpy_2 +#define p4_axpy_2(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + pm(6,4) \ + pa(4,2) \ + puq(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(6,5) \ + pa(5,3) \ + puq(1,SS(a_,RS4),ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pm(6,4) \ + pa(4,0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,4) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(6,5) \ + pa(5,1) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpaxpy_2 +#define lpaxpy_2(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,4) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pl(SS(a_,MM(1,RS4)),cx,5) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,4) \ + pa(4,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(6,5) \ + pa(5,1) +#undef dpaxpy_2 +#define dpaxpy_2(a_) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,4) \ + pa(4,2) \ + puq(0,a_,ax) \ + pm(6,5) \ + pa(5,3) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpy_2 +#define plaxpy_2 16 + +#undef p1_4_axpy_2c +#define p1_4_axpy_2c(a_) +#undef p1_2_axpy_2c +#define p1_2_axpy_2c(a_) \ + pld(a_,cx,4) \ + pld(a_,ax,0) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + pud(0,a_,ax) +#undef p1_axpy_2c +#define p1_axpy_2c(a_) \ + pl(a_,cx,4) \ + plq(a_,ax,0) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + puq(0,a_,ax) +#undef p2_axpy_2c +#define p2_axpy_2c(a_) \ + pl(a_,cx,4) \ + plq(a_,ax,0) \ + pl(SS(a_,RS4),cx,5) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + plq(SS(a_,RS4),ax,1) \ + puq(0,a_,ax) \ + pc(5,3) \ + pm(6,5) \ + pa(5,1) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,1) \ + puq(1,SS(a_,RS4),ax) +#undef p4_axpy_2c +#define p4_axpy_2c(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + puq(0,a_,ax) \ + pc(4,0) \ + pm(6,4) \ + pa(4,2) \ + ps(CSHUF,0,0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + pm(7,0) \ + pa(0,2) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + puq(1,SS(a_,RS4),ax) \ + pc(5,1) \ + pm(6,5) \ + pa(5,3) \ + ps(CSHUF,1,1) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pm(7,1) \ + pa(1,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,4) \ + pm(7,2) \ + pa(2,0) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + pc(5,3) \ + pm(6,5) \ + pa(5,1) \ + ps(CSHUF,3,3) \ + pl(SS(a_,MM(7,RS4)),cx,5) \ + pm(7,3) \ + pa(3,1) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) +#undef lpaxpy_2c +#define lpaxpy_2c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,4) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + pl(SS(a_,MM(1,RS4)),cx,5) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pc(4,2) \ + pm(6,4) \ + pa(4,0) \ + ps(CSHUF,2,2) \ + pl(SS(a_,MM(2,RS4)),cx,4) \ + pm(7,2) \ + pa(2,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pc(5,3) \ + pm(6,5) \ + pa(5,1) \ + ps(CSHUF,3,3) \ + pl(SS(a_,MM(3,RS4)),cx,5) \ + pm(7,3) \ + pa(3,1) +#undef dpaxpy_2c +#define dpaxpy_2c(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + puq(0,a_,ax) \ + pc(4,0) \ + pm(6,4) \ + pa(4,2) \ + ps(CSHUF,0,0) \ + puq(1,SS(a_,RS4),ax) \ + pm(7,0) \ + pa(0,2) \ + pc(5,1) \ + pm(6,5) \ + pa(5,3) \ + ps(CSHUF,1,1) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pm(7,1) \ + pa(1,3) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plaxpy_2c +#define plaxpy_2c 16 + +#undef p1_4_axpy_1c +#define p1_4_axpy_1c(a_) +#undef p1_2_axpy_1c +#define p1_2_axpy_1c(a_) \ + pld(a_,cx,2) \ + pc(2,0) \ + pld(a_,ax,1) \ + ps(CSHUF,0,0) \ + pm(6,2) \ + pa(2,1) \ + pm(7,0) \ + pa(0,1) \ + pud(1,a_,ax) +#undef p1_axpy_1c +#define p1_axpy_1c(a_) \ + pl(a_,cx,2) \ + pc(2,0) \ + plq(a_,ax,1) \ + ps(CSHUF,0,0) \ + pm(6,2) \ + pa(2,1) \ + pm(7,0) \ + pa(0,1) \ + puq(1,a_,ax) +#undef p2_axpy_1c +#define p2_axpy_1c(a_) \ + plq(SS(a_,RS4),ax,3) \ + ps(CSHUF,0,0) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,2) \ + pa(2,1) \ + pm(7,0) \ + pa(0,1) \ + pc(4,0) \ + puq(1,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,1) \ + ps(CSHUF,0,0) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pm(6,4) \ + pa(4,3) \ + pm(7,0) \ + pa(0,3) \ + pc(2,0) \ + puq(3,SS(a_,RS4),ax) +#undef lpaxpy_1c +#define lpaxpy_1c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,2) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,1) \ + pc(2,0) +#undef dpaxpy_1c +#define dpaxpy_1c(a_) \ + plq(SS(a_,RS4),ax,3) \ + ps(CSHUF,0,0) \ + pl(SS(a_,RS4),cx,4) \ + pm(6,2) \ + pa(2,1) \ + pm(7,0) \ + pa(0,1) \ + pc(4,0) \ + puq(1,a_,ax) \ + ps(CSHUF,0,0) \ + pm(6,4) \ + pa(4,3) \ + pm(7,0) \ + pa(0,3) \ + puq(3,SS(a_,RS4),ax) +#undef plaxpy_1c +#define plaxpy_1c 8 + +#undef p1_4_copy_1 +#define p1_4_copy_1(a_) \ + pls(a_,cx,2) \ + pus(2,a_,ax) +#undef p1_2_copy_1 +#define p1_2_copy_1(a_) \ + pld(a_,cx,2) \ + pud(2,a_,ax) +#undef p1_copy_1 +#define p1_copy_1(a_) \ + pl(a_,cx,2) \ + puq(2,a_,ax) +#undef p2_copy_1 +#define p2_copy_1(a_) \ + pl(SS(a_,RS4),cx,4) \ + puq(2,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + puq(4,SS(a_,RS4),ax) +#undef lpcopy_1 +#define lpcopy_1(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,2) +#undef dpcopy_1 +#define dpcopy_1(a_) \ + pl(SS(a_,RS4),cx,4) \ + puq(2,a_,ax) \ + puq(4,SS(a_,RS4),ax) +#undef plcopy_1 +#define plcopy_1 8 + +#undef p1_4_copy_2 +#define p1_4_copy_2(a_) \ + pls(a_,ax,2) \ + pus(2,a_,cx) +#undef p1_2_copy_2 +#define p1_2_copy_2(a_) \ + pld(a_,ax,2) \ + pud(2,a_,cx) +#undef p1_copy_2 +#define p1_copy_2(a_) \ + plq(a_,ax,2) \ + pu(2,a_,cx) +#undef p2_copy_2 +#define p2_copy_2(a_) \ + plq(SS(a_,RS4),ax,4) \ + pu(2,a_,cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pu(4,SS(a_,RS4),cx) +#undef lpcopy_2 +#define lpcopy_2(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,2) +#undef dpcopy_2 +#define dpcopy_2(a_) \ + plq(SS(a_,RS4),ax,4) \ + pu(2,a_,cx) \ + pu(4,SS(a_,RS4),cx) +#undef plcopy_2 +#define plcopy_2 8 + +#undef p1_4_copy_3 +#define p1_4_copy_3(a_) \ + pls(a_,cx,2) \ + pus(2,a_,ax) +#undef p1_2_copy_3 +#define p1_2_copy_3(a_) \ + pld(a_,cx,2) \ + pud(2,a_,ax) +#undef p1_copy_3 +#define p1_copy_3(a_) \ + pl(a_,cx,2) \ + punt(2,a_,ax) +#undef p2_copy_3 +#define p2_copy_3(a_) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + punt(0,SS(a_,MM(0,RS4)),ax) \ + punt(1,SS(a_,MM(1,RS4)),ax) +#undef p4_copy_3 +#define p4_copy_3(a_) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + punt(0,SS(a_,MM(0,RS4)),ax) \ + punt(1,SS(a_,MM(1,RS4)),ax) \ + punt(2,SS(a_,MM(2,RS4)),ax) \ + punt(3,SS(a_,MM(3,RS4)),ax) +#undef p8_copy_3 +#define p8_copy_3(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pl(SS(a_,MM(6,RS4)),cx,6) \ + pl(SS(a_,MM(7,RS4)),cx,7) \ + punt(0,SS(a_,MM(0,RS4)),ax) \ + punt(1,SS(a_,MM(1,RS4)),ax) \ + punt(2,SS(a_,MM(2,RS4)),ax) \ + punt(3,SS(a_,MM(3,RS4)),ax) \ + punt(4,SS(a_,MM(4,RS4)),ax) \ + punt(5,SS(a_,MM(5,RS4)),ax) \ + punt(6,SS(a_,MM(6,RS4)),ax) \ + punt(7,SS(a_,MM(7,RS4)),ax) +#undef lpcopy_3 +#define lpcopy_3(a_) +#undef dpcopy_3 +#define dpcopy_3(a_) p8_copy_3(a_) +#undef plcopy_3 +#define plcopy_3 32 + +#undef p1_4_cpsc_3 +#define p1_4_cpsc_3(a_) \ + pls(a_,ax,0) \ + pmsr(6,0) \ + pus(0,a_,cx) +#undef p1_2_cpsc_3 +#define p1_2_cpsc_3(a_) \ + pld(a_,ax,0) \ + pm(6,0) \ + pud(0,a_,cx) +#undef p1_cpsc_3 +#define p1_cpsc_3(a_) \ + plq(a_,ax,0) \ + pm(6,0) \ + pu(0,a_,cx) +#undef p2_cpsc_3 +#define p2_cpsc_3(a_) \ + plq(a_,ax,0) \ + plq(SS(a_,RS4),ax,1) \ + pm(6,0) \ + pm(6,1) \ + pu(0,a_,cx) \ + pu(1,SS(a_,RS4),cx) +#undef p4_cpsc_3 +#define p4_cpsc_3(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,2) \ + pu(0,a_,cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(6,3) \ + pu(1,SS(a_,RS4),cx) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pm(6,0) \ + pu(2,SS(a_,MM(2,RS4)),cx) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(6,1) \ + pu(3,SS(a_,MM(3,RS4)),cx) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) +#undef lpcpsc_3 +#define lpcpsc_3(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pm(6,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pm(6,1) +#undef dpcpsc_3 +#define dpcpsc_3(a_) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(6,2) \ + pu(0,a_,cx) \ + pm(6,3) \ + pu(1,SS(a_,RS4),cx) \ + pu(2,SS(a_,MM(2,RS4)),cx) \ + pu(3,SS(a_,MM(3,RS4)),cx) +#undef plcpsc_3 +#define plcpsc_3 16 + +#undef p1_4_cpsc_3c +#define p1_4_cpsc_3c(a_) +#undef p1_2_cpsc_3c +#define p1_2_cpsc_3c(a_) \ + pld(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + pud(0,a_,cx) +#undef p1_cpsc_3c +#define p1_cpsc_3c(a_) \ + plq(a_,ax,0) \ + pc(0,1) \ + pm(6,0) \ + ps(CSHUF,1,1) \ + pm(7,1) \ + pa(1,0) \ + pu(0,a_,cx) +#undef p2_cpsc_3c +#define p2_cpsc_3c(a_) \ + plq(a_,ax,0) \ + plq(SS(a_,RS4),ax,1) \ + pc(0,2) \ + pm(6,0) \ + ps(CSHUF,2,2) \ + pm(7,2) \ + pa(2,0) \ + pu(0,a_,cx) \ + pc(1,3) \ + pm(6,1) \ + ps(CSHUF,3,3) \ + pm(7,3) \ + pa(3,1) \ + pu(1,SS(a_,RS4),cx) +#undef p4_cpsc_3c +#define p4_cpsc_3c(a_) \ + pu(0,a_,cx) \ + pc(2,4) \ + pm(6,2) \ + ps(CSHUF,4,4) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ + plq(SS(a_,MM(4,RS4)),ax,0) \ + pm(7,4) \ + pa(4,2) \ + pu(1,SS(a_,RS4),cx) \ + pc(3,4) \ + pm(6,3) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(5,RS4)),ax,1) \ + pm(7,4) \ + pa(4,3) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pu(2,SS(a_,MM(2,RS4)),cx) \ + pc(0,4) \ + pm(6,0) \ + ps(CSHUF,4,4) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \ + plq(SS(a_,MM(6,RS4)),ax,2) \ + pm(7,4) \ + pa(4,0) \ + pu(3,SS(a_,MM(3,RS4)),cx) \ + pc(1,4) \ + pm(6,1) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(7,RS4)),ax,3) \ + pm(7,4) \ + pa(4,1) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) +#undef lpcpsc_3c +#define lpcpsc_3c(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ + plq(SS(a_,MM(0,RS4)),ax,0) \ + plq(SS(a_,MM(1,RS4)),ax,1) \ + pc(0,4) \ + pm(6,0) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(2,RS4)),ax,2) \ + pm(7,4) \ + pa(4,0) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pc(1,4) \ + pm(6,1) \ + ps(CSHUF,4,4) \ + plq(SS(a_,MM(3,RS4)),ax,3) \ + pm(7,4) \ + pa(4,1) +#undef dpcpsc_3c +#define dpcpsc_3c(a_) \ + pu(0,a_,cx) \ + pc(2,4) \ + pm(6,2) \ + ps(CSHUF,4,4) \ + pu(1,SS(a_,RS4),cx) \ + pm(7,4) \ + pa(4,2) \ + pc(3,4) \ + pm(6,3) \ + ps(CSHUF,4,4) \ + pu(2,SS(a_,MM(2,RS4)),cx) \ + pm(7,4) \ + pa(4,3) \ + pu(3,SS(a_,MM(3,RS4)),cx) +#undef plcpsc_3c +#define plcpsc_3c 16 + +#undef p1_4_cpsc_4 +#define p1_4_cpsc_4(a_) \ + pls(a_,cx,0) \ + pmsr(6,0) \ + pus(0,a_,ax) +#undef p1_2_cpsc_4 +#define p1_2_cpsc_4(a_) \ + pld(a_,cx,0) \ + pm(6,0) \ + pud(0,a_,ax) +#undef p1_cpsc_4 +#define p1_cpsc_4(a_) \ + pl(a_,cx,0) \ + pm(6,0) \ + puq(0,a_,ax) +#undef p2_cpsc_4 +#define p2_cpsc_4(a_) \ + pl(a_,cx,0) \ + pl(SS(a_,RS4),cx,1) \ + pm(6,0) \ + pm(6,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_cpsc_4 +#define p4_cpsc_4(a_) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pm(6,2) \ + puq(0,a_,ax) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(4,RS4)),cx,0) \ + pm(6,3) \ + puq(1,SS(a_,RS4),ax) \ + pl(SS(a_,MM(5,RS4)),cx,1) \ + pm(6,0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \ + pl(SS(a_,MM(6,RS4)),cx,2) \ + pm(6,1) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef lpcpsc_4 +#define lpcpsc_4(a_) \ + f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pm(6,0) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pm(6,1) +#undef dpcpsc_4 +#define dpcpsc_4(a_) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pm(6,2) \ + puq(0,a_,ax) \ + pm(6,3) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef plcpsc_4 +#define plcpsc_4 16 + +#undef p1_4_cpsc_5 +#define p1_4_cpsc_5(a_) \ + pls(a_,cx,0) \ + pmsr(6,0) \ + pus(0,a_,ax) +#undef p1_2_cpsc_5 +#define p1_2_cpsc_5(a_) \ + pld(a_,cx,0) \ + pm(6,0) \ + pud(0,a_,ax) +#undef p1_cpsc_5 +#define p1_cpsc_5(a_) \ + pl(a_,cx,0) \ + pm(6,0) \ + puq(0,a_,ax) +#undef p2_cpsc_5 +#define p2_cpsc_5(a_) \ + pl(a_,cx,0) \ + pl(SS(a_,RS4),cx,1) \ + pm(6,0) \ + pm(6,1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_cpsc_5 +#define p4_cpsc_5(a_) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pm(6,0) \ + pm(6,1) \ + pm(6,2) \ + pm(6,3) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef p8_cpsc_5 +#define p8_cpsc_5(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + pl(SS(a_,MM(5,RS4)),cx,5) \ + pl(SS(a_,MM(6,RS4)),cx,7) \ + pm(6,0) \ + pm(6,1) \ + pm(6,2) \ + pm(6,3) \ + puq(0,a_,ax) \ + pl(SS(a_,MM(7,RS4)),cx,0) \ + pm(6,4) \ + pm(6,5) \ + pm(6,7) \ + pm(6,0) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + puq(4,SS(a_,MM(4,RS4)),ax) \ + puq(5,SS(a_,MM(5,RS4)),ax) \ + puq(7,SS(a_,MM(6,RS4)),ax) \ + puq(0,SS(a_,MM(7,RS4)),ax) +#undef lpcpsc_5 +#define lpcpsc_5(a_) +#undef dpcpsc_5 +#define dpcpsc_5(a_) p8_cpsc_5(a_) +#undef plcpsc_5 +#define plcpsc_5 32 + +#undef cpsc_cdp +#define cpsc_cdp(a_) pc(a_,5) pm(6,a_) ps(CSHUF,5,5) pm(7,5) pa(5,a_) +#undef p1_4_cpsc_5c +#define p1_4_cpsc_5c(a_) +#undef p1_2_cpsc_5c +#define p1_2_cpsc_5c(a_) \ + pld(a_,cx,0) \ + cpsc_cdp(0) \ + pud(0,a_,ax) +#undef p1_cpsc_5c +#define p1_cpsc_5c(a_) \ + pl(a_,cx,0) \ + cpsc_cdp(0) \ + puq(0,a_,ax) +#undef p2_cpsc_5c +#define p2_cpsc_5c(a_) \ + pl(a_,cx,0) \ + pl(SS(a_,RS4),cx,1) \ + cpsc_cdp(0) \ + cpsc_cdp(1) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) +#undef p4_cpsc_5c +#define p4_cpsc_5c(a_) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + cpsc_cdp(0) \ + cpsc_cdp(1) \ + cpsc_cdp(2) \ + cpsc_cdp(3) \ + puq(0,a_,ax) \ + puq(1,SS(a_,RS4),ax) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + puq(3,SS(a_,MM(3,RS4)),ax) +#undef p8_cpsc_5c +#define p8_cpsc_5c(a_) \ + f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \ + pl(SS(a_,MM(0,RS4)),cx,0) \ + pl(SS(a_,MM(1,RS4)),cx,1) \ + pl(SS(a_,MM(2,RS4)),cx,2) \ + pl(SS(a_,MM(3,RS4)),cx,3) \ + pl(SS(a_,MM(4,RS4)),cx,4) \ + cpsc_cdp(0) \ + cpsc_cdp(1) \ + puq(0,a_,ax) \ + pl(SS(a_,MM(5,RS4)),cx,0) \ + cpsc_cdp(2) \ + cpsc_cdp(3) \ + puq(1,SS(a_,RS4),ax) \ + pl(SS(a_,MM(6,RS4)),cx,1) \ + cpsc_cdp(4) \ + cpsc_cdp(0) \ + puq(2,SS(a_,MM(2,RS4)),ax) \ + pl(SS(a_,MM(7,RS4)),cx,2) \ + cpsc_cdp(1) \ + cpsc_cdp(2) \ + puq(3,SS(a_,MM(3,RS4)),ax) \ + puq(4,SS(a_,MM(4,RS4)),ax) \ + puq(0,SS(a_,MM(5,RS4)),ax) \ + puq(1,SS(a_,MM(6,RS4)),ax) \ + puq(2,SS(a_,MM(7,RS4)),ax) +#undef lpcpsc_5c +#define lpcpsc_5c(a_) +#undef dpcpsc_5c +#define dpcpsc_5c(a_) p8_cpsc_5c(a_) +#undef plcpsc_5c +#define plcpsc_5c 32 + +#undef p1_4_cpsc_1 +#define p1_4_cpsc_1(a_) \ + pls(a_,ax,2) \ + pmsr(3,2) \ + pus(2,a_,cx) +#undef p1_2_cpsc_1 +#define p1_2_cpsc_1(a_) \ + pld(a_,ax,2) \ + pm(3,2) \ + pud(2,a_,cx) +#undef p1_cpsc_1 +#define p1_cpsc_1(a_) \ + plq(a_,ax,2) \ + pm(3,2) \ + pu(2,a_,cx) +#undef p2_cpsc_1 +#define p2_cpsc_1(a_) \ + plq(SS(a_,RS4),ax,4) \ +