From 96a32415ab43377cf1575bd3f4f2980f58028209 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Fri, 14 Aug 2015 11:51:42 +0800
Subject: add implementation for kaldi io (by ymz)

---
 .../tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h |  188 ++
 kaldi_io/src/tools/ATLAS/include/contrib/Make.ext  |   39 +
 .../src/tools/ATLAS/include/contrib/SSE3Dnow.h     |  709 +++++
 .../src/tools/ATLAS/include/contrib/camm_dpa.h     | 1626 +++++++++++
 .../src/tools/ATLAS/include/contrib/camm_pipe3.h   |  295 ++
 .../src/tools/ATLAS/include/contrib/camm_scale.h   |  215 ++
 .../src/tools/ATLAS/include/contrib/camm_strat1.h  | 2982 ++++++++++++++++++++
 .../src/tools/ATLAS/include/contrib/camm_tpipe.h   |  331 +++
 .../src/tools/ATLAS/include/contrib/camm_util.h    |  508 ++++
 9 files changed, 6893 insertions(+)
 create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h
 create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/Make.ext
 create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h
 create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h
 create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h
 create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h
 create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h
 create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h
 create mode 100644 kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h

(limited to 'kaldi_io/src/tools/ATLAS/include/contrib')

diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h b/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h
new file mode 100644
index 0000000..118d3de
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/ATL_gemv_ger_SSE.h
@@ -0,0 +1,188 @@
+#ifdef GER
+#undef NO_TRANSPOSE
+#define NO_TRANSPOSE
+#endif
+
+
+#if NDPM > 4
+#error Max NDPM is 4 
+#endif
+
+#if !defined(ATL_SSE1) && ( defined(SREAL) || defined(SCPLX) )
+#error This routine needs ATL_SSE1 defined
+#endif
+
+#if !defined(ATL_SSE2) && ( defined(DREAL) || defined(DCPLX) )
+#error This routine needs ATL_SSE2 defined
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "camm_util.h"
+
+#ifndef GER
+#if defined(BETAX) || defined(BETAXI0)
+#include "camm_scale.h"
+#endif
+#endif
+
+#if NDPM >= 4
+#define EXT4 Mjoin(4dp,BLC)
+#undef NDP
+#define NDP 4
+#undef EXT
+#define EXT EXT4
+#include "camm_dpa.h"
+#endif
+
+#if NDPM >= 3
+#define EXT3 Mjoin(3dp,BLC)
+#undef NDP
+#define NDP 3
+#undef EXT
+#define EXT EXT3
+#include "camm_dpa.h"
+#endif
+
+#if NDPM >= 2
+#define EXT2 Mjoin(2dp,BLC)
+#undef NDP
+#define NDP 2
+#undef EXT
+#define EXT EXT2
+#include "camm_dpa.h"
+#endif
+
+#define EXT1 Mjoin(1dp,BLC)
+#undef NDP
+#define NDP 1
+#undef EXT
+#define EXT EXT1
+#include "camm_dpa.h"
+
+#undef NDP
+#define NDP NDPM
+#undef EXT
+#define EXT Mjoin(Mjoin(NDP,Mjoin(dp,BLC)),m)
+#include "camm_dpa.h"
+
+#ifdef GER
+#if defined(SCPLX) || defined(DCPLX)
+#ifdef Conj_
+#define IM 1c
+#else
+#define IM 1u
+#endif
+#else
+#define IM 1
+#endif
+
+
+#define FN Mjoin(Mjoin(Mjoin(ATL_,PREC),Mjoin(ger,IM)),_a1_x1_yX)
+
+#undef MY_FUNCTION
+#define MY_FUNCTION FN
+
+void 
+MY_FUNCTION(int m,int n, const SCALAR alpha,const TYPE *c,
+   int cinc,const TYPE *b,int binc,
+   TYPE *a,int lda) {
+
+#else
+
+
+#define FN Mjoin(Mjoin(Mjoin(ATL_,PREC),gemv),Mjoin(FEXT,Mjoin(_a1_x1_,Mjoin(BL,_y1))))
+
+#undef MY_FUNCTION
+#define MY_FUNCTION FN
+
+void 
+MY_FUNCTION(int m,int n, const SCALAR alpha,const TYPE *a,
+   int lda,const TYPE *b,int binc,
+   const SCALAR beta,TYPE *c,int cinc) {
+
+#endif
+
+  int i,mm,nn;
+  const TYPE *ae;
+#ifdef NO_TRANSPOSE
+  int len=m,w=n;
+#define zz b
+#else
+  int len=n,w=m;
+#define zz c
+#endif
+
+#ifdef GER
+#define zzinc binc
+#else
+#define zzinc 1
+
+
+#if defined(NO_TRANSPOSE) && defined(BETA0)
+  memset(c,0,m*sizeof(*c));
+#endif
+
+#if defined(BETAX) || defined(BETAXI0)
+#if defined(SCPLX) || defined(DCPLX)
+  SCALE(beta,c,m);
+#endif
+#if defined(SREAL) || defined(DREAL)
+  SCALE(&beta,c,m);
+#endif
+#endif
+
+#endif
+
+  ae=a+w*lda;
+  nn=STRIDE*lda;
+
+
+#if NDPM == 1
+  for (;a<ae;a+=lda,zz+=zzinc)
+    Mjoin(dp,EXT)(a,nn,b,c,STRIDE*zzinc,len);
+
+#else
+
+  while (a+NDPM*nn<=ae) {
+    for (i=0;i<STRIDE;i++,a+=lda,zz+=zzinc) 
+      Mjoin(dp,EXT)(a,nn,b,c,STRIDE*zzinc,len);
+
+    a+=(NDPM-1)*nn;
+    zz+=(NDPM-1)*STRIDE*zzinc;
+  }
+
+  for (i=0;a<ae && i<STRIDE;i++,a+=lda,zz+=zzinc) {
+
+    mm=(ae-a)/nn;
+#if STRIDE > 1
+    if (((ae-a)/lda)%STRIDE)
+      mm++;
+#endif
+    
+    if (mm == 1)
+      Mjoin(dp,EXT1)(a,nn,b,c,STRIDE*zzinc,len);
+
+#if ( NDPM == 2 && STRIDE > 1 ) || NDPM > 2
+    else if (mm == 2)
+      Mjoin(dp,EXT2)(a,nn,b,c,STRIDE*zzinc,len);
+#endif
+
+#if ( NDPM == 3 && STRIDE > 1 ) || NDPM > 3
+    else if (mm == 3)
+      Mjoin(dp,EXT3)(a,nn,b,c,STRIDE*zzinc,len);
+#endif
+
+#if ( NDPM == 4 && STRIDE > 1 ) || NDPM > 4
+    else if (mm == 4)
+      Mjoin(dp,EXT4)(a,nn,b,c,STRIDE*zzinc,len);
+#endif
+
+
+  }
+
+#endif
+
+}
+
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext b/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext
new file mode 100644
index 0000000..f7f9a0a
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/Make.ext
@@ -0,0 +1,39 @@
+
+topd = /home/whaley/atlas3.8/AtlasBase
+incs = -def topd /home/whaley/atlas3.8/AtlasBase \
+       -def incd /home/whaley/atlas3.8/AtlasBase/Clint \
+       -def BASEdir /home/whaley/atlas3.8/AtlasBase/Antoine/ \
+       -def basd /home/whaley/atlas3.8/AtlasBase/Clint
+ext  = extract
+extF = $(ext) -langF -lnlen71 -Remtblank -llwarn2 -LAPACK1 $(incs)
+extC = $(ext) -langC -lnlen79 -Remtblank -llwarn2 $(incs)
+extM = $(ext) -langM -lnlen79 -llwarn2 $(incs)
+
+default: all
+force_build:
+basd = /home/whaley/atlas3.8/AtlasBase/Clint
+basdRCW = /home/whaley/atlas3.8/AtlasBase/Clint
+basdAPP = /home/whaley/atlas3.8/AtlasBase/Antoine
+incf = /home/whaley/atlas3.8/AtlasBase/gen.inc
+
+files = ATL_gemv_ger_SSE.h SSE3Dnow.h camm_dpa.h camm_pipe3.h camm_scale.h \
+        camm_strat1.h camm_tpipe.h camm_util.h
+
+all : $(files)
+
+camm_strat1.h : $(topd)/kernel/CammMaguire/camm_strat1.h
+	cp $(topd)/kernel/CammMaguire/camm_strat1.h .
+camm_tpipe.h : $(topd)/kernel/CammMaguire/camm_tpipe.h
+	cp $(topd)/kernel/CammMaguire/camm_tpipe.h .
+camm_pipe3.h : $(topd)/kernel/CammMaguire/camm_pipe3.h
+	cp $(topd)/kernel/CammMaguire/camm_pipe3.h .
+ATL_gemv_ger_SSE.h : $(topd)/kernel/CammMaguire/ATL_gemv_ger_SSE.h
+	cp $(topd)/kernel/CammMaguire/ATL_gemv_ger_SSE.h .
+camm_util.h : $(topd)/kernel/CammMaguire/camm_util.h
+	cp $(topd)/kernel/CammMaguire/camm_util.h .
+camm_scale.h : $(topd)/kernel/CammMaguire/camm_scale.h
+	cp $(topd)/kernel/CammMaguire/camm_scale.h .
+camm_dpa.h : $(topd)/kernel/CammMaguire/camm_dpa.h
+	cp $(topd)/kernel/CammMaguire/camm_dpa.h .
+SSE3Dnow.h : $(topd)/kernel/PeterSoendergaard/SSE3Dnow.h
+	cp $(topd)/kernel/PeterSoendergaard/SSE3Dnow.h .
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h b/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h
new file mode 100644
index 0000000..a783749
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/SSE3Dnow.h
@@ -0,0 +1,709 @@
+#if !defined(ATL_GAS_x8632) &&  !defined(ATL_GAS_x8664)
+   #error "This kernel requires gas x86 assembler!"
+#endif
+#ifndef Mstr   /* Added by RCW to make multiline macros work */
+   #define Mstr2(m) # m
+   #define Mstr(m) Mstr2(m)
+#endif
+/*  The mening of the defined macros is as follows:
+ *  VECLEN:         The length of a singleprecision vector register
+ *  vec_add:        Add to single precision vectors. 
+ *  vec_mul:        Multiply to single precision vectors.
+ *  vec_mov:        Moves data around
+ *  vec_mov1:       Load one element in a vector and zero all other entries!
+ *  vec_splat:      Load one element relpicated in all positions in the vector.
+ *  vec_load_apart: Load elements from different memory positions into a register. 
+ *  vec_sum:        Sums a register.
+ *  vec_store_one:  Stores lowest element in vector to memory, no zero-extend!
+ * Meaning of suffixes is as follows:
+ *    mr means memory to register
+ *    rr means register to register 
+ *    rm means register to memory
+ *    a means that instruction needs aligned data
+ *    1 means that the instructions only operates on the lowest element of the
+ *      vector.
+ * 
+ * The _1 instructions work under one important assumption: That you never mix
+ * them with regular instructions, e.g. loading into a register with a normal
+ * mov, and then using add_rr_1 will not work under 3dnow! since it is in
+ * reality a normal add.  However, if using a mov_1 first, the upper part of
+ * the register will be zeroed, and it will therefore work. The _1 system is
+ * more robust under SSE, but other architectures might be implemented the
+ * same way as 3dnow!
+ *
+ * RCW: I added the following functionality for SSE only (note that vw may
+ *      be overwritten with intermediate results, but is not used as input,
+ *      and that all input array may be overwritten wt intermediate results.
+ *      VL : vector length -1):
+ *   vec_red(vd, vw) : vd[0] = sum(vd[0:VL])
+ *   vec_red2(v1, v2, vw) : v1[0] = sum(v1[0:VL]); v1[1] = sum(v2[0:VL])
+ *   vec_red4(v0, v1, v2, v3 vw1, vw2) : 
+ *      v0[0] = sum(v0[0:VL]); v0[1] = sum(v1[0:VL])
+ *      if type = double:
+ *         v2[0] = sum(v2[0:VL]); v2[1] = sum(v3[0:VL])
+ *      else 
+ *         v0[2] = sum(v2[0:VL]); v0[3] = sum(v3[0:VL])
+ *    vec_zero(vd) : vd[0:VL] = 0.0
+ */
+
+
+/* Things to try:
+ *    Non-temporal stores 
+ *    Sequences of instructions instead of movups
+ *
+ *
+ *
+ *
+ */
+
+
+
+#define gen_vec_rr(op,reg1,reg2) \
+        __asm__ __volatile__ (#op " " #reg1 ", " #reg2 \
+                              :  /* nothing */ \
+                              : /* nothing */)
+
+
+#define w(p) p
+
+#define nop()             __asm__ __volatile__ ("nop")
+
+#define rep()             __asm__ __volatile__ ("rep")
+
+#define align()           __asm__ __volatile__ (".align 16")
+
+
+#ifdef x87double
+
+#define st0 %%st(0)
+#define st1 %%st(1)
+#define st2 %%st(2)
+#define st3 %%st(3)
+#define st4 %%st(4)
+#define st5 %%st(5)
+#define st6 %%st(6)
+#define st7 %%st(7)
+
+
+#define gen_stack_rt(op,reg) \
+        __asm__ __volatile__ (#op " " #reg \
+                              :  /* nothing */ \
+                              : /* nothing */)
+
+#define gen_stack_tr(op,reg) \
+        __asm__ __volatile__ (#op " %%st(0)," #reg \
+                              :  \
+                              : )
+
+
+#define gen_stack_rr(op,reg1,reg2) \
+        __asm__ __volatile__ (#op " " #reg1 ", " #reg2 \
+                              :  /* nothing */ \
+                              : /* nothing */)
+
+#define gen_stack_t(op)  \
+        __asm__ __volatile__ (#op \
+                              :  /* nothing */ \
+                              : /* nothing */)
+
+
+#define gen_stack_tm(op,mem) \
+        __asm__ __volatile__ (#op " %0" \
+                              : "=m" (((mem)[0])) \
+                              :  )
+
+#define gen_stack_mt(op,mem) \
+        __asm__ __volatile__ (#op " %0" \
+                              :  \
+                              : "m" (((mem)[0])))
+
+
+#define stack_mov_mt_push(mem)  gen_stack_mt(fldl,mem)
+
+#define stack_add_tr_pop(reg)   gen_stack_tr(faddp,reg)
+#define stack_add_mt(mem)       gen_stack_mt(faddl,mem)
+
+#define stack_mul_tr(reg)       gen_stack_tr(fmul,reg)
+#define stack_mul_tr_pop(reg)   gen_stack_tr(fmulp,reg)
+#define stack_mul_mt(mem)       gen_stack_mt(fmul,mem)
+
+#define stack_mov_tm_pop(mem)   gen_stack_tm(fstpl,mem)
+
+#define stack_zero_push()       gen_stack_t(fldz)
+
+#endif /* x87double */
+
+#ifdef SSE
+
+/* Peculiarities of SSE: Alignment is good, but not mandatory. It is possible to
+ * load/store from misaligned adresses using movups at a cost of some cycles. Loading
+ * using mul/add must always be aligned. Alignment is 16 bytes.
+ * No muladd.
+ */
+
+
+
+#define gen_vec_mr(op,mem,reg) \
+        __asm__ __volatile__ (#op " %0, " #reg \
+                              :  /* nothing */ \
+                              : "m" (((mem)[0])), "m" (((mem)[1])), "m" (((mem)[2])), "m" (((mem)[3])))
+
+
+#define gen_vec_rm(op,reg,mem) \
+        __asm__ __volatile__ (#op " " #reg ", %0" \
+                              : "=m" (((mem)[0])), "=m" (((mem)[1])), "=m" (((mem)[2])), "=m" (((mem)[3])) \
+                              :  /* nothing */ )                          
+
+
+
+
+#define VECLEN 4
+
+#define reg0 %%xmm0
+#define reg1 %%xmm1
+#define reg2 %%xmm2
+#define reg3 %%xmm3
+#define reg4 %%xmm4
+#define reg5 %%xmm5
+#define reg6 %%xmm6
+#define reg7 %%xmm7
+#ifdef ATL_GAS_x8664
+   #define reg8 %%xmm8
+   #define reg9 %%xmm9
+   #define reg10 %%xmm10
+   #define reg11 %%xmm11
+   #define reg12 %%xmm12
+   #define reg13 %%xmm13
+   #define reg14 %%xmm14
+   #define reg15 %%xmm15
+#endif
+
+#define vec_mov_mr(mem,reg)     gen_vec_mr(movups,mem,reg)
+#define vec_mov_rm(reg,mem)     gen_vec_rm(movups,reg,mem)
+#define vec_mov_mr_a(mem,reg)   gen_vec_mr(movaps,mem,reg)
+#define vec_mov_rm_a(reg,mem)   gen_vec_rm(movaps,reg,mem)
+#define vec_mov_rr(reg1,reg2)   gen_vec_rr(movaps,reg1,reg2)
+
+#define vec_add_mr_a(mem,reg)   gen_vec_mr(addps,mem,reg)
+#define vec_mul_mr_a(mem,reg)   gen_vec_mr(mulps,mem,reg)
+
+#define vec_add_rr(mem,reg)     gen_vec_rr(addps,mem,reg)
+#define vec_mul_rr(mem,reg)     gen_vec_rr(mulps,mem,reg)
+
+#define vec_mov_mr_1(mem,reg)   gen_vec_mr(movss,mem,reg)
+#define vec_mov_rm_1(reg,mem)   gen_vec_rm(movss,reg,mem)
+#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movss,reg1,reg2)
+
+#define vec_add_mr_1(mem,reg)   gen_vec_mr(addss,mem,reg)
+#define vec_add_rr_1(reg1,reg2) gen_vec_rr(addss,reg1,reg2)
+
+#define vec_mul_mr_1(mem,reg)   gen_vec_mr(mulss,mem,reg)
+#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(mulss,reg1,reg2)
+
+#define vec_unpack_low(reg1,reg2)  gen_vec_rr(unpcklps,reg1,reg2)
+#define vec_unpack_high(reg1,reg2) gen_vec_rr(unpckhps,reg1,reg2)
+#define vec_shuffle(mode,reg1,reg2) vec_shuffle_wrap(mode,reg1,reg2)
+#define vec_shuffle_wrap(mode,reg1,reg2) \
+        __asm__ __volatile__ ("shufps " #mode ", " #reg1 ", " #reg2 \
+			      : /* nothing */\
+			      : /* nothing */) 
+
+/* Hack! */
+/* To use this instruction be sure that register 7 is not in use!!! */
+/* It must be possible to reduce this sequence to only four instructions.
+ * please tell me how! */
+#define vec_sum(reg) vec_sum_wrap(reg)
+#define vec_sum_wrap(reg) \
+        __asm__ __volatile__ ("movhlps " #reg ", %%xmm7\n"\
+    			      "addps " #reg ", %%xmm7\n"\
+    			      "movaps %%xmm7, " #reg "\n"\
+                              "shufps $1, " #reg ", %%xmm7\n"\
+    			      "addss %%xmm7, " #reg "\n"\
+			      : /* nothing */\
+			      : /* nothing */) 
+
+/* RCW: added to safely replace vec_sum (vec reduce), and use SSE3 when avail */
+#define vec_zero(vd) __asm__ __volatile__("xorps " Mstr(vd) ", " Mstr(vd) ::)
+#ifdef ATL_SSE3
+   #define vec_red(vr, vwrk) \
+     __asm__ __volatile__("haddps " Mstr(vr) ", " Mstr(vr) "\n"\
+                          "haddps " Mstr(vr) ", " Mstr(vr) "\n" ::)
+/*
+ * haddps v1 v0         # v0 = {v1cd, v1ab, v0cd, v0ab}
+ * haddps v0 v0         # v0 = {v1abcd, v0abcd, v1abcd, v0abcd}
+ */
+   #define vec_red2(v0, v1, vwork) \
+     __asm__ __volatile__("haddps " Mstr(v1) ", " Mstr(v0) "\n"\
+                          "haddps " Mstr(v0) ", " Mstr(v0) "\n" ::)
+/*
+ * haddps   v1, v0     # v0 = {v1cd,v1ab,v0cd,v0ab}
+ * haddps   v3, v2     # v2 = {v3cd,v3ab,v2cd,v2ab}
+ * haddps   v2, v0     # v0 = {v3abcd,v2abcd,v1abcd, v0abcd}
+ */
+   #define vec_red4(v0, v1, v2, v3, w0, w1) \
+     __asm__ __volatile__("haddps " Mstr(v1) ", " Mstr(v0) "\n"\
+                          "haddps " Mstr(v3) ", " Mstr(v2) "\n"\
+                          "haddps " Mstr(v2) ", " Mstr(v0) "\n" ::)
+#elif defined(ATL_SSE2)
+   #define vec_red(vr, vwrk) \
+      __asm__ __volatile__ ("pshufd $0xEE, " Mstr(vr) ", " Mstr(vwrk) "\n"\
+                            "addps " Mstr(vwrk) ", " Mstr(vr) "\n"\
+                            "pshufd $0xE5, " Mstr(vr) ", " Mstr(vwrk) "\n"\
+                            "addss " Mstr(vwrk) ", " Mstr(vr) "\n"\
+                            ::)
+#else
+   #define vec_red(vr, vwrk) \
+      __asm__ __volatile__ ("movhlps " Mstr(vr) ", " Mstr(vwrk) "\n"\
+                            "addps " Mstr(vwrk) ", " Mstr(vr) "\n"\
+                            "movaps " Mstr(vr) ", " Mstr(vwrk) "\n"\
+                            "shufps $0xE5, " Mstr(vr) ", " Mstr(vr) "\n"\
+                            "addss " Mstr(vwrk) ", " Mstr(vr) "\n"\
+                            ::)
+#endif
+#ifndef ATL_SSE3  /* codes that are the same for SSE2 and SSE1 */
+/*
+                                # v0 = {v0d,v0c,v0b,v0a}
+                                # v1 = {v1d,v1c,v1b,v1a}
+       movaps   v0, vw          # vw = {v0d,v0c,v0b,v0a}
+       unpacklps v1, v0         # v0 = {v1b,v0b,v1a,v0a}
+       unpackhps v1, vw         # vw = {v1d,v0d,v1c,v0c}
+       addps    vw, v0          # v0 = {v1bd,v0bd,v1ac,v0ac}
+       movhlps  v0, vw          # vw = {X   ,   X,v1bd,v0bd}
+       addps    vw, v0          # v0 = {X   ,   X,v1abcd,v0abcd}
+*/
+   #define vec_red2(v0, v1, vw) \
+     __asm__ __volatile__ ("movaps " Mstr(v0) ", " Mstr(vw) "\n"\
+                           "unpcklps " Mstr(v1) ", " Mstr(v0) "\n"\
+                           "unpckhps " Mstr(v1) ", " Mstr(vw) "\n"\
+                           "addps " Mstr(vw) ", " Mstr(v0) "\n"\
+                           "movhlps " Mstr(v0) ", " Mstr(vw) "\n"\
+                           "addps " Mstr(vw) ", " Mstr(v0) "\n"\
+                           ::)
+/* 
+ * movaps  v0, w0       # w0 = {v0d, v0c, v0b, v0a}
+ * unpcklps v1, v0      # v0 = {v1b, v0b, v1a, v0a}
+ * movaps  v2, w1       # w1 = {v2d, v2c, v2b, v2a}
+ * unpckhps v1, w0      # w0 = {v1d, v0d, v1c, v0c}
+ * unpcklps v3, v2      # v2 = {v3b, v2b, v3a, v2a}
+ * addps    w0, v0      # v0 = {v1bd, v0bd, v1ac, v0ac}
+ * unpckhps v3, w1      # w1 = {v3d, v2d, v3c, v2c}
+ * movaps   v0, w0      # w0 = {v1bd, v0bd, v1ac, v0ac}
+ * addps    w1, v2      # v2 = {v3bd, v2bd, v3ac, v2ac}
+ * shufps   $0x44,v2,v0 # v0 = {v3ac, v2ac, v1ac, v0ac}
+ * shufps   $0xEE,v2,w0 # w0 = {v3bd, v2bd, v1bd, v0bd}
+ * addps    w0, v0      # v0 = {v3abcd, v2abcd, v1abcd, v0abcd}
+ */
+   #define vec_red4(v0, v1, v2, v3, w0, w1) \
+     __asm__ __volatile__ ("movaps " Mstr(v0) ", " Mstr(w0) "\n"\
+                          "unpcklps " Mstr(v1) ", " Mstr(v0) "\n"\
+                          "movaps " Mstr(v2) ", " Mstr(w1) "\n"\
+                          "unpckhps " Mstr(v1) ", " Mstr(w0) "\n"\
+                          "unpcklps " Mstr(v3) ", " Mstr(v2) "\n"\
+                          "addps " Mstr(w0) ", " Mstr(v0) "\n"\
+                          "unpckhps " Mstr(v3) ", " Mstr(w1) "\n"\
+                          "movaps " Mstr(v0) ", " Mstr(w0) "\n"\
+                          "addps " Mstr(w1) ", " Mstr(v2) "\n"\
+                          "shufps $0x44, " Mstr(v2) ", " Mstr(v0) "\n"\
+                          "shufps $0xEE, " Mstr(v2) ", " Mstr(w0) "\n"\
+                          "addps " Mstr(w0) ", " Mstr(v0) "\n"\
+                           ::)
+#endif
+     
+#define vec_splat(mem,reg)      vec_splat_wrap(mem,reg)
+#define vec_splat_wrap(mem,reg) \
+        __asm__ __volatile__ ("movss %0, " #reg "\n"\
+			      "unpcklps " #reg ", " #reg "\n"\
+			      "movlhps " #reg ", " #reg "\n"\
+			      : /* nothing */ \
+                              : "m" ((mem)[0]))
+
+
+/* This instruction sequence appears courtesy of Camm Maguire. */
+#define vec_sum_full(reg0,reg1,reg2,reg3,regout,empty0,empty1) vec_sum_full_wrap(reg0,reg1,reg2,reg3,regout,empty0,empty1)
+#define vec_sum_full_wrap(reg0,reg1,reg2,reg3,regout,empty0,empty1) \
+      __asm__ __volatile__ ("movaps " #reg0 "," #empty0 "\n"\
+			    "unpcklps " #reg1 "," #reg0 "\n"\
+			    "movaps " #reg2 "," #empty1 "\n"\
+			    "unpckhps " #reg1 "," #empty0 "\n"\
+			    "unpcklps " #reg3 "," #reg2 "\n"\
+			    "addps  " #empty0 "," #reg0 "\n"\
+			    "unpckhps " #reg3 "," #empty1 "\n"\
+			    "movaps " #reg0 "," #regout "\n"\
+			    "addps  " #empty1 "," #reg2 "\n"\
+			    "shufps $0x44," #reg2 "," #reg0 "\n"\
+			    "shufps $0xee," #reg2 "," #regout "\n"\
+			    "addps  " #reg0 "," #regout "\n"\
+			    : /* nothing */  \
+			    : /* nothing */)			    
+
+
+
+typedef float vector[VECLEN];
+
+#endif  /* end ifdef SSE */
+
+
+#ifdef SSE2
+
+/* Peculiarities of SSE: Alignment is good, but not mandatory. It is possible to
+ * load/store from misaligned adresses using movups at a cost of some cycles. Loading
+ * using mul/add must always be aligned. Alignment is 16 bytes.
+ * No muladd.
+ */
+
+
+
+#define gen_vec_mr(op,mem,reg) \
+        __asm__ __volatile__ (#op " %0, " #reg \
+                              :  /* nothing */ \
+                              : "m" (((mem)[0])), "m" (((mem)[1])))
+
+
+#define gen_vec_rm(op,reg,mem) \
+        __asm__ __volatile__ (#op " " #reg ", %0" \
+                              : "=m" (((mem)[0])), "=m" (((mem)[1])) \
+                              :  /* nothing */ )                          
+
+
+
+
+#define VECLEN 2
+
+#define reg0 %%xmm0
+#define reg1 %%xmm1
+#define reg2 %%xmm2
+#define reg3 %%xmm3
+#define reg4 %%xmm4
+#define reg5 %%xmm5
+#define reg6 %%xmm6
+#define reg7 %%xmm7
+#ifdef ATL_GAS_x8664
+   #define reg8 %%xmm8
+   #define reg9 %%xmm9
+   #define reg10 %%xmm10
+   #define reg11 %%xmm11
+   #define reg12 %%xmm12
+   #define reg13 %%xmm13
+   #define reg14 %%xmm14
+   #define reg15 %%xmm15
+#endif
+
+
+#define vec_mov_mr(mem,reg)     gen_vec_mr(movupd,mem,reg)
+#define vec_mov_rm(reg,mem)     gen_vec_rm(movupd,reg,mem)
+#define vec_mov_mr_a(mem,reg)   gen_vec_mr(movapd,mem,reg)
+#define vec_mov_rm_a(reg,mem)   gen_vec_rm(movapd,reg,mem)
+#define vec_mov_rr(reg1,reg2)   gen_vec_rr(movapd,reg1,reg2)
+
+#define vec_add_mr_a(mem,reg)   gen_vec_mr(addpd,mem,reg)
+#define vec_mul_mr_a(mem,reg)   gen_vec_mr(mulpd,mem,reg)
+
+#define vec_add_rr(mem,reg)     gen_vec_rr(addpd,mem,reg)
+#define vec_mul_rr(mem,reg)     gen_vec_rr(mulpd,mem,reg)
+
+#define vec_mov_mr_1(mem,reg)   gen_vec_mr(movsd,mem,reg)
+#define vec_mov_rm_1(reg,mem)   gen_vec_rm(movsd,reg,mem)
+#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movsd,reg1,reg2)
+
+#define vec_add_mr_1(mem,reg)   gen_vec_mr(addsd,mem,reg)
+#define vec_add_rr_1(reg1,reg2) gen_vec_rr(addsd,reg1,reg2)
+
+#define vec_mul_mr_1(mem,reg)   gen_vec_mr(mulsd,mem,reg)
+#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(mulsd,reg1,reg2)
+
+#define vec_splat(mem,reg)      vec_splat_wrap(mem,reg)
+#define vec_splat_wrap(mem,reg) \
+        __asm__ __volatile__ ("movsd %0, " #reg "\n"\
+			      "unpcklpd " #reg ", " #reg \
+			      : /* nothing */ \
+                              : "m" ((mem)[0]))
+
+/* Hack! */
+/* To use this instruction be sure that register 7 is not in use!!! */
+#define vec_sum(reg) vec_sum_wrap(reg)
+#define vec_sum_wrap(reg) \
+        __asm__ __volatile__ ("movhlps " #reg ", %%xmm7\n"\
+    			      "addpd %%xmm7, " #reg "\n"\
+			      : /* nothing */\
+			      : /* nothing */) 
+/* 
+ * Added by RCW to improve performance and avoid xmm7 hack (replace vec_sum)
+ */
+#define vec_zero(vd) __asm__ __volatile__("xorps " Mstr(vd) ", " Mstr(vd) ::)
+#ifdef ATL_SSE3
+   #define vec_red(vr, vwrk) \
+      __asm__ __volatile__("haddpd " Mstr(vr) ", " Mstr(vr) "\n" ::)
+   #define vec_red2(v0, v1, vw) \
+      __asm__ __volatile__("haddpd " Mstr(v1) ", " Mstr(v0) "\n" ::)
+   #define vec_red4(v0, v1, v2, v3, w0, w1) \
+      __asm__ __volatile__("haddpd " Mstr(v1) ", " Mstr(v0) "\n"\
+                           "haddpd " Mstr(v3) ", " Mstr(v2) "\n"\
+                           ::)
+#else
+   #define vec_red(vr, vwrk) \
+      __asm__ __volatile__ ("pshufd $0xEE, " Mstr(vr) ", " Mstr(vwrk) "\n"\
+                            "addsd " Mstr(vwrk) ", " Mstr(vr) "\n" ::)
+/*
+ * movapd v0, vw        # vw = {v0b, v0a}
+ * unpcklpd v1,v0      # v0 = {v1a, v0a}
+ * unpckhpd v1, vw     # vw = {v1b, v0b}
+ * addpd     vw, v0     # v0 = {v1ab,v0ab}
+ */
+   #define vec_red2(v0, v1, vw) \
+      __asm__ __volatile__("movapd " Mstr(v0) ", " Mstr(vw) "\n"\
+                           "unpcklpd " Mstr(v1) ", " Mstr(v0) "\n"\
+                           "unpckhpd " Mstr(v1) ", " Mstr(vw) "\n"\
+                           "addpd " Mstr(vw) ", " Mstr(v0) "\n"\
+                           ::)
+/*
+ * movapd   v0, w0      # w0 = {v0b, v0a}
+ * movapd   v2, w1      # w1 = {v2b, v2a}
+ * unpcklpd v1, v0      # v0 = {v1a, v0a}
+ * unpcklpd v3, v2      # v2 = {v3a, v2a}
+ * unpckhpd v1, w0      # w0 = {v1b, v0b}
+ * unpckhpd v3, w1      # w1 = {v3b, v2b}
+ * addpd    w0, v0      # v0 = {v1ab, v0ab}
+ * addpd    w1, v2      # v2 = {v3ab, v2ab}
+ */
+   #define vec_red4(v0, v1, v2, v3, w0, w1) \
+      __asm__ __volatile__("movapd " Mstr(v0) ", " Mstr(w0) "\n"\
+                           "movapd " Mstr(v2) ", " Mstr(w1) "\n"\
+                           "unpcklpd " Mstr(v1) ", " Mstr(v0) "\n"\
+                           "unpcklpd " Mstr(v3) ", " Mstr(v2) "\n"\
+                           "unpckhpd " Mstr(v1) ", " Mstr(w0) "\n"\
+                           "unpckhpd " Mstr(v3) ", " Mstr(w1) "\n"\
+                           "addpd " Mstr(w0) ", " Mstr(v0) "\n"\
+                           "addpd " Mstr(w1) ", " Mstr(v2) "\n"\
+                           ::)
+#endif
+
+#define vec_sum_full(reg1,reg2,empty1) vec_sum_full_wrap(reg1,reg2,empty1)
+#define vec_sum_full_wrap(reg1,reg2,empty1) \
+        __asm__ __volatile__ ("movhlps " #reg2 ", " #empty1 "\n"\
+                              "movlhps " #reg2 ", " #empty1 "\n"\
+    			      "addpd " #empty1 ", " #reg1 "\n"\
+			      : /* nothing */\
+			      : /* nothing */) 
+
+     
+typedef double vector[VECLEN];
+
+#endif  /* end ifdef SSE2 */
+
+
+#ifdef THREEDNOW
+
+/* Peculiarities of 3DNOW. Alignment is not an issue,
+ * all alignments are legal, however alignment gives a speed increase.
+ * The vec_acc instruction can be used to sum to registers at once more efficiently
+ * than a series of vec_sum and vec_store_one
+ * No muladd.
+ */
+
+
+#define gen_vec_mr(op,mem,reg) \
+        __asm__ __volatile__ (#op " %0, " #reg \
+                              :  /* nothing */ \
+                              : "m" (((mem)[0])), "m" (((mem)[1])))
+
+#define gen_vec_rm(op,reg,mem) \
+        __asm__ __volatile__ (#op " " #reg ", %0" \
+                              : "=m" (((mem)[0])), "=m" (((mem)[1])) \
+			      :  /* nothing */ )                            
+
+
+
+
+#define VECLEN 2
+
+#define reg0 %%mm0
+#define reg1 %%mm1
+#define reg2 %%mm2
+#define reg3 %%mm3
+#define reg4 %%mm4
+#define reg5 %%mm5
+#define reg6 %%mm6
+#define reg7 %%mm7
+
+#define vec_add_mr(mem,reg)     gen_vec_mr(pfadd,mem,reg)
+#define vec_mul_mr(mem,reg)     gen_vec_mr(pfmul,mem,reg)
+#define vec_mov_mr(mem,reg)     gen_vec_mr(movq,mem,reg)
+#define vec_mov_rm(reg,mem)     gen_vec_rm(movq,reg,mem)
+#define vec_add_rr(reg1,reg2)   gen_vec_rr(pfadd,reg1,reg2)
+#define vec_mul_rr(reg1,reg2)   gen_vec_rr(pfmul,reg1,reg2)
+#define vec_acc_rr(reg1,reg2)   gen_vec_rr(pfacc,reg1,reg2)
+#define vec_mov_rr(reg1,reg2)   gen_vec_rr(movq,reg1,reg2)
+
+#define vec_sum(reg)            gen_vec_rr(pfacc,reg,reg)
+#define vec_sum_full(reg1,reg2)  gen_vec_rr(pfacc,reg1,reg2)
+
+#define vec_mov_mr_1(mem,reg)   gen_vec_mr(movd,mem,reg)
+#define vec_mov_rm_1(reg,mem)   gen_vec_rm(movd,reg,mem)
+#define vec_mov_rr_1(reg1,reg2) gen_vec_rr(movd,reg1,reg2)
+
+#define vec_add_rr_1(reg1,reg2) gen_vec_rr(pfadd,reg1,reg2)
+#define vec_mul_rr_1(reg1,reg2) gen_vec_rr(pfmul,reg1,reg2)
+
+
+#define vec_splat(mem,reg)      vec_splat_wrap(mem,reg)
+#define vec_splat_wrap(mem,reg) \
+        __asm__ __volatile__ ("movd %0, " #reg "\n"\
+			      "punpckldq " #reg ", " #reg \
+			      : /* nothing */ \
+                              : "m" ((mem)[0]))
+
+
+#define vec_load_apart(mem1,mem2,reg) vec_load_apart_wrap(mem1,mem2,reg)
+#define vec_load_apart_wrap(mem1,mem2,reg) \
+        __asm__ __volatile__ ("movd %0, " #reg "\n"\
+			      "punpckldq %1, " #reg \
+			      : /* nothing */ \
+                              : "m" ((mem1)[0]), "m" (((mem2)[0])))
+
+
+#define vec_zero(reg)           gen_vec_rr(pxor,reg,reg)     
+
+#define vec_enter()             __asm__ __volatile__ ("femms")
+#define vec_exit()              __asm__ __volatile__ ("femms")
+
+#define align()                 __asm__ __volatile__ (".align 16")
+
+
+typedef float vector[VECLEN];
+
+#endif
+
+
+
+
+
+#ifdef ALTIVEC
+
+#define VECLEN 4
+
+#define reg0 %%vr0
+#define reg1 %%vr1
+#define reg2 %%vr2
+#define reg3 %%vr3
+#define reg4 %%vr4
+#define reg5 %%vr5
+#define reg6 %%vr6
+#define reg7 %%vr7
+#define reg8 %%vr8
+#define reg9 %%vr9
+#define reg10 %%vr10
+#define reg11 %%vr11
+#define reg12 %%vr12
+#define reg13 %%vr13
+#define reg14 %%vr14
+#define reg15 %%vr15
+#define reg16 %%vr16
+#define reg17 %%vr17
+#define reg18 %%vr18
+#define reg19 %%vr19
+#define reg20 %%vr20
+#define reg21 %%vr21
+#define reg22 %%vr22
+#define reg23 %%vr23
+#define reg24 %%vr24
+#define reg25 %%vr25
+#define reg26 %%vr26
+#define reg27 %%vr27
+#define reg28 %%vr28
+#define reg29 %%vr29
+#define reg30 %%vr30
+#define reg31 %%vr31
+
+#define gen_vec_mr(op,mem,reg) \
+        __asm__ __volatile__ (#op " %0, " #reg \
+                              :  /* nothing */ \
+                              : "m" (((mem)[0])), "m" (((mem)[1])), "m" (((mem)[2])), "m" (((mem)[3])))
+
+
+#define gen_vec_rm(op,reg,mem) \
+        __asm__ __volatile__ (#op " " #reg ", %0" \
+                              : "=m" (((mem)[0])), "=m" (((mem)[1])), "=m" (((mem)[2])), "=m" (((mem)[3])) \
+                              :  /* nothing */ )                          
+
+
+#define gen_alti3(op,reg1,reg2,regout) \
+        __asm__ __volatile__ (#op " " #reg1 ", " #reg2 ", " #regout \
+                              :  /* nothing */ \
+                              : /* nothing */)
+
+#define gen_alti_muladd(op,reg1,reg2,regout) \
+        __asm__ __volatile__ (#op " " #reg1 ", " #reg2 ", " #regout ", " #regout \
+                              :  /* nothing */ \
+                              : /* nothing */)
+
+
+
+#define vec_mov_mr_a(mem,reg) gen_vec_mr(lvx,mem,reg)
+#define vec_mov_rm_a(reg,mem) gen_vec_rm(svx,reg,mem)
+#define vec_muladd(reg1,reg2,regout) gen_alti3(vmaddfp,reg1,reg2,regout)
+
+#define vec_zero(reg) gen_alti3(vxor,reg,reg,reg)
+
+
+typedef float vector[VECLEN];
+
+#endif
+
+
+#ifdef ALTIVEC_C
+
+/* These macros have been written by, or greatly inspired by,
+ *  Nicholas A. Coult . Thanks.
+ */
+
+/* assumes that last four registers are not in use! */
+#define transpose(x0,x1,x2,x3) \
+reg28 = vec_mergeh(x0,x2); \
+reg29 = vec_mergeh(x1,x3); \
+reg30 = vec_mergel(x0,x2); \
+reg31 = vec_mergel(x1,x3); \
+x0 = vec_mergeh(reg28,reg29); \
+x1 = vec_mergel(reg28,reg29); \
+x2 = vec_mergeh(reg30,reg31); \
+x3 = vec_mergel(reg30,reg31)
+
+#define vec_mov_rm(v, where) \
+low = vec_ld(0, (where)); \
+high = vec_ld(16, (where)); \
+p_vector = vec_lvsr(0, (int *)(where)); \
+mask  = vec_perm((vector unsigned char)(0), (vector unsigned char)(-1), p_vector); \
+v = vec_perm(v, v, p_vector); \
+low = vec_sel(low,  v, mask); \
+high = vec_sel(v, high, mask); \
+vec_st(low,  0, (where)); \
+vec_st(high, 16, (where))
+
+#define vec_mov_mr_a(mem,reg)  reg = vec_ld(0, mem) 
+
+#define vec_mov_mr(u,v) \
+p_vector = (vector unsigned char)vec_lvsl(0, (int*)(v)); \
+low = (vector unsigned char)vec_ld(0, (v)); \
+high = (vector unsigned char)vec_ld(16, (v)); \
+u=(vector float)vec_perm(low, high, p_vector)
+
+#define vec_muladd(reg1,reg2,regout) regout = vec_madd(reg1,reg2,regout)
+#define vec_add_rr(reg1,reg2)        reg2 = vec_add(reg1,reg2)
+
+#define vec_zero(reg)                reg = vec_xor(reg,reg)
+
+#define vec_sum_full(reg0,reg1,reg2,reg3,regout,empty0,empty1) \
+transpose(reg0, reg1,reg2,reg3,regout,empty0,empty1); \
+empty0 = vec_add(reg0,reg1); \
+empty1 = vec_add(reg2,reg3); \
+regout = vec_add(empty0,empty1)
+
+
+#endif /* ALTIVEC_C */
+
+
+
+
+
+
+
+
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h
new file mode 100644
index 0000000..af9c6b1
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_dpa.h
@@ -0,0 +1,1626 @@
+#include <stdlib.h>
+#include <sys/time.h>
+#include <stdio.h>
+
+#include "camm_util.h"
+
+
+#if defined(ALIGN) 
+#if( defined(SCPLX) || defined(DCPLX))
+#error Cannot align complex routines
+#endif
+#if defined(SREAL) && ( NDPM != 1 ) && ( STRIDE % 4 != 0)
+#error Can only align SREAL with NDPM 1 or STRIDE % 4 = 0
+#endif
+#if defined(DREAL) && ( NDPM != 1 ) && ( STRIDE % 2 != 0)
+#error Can only align DREAL with NDPM 1 or STRIDE % 2 = 0
+#endif
+#endif
+
+/******************************************************************************
+ *  Single Precision Complex Macros
+ ******************************************************************************/  
+
+#ifdef SCPLX
+
+#ifdef NO_TRANSPOSE
+
+#if NDPM > 3 
+#error Max NDPM is 3 for SCPLX NO_TRANSPOSE
+#endif
+
+#undef plax
+#define plax
+
+#undef R1
+#define R1 2
+#undef R2
+#define R2 4
+#undef R3
+#define R3 6
+#undef R4
+#define R4 6
+
+#undef TREG
+#define TREG 1
+#undef SREG
+#define SREG 0
+#undef CREG
+#define CREG 0
+
+#ifdef GER
+#undef AREG
+#define AREG 0
+#undef targ
+#define targ(a_)        AREG
+#undef wb
+#define wb(a_,b_)       pu(AREG,a_,b_)
+#undef wbd
+#define wbd(a_,b_)      pud(AREG,a_,b_)
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#else
+#undef AREG
+#define AREG TREG
+#undef targ
+#define targ(a_)        CREG
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef w
+#define w(a_)           pu(CREG,a_ ## 0,si)
+#undef w1_2
+#define w1_2(a_)        pud(CREG,a_ ## 0,si)
+#endif
+
+#undef src
+#define src(a_)         a_
+#undef mpx
+#define mpx(a_)         pls(0,si,a_) ps(0,a_,a_) pls(4,si,P(a_,1)) \
+                        ps(0,P(a_,1),P(a_,1)) sign(a_)
+#undef madd
+#define madd(a_,b_,c_)  pas(a_,b_,c_)
+#undef ulfa
+#define ulfa(a_)
+
+#else
+
+#undef R1
+#define R1 4
+#undef R2
+#define R2 5
+#undef R3
+#define R3 6
+#undef R4
+#define R4 7
+
+#undef TREG
+#define TREG 3
+#undef SREG
+#define SREG 2
+#undef CREG
+#define CREG 0
+#undef targ
+#define targ(a_)        a_
+#undef src
+#define src(a_)         0
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef mpx
+#define mpx(a_)        px(a_)
+#ifdef BETA0
+#undef ulfa
+#define ulfa(a_)       phl(a_,0) pa(0,a_) pud(a_,0,si)
+#else
+#undef ulfa
+#define ulfa(a_)       pld(0,si,TREG) phl(a_,0) pa(0,a_) pa(TREG,a_) pud(a_,0,si)
+#endif
+#undef AREG
+#define AREG TREG
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+#define wbs(a_,b_)
+
+
+#undef plax
+#define plax       pc(CREG,1) ps(160,CREG,CREG) ps(245,1,1) sign(CREG)
+
+
+
+#endif
+
+#if defined(Conj_) && ! defined(GER) 
+#undef sign
+#define sign(a_)       pm(SREG,a_)
+#else		   
+#undef sign
+#define sign(a_)       pm(SREG,P(a_,1))
+#endif
+
+
+
+#undef plb
+#define plb(a_,b_)           pl(a_,b_,AREG)
+#undef plbd
+#define plbd(a_,b_)          px(AREG) pld(a_,b_,AREG)
+
+#undef dpr
+#define dpr(a_)              pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprp
+#define dprp(a_,b_,c_)       pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dpi
+#define dpi(a_)              pm(P(src(a_),1),TREG) ps(177,TREG,TREG) pa(TREG,targ(a_))
+
+#ifndef GER
+
+
+#undef plaa
+#define plaa(a_)                pl(a_ ## 0,si,CREG) plax
+#undef wa
+#define wa(a_)                  w(a_)
+#undef dp
+#define dp(a_,b_,c_)            plb(a_ ## 0,b_) dpr(c_) plb(a_ ## 0,b_) dpi(c_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_)     plb(a_ ## 0,b_) dprp(c_,d_,e_) plb(a_ ## 0,b_) dpi(c_)
+#undef ddp
+#define ddp(a_,b_,c_)           dp(a_,b_,c_)       
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_)    dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+#define plaa1_2(a_)             px(CREG) pld(a_ ## 0,si,CREG) plax
+#undef wa1_2
+#define wa1_2(a_)               w1_2(a_)
+#undef dp1_2
+#define dp1_2(a_,b_,c_)         plbd(a_ ## 0,b_) dpr(c_) plbd(a_ ## 0,b_) dpi(c_)
+#undef dpp1_2
+#define dpp1_2(a_,b_,c_,d_,e_)  plbd(a_ ## 0,b_) dprp(c_,d_,e_) plbd(a_ ## 0,b_) dpi(c_)
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_)        dp1_2(a_,b_,c_)       
+#undef ddpp1_2
+#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_)
+
+
+#else
+
+#undef lqc
+#define lqc(a_)              pl(a_ ## 0,si,TREG)
+#undef lqc1
+#define lqc1_2(a_)           px(TREG) pld(a_ ## 0,si,TREG)
+
+
+#undef plaa
+#define plaa(a_) 
+#undef wa
+#define wa(a_)
+#undef dp
+#define dp(a_,b_,c_)         lqc(a_) plb(a_ ## 0,b_) dpr(c_) \
+                             lqc(a_) dpi(c_) wb(a_ ## 0,b_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_)  lqc(a_) plb(a_ ## 0,b_) dpr(c_) pf(d_,e_) \
+                             lqc(a_) dpi(c_) wb(a_ ## 0,b_)
+#undef ddp
+#define ddp(a_,b_,c_)        dp(a_,b_,c_)       
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+#define plaa1_2(a_)
+#undef wa1_2
+#define wa1_2(a_)
+#undef dp1_2
+#define dp1_2(a_,b_,c_)         lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) \
+                                lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_)
+#undef dpp1_2
+#define dpp1_2(a_,b_,c_,d_,e_)  lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) pf(d_,e_) \
+                                lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_)
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_)        dp1_2(a_,b_,c_)       
+#undef ddpp1_2
+#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_)
+
+#endif
+
+#endif
+
+/******************************************************************************
+ *  Single Precision Real Macros
+ ******************************************************************************/  
+
+#ifdef SREAL
+
+#ifdef NO_TRANSPOSE
+
+#undef mpx
+#define mpx(a_)        pls(0,si,a_) ps(0,a_,a_)
+#undef madd
+#define madd(a_,b_,c_) pas(a_,b_,c_)
+#undef TREG
+#define TREG 1
+#undef targ
+#define targ(a_)        0
+#undef src
+#define src(a_)         a_
+#undef ulfa
+#define ulfa(a_)
+
+#ifdef GER
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef w1_4
+#define w1_4(a_)
+#undef CREG
+#define CREG 2
+#undef AREG
+#define AREG 0
+#undef cp
+#define cp pc(CREG,TREG)
+#undef wb
+#define wb(a_,b_) pu(AREG,a_,b_)
+#undef wbd
+#define wbd(a_,b_) pud(AREG,a_,b_)
+#undef wbs
+#define wbs(a_,b_) pus(AREG,a_,b_)
+#else
+#undef CREG
+#define CREG 0
+#undef AREG
+#define AREG TREG
+#undef cp
+#define cp
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+#define wbs(a_,b_)
+#undef w
+#define w(a_)           pu(CREG,a_ ## 0,si)
+#undef w1_2
+#define w1_2(a_)        pud(CREG,a_ ## 0,si)
+#undef w1_4
+#define w1_4(a_)        pus(CREG,a_ ## 0,si)
+#endif
+
+#else
+
+#undef mpx
+#define mpx(a_)        px(a_)
+#ifdef BETA0
+#undef madd
+#define madd(a_,b_,c_)
+#else
+#undef madd
+#define madd(a_,b_,c_) pas(a_,b_,c_)
+#endif
+#undef TREG
+#define TREG 3
+#undef targ
+#define targ(a_)        a_
+#undef src
+#define src(a_)         0
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef w1_4
+#define w1_4(a_)
+#undef ulfa
+#undef ulfa
+#define ulfa(a_)       phl(a_,0) pa(0,a_) pc(a_,0) ps(1,0,0) pa(0,a_) \
+                       madd(0,si,a_) pus(a_,0,si) 
+
+#undef CREG
+#define CREG 0
+#undef AREG
+#define AREG TREG
+#undef cp
+#define cp
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+#define wbs(a_,b_)
+
+#endif
+
+#if defined(ALIGN)
+#undef plb
+#define plb(a_,b_)           pla(a_,b_,AREG)
+#else
+#undef plb
+#define plb(a_,b_)           pl(a_,b_,AREG)
+#endif
+#undef plbd
+#define plbd(a_,b_)          px(AREG) pld(a_,b_,AREG)
+#undef plbs
+#define plbs(a_,b_)          pls(a_,b_,AREG)
+#undef dpr
+#define dpr(a_)              pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprp
+#define dprp(a_,b_,c_)       pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprs
+#define dprs(a_)             pmsr(src(a_),TREG) pasr(TREG,targ(a_))
+#undef dprps
+#define dprps(a_,b_,c_)      pf(b_,c_) pmsr(src(a_),TREG) pasr(TREG,targ(a_))
+
+#undef plaa
+#define plaa(a_)             pl(a_ ## 0,si,CREG) 
+#undef wa
+#define wa(a_)               w(a_)
+#undef dp
+#define dp(a_,b_,c_)         cp plb(a_ ## 0,b_) dpr(c_) wb(a_ ## 0,b_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_)  cp plb(a_ ## 0,b_) dprp(c_,d_,e_) wb(a_ ## 0,b_)
+#undef ddp
+#define ddp(a_,b_,c_)        dp(a_,b_,c_)       
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+#define plaa1_2(a_)             px(CREG) pld(a_ ## 0,si,CREG) 
+#undef wa1_2
+#define wa1_2(a_)               w1_2(a_)
+#undef dp1_2
+#define dp1_2(a_,b_,c_)         cp plbd(a_ ## 0,b_) dpr(c_) wbd(a_ ## 0,b_)
+#undef dpp1_2
+#define dpp1_2(a_,b_,c_,d_,e_)  cp plbd(a_ ## 0,b_) dprp(c_,d_,e_) wbd(a_ ## 0,b_)
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_)        dp1_2(a_,b_,c_)       
+#undef ddpp1_2
+#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_)
+
+#undef plaa1_4
+#define plaa1_4(a_)             pls(a_ ## 0,si,CREG) 
+#undef wa1_4
+#define wa1_4(a_)               w1_4(a_)
+#undef dp1_4
+#define dp1_4(a_,b_,c_)         cp plbs(a_ ## 0,b_) dprs(c_) wbs(a_ ## 0,b_)
+#undef dpp1_4
+#define dpp1_4(a_,b_,c_,d_,e_)  cp plbs(a_ ## 0,b_) dprps(c_,d_,e_) wbs(a_ ## 0,b_)
+#undef ddp1_4
+#define ddp1_4(a_,b_,c_)        dp1_4(a_,b_,c_)       
+#undef ddpp1_4
+#define ddpp1_4(a_,b_,c_,d_,e_) dpp1_4(a_,b_,c_,d_,e_)
+
+
+
+#undef R1
+#define R1 4
+#undef R2
+#define R2 5
+#undef R3
+#define R3 6
+#undef R4
+#define R4 7
+
+#endif
+
+/******************************************************************************
+ *  Double Precision Real Macros
+ ******************************************************************************/  
+
+#ifdef DREAL
+
+#ifdef ATL_SSE2
+
+#ifdef NO_TRANSPOSE
+
+#undef mpx
+#define mpx(a_)        pls(0,si,a_) ps(0,a_,a_)
+#undef madd
+#define madd(a_,b_,c_) pas(a_,b_,c_)
+#undef TREG
+#define TREG 1
+#undef targ
+#define targ(a_)        0
+#undef src
+#define src(a_)         a_
+#undef ulfa
+#define ulfa(a_)
+
+#ifdef GER
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef w1_4
+#define w1_4(a_)
+#undef CREG
+#define CREG 2
+#undef AREG
+#define AREG 0
+#undef cp
+#define cp pc(CREG,TREG)
+#undef wb
+#define wb(a_,b_) pu(AREG,a_,b_)
+#undef wbd
+#define wbd(a_,b_) pus(AREG,a_,b_)
+#undef wbs
+/* #define wbs(a_,b_) pus(AREG,a_,b_) */
+#else
+#undef CREG
+#define CREG 0
+#undef AREG
+#define AREG TREG
+#undef cp
+#define cp
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+/* #define wbs(a_,b_) */
+#undef w
+#define w(a_)           pu(CREG,a_ ## 0,si)
+#undef w1_2
+#define w1_2(a_)        pus(CREG,a_ ## 0,si)
+#undef w1_4
+/* #define w1_4(a_)        pus(CREG,a_ ## 0,si) */
+#endif
+
+#else
+
+#undef mpx
+#define mpx(a_)        px(a_)
+#ifdef BETA0
+#undef madd
+#define madd(a_,b_,c_)
+#else
+#undef madd
+#define madd(a_,b_,c_) pas(a_,b_,c_)
+#endif
+#undef TREG
+#define TREG 3
+#undef targ
+#define targ(a_)        a_
+#undef src
+#define src(a_)         0
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef w1_4
+#define w1_4(a_)
+#undef ulfa
+#undef ulfa
+#define ulfa(a_)       /* phl(a_,0) pa(0,a_) */ pc(a_,0)  ps(1,0,0) pa(0,a_) \
+                       madd(0,si,a_) pus(a_,0,si) 
+
+#undef CREG
+#define CREG 0
+#undef AREG
+#define AREG TREG
+#undef cp
+#define cp
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+#define wbs(a_,b_)
+
+#endif
+
+#if defined(ALIGN)
+#undef plb
+#define plb(a_,b_)           pla(a_,b_,AREG)
+#else
+#undef plb
+#define plb(a_,b_)           pl(a_,b_,AREG)
+#endif
+#undef plbd
+#define plbd(a_,b_)          /* px(AREG)  */pls(a_,b_,AREG)
+#undef plbs
+/* #define plbs(a_,b_)          pls(a_,b_,AREG) */
+#undef dpr
+#define dpr(a_)              pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprp
+#define dprp(a_,b_,c_)       pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprs
+#define dprs(a_)             pmsr(src(a_),TREG) pasr(TREG,targ(a_))
+#undef dprps
+#define dprps(a_,b_,c_)      pf(b_,c_) pmsr(src(a_),TREG) pasr(TREG,targ(a_))
+
+#undef plaa
+#define plaa(a_)             pl(a_ ## 0,si,CREG) 
+#undef wa
+#define wa(a_)               w(a_)
+#undef dp
+#define dp(a_,b_,c_)         cp plb(a_ ## 0,b_) dpr(c_) wb(a_ ## 0,b_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_)  cp plb(a_ ## 0,b_) dprp(c_,d_,e_) wb(a_ ## 0,b_)
+#undef ddp
+#define ddp(a_,b_,c_)        dp(a_,b_,c_)       
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+#define plaa1_2(a_)             /* px(CREG)  */pls(a_ ## 0,si,CREG) 
+#undef wa1_2
+#define wa1_2(a_)               w1_2(a_)
+#undef dp1_2
+#define dp1_2(a_,b_,c_)         cp plbd(a_ ## 0,b_) dprs(c_) wbd(a_ ## 0,b_)
+#undef dpp1_2
+#define dpp1_2(a_,b_,c_,d_,e_)  cp plbd(a_ ## 0,b_) dprps(c_,d_,e_) wbd(a_ ## 0,b_)
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_)        dp1_2(a_,b_,c_)       
+#undef ddpp1_2
+#define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_)
+
+#undef plaa1_4
+/* #define plaa1_4(a_)             pls(a_ ## 0,si,CREG)  */
+#undef wa1_4
+/* #define wa1_4(a_)               w1_4(a_) */
+#undef dp1_4
+/* #define dp1_4(a_,b_,c_)         cp plbs(a_ ## 0,b_) dprs(c_) wbs(a_ ## 0,b_) */
+#undef dpp1_4
+/* #define dpp1_4(a_,b_,c_,d_,e_)  cp plbs(a_ ## 0,b_) dprps(c_,d_,e_) wbs(a_ ## 0,b_) */
+#undef ddp1_4
+/* #define ddp1_4(a_,b_,c_)        dp1_4(a_,b_,c_)        */
+#undef ddpp1_4
+/* #define ddpp1_4(a_,b_,c_,d_,e_) dpp1_4(a_,b_,c_,d_,e_) */
+
+
+
+#undef R1
+#define R1 4
+#undef R2
+#define R2 5
+#undef R3
+#define R3 6
+#undef R4
+#define R4 7
+
+#else
+
+#ifdef NO_TRANSPOSE
+
+#undef t0
+#define t0(a_)         1
+#undef s0
+#define s0(a_)         a_
+#undef t8
+#define t8(a_)         2
+#undef s8
+#define s8(a_)         a_
+#undef w
+#define w(a_)          fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef w1_2
+#define w1_2(a_)       fp(a_ ## 0,si)
+#undef mpx
+#define mpx(a_)        fl(0,si) fc(M(a_,2))
+#undef madd
+#define madd(a_,b_,c_) faa(a_,b_)
+#undef ulfa
+#define ulfa(a_)       fc(0)
+
+#else
+
+#undef t0
+#define t0(a_)         a_
+#undef s0
+#define s0(a_)         1
+#undef t8
+#define t8(a_)         a_
+#undef s8
+#define s8(a_)         2
+#undef w
+#define w(a_)           
+#undef w1_2
+#define w1_2(a_)           
+#undef mpx
+#define mpx(a_)        fz
+#ifdef BETA0
+#undef madd
+#define madd(a_,b_,c_)
+#else
+#undef madd
+#define madd(a_,b_,c_) faa(a_,b_)
+#endif
+#undef ulfa
+#define ulfa(a_)       madd(0,si,a_) fp(0,si)
+
+#endif
+
+
+#ifndef GER
+
+#undef plaa1_2
+#define plaa1_2(a_)              fl(a_ ## 0,si) 
+#undef wa1_2
+#define wa1_2(a_)                w1_2(a_)
+#ifdef NO_TRANSPOSE
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_)         fl(a_ ## 0,b_) fm(M(s0(c_),1),0) fap(0,t0(c_)) 
+#undef dp1_2
+#define dp1_2(a_,b_,c_)          ddp1_2(a_,b_,c_)
+#else
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_)         fl(a_ ## 0,b_) fm(s0(c_),0) fap(0,M(t0(c_),1)) 
+#undef dp1_2
+#define dp1_2(a_,b_,c_)          fl(a_ ## 0,b_) fmp(0,s0(c_)) fap(0,M(t0(c_),2))
+#endif
+
+#else
+
+#undef plaa1_2
+#define plaa1_2(a_)              fl(a_ ## 0,si) 
+#undef wa1_2
+#define wa1_2(a_)
+#undef ddp1_2
+#define ddp1_2(a_,b_,c_)         fd(M(s0(c_),2)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) 
+#undef dp1_2
+#define dp1_2(a_,b_,c_)          fm(M(s0(c_),2),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) 
+
+#endif
+
+
+
+#undef plaa
+#define plaa(a_)                 fl(a_ ## 0,si) fl(a_ ## 8,si) fx1
+
+#ifndef GER
+
+
+#undef wa
+#define wa(a_)                   w(a_)
+
+
+#undef ddp
+#define ddp(a_,b_,c_)            fl(a_ ## 0,b_) fm(s0(c_),0) fl(a_ ## 8,b_) \
+                                 fm(P(s8(c_),1),0) fx1 fap(0,P(t0(c_),1)) \
+                                 fap(0,t8(c_))
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_)     fl(a_ ## 0,b_) fm(s0(c_),0) fl(a_ ## 8,b_) \
+                                 fm(P(s8(c_),1),0)  pf(d_,e_) fx1 fap(0,P(t0(c_),1)) \
+                                 fap(0,t8(c_))
+
+/* #define ddp(a_,b_,c_)            fd(M(s0(c_),1)) fma(a_ ## 0,b_) fap(0,t0(c_)) \ */
+/*                                  fd(M(s8(c_),1)) fma(a_ ## 8,b_) fap(0,t8(c_)) */
+/* #define ddpp(a_,b_,c_,d_,e_)     fd(M(s0(c_),1)) fma(a_ ## 0,b_) fap(0,t0(c_)) \ */
+/*                                   \ */
+/*                                  fd(M(s8(c_),1)) fma(a_ ## 8,b_) fap(0,t8(c_)) pf(d_,e_) */
+
+#ifdef NO_TRANSPOSE
+
+#undef dp
+#define dp(a_,b_,c_)             ddp(a_,b_,c_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_)      ddpp(a_,b_,c_,d_,e_)
+
+#else
+
+#undef dp
+#define dp(a_,b_,c_)             fl(a_ ## 0,b_) fmp(0,s0(c_)) fl(a_ ## 8,b_) \
+                                 fmp(0,s8(c_)) fap(0,M(t0(c_),1)) fap(0,M(t8(c_),2))
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_)      fl(a_ ## 0,b_)  pf(d_ ,e_) fmp(0,s0(c_)) fl(a_ ## 8,b_) \
+                                 fmp(0,s8(c_)) fap(0,M(t0(c_),1)) fap(0,M(t8(c_),2))
+
+/* #define dp(a_,b_,c_)             fma(a_ ## 0,b_) fap(0,M(t0(c_),1))  \ */
+/*                                  fma(a_ ## 8,b_) fap(0,M(t8(c_),2)) */
+/* #define dpp(a_,b_,c_,d_,e_)      fma(a_ ## 0,b_) fap(0,M(t0(c_),1))  \ */
+/*                                   \ */
+/* 			         fma(a_ ## 8,b_) fap(0,M(t8(c_),2)) pf(d_,e_) */
+
+#endif
+
+
+#else
+
+#undef wa
+#define wa(a_)
+#undef ddp
+#define ddp(a_,b_,c_)            fd(M(s0(c_),1)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \
+                                 fd(M(s8(c_),1)) fm(t8(c_),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_)
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_)     fd(M(s0(c_),1)) fm(t0(c_),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \
+                                 fd(M(s8(c_),1)) fm(t8(c_),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) pf(d_,e_)
+
+#undef dp
+#define dp(a_,b_,c_)             fm(M(s0(c_),1),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \
+                                 fm(M(s8(c_),2),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_)      fm(M(s0(c_),1),0) faa(a_ ## 0,b_) fp(a_ ## 0,b_) \
+                                 fm(M(s8(c_),2),0) faa(a_ ## 8,b_) fp(a_ ## 8,b_) pf(d_,e_)
+
+#endif
+
+
+#undef R1
+#define R1 3
+#undef R2
+#define R2 4
+#undef R3
+#define R3 5
+#undef R4
+#define R4 6
+
+#endif
+
+#endif
+
+/******************************************************************************
+ *  Double Precision Complex Macros
+ ******************************************************************************/  
+
+#ifdef DCPLX
+
+#ifdef ATL_SSE2
+#ifdef NO_TRANSPOSE
+
+#if NDPM > 3 
+#error Max NDPM is 3 for DCPLX NO_TRANSPOSE
+#endif
+
+#undef plax
+#define plax
+
+#undef R1
+#define R1 2
+#undef R2
+#define R2 4
+#undef R3
+#define R3 6
+#undef R4
+#define R4 6
+
+#undef TREG
+#define TREG 1
+#undef SREG
+#define SREG 0
+#undef CREG
+#define CREG 0
+
+#ifdef GER
+#undef AREG
+#define AREG 0
+#undef targ
+#define targ(a_)        AREG
+#undef wb
+#define wb(a_,b_)       pu(AREG,a_,b_)
+#undef wbd
+/* #define wbd(a_,b_)      pud(AREG,a_,b_) */
+#undef w
+#define w(a_)
+#undef w1_2
+/* #define w1_2(a_) */
+#else
+#undef AREG
+#define AREG TREG
+#undef targ
+#define targ(a_)        CREG
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+/* #define wbd(a_,b_) */
+#undef w
+#define w(a_)           pu(CREG,a_ ## 0,si)
+#undef w1_2
+/* #define w1_2(a_)        pud(CREG,a_ ## 0,si) */
+#endif
+
+#undef src
+#define src(a_)         a_
+#undef mpx
+#define mpx(a_)         pls(0,si,a_) ps(0,a_,a_) pls(8,si,P(a_,1)) \
+                        ps(0,P(a_,1),P(a_,1)) sign(a_)
+#undef madd
+#define madd(a_,b_,c_)  pas(a_,b_,c_)
+#undef ulfa
+#define ulfa(a_)
+
+#else
+
+#undef R1
+#define R1 4
+#undef R2
+#define R2 5
+#undef R3
+#define R3 6
+#undef R4
+#define R4 7
+
+#undef TREG
+#define TREG 3
+#undef SREG
+#define SREG 2
+#undef CREG
+#define CREG 0
+#undef targ
+#define targ(a_)        a_
+#undef src
+#define src(a_)         0
+#undef w
+#define w(a_)
+#undef w1_2
+#define w1_2(a_)
+#undef mpx
+#define mpx(a_)        px(a_)
+#ifdef BETA0
+#undef ulfa
+#define ulfa(a_)       /* phl(a_,0) pa(0,a_)  */pu(a_,0,si)
+#else
+#undef ulfa
+#define ulfa(a_)       pl(0,si,TREG) /* phl(a_,0) pa(0,a_) */ pa(TREG,a_) pu(a_,0,si)
+#endif
+#undef AREG
+#define AREG TREG
+#undef wb
+#define wb(a_,b_)
+#undef wbd
+#define wbd(a_,b_)
+#undef wbs
+#define wbs(a_,b_)
+
+
+#undef plax
+#define plax       pc(CREG,1) ps(0,CREG,CREG) ps(3,1,1) sign(CREG)
+
+
+
+#endif
+
+#if defined(Conj_) && ! defined(GER) 
+#undef sign
+#define sign(a_)       pm(SREG,a_)
+#else		   
+#undef sign
+#define sign(a_)       pm(SREG,P(a_,1))
+#endif
+
+
+
+#undef plb
+#define plb(a_,b_)           pl(a_,b_,AREG)
+#undef plbd
+/* #define plbd(a_,b_)          px(AREG) pld(a_,b_,AREG) */
+
+#undef dpr
+#define dpr(a_)              pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dprp
+#define dprp(a_,b_,c_)       pf(b_,c_) pm(src(a_),TREG) pa(TREG,targ(a_))
+#undef dpi
+#define dpi(a_)              pm(P(src(a_),1),TREG) ps(1,TREG,TREG) pa(TREG,targ(a_))
+
+#ifndef GER
+
+#undef plaa
+#define plaa(a_)                pl(a_ ## 0,si,CREG) plax
+#undef wa
+#define wa(a_)                  w(a_)
+#undef dp
+#define dp(a_,b_,c_)            plb(a_ ## 0,b_) dpr(c_) plb(a_ ## 0,b_) dpi(c_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_)     plb(a_ ## 0,b_) dprp(c_,d_,e_) plb(a_ ## 0,b_) dpi(c_)
+#undef ddp
+#define ddp(a_,b_,c_)           dp(a_,b_,c_)       
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_)    dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+/* #define plaa1_2(a_)             px(CREG) pld(a_ ## 0,si,CREG) plax */
+#undef wa1_2
+/* #define wa1_2(a_)               w1_2(a_) */
+#undef dp1_2
+/* #define dp1_2(a_,b_,c_)         plbd(a_ ## 0,b_) dpr(c_) plbd(a_ ## 0,b_) dpi(c_) */
+#undef dpp1_2
+/* #define dpp1_2(a_,b_,c_,d_,e_)  plbd(a_ ## 0,b_) dprp(c_,d_,e_) plbd(a_ ## 0,b_) dpi(c_) */
+#undef ddp1_2
+/* #define ddp1_2(a_,b_,c_)        dp1_2(a_,b_,c_)        */
+#undef ddpp1_2
+/* #define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) */
+
+
+#else
+
+#undef lqc
+#define lqc(a_)              pl(a_ ## 0,si,TREG)
+#undef lqc1
+/* #define lqc1_2(a_)           px(TREG) pld(a_ ## 0,si,TREG) */
+
+
+#undef plaa
+#define plaa(a_) 
+#undef wa
+#define wa(a_)
+#undef dp
+#define dp(a_,b_,c_)         lqc(a_) plb(a_ ## 0,b_) dpr(c_) \
+                             lqc(a_) dpi(c_) wb(a_ ## 0,b_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_)  lqc(a_) plb(a_ ## 0,b_) dpr(c_) pf(d_,e_) \
+                             lqc(a_) dpi(c_) wb(a_ ## 0,b_)
+#undef ddp
+#define ddp(a_,b_,c_)        dp(a_,b_,c_)       
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) dpp(a_,b_,c_,d_,e_)
+
+#undef plaa1_2
+/* #define plaa1_2(a_) */
+#undef wa1_2
+/* #define wa1_2(a_) */
+#undef dp1_2
+/* #define dp1_2(a_,b_,c_)         lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) \ */
+/*                                 lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) */
+#undef dpp1_2
+/* #define dpp1_2(a_,b_,c_,d_,e_)  lqc1_2(a_) plbd(a_ ## 0,b_) dpr(c_) pf(d_,e_) \ */
+/*                                 lqc1_2(a_) dpi(c_) wbd(a_ ## 0,b_) */
+#undef ddp1_2
+/* #define ddp1_2(a_,b_,c_)        dp1_2(a_,b_,c_)        */
+#undef ddpp1_2
+/* #define ddpp1_2(a_,b_,c_,d_,e_) dpp1_2(a_,b_,c_,d_,e_) */
+
+#endif
+
+#else 
+
+#if NDPM > 2
+#error Max NDPM is 2 for DCPLX
+#endif
+
+#undef TREG
+#define TREG           2
+
+#ifdef NO_TRANSPOSE
+
+#undef w
+#define w(a_)          fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef plax
+#define plax           fx1
+#undef srr
+#define srr(a_)        a_
+#undef sri
+#define sri(a_)        a_
+#undef sir
+#define sir(a_)        a_
+#undef sii
+#define sii(a_)        a_
+#undef trr
+#define trr(a_)        P(TREG,1)
+#undef tri
+#define tri(a_)        M(TREG,1)
+#undef tir
+#define tir(a_)        TREG
+#undef tii
+#define tii(a_)        TREG
+#undef mpx
+#define mpx(a_)        fl(0,si) fl(8,si) fc(M(a_,2)) fc(M(a_,2)) 
+#undef madd
+#define madd(a_,b_,c_) faa(a_,b_)
+#undef ulfa
+#define ulfa(a_)       fc(0) fc(0)
+
+#else
+
+#undef srr
+#define srr(a_)       P(TREG,1)
+#undef sri
+#define sri(a_)       M(TREG,1)
+#undef sir
+#define sir(a_)       TREG
+#undef sii
+#define sii(a_)       TREG
+#undef trr
+#define trr(a_)       a_
+#undef tri
+#define tri(a_)       a_
+#undef tir
+#define tir(a_)       a_
+#undef tii
+#define tii(a_)       a_
+#undef w
+#define w(a_)           
+#undef plax
+#define plax  
+#undef mpx
+#define mpx(a_)        fz fz
+#ifdef BETA0
+#undef madd
+#define madd(a_,b_,c_)
+#else
+#undef madd
+#define madd(a_,b_,c_) faa(a_,b_)
+#endif
+#undef ulfa
+#define ulfa(a_)       madd(0,si,a_) fp(0,si) madd(8,si,a_) fp(8,si)
+
+#endif
+
+
+
+#ifdef Conj_
+#undef fapi
+#define fapi(a_,b_)   fsp(b_)
+#undef fspi
+#define fspi(a_,b_)   fap(a_,b_)
+#else
+#undef fapi
+#define fapi(a_,b_)   fap(a_,b_)
+#undef fspi
+#define fspi(a_,b_)   fsp(b_)
+#endif
+
+#ifndef GER
+
+
+#undef plaa
+#define plaa(a_)             fl(a_ ## 0,si) fl(a_ ## 8,si) plax
+#undef wa
+#define wa(a_)               w(a_)
+#undef ddp
+#define ddp(a_,b_,c_)        fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \
+                                                  fm(sri(c_),0) fap(0,tri(c_))\
+                             fl(a_ ## 8,b_) fd(0) fm(sir(c_),0) fspi(0,tir(c_)) \
+                                                  fm(sii(c_),0) fapi(0,tii(c_))
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \
+                                                  fm(sri(c_),0) fap(0,tri(c_))\
+                             fl(a_ ## 8,b_) fd(0) pf(d_,e_) fm(sir(c_),0) fspi(0,tir(c_))\
+                                                  fm(sii(c_),0) fapi(0,tii(c_))
+
+
+
+#ifdef NO_TRANSPOSE
+
+
+
+#undef dp
+#define dp(a_,b_,c_)         ddp(a_,b_,c_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_)  ddpp(a_,b_,c_,d_,e_)
+
+
+
+#else
+
+#undef dp
+#define dp(a_,b_,c_)        fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \
+                                                 fm(sri(c_),0) fap(0,tri(c_))\
+                            fl(a_ ## 8,b_)       fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \
+                                                 fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2))
+
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_) fl(a_ ## 0,b_) fd(0) fm(srr(c_),0) fap(0,trr(c_)) \
+                                                 pf(d_,e_) fm(sri(c_),0) fap(0,tri(c_))\
+                            fl(a_ ## 8,b_)       fm(0,sir(c_)) fmp(0,M(sir(c_),1)) \
+                                                 fspi(0,M(tir(c_),2)) fapi(0,M(tii(c_),2))
+
+
+#endif
+
+#else
+
+#undef plaa
+#define plaa(a_)            fl(a_ ## 0,si) fl(a_ ## 8,si) plax
+#undef wa
+#define wa(a_)
+
+#undef ddprr
+#define ddprr(a_,b_,c_)     fl(a_ ## 0,b_) \
+                                              fd(tri(c_))           fm(P(sri(c_),1),0)      fap(0,1) \
+                                              fd(M(trr(c_),1))      fm(srr(c_),0)           fspi(0,1) \
+                            fp(a_ ## 0,b_) 
+#undef ddpri
+#define ddpri(a_,b_,c_)     fl(a_ ## 8,b_) \
+                                              fd(tii(c_))           fm(P(sii(c_),1),0)      fap(0,1) \
+                                              fd(M(tir(c_),1))      fm(sir(c_),0)           fapi(0,1) \
+                            fp(a_ ## 8,b_) 
+#undef dpri
+#define dpri(a_,b_,c_)      fl(a_ ## 8,b_) \
+                                              fx(2)                 fm(sir(c_),0)           fap(0,2) \
+                                                                    fm(M(sii(c_),2),0)      fapi(0,1) \
+                            fp(a_ ## 8,b_)
+
+
+#undef ddpp
+#define ddpp(a_,b_,c_,d_,e_) ddprr(a_,b_,c_) pf(d_,e_) ddpri(a_,b_,c_)
+#undef ddp
+#define ddp(a_,b_,c_)        ddprr(a_,b_,c_)           ddpri(a_,b_,c_)
+#undef dpp
+#define dpp(a_,b_,c_,d_,e_)  ddprr(a_,b_,c_) pf(d_,e_) dpri(a_,b_,c_)
+#undef dp
+#define dp(a_,b_,c_)         ddprr(a_,b_,c_)           dpri(a_,b_,c_)
+
+#endif
+
+
+#undef R1
+#define R1 4
+#undef R2
+#define R2 6
+#undef R3
+#define R3 6
+#undef R4
+#define R4 6
+
+#endif
+
+#endif
+
+
+/******************************************************************************
+ *  General Macros
+ ******************************************************************************/  
+
+
+
+
+#undef bla1
+#define bla1(a_,b_)          plaa(a_) dpp(a_,ax,R1,b_,si) wa(a_) 
+#undef blb1
+#define blb1(a_,b_)          plaa(a_) dpp(a_,ax,R1,b_,ax) wa(a_)
+			     
+#undef bla2
+#undef bla2
+#define bla2(a_,b_)          pf(b_,si) plaa(a_) ddp(a_,ax,R1)        pf(b_,ax) dp(a_,bx,R2) wa(a_)
+#undef blb2
+#undef blb2
+#define blb2(a_,b_)                    plaa(a_) ddpp(a_,ax,R1,b_,bx)           dp(a_,bx,R2) wa(a_) 
+			     
+#undef bla3
+#define bla3(a_,b_)          plaa(a_) ddpp(a_,ax,R1,b_,si) ddp(a_,bx,R2) \
+                             dpp(a_,cx,R3,b_,ax) wa(a_)
+#undef blb3
+#define blb3(a_,b_)          plaa(a_) ddpp(a_,ax,R1,b_,bx) ddp(a_,bx,R2) \
+                             dpp(a_,cx,R3,b_,cx) wa(a_)
+			     
+#undef bla4
+#define bla4(a_,b_)          plaa(a_) ddpp(a_,ax,R1,b_,si) ddpp(a_,bx,R2,b_,ax) \
+                             ddp(a_,cx,R3) dpp(a_,dx,R4,b_,bx) wa(a_)
+#undef blb4
+#define blb4(a_,b_)          plaa(a_) ddp(a_,ax,R1)        ddpp(a_,bx,R2,b_,cx) \
+                             ddp(a_,cx,R3) dpp(a_,dx,R4,b_,dx) wa(a_)
+
+#undef bla
+#define bla(a_,b_)      Mjoin(bla,NDP)(a_,b_)
+#undef blb
+#define blb(a_,b_)      Mjoin(blb,NDP)(a_,b_)
+
+
+
+#undef bla11_2
+#define bla11_2(a_)    plaa1_2(a_) dp1_2(a_,ax,R1) wa1_2(a_) 
+#undef bla21_2
+#define bla21_2(a_)    plaa1_2(a_) ddp1_2(a_,ax,R1) dp1_2(a_,bx,R2) wa1_2(a_)
+#undef bla31_2
+#define bla31_2(a_)    plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \
+                          dp1_2(a_,cx,R3) wa1_2(a_)
+#undef bla41_2
+#define bla41_2(a_)    plaa1_2(a_) ddp1_2(a_,ax,R1) ddp1_2(a_,bx,R2) \
+                          ddp1_2(a_,cx,R3) dp1_2(a_,dx,R4) wa1_2(a_)
+
+#undef bla1_2
+#define bla1_2(a_)     Mjoin(Mjoin(bla,NDP),1_2)(a_)
+
+
+
+#undef bla11_4
+#define bla11_4(a_)    plaa1_4(a_) dp1_4(a_,ax,R1) wa1_4(a_) 
+#undef bla21_4
+#define bla21_4(a_)    plaa1_4(a_) ddp1_4(a_,ax,R1) dp1_4(a_,bx,R2) wa1_4(a_)
+#undef bla31_4
+#define bla31_4(a_)    plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \
+                          dp1_4(a_,cx,R3) wa1_4(a_)
+#undef bla41_4
+#define bla41_4(a_)    plaa1_4(a_) ddp1_4(a_,ax,R1) ddp1_4(a_,bx,R2) \
+                          ddp1_4(a_,cx,R3) dp1_4(a_,dx,R4) wa1_4(a_)
+
+#undef bla1_4
+#define bla1_4(a_)     Mjoin(Mjoin(bla,NDP),1_4)(a_)
+
+
+
+#undef inc1
+#define inc1(a_)        a(a_,si) a(a_,ax)
+#undef inc2
+#define inc2(a_)        inc1(a_) a(a_,bx)
+#undef inc3
+#define inc3(a_)        inc2(a_) a(a_,cx)
+#undef inc4
+#define inc4(a_)        inc3(a_) a(a_,dx)
+
+#undef inc
+#define inc(a_)         Mjoin(inc,NDP)(a_)
+
+
+#ifdef PREFETCH
+/* #include "camm_arith.h" */
+#undef S
+#define S(a_,b_) (a_) + (b_)
+#undef PF1
+#define PF1 PREFETCH
+#undef PF2
+#define PF2 S(PF1,32)
+#undef PF3
+#define PF3 S(PF1,64)
+#undef PF4
+#define PF4 S(PF1,96)
+#undef PF5
+#define PF5 S(PF1,128)
+#undef PF6
+#define PF6 S(PF1,160)
+#undef PF7
+#define PF7 S(PF1,192)
+#undef PF8
+#define PF8 S(PF1,224)
+#else
+#undef PF1
+#define PF1 64
+#undef PF2
+#define PF2 96
+#undef PF3
+#define PF3 128
+#undef PF4
+#define PF4 160
+#undef PF5
+#define PF5 192
+#undef PF6
+#define PF6 224
+#undef PF7
+#define PF7 256
+#undef PF8
+#define PF8 288
+#endif
+
+
+#if defined(NO_TRANSPOSE) && !defined(SREAL) && !defined(GER)
+#undef pf
+#define pf(a_,b_)  f(t0,a_,b_)
+#else
+#undef pf
+#define pf(a_,b_)  f(nta,a_,b_)
+#endif
+
+#undef bl1
+#define bl1            bla1_4(0x0) inc(4)
+#undef bl2
+#define bl2            bla1_2(0x0) inc(8)
+#undef bl4
+#define bl4            bla(0x0,PF1) inc(16)
+#undef bl8
+#define bl8            bla(0x0,PF1) blb(0x1,PF1) inc(32) 
+#undef bl16
+#define bl16           bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) inc(64)
+#undef bl32
+#define bl32           bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \
+                       bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) inc(128)
+#undef bl64
+#define bl64           bla(0x0,PF1) blb(0x1,PF1) bla(0x2,PF2) blb(0x3,PF2) \
+                       bla(0x4,PF3) blb(0x5,PF3) bla(0x6,PF4) blb(0x7,PF4) \
+                       bla(0x8,PF5) blb(0x9,PF5) bla(0xa,PF6) blb(0xb,PF6) \
+                       bla(0xc,PF7) blb(0xd,PF7) bla(0xe,PF8) blb(0xf,PF8) inc(256)
+
+/* #define in2           inc(8) */
+/* #define in4           inc(16) */
+/* #define in8           inc(32) */
+/* #define in16          inc(64) */
+
+#undef in2
+#define in2  
+#undef in4
+#define in4  
+#undef in8
+#define in8  
+#undef in16
+#define in16 
+
+#ifdef NO_TRANSPOSE
+#undef incf
+#define incf           ra(di,si)
+#else
+#undef incf
+#define incf
+#endif
+
+#undef lf1
+#define lf1            mpx(R1)
+#undef lf2
+#define lf2            lf1 incf mpx(R2)
+#undef lf3
+#define lf3            lf2 incf mpx(R3)
+#undef lf4
+#define lf4            lf3 incf mpx(R4)
+
+#undef lf
+#define lf             Mjoin(lf,NDP)
+
+
+#undef ulf1
+#define ulf1           ulfa(R1)
+#undef ulf2
+#define ulf2           ulf1 ra(di,si) ulfa(R2) 
+#undef ulf3
+#define ulf3           ulf2 ra(di,si) ulfa(R3) 
+#undef ulf4
+#define ulf4           ulf3 ra(di,si) ulfa(R4) 
+
+#undef ulf
+#define ulf            Mjoin(ulf,NDP)
+
+#undef lpba
+#define lpba(a_)      "movl %%esi,%%e" #a_ "\n\t"
+
+#undef lpb1
+#define lpb1          lpba(ax)
+#undef lpb2
+#define lpb2          lpb1 ra(di,si) lpba(bx)
+#undef lpb3
+#define lpb3          lpb2 ra(di,si) lpba(cx)
+#undef lpb4
+#define lpb4          lpb3 ra(di,si) lpba(dx)
+
+#undef lpb
+#define lpb           Mjoin(lpb,NDP)
+
+#undef ipf1
+#define ipf1(a_)   pf(a_,si) pf(a_,ax)
+#undef ipf2
+#define ipf2(a_)   ipf1(a_)  pf(a_,bx) 
+#undef ipf3
+#define ipf3(a_)   ipf2(a_)  pf(a_,cx) 
+#undef ipf4
+#define ipf4(a_)   ipf3(a_)  pf(a_,dx) 
+
+#undef ipf
+#define ipf(a_)     Mjoin(ipf,NDP)(a_)
+
+#ifdef LUNROLL
+#undef UNROLL
+#ifdef SREAL
+#undef UNROLL
+#define UNROLL LUNROLL
+#elif defined(DREAL) || defined(SCPLX)
+#undef UNROLL
+#define UNROLL LUNROLL*2
+#elif defined(DCPLX)
+#undef UNROLL
+#define UNROLL LUNROLL*4
+#endif
+#else
+#undef UNROLL
+#define UNROLL 16
+#endif
+
+#undef UNROLL1_2
+#if UNROLL == 64
+#undef blUNROLL
+#define blUNROLL bl64
+#undef UNROLL1_2
+#define UNROLL1_2 32
+#elif UNROLL == 32
+#undef blUNROLL
+#define blUNROLL bl32
+#undef UNROLL1_2
+#define UNROLL1_2 16
+#elif UNROLL == 16
+#undef blUNROLL
+#define blUNROLL bl16
+#undef UNROLL1_2
+#define UNROLL1_2 8
+#elif UNROLL == 8
+#undef blUNROLL
+#define blUNROLL bl8
+#undef UNROLL1_2
+#define UNROLL1_2 4
+#elif UNROLL == 4
+#undef blUNROLL
+#define blUNROLL bl4
+#undef UNROLL1_2
+#define UNROLL1_2 2
+#elif UNROLL == 2
+#undef blUNROLL
+#define blUNROLL bl2
+#undef UNROLL1_2
+#define UNROLL1_2 1
+#elif UNROLL == 1
+#undef blUNROLL
+#define blUNROLL bl1
+#undef UNROLL1_2
+#define UNROLL1_2 stop
+#endif
+#ifndef UNROLL1_2
+#error UNROLL must be set to power of 2 < 128
+#endif
+
+
+#ifdef GER
+#undef aconst
+#define aconst
+#undef cconst
+#define cconst const
+#else
+#undef aconst
+#define aconst const
+#undef cconst
+#define cconst
+#endif
+
+#undef MY_FUNCTION
+#define MY_FUNCTION Mjoin(dp,EXT)
+
+static void
+MY_FUNCTION(aconst TYPE *a,int lda,
+	      const TYPE *b,
+	      cconst TYPE *c,int stride,int len) {
+
+#ifdef SCPLX
+#if defined(GER) && defined(Conj_)
+    const TYPE w1[2]={{-1.0,1.0},{-1.0,1.0}},*w=w1;
+#else
+    const TYPE w1[2]={{1.0,-1.0},{1.0,-1.0}},*w=w1;
+#endif
+#endif
+
+#if defined(DCPLX) && defined(ATL_SSE2)
+#if defined(GER) && defined(Conj_)
+    const TYPE w1[1]={{-1.0,1.0}},*w=w1;
+#else
+    const TYPE w1[1]={{1.0,-1.0}},*w=w1;
+#endif
+#endif
+
+#ifdef NO_TRANSPOSE
+#undef movm
+#define movm c
+#undef fixm
+#define fixm b
+#else
+#undef movm
+#define movm b
+#undef fixm
+#define fixm c
+#endif    
+    NO_INLINE
+    unsigned u1=stride*sizeof(*fixm),u2=lda*sizeof(*a),u3=len*sizeof(*movm)/sizeof(float);
+
+    ASM (
+
+	 "pushl %%ebx\n\t"
+	 a(4,sp)
+
+#if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2))
+	 "movl %6,%%esi\n\t"
+	 pl(0,si,SREG)
+#endif
+	 
+#ifdef NO_TRANSPOSE
+	 "movl %1,%%esi\n\t"  /* fixm */
+	 "movl %2,%%edi\n\t"  /* fixm2fixm */
+#endif
+
+	 lf
+
+	 "movl %3,%%esi\n\t"  /* a */
+	 "movl %4,%%edi\n\t"  /* a2a */
+
+	 lpb
+
+	 ipf(0)
+
+	 "movl %0,%%esi\n\t"  /* movm */
+	 "movl %5,%%edi\n\t"  /* len */
+
+#if defined(ALIGN)
+
+#if defined(SREAL)
+
+	 test(4,ax)
+	 je(Mjoin(a1,EXT))
+	 test(-1,di)
+	 je(Mjoin(a1,EXT))
+	 sub(1,di)
+	 bl1
+
+	 lab(Mjoin(a1,EXT))
+
+#endif
+
+#if defined(DREAL) || defined(SREAL)
+
+	 test(8,ax)
+	 je(Mjoin(as,EXT))
+	 test(-2,di)
+	 je(Mjoin(as,EXT))
+	 sub(2,di)
+	 bl2
+
+	 lab(Mjoin(as,EXT))
+
+#endif
+
+#endif
+	      
+
+	 ipf(32)
+
+	 lab(Mjoin(loop,EXT))
+
+	 test(-UNROLL,di)
+	 je(Mjoin(UNROLL1_2,EXT))
+	 sub(UNROLL,di)
+
+	 blUNROLL
+	 
+	 jmp(Mjoin(loop,EXT))
+
+#if UNROLL > 32
+	 lab(Mjoin(32,EXT))
+	 test(32,di)
+	 je(Mjoin(16,EXT))
+	 bl32
+#endif	 
+
+#if UNROLL > 16
+	 lab(Mjoin(16,EXT))
+	 test(16,di)
+	 je(Mjoin(8,EXT))
+	 bl16
+#endif	 
+
+#if UNROLL > 8
+	 lab(Mjoin(8,EXT))
+	 test(8,di)
+	 je(Mjoin(4,EXT))
+	 bl8
+#endif	 
+
+#if UNROLL > 4
+	 lab(Mjoin(4,EXT))
+	 test(4,di)
+	 je(Mjoin(2,EXT))
+	 bl4
+#endif
+
+#if UNROLL > 2	 
+	 lab(Mjoin(2,EXT))
+#ifndef DCPLX
+	 test(2,di)
+	 je(Mjoin(1,EXT))
+	 bl2
+#endif
+#endif
+
+#if UNROLL > 1
+	 lab(Mjoin(1,EXT))
+#ifdef SREAL
+	 test(1,di)
+	 je(Mjoin(stop,EXT))
+	 bl1
+#endif
+#endif
+
+	 lab(Mjoin(stop,EXT))
+
+#ifndef NO_TRANSPOSE
+	 "movl %1,%%esi\n\t"  /* fixm */
+	 "movl %2,%%edi\n\t"  /* fixm2fixm */
+#endif
+
+	 ulf
+
+	 a(-4,sp)
+	 "popl %%ebx\n\t"
+
+
+	 ::"m" (movm),"m" (fixm),"m" (u1),"m" (a),"m" (u2),"m" (u3)
+
+#if defined(SCPLX) || (defined(DCPLX) && defined(ATL_SSE2))
+	 ,"m" (w)
+#endif
+	 :"ax","bx","cx","dx","si","di");
+
+
+}
+
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h
new file mode 100644
index 0000000..7fd1404
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_pipe3.h
@@ -0,0 +1,295 @@
+#include "camm_util.h"
+
+#ifndef N
+#error N must be defined in camm_pipe3.h
+#endif
+#ifndef KB
+#error KB must be defined in camm_pipe3.h
+#endif
+
+#undef p1
+#define p1(a_)         Mjoin(p1_4_,N)(a_)
+#undef p2
+#define p2(a_)         Mjoin(p1_2_,N)(a_)
+#undef p4
+#define p4(a_)         Mjoin(p1_,N)(a_)
+#undef load_pipe
+#define load_pipe(a_)  Mjoin(lp,N)(a_)
+#undef drain_pipe
+#define drain_pipe(a_) Mjoin(dp,N)(a_)
+#undef pipe_len
+#define pipe_len       Mjoin(pl,N)
+
+#undef p8
+#if pipe_len > 4
+#define p8(a_)         Mjoin(p2_,N)(a_)
+#else
+#define p8(a_)         p4(a_)   p4(SS(a_,16))
+#endif
+
+#undef p16
+#if pipe_len > 8
+#define p16(a_)        Mjoin(p4_,N)(a_)
+#else
+#define p16(a_)        p8(a_)   p8(SS(a_,32))
+#endif
+
+#undef p32
+#if pipe_len > 16
+#define p32(a_)        Mjoin(p8_,N)(a_)
+#else
+#define p32(a_)        p16(a_)   p16(SS(a_,64))
+#endif
+
+#undef p64
+#if pipe_len > 32
+#define p64(a_)        Mjoin(p16_,N)(a_)
+#else
+#define p64(a_)        p32(a_)   p32(SS(a_,128))
+#endif
+
+#undef p128
+#if pipe_len > 64
+#define p128(a_)       Mjoin(p32_,N)(a_)
+#else
+#define p128(a_)       p64(a_)   p64(SS(a_,256))
+#endif
+
+#undef p256
+#if pipe_len > 128
+#define p256(a_)       Mjoin(p64_,N)(a_)
+#else
+#define p256(a_)       p128(a_)   p128(SS(a_,512))
+#endif
+
+#if KB < pipe_len
+#undef pipe_len
+#define pipe_len 0
+#undef load_pipe
+#define load_pipe(a_)
+#undef drain_pipe
+#define drain_pipe(a_)
+#endif
+
+
+#undef MKB
+/* #ifdef SREAL */
+#define MKB KB
+/* #elif defined (DCPLX) */
+/* #define MKB ( KB * 4 ) */
+/* #else */
+/* #define MKB ( KB * 2 ) */
+/* #endif */
+
+#if MKB >= 512
+#error MKB must be less than 512
+#endif
+
+#undef x0
+#undef o0
+#define x0 load_pipe(0)
+#define o0 0
+
+#undef MKBB
+#define MKBB ( MKB - pipe_len )
+
+#undef xx1
+#undef oo1
+#if MKBB >= 256
+#define xx1 x0 p256(o0)
+#define oo1 SS(1024,o0)
+#else
+#define xx1 x0
+#define oo1 o0
+#endif
+
+#undef xx1a
+#undef oo1a
+#if pipe_len == 256
+#define xx1a xx1 drain_pipe(oo1)
+#define oo1a SS(1024,oo1)
+#undef MKBB
+#define MKBB MKB
+#else
+#define xx1a xx1
+#define oo1a oo1
+#endif
+
+#undef x1
+#undef o1
+#if ( MKBB / 128 ) % 2
+#define x1 xx1a p128(oo1a)
+#define o1 SS(512,oo1a)
+#else
+#define x1 xx1a
+#define o1 oo1a
+#endif
+
+#undef x1a
+#undef o1a
+#if pipe_len == 128
+#define x1a x1 drain_pipe(o1)
+#define o1a SS(512,o1)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x1a x1
+#define o1a o1
+#endif
+
+#undef x2
+#undef o2
+#if ( MKBB / 64 ) % 2
+#define x2  x1a p64(o1a)
+#define o2 SS(256,o1a)
+#else
+#define x2 x1a
+#define o2 o1a
+#endif
+
+#undef x2a
+#undef o2a
+#if pipe_len == 64
+#define x2a x2 drain_pipe(o2)
+#define o2a SS(256,o2)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x2a x2
+#define o2a o2
+#endif
+
+#undef x3
+#undef o3
+#if ( MKBB / 32 ) % 2
+#define x3  x2a p32(o2a)
+#define o3 SS(128,o2a)
+#else
+#define x3 x2a
+#define o3 o2a
+#endif
+
+#undef x3a
+#undef o3a
+#if pipe_len == 32
+#define x3a x3 drain_pipe(o3)
+#define o3a SS(128,o3)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x3a x3
+#define o3a o3
+#endif
+
+#undef x4
+#undef o4
+#if ( MKBB / 16 ) % 2
+#define x4 x3a p16(o3a)
+#define o4 SS(64,o3a)
+#else
+#define x4 x3a
+#define o4 o3a
+#endif
+
+#undef x4a
+#undef o4a
+#if pipe_len == 16
+#define x4a x4 drain_pipe(o4)
+#define o4a SS(64,o4)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x4a x4
+#define o4a o4
+#endif
+
+#undef x5
+#undef o5
+#if ( MKBB / 8 ) % 2
+#define x5  x4a p8(o4a)
+#define o5 SS(32,o4a)
+#else
+#define x5 x4a
+#define o5 o4a
+#endif
+
+#undef x5a
+#undef o5a
+#if pipe_len == 8
+#define x5a x5 drain_pipe(o5)
+#define o5a SS(32,o5)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x5a x5
+#define o5a o5
+#endif
+
+#undef x6
+#undef o6
+#if ( MKBB / 4 ) % 2
+#define x6  x5a p4(o5a)
+#define o6 SS(16,o5a)
+#else
+#define x6 x5a
+#define o6 o5a
+#endif
+
+#undef x6a
+#undef o6a
+#if pipe_len == 4
+#define x6a x6 drain_pipe(o6)
+#define o6a SS(16,o6)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x6a x6
+#define o6a o6
+#endif
+
+#undef x7
+#undef o7
+#if ( MKB / 2 ) % 2
+#define x7  x6a p2(o6a)
+#define o7 SS(8,o6a)
+#else
+#define x7 x6a
+#define o7 o6a
+#endif
+
+#undef x7a
+#undef o7a
+#if pipe_len == 2
+#define x7a x7 drain_pipe(o7)
+#define o7a SS(8,o7)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x7a x7
+#define o7a o7
+#endif
+
+#undef x8
+#undef o8
+#if ( MKB / 1 ) % 2
+#define x8 x7a p1(o7a)
+#define o8 SS(4,o7a)
+#else
+#define x8 x7a
+#define o8 o7a
+#endif
+
+#undef x8a
+#undef o8a
+#if pipe_len == 1
+#define x8a x8 drain_pipe(o8)
+#define o8a SS(4,o8)
+#undef MKBB
+#define MKBB MKB
+#else
+#define x8a x8
+#define o8a o8
+#endif
+
+#undef KB_block
+#define KB_block x8a
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h
new file mode 100644
index 0000000..35e9e59
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_scale.h
@@ -0,0 +1,215 @@
+#ifndef CAMM_SCALE_H
+#define CAMM_SCALE_H    /*+ To stop multiple inclusions. +*/
+
+#include "camm_util.h"
+
+#undef spf
+#define spf(a_,b_)  f(t0,a_,b_)
+
+#ifdef SCPLX
+#ifdef BETAX
+#undef SSREG
+#define SSREG      2
+#undef lbx
+#define lbx        pls(4,ax,1) ps(0,1,1) pm(SSREG,1)
+#undef cxx
+#define cxx        pm(1,3) ps(177,3,3) pa(3,2)
+#undef pcx
+#define pcx        pc(2,3)
+#else
+#undef lbx
+#define lbx
+#undef cxx
+#define cxx
+#undef pcx
+#define pcx
+#endif
+#undef lb
+#define lb         pls(0,ax,0) ps(0,0,0) lbx 
+#undef c
+#define c(a_)      pl(a_ ## 0,si,2) pcx pm(0,2) cxx pu(2,a_ ## 0,si)
+#undef cp
+#define cp(a_,b_)  pl(a_ ## 0,si,2) pcx pm(0,2) spf(b_,si) cxx pu(2,a_ ## 0,si)
+#undef c1_2
+#define c1_2(a_)   px(2) pld(a_ ## 0,si,2) pcx pm(0,2) cxx pud(2,a_ ## 0,si)
+#undef ub
+#define ub
+#endif
+
+#ifdef SREAL
+#undef lb
+#define lb         pls(0,ax,0) ps(0,0,0)
+#undef c
+#define c(a_)      pl(a_ ## 0,si,2) pm(0,2) pu(2,a_ ## 0,si)
+#undef cp
+#define cp(a_,b_)  pl(a_ ## 0,si,2) spf(b_,si) pm(0,2) pu(2,a_ ## 0,si)
+#undef c1_2
+#define c1_2(a_)   px(2) pld(a_ ## 0,si,2) pm(0,2) pud(2,a_ ## 0,si)
+#undef c1_4
+#define c1_4(a_)   pls(a_ ## 0,si,2) pm(0,2) pus(2,a_ ## 0,si)
+#undef ub
+#define ub
+#endif
+
+#ifdef DREAL
+#undef lb
+#define lb        fl(0,ax)
+#undef c
+#define c(a_)     fl(a_ ## 0,si) fm(1,0) fl(a_ ## 8,si) fm(2,0) fx1 \
+                  fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef cp
+#define cp(a_,b_) fl(a_ ## 0,si) fm(1,0) fl(a_ ## 8,si) spf(b_,si) fm(2,0) fx1 \
+                  fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef c1_2
+#define c1_2(a_)  fl(a_ ## 0,si) fm(1,0) fp(a_ ## 0,si) 
+#undef ub
+#define ub        fc(0)
+#endif
+
+#ifdef DCPLX
+#undef lb
+#define lb        fl(0,ax) fl(8,ax)
+#undef c
+#define c(a_)     fl(a_ ## 0,si) fl(a_ ## 8,si) fd(3) fm(2,0) fd(3) \
+                  fm(2,0) fx(3) fm(4,0) fx(2) fm(5,0) fap(0,2) fx(2) fsp(2) fx1 \
+                  fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef cp
+#define cp(a_,b_) fl(a_ ## 0,si) fl(a_ ## 8,si) fd(3) fm(2,0) fd(3) \
+                  fm(2,0) fx(3) spf(b_,si) fm(4,0) fx(2) fm(5,0) fap(0,2) fx(2) \
+                  fsp(2) fx1 fp(a_ ## 0,si) fp(a_ ## 8,si)
+#undef ub
+#define ub        fc(0) fc(0)
+#endif
+
+#undef sbl1
+#define sbl1       c1_4(0x0)
+#undef sbl2
+#define sbl2       c1_2(0x0)
+#undef sbl4
+#define sbl4       cp(0x0,0x40)
+#undef sbl8
+#define sbl8       sbl4 c(0x1)
+#undef sbl16
+#define sbl16      sbl8 cp(0x2,0x60) c(0x3)
+
+#undef sinc16
+#define sinc16    a(0x40,si)
+#undef sinc8
+#define sinc8     a(0x20,si)
+#undef sinc4
+#define sinc4     a(0x10,si)
+#undef sinc2
+#define sinc2     a(0x8,si)
+#undef sinc1
+#define sinc1     a(0x4,si)
+
+#undef SCALE
+#define SCALE Mjoin(Mjoin(PREC,Mjoin(scale,BLC)),FEXT)
+
+#undef MY_FUNCTION
+#define MY_FUNCTION SCALE
+
+static void
+MY_FUNCTION(const TYPE *b,TYPE *c,int len) {
+
+  const TYPE *ce=c+len;
+#if defined(BETAX) && defined(SCPLX)
+  const TYPE z1[2]={{1.0,-1.0},{1.0,-1.0}},*z=z1;
+#endif
+  NO_INLINE
+
+#ifndef SREAL
+  len+=len;
+#endif
+#ifdef DCPLX
+  len+=len;
+#endif
+
+
+  ASM(
+
+      "pushl %%ebx\n\t"
+      a(4,sp)
+
+
+      "movl %0,%%esi\n\t"
+
+      spf(0x00,si)
+      spf(0x20,si)
+
+      "movl %1,%%eax\n\t"
+      "movl %2,%%edi\n\t"
+
+#if defined(BETAX) && defined(SCPLX)
+      "movl %3,%%ebx\n\t"
+      pl(0,bx,SSREG)
+#endif
+
+      lb
+
+      lab(loop)
+
+      test(-16,di)
+      je(8)
+      sub(16,di)
+      align
+
+      sbl16
+      sinc16
+
+      jmp(loop)
+      align
+
+      lab(8)
+
+      test(8,di)
+      je(4)
+
+      sbl8
+      sinc8
+
+      lab(4)
+
+      test(4,di)
+      je(2)
+
+      sbl4
+      sinc4
+
+      lab(2)
+
+#ifndef DCPLX
+      test(2,di)
+      je(1)
+
+      sbl2
+      sinc2
+
+      lab(1)
+
+#ifdef SREAL
+      test(1,di)
+      je(stop)
+
+      sbl1
+      sinc1
+
+      lab(stop)
+#endif
+#endif
+
+      ub
+
+      a(-4,sp)
+      "popl %%ebx\n\t"
+
+
+      ::"m" (c),"m" (b), "m" (len)
+#if defined(BETAX) && defined(SCPLX)
+      ,"m" (z)
+#endif
+      : "si","ax","di");
+
+
+}
+#endif /* CAMM_SCALE_H */
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h
new file mode 100644
index 0000000..4a92006
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_strat1.h
@@ -0,0 +1,2982 @@
+#include "camm_util.h"
+
+#undef p1_4_swap_1
+#define p1_4_swap_1(a_) \
+      pls(a_,ax,1) \
+      pls(a_,cx,0) \
+      pus(0,a_,ax) \
+      pus(1,a_,cx) 
+#undef p1_2_swap_1
+#define p1_2_swap_1(a_) \
+      px(1) \
+      pld(a_,ax,1) \
+      px(0) \
+      pld(a_,cx,0) \
+      pud(0,a_,ax) \
+      pud(1,a_,cx) 
+#undef p1_swap_1
+#define p1_swap_1(a_) \
+      plq(a_,ax,1) \
+      pl(a_,cx,0) \
+      puq(0,a_,ax) \
+      pu(1,a_,cx) 
+#undef p2_swap_1
+#define p2_swap_1(a_) \
+      plq(SS(a_,RS4),ax,3) \
+      pl(SS(a_,RS4),cx,2) \
+      puq(0,a_,ax) \
+      pu(1,a_,cx) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,1) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(2,RS4)),cx,0) \
+      puq(2,SS(a_,RS4),ax) \
+      pu(3,SS(a_,RS4),cx) 
+#undef lpswap_1
+#define lpswap_1(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,1) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(SS(a_,MM(0,RS4)),cx,0) 
+#undef dpswap_1
+#define dpswap_1(a_) \
+      plq(SS(a_,RS4),ax,3) \
+      pl(SS(a_,RS4),cx,2) \
+      puq(0,a_,ax) \
+      pu(1,a_,cx) \
+      puq(2,SS(a_,RS4),ax) \
+      pu(3,SS(a_,RS4),cx) 
+#undef plswap_1
+#define plswap_1 8
+
+
+#undef p1_4_scal_3
+#define p1_4_scal_3(a_) \
+      pls(a_,ax,0) \
+      pmsr(6,0) \
+      pus(0,a_,ax)
+#undef p1_2_scal_3
+#define p1_2_scal_3(a_) \
+      pld(a_,ax,0) \
+      pm(6,0) \
+      pud(0,a_,ax)
+#undef p1_scal_3
+#define p1_scal_3(a_) \
+      plq(a_,ax,0) \
+      pm(6,0) \
+      puq(0,a_,ax)
+#undef p2_scal_3
+#define p2_scal_3(a_) \
+      plq(a_,ax,0) \
+      plq(SS(a_,RS4),ax,1) \
+      pm(6,0) \
+      pm(6,1) \
+      puq(0,a_,ax) \
+      puq(1,SS(a_,RS4),ax) 
+#undef p4_scal_3
+#define p4_scal_3(a_) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      pm(6,2) \
+      puq(0,a_,ax) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(4,RS4)),ax,0) \
+      pm(6,3) \
+      puq(1,SS(a_,RS4),ax) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(5,RS4)),ax,1) \
+      pm(6,0) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+      plq(SS(a_,MM(6,RS4)),ax,2) \
+      pm(6,1) \
+      puq(3,SS(a_,MM(3,RS4)),ax) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) 
+#undef lpscal_3
+#define lpscal_3(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,0) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pm(6,0) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,2) \
+      pm(6,1) 
+#undef dpscal_3
+#define dpscal_3(a_) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      pm(6,2) \
+      puq(0,a_,ax) \
+      pm(6,3) \
+      puq(1,SS(a_,RS4),ax) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plscal_3
+#define plscal_3 16
+
+#undef p1_4_scal_3c
+#define p1_4_scal_3c(a_) 
+#undef p1_2_scal_3c
+#define p1_2_scal_3c(a_) \
+      pld(a_,ax,0) \
+      pc(0,1) \
+      pm(6,0) \
+      ps(CSHUF,1,1) \
+      pm(7,1) \
+      pa(1,0) \
+      pud(0,a_,ax)
+#undef p1_scal_3c
+#define p1_scal_3c(a_) \
+      plq(a_,ax,0) \
+      pc(0,1) \
+      pm(6,0) \
+      ps(CSHUF,1,1) \
+      pm(7,1) \
+      pa(1,0) \
+      puq(0,a_,ax)
+#undef p2_scal_3c
+#define p2_scal_3c(a_) \
+      plq(a_,ax,0) \
+      plq(SS(a_,RS4),ax,1) \
+      pc(0,2) \
+      pm(6,0) \
+      ps(CSHUF,2,2) \
+      pm(7,2) \
+      pa(2,0) \
+      puq(0,a_,ax) \
+      pc(1,3) \
+      pm(6,1) \
+      ps(CSHUF,3,3) \
+      pm(7,3) \
+      pa(3,1) \
+      puq(1,SS(a_,RS4),ax) 
+#undef p4_scal_3c
+#define p4_scal_3c(a_) \
+      pm(7,5) \
+      pa(5,1) \
+      puq(0,a_,ax) \
+      ps(CSHUF,4,4) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(4,RS4)),ax,0) \
+      pc(3,5) \
+      pm(6,3) \
+      pm(7,4) \
+      pa(4,2) \
+      puq(1,SS(a_,RS4),ax) \
+      ps(CSHUF,5,5) \
+      plq(SS(a_,MM(5,RS4)),ax,1) \
+      pc(0,4) \
+      pm(6,0) \
+      pm(7,5) \
+      pa(5,3) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      ps(CSHUF,4,4) \
+      plq(SS(a_,MM(6,RS4)),ax,2) \
+      pc(1,5) \
+      pm(6,1) \
+      pm(7,4) \
+      pa(4,0) \
+      puq(3,SS(a_,MM(3,RS4)),ax) \
+      ps(CSHUF,5,5) \
+      plq(SS(a_,MM(7,RS4)),ax,3) \
+      pc(2,4) \
+      pm(6,2) 
+#undef lpscal_3c
+#define lpscal_3c(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,0) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pc(0,4) \
+      pm(6,0) \
+      ps(CSHUF,4,4) \
+      plq(SS(a_,MM(2,RS4)),ax,2) \
+      pc(1,5) \
+      pm(6,1) \
+      pm(7,4) \
+      pa(4,0) \
+      ps(CSHUF,5,5) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      pc(2,4) \
+      pm(6,2) 
+#undef dpscal_3c
+#define dpscal_3c(a_) \
+      pm(7,5) \
+      pa(5,1) \
+      ps(CSHUF,4,4) \
+      puq(0,a_,ax) \
+      pm(7,4) \
+      pa(4,2) \
+      pc(3,5) \
+      pm(6,3) \
+      puq(1,SS(a_,RS4),ax) \
+      ps(CSHUF,5,5) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      pm(7,5) \
+      pa(5,3) \
+      puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plscal_3c
+#define plscal_3c 16
+
+#undef p1_4_scal_4
+#define p1_4_scal_4(a_) \
+      pls(SS(a_,MM(0,RS4)),ax,0) \
+      pmsr(6,0) \
+      pus(0,a_,ax) 
+#undef p1_2_scal_4
+#define p1_2_scal_4(a_) \
+      pld(SS(a_,MM(0,RS4)),ax,0) \
+      pm(6,0) \
+      pud(0,a_,ax) 
+#undef p1_scal_4
+#define p1_scal_4(a_) \
+      plq(SS(a_,MM(0,RS4)),ax,0) \
+      pm(6,0) \
+      puq(0,a_,ax) 
+#undef p2_scal_4
+#define p2_scal_4(a_) \
+      plq(SS(a_,MM(0,RS4)),ax,0) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pm(6,0) \
+      pm(6,1) \
+      puq(0,a_,ax) \
+      puq(1,SS(a_,RS4),ax) 
+#undef p4_scal_4
+#define p4_scal_4(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,0) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      plq(SS(a_,MM(2,RS4)),ax,2) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      pm(6,0) \
+      pm(6,1) \
+      pm(6,2) \
+      pm(6,3) \
+      puq(0,a_,ax) \
+      puq(1,SS(a_,RS4),ax) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      puq(3,SS(a_,MM(3,RS4)),ax) 
+#undef p8_scal_4
+#define p8_scal_4(a_) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,0) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      plq(SS(a_,MM(2,RS4)),ax,2) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      plq(SS(a_,MM(4,RS4)),ax,4) \
+      plq(SS(a_,MM(5,RS4)),ax,5) \
+      plq(SS(a_,MM(6,RS4)),ax,7) \
+      pm(6,0) \
+      pm(6,1) \
+      pm(6,2) \
+      puq(0,a_,ax) \
+      pm(6,3) \
+      pm(6,4) \
+      pm(6,5) \
+      plq(SS(a_,MM(7,RS4)),ax,0) \
+      pm(6,7) \
+      pm(6,0) \
+      puq(1,SS(a_,RS4),ax) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      puq(3,SS(a_,MM(3,RS4)),ax) \
+      puq(4,SS(a_,MM(4,RS4)),ax) \
+      puq(5,SS(a_,MM(5,RS4)),ax) \
+      puq(7,SS(a_,MM(6,RS4)),ax) \
+      puq(0,SS(a_,MM(7,RS4)),ax) 
+#undef lpscal_4
+#define lpscal_4(a_) 
+#undef dpscal_4
+#define dpscal_4(a_) p4_scal_4(a_)
+#undef plscal_4
+#define plscal_4 16
+
+#undef p1_4_scal_4c
+#define p1_4_scal_4c(a_) 
+#undef p1_2_scal_4c
+#define p1_2_scal_4c(a_) \
+      pld(a_,ax,0) \
+      pc(0,1) \
+      pm(6,0) \
+      ps(CSHUF,1,1) \
+      pm(7,1) \
+      pa(1,0) \
+      pud(0,a_,ax)
+#undef p1_scal_4c
+#define p1_scal_4c(a_) \
+      plq(a_,ax,0) \
+      pc(0,1) \
+      pm(6,0) \
+      ps(CSHUF,1,1) \
+      pm(7,1) \
+      pa(1,0) \
+      puq(0,a_,ax)
+#undef p2_scal_4c
+#define p2_scal_4c(a_) \
+      plq(SS(a_,MM(0,RS4)),ax,0) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pc(0,4) \
+      pc(1,5) \
+      pm(6,0) \
+      pm(6,1) \
+      ps(CSHUF,4,4) \
+      ps(CSHUF,5,5) \
+      pm(7,4) \
+      pa(4,0) \
+      pm(7,5) \
+      pa(5,1) \
+      puq(0,a_,ax) \
+      puq(1,SS(a_,RS4),ax) 
+#undef p4_scal_4c
+#define p4_scal_4c(a_) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,0) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      plq(SS(a_,MM(2,RS4)),ax,2) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      pc(0,4) \
+      pc(1,5) \
+      pm(6,0) \
+      pm(6,1) \
+      ps(CSHUF,4,4) \
+      ps(CSHUF,5,5) \
+      pm(7,4) \
+      pa(4,0) \
+      pc(2,4) \
+      pm(7,5) \
+      pa(5,1) \
+      pc(3,5) \
+      pm(6,2) \
+      pm(6,3) \
+      ps(CSHUF,4,4) \
+      ps(CSHUF,5,5) \
+      pm(7,4) \
+      pa(4,2) \
+      pm(7,5) \
+      pa(5,3) \
+      puq(0,a_,ax) \
+      puq(1,SS(a_,RS4),ax) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      puq(3,SS(a_,MM(3,RS4)),ax) 
+#undef lpscal_4c
+#define lpscal_4c(a_) 
+#undef dpscal_4c
+#define dpscal_4c(a_) p4_scal_4c(a_)
+#undef plscal_4c
+#define plscal_4c 16
+
+#undef p1_4_scal_1
+#define p1_4_scal_1(a_) \
+      pls(a_,ax,1) \
+      pmsr(0,1) \
+      pus(1,a_,ax) 
+#undef p1_2_scal_1
+#define p1_2_scal_1(a_) \
+      px(1) \
+      pld(a_,ax,1) \
+      pm(0,1) \
+      pud(1,a_,ax) 
+#undef p1_scal_1
+#define p1_scal_1(a_) \
+      plq(a_,ax,1) \
+      pm(0,1) \
+      puq(1,a_,ax) 
+#undef p2_scal_1
+#define p2_scal_1(a_) \
+      plq(a_,ax,1) \
+      plq(SS(a_,RS4),ax,2) \
+      pm(0,1) \
+      pm(0,2) \
+      puq(1,a_,ax) \
+      puq(2,SS(a_,RS4),ax)
+#undef p4_scal_1
+#define p4_scal_1(a_) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,1) \
+      pm(0,3) \
+      puq(7,a_,ax) \
+      plq(SS(a_,MM(3,RS4)),ax,2) \
+      pm(0,1) \
+      puq(3,SS(a_,MM(1,RS4)),ax) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+      plq(SS(a_,MM(4,RS4)),ax,7) \
+      pm(0,2) \
+      puq(1,SS(a_,MM(2,RS4)),ax) \
+      plq(SS(a_,MM(5,RS4)),ax,3) \
+      pm(0,7) \
+      puq(2,SS(a_,MM(3,RS4)),ax) 
+#undef lpscal_1
+#define lpscal_1(a_) \
+      plq(a_,ax,7) \
+      plq(SS(a_,MM(1,RS4)),ax,3) \
+      pm(0,7) 
+#undef dpscal_1
+#define dpscal_1(a_) \
+      plq(SS(a_,MM(2,RS4)),ax,1) \
+      pm(0,3) \
+      puq(7,a_,ax) \
+      plq(SS(a_,MM(3,RS4)),ax,2) \
+      pm(0,1) \
+      puq(3,SS(a_,MM(1,RS4)),ax) \
+      pm(0,2) \
+      puq(1,SS(a_,MM(2,RS4)),ax) \
+      puq(2,SS(a_,MM(3,RS4)),ax) 
+#undef plscal_1
+#define plscal_1 RS4
+
+
+#undef p1_4_set_1
+#define p1_4_set_1(a_) \
+      pls(a_,ax,1) \
+      pcs(0,1) \
+      pus(1,a_,ax) 
+#undef p1_2_set_1
+#define p1_2_set_1(a_) \
+      px(1) \
+      pld(a_,ax,1) \
+      pc(0,1) \
+      pud(1,a_,ax) 
+#undef p1_set_1
+#define p1_set_1(a_) \
+      plq(a_,ax,1) \
+      pc(0,1) \
+      puq(1,a_,ax) 
+#undef p2_set_1
+#define p2_set_1(a_) \
+      plq(a_,ax,1) \
+      plq(SS(a_,RS4),ax,2) \
+      pc(0,1) \
+      pc(0,2) \
+      puq(1,a_,ax) \
+      puq(2,SS(a_,RS4),ax)
+#undef p4_set_1
+#define p4_set_1(a_) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,1) \
+      pc(0,3) \
+      puq(7,a_,ax) \
+      plq(SS(a_,MM(3,RS4)),ax,2) \
+      pc(0,1) \
+      puq(3,SS(a_,MM(1,RS4)),ax) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+      plq(SS(a_,MM(4,RS4)),ax,7) \
+      pc(0,2) \
+      puq(1,SS(a_,MM(2,RS4)),ax) \
+      plq(SS(a_,MM(5,RS4)),ax,3) \
+      pc(0,7) \
+      puq(2,SS(a_,MM(3,RS4)),ax) 
+#undef lpset_1
+#define lpset_1(a_) \
+      plq(a_,ax,7) \
+      plq(SS(a_,MM(1,RS4)),ax,3) \
+      pc(0,7) 
+#undef dpset_1
+#define dpset_1(a_) \
+      plq(SS(a_,MM(2,RS4)),ax,1) \
+      pc(0,3) \
+      puq(7,a_,ax) \
+      plq(SS(a_,MM(3,RS4)),ax,2) \
+      pc(0,1) \
+      puq(3,SS(a_,MM(1,RS4)),ax) \
+      pc(0,2) \
+      puq(1,SS(a_,MM(2,RS4)),ax) \
+      puq(2,SS(a_,MM(3,RS4)),ax) 
+#undef plset_1
+#define plset_1 RS4
+
+
+#undef p1_4_set_2
+#define p1_4_set_2(a_) \
+      pus(0,a_,ax) 
+#undef p1_2_set_2
+#define p1_2_set_2(a_) \
+      pud(0,a_,ax) 
+#undef p1_set_2
+#define p1_set_2(a_) \
+      puq(0,a_,ax) 
+#undef p2_set_2
+#define p2_set_2(a_) \
+      puq(0,a_,ax) \
+      puq(0,SS(a_,RS4),ax)
+#undef p4_set_2
+#define p4_set_2(a_) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      puq(0,a_,ax) \
+      puq(0,SS(a_,MM(1,RS4)),ax) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+      puq(0,SS(a_,MM(2,RS4)),ax) \
+      puq(0,SS(a_,MM(3,RS4)),ax) 
+#undef lpset_2
+#define lpset_2(a_) 
+#undef dpset_2
+#define dpset_2(a_) \
+      puq(0,a_,ax) \
+      puq(0,SS(a_,MM(1,RS4)),ax) \
+      puq(0,SS(a_,MM(2,RS4)),ax) \
+      puq(0,SS(a_,MM(3,RS4)),ax) 
+#undef plset_2
+#define plset_2 RS4
+
+
+#undef p1_4_set_3
+#define p1_4_set_3(a_) \
+      pus(0,a_,ax) 
+#undef p1_2_set_3
+#define p1_2_set_3(a_) \
+      pud(0,a_,ax) 
+#undef p1_set_3
+#define p1_set_3(a_) \
+      puq(0,SS(a_,MM(0,RS4)),ax) 
+#undef p2_set_3
+#define p2_set_3(a_) \
+      puq(0,SS(a_,MM(0,RS4)),ax) \
+      puq(0,SS(a_,MM(1,RS4)),ax) 
+#undef p4_set_3
+#define p4_set_3(a_) \
+      puq(0,SS(a_,MM(0,RS4)),ax) \
+      puq(0,SS(a_,MM(1,RS4)),ax) \
+      puq(0,SS(a_,MM(2,RS4)),ax) \
+      puq(0,SS(a_,MM(3,RS4)),ax) 
+#undef p8_set_3
+#define p8_set_3(a_) \
+      puq(0,SS(a_,MM(0,RS4)),ax) \
+      puq(0,SS(a_,MM(1,RS4)),ax) \
+      puq(0,SS(a_,MM(2,RS4)),ax) \
+      puq(0,SS(a_,MM(3,RS4)),ax) \
+      puq(0,SS(a_,MM(4,RS4)),ax) \
+      puq(0,SS(a_,MM(5,RS4)),ax) \
+      puq(0,SS(a_,MM(6,RS4)),ax) \
+      puq(0,SS(a_,MM(7,RS4)),ax) 
+#undef lpset_3
+#define lpset_3(a_) 
+#undef dpset_3
+#define dpset_3(a_) p8_set_3(a_)
+#undef plset_3
+#define plset_3 32
+
+
+#undef p1_4_0x1_nrm2_1
+#define p1_4_0x1_nrm2_1(a_) \
+      pls(a_,ax,1) \
+      pmsr(1,1) \
+      pasr(1,0) 
+#undef p1_2_0x1_nrm2_1
+#define p1_2_0x1_nrm2_1(a_) \
+      px(1) \
+      pld(a_,ax,1) \
+      pm(1,1) \
+      pa(1,0) 
+#undef p1_0x1_nrm2_1
+#define p1_0x1_nrm2_1(a_) \
+      plq(a_,ax,1) \
+      pm(1,1) \
+      pa(1,0) 
+#undef p2_0x1_nrm2_1
+#define p2_0x1_nrm2_1(a_) \
+      plq(a_,ax,1) \
+      plq(SS(a_,RS4),ax,2) \
+      pm(1,1) \
+      pm(2,2) \
+      pa(1,0) \
+      pm(2,0) 
+#undef p4_0x1_nrm2_1
+#define p4_0x1_nrm2_1(a_) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,1) \
+      pm(3,3) \
+      pa(7,0) \
+      plq(SS(a_,MM(3,RS4)),ax,2) \
+      pm(1,1) \
+      pa(3,0) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+      plq(SS(a_,MM(4,RS4)),ax,7) \
+      pm(2,2) \
+      pa(1,0) \
+      plq(SS(a_,MM(5,RS4)),ax,3) \
+      pm(7,7) \
+      pa(2,0) 
+#undef lp0x1_nrm2_1
+#define lp0x1_nrm2_1(a_) \
+      plq(a_,ax,7) \
+      plq(SS(a_,MM(1,RS4)),ax,3) \
+      pm(7,7) 
+#undef dp0x1_nrm2_1
+#define dp0x1_nrm2_1(a_) \
+      plq(SS(a_,MM(2,RS4)),ax,1) \
+      pm(3,3) \
+      pa(7,0) \
+      plq(SS(a_,MM(3,RS4)),ax,2) \
+      pm(1,1) \
+      pa(3,0) \
+      pm(2,2) \
+      pa(1,0) \
+      pa(2,0) 
+#undef pl0x1_nrm2_1
+#define pl0x1_nrm2_1 RS4
+
+
+#undef p1_4_nrm2_2
+#define p1_4_nrm2_2(a_) \
+      pls(a_,ax,1) dbg(1) \
+      pan(4,1) dbg(1) \
+      pcs(5,6) dbg(6) \
+      pcs(5,7) dbg(7) \
+      paxs(1,5) dbg(5) \
+      prps(5,2) dbg(2) \
+      px(3) \
+      pcms(0,2,3) dbg(3) \
+      pan(3,7) dbg(7) \
+      pann(5,3) dbg(3) \
+      pasr(3,7) dbg(7) \
+      pcs(7,5) dbg(5) \
+      pdsr(5,6) dbg(6) \
+      pdsr(5,1) dbg(1) \
+      pmsr(6,6) dbg(6) \
+      pmsr(1,1) dbg(1) \
+      pm(6,0) dbg(0) \
+      pasr(1,0) dbg(0)
+#undef p1_2_nrm2_2
+#define p1_2_nrm2_2(a_) \
+      px(1) pld(a_,ax,1) dbg(1) \
+      pan(4,1) dbg(1) \
+      pc(5,6) dbg(6) \
+      pc(5,7) dbg(7) \
+      pax(1,5) dbg(5) \
+      prp(5,2) dbg(2) \
+      px(3) \
+      pcm(0,2,3)dbg(3) \
+      pan(3,7) dbg(7) \
+      pann(5,3) dbg(3) \
+      pa(3,7) dbg(7) \
+      pc(7,5) dbg(5) \
+      pd(5,6) dbg(6) \
+      pd(5,1) dbg(1) \
+      pm(6,6) dbg(6) \
+      pm(1,1) dbg(1) \
+      pm(6,0) dbg(0) \
+      pa(1,0) dbg(0)
+#undef p1_nrm2_2
+#define p1_nrm2_2(a_) \
+      plq(a_,ax,1) dbg(1) \
+      pan(4,1) dbg(1) \
+      pc(5,6) dbg(6) \
+      pc(5,7) dbg(7) \
+      pax(1,5) dbg(5) \
+      prp(5,2) dbg(2) \
+      px(3) \
+      pcm(0,2,3)dbg(3) \
+      pan(3,7) dbg(7) \
+      pann(5,3) dbg(3) \
+      pa(3,7) dbg(7) \
+      pc(7,5) dbg(5) \
+      pd(5,6) dbg(6) \
+      pd(5,1) dbg(1) \
+      pm(6,6) dbg(6) \
+      pm(1,1) dbg(1) \
+      pm(6,0) dbg(0) \
+      pa(1,0) dbg(0)
+#define p2_nrm2_2(a_) \
+      plq(SS(a_,RS4),ax,1) dbg(1) \
+      pan(4,1) dbg(1) \
+      pc(5,6) dbg(6) \
+      pc(5,7) dbg(7) \
+      pax(1,5) dbg(5) \
+      prp(5,2) dbg(2) \
+      px(3) \
+      pcm(0,2,3)dbg(3) \
+      pan(3,7) dbg(7) \
+      pann(5,3) dbg(3) \
+      pa(3,7) dbg(7) \
+      pc(7,5) dbg(5) \
+      pd(5,6) dbg(6) \
+      pd(5,1) dbg(1) \
+      pm(6,6) dbg(6) \
+      pm(1,1) dbg(1) \
+      pm(6,0) dbg(0) \
+      pa(1,0) dbg(0) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \
+      pan(4,1) dbg(1) \
+      pc(5,6) dbg(6) \
+      pc(5,7) dbg(7) \
+      pax(1,5) dbg(5) \
+      prp(5,2) dbg(2) \
+      px(3) \
+      pcm(0,2,3)dbg(3) \
+      pan(3,7) dbg(7) \
+      pann(5,3) dbg(3) \
+      pa(3,7) dbg(7) \
+      pc(7,5) dbg(5) \
+      pd(5,6) dbg(6) \
+      pd(5,1) dbg(1) \
+      pm(6,6) dbg(6) \
+      pm(1,1) dbg(1) \
+      pm(6,0) dbg(0) \
+      pa(1,0) dbg(0)
+#undef lpnrm2_2
+#define lpnrm2_2(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \
+      pan(4,1) dbg(1) \
+      pc(5,6) dbg(6) \
+      pc(5,7) dbg(7) \
+      pax(1,5) dbg(5) \
+      prp(5,2) dbg(2) \
+      px(3) \
+      pcm(0,2,3)dbg(3) \
+      pan(3,7) dbg(7) \
+      pann(5,3) dbg(3) \
+      pa(3,7) dbg(7) \
+      pc(7,5) dbg(5) \
+      pd(5,6) dbg(6) \
+      pd(5,1) dbg(1) \
+      pm(6,6) dbg(6) \
+      pm(1,1) dbg(1) \
+      pm(6,0) dbg(0) \
+      pa(1,0) dbg(0)
+#undef dpnrm2_2
+#define dpnrm2_2(a_) \
+      plq(SS(a_,RS4),ax,1) dbg(1) \
+      pan(4,1) dbg(1) \
+      pc(5,6) dbg(6) \
+      pc(5,7) dbg(7) \
+      pax(1,5) dbg(5) \
+      prp(5,2) dbg(2) \
+      px(3) \
+      pcm(0,2,3)dbg(3) \
+      pan(3,7) dbg(7) \
+      pann(5,3) dbg(3) \
+      pa(3,7) dbg(7) \
+      pc(7,5) dbg(5) \
+      pd(5,6) dbg(6) \
+      pd(5,1) dbg(1) \
+      pm(6,6) dbg(6) \
+      pm(1,1) dbg(1) \
+      pm(6,0) dbg(0) \
+      pa(1,0) dbg(0) 
+#undef plnrm2_2
+#define plnrm2_2 8
+
+
+#undef p1_4_nrm2_3
+#define p1_4_nrm2_3(a_) \
+      pls(a_,ax,1) dbg(1) \
+      pcs(5,6) dbg(6) \
+      pan(4,1) dbg(1) \
+      paxs(1,5) dbg(5) \
+      pdsr(5,6) dbg(6) \
+      pdsr(5,1) dbg(1) \
+      pmsr(6,6) dbg(6) \
+      pmsr(1,1) dbg(1) \
+      pm(6,0) dbg(0) \
+      pasr(1,0) dbg(0)
+#undef p1_2_nrm2_3
+#define p1_2_nrm2_3(a_) \
+      px(1) pld(a_,ax,1) dbg(1) \
+      pc(5,6) dbg(6) \
+      pan(4,1) dbg(1) \
+      pax(1,5) dbg(5) \
+      pd(5,6) dbg(6) \
+      pd(5,1) dbg(1) \
+      pm(6,6) dbg(6) \
+      pm(1,1) dbg(1) \
+      pm(6,0) dbg(0) \
+      pa(1,0) dbg(0)
+#undef p1_nrm2_3
+#define p1_nrm2_3(a_) \
+      plq(a_,ax,1) dbg(1) \
+      pc(5,6) dbg(6) \
+      pan(4,1) dbg(1) \
+      pax(1,5) dbg(5) \
+      pd(5,6) dbg(6) \
+      pd(5,1) dbg(1) \
+      pm(6,6) dbg(6) \
+      pm(1,1) dbg(1) \
+      pm(6,0) dbg(0) \
+      pa(1,0) dbg(0)
+#define p2_nrm2_3(a_) \
+      plq(SS(a_,RS4),ax,1) dbg(1) \
+      pc(5,6) dbg(6) \
+      pan(4,1) dbg(1) \
+      pax(1,5) dbg(5) \
+      pd(5,6) dbg(6) \
+      pd(5,1) dbg(1) \
+      pm(6,6) dbg(6) \
+      pm(1,1) dbg(1) \
+      pm(6,0) dbg(0) \
+      pa(1,0) dbg(0) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \
+      pc(5,6) dbg(6) \
+      pan(4,1) dbg(1) \
+      pax(1,5) dbg(5) \
+      pd(5,6) dbg(6) \
+      pd(5,1) dbg(1) \
+      pm(6,6) dbg(6) \
+      pm(1,1) dbg(1) \
+      pm(6,0) dbg(0) \
+      pa(1,0) dbg(0)
+#undef lpnrm2_3
+#define lpnrm2_3(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \
+      pc(5,6) dbg(6) \
+      pan(4,1) dbg(1) \
+      pax(1,5) dbg(5) \
+      pd(5,6) dbg(6) \
+      pd(5,1) dbg(1) \
+      pm(6,6) dbg(6) \
+      pm(1,1) dbg(1) \
+      pm(6,0) dbg(0) \
+      pa(1,0) dbg(0)
+#undef dpnrm2_3
+#define dpnrm2_3(a_) \
+      plq(SS(a_,RS4),ax,1) dbg(1) \
+      pc(5,6) dbg(6) \
+      pan(4,1) dbg(1) \
+      pax(1,5) dbg(5) \
+      pd(5,6) dbg(6) \
+      pd(5,1) dbg(1) \
+      pm(6,6) dbg(6) \
+      pm(1,1) dbg(1) \
+      pm(6,0) dbg(0) \
+      pa(1,0) dbg(0) 
+#undef plnrm2_3
+#define plnrm2_3 8
+
+#define block_nrm2_4(a_,b_) \
+      Mjoin(pc,a_)(5,6) dbg(6) \
+      pan(4,1) dbg(1) \
+      Mjoin(pax,a_)(1,5) dbg(5) \
+      Mjoin(pc,a_)(2,7) dbg(7) \
+      Mjoin(pd,b_)(5,7) dbg(7) \
+      Mjoin(pm,b_)(7,6) dbg(6) \
+      Mjoin(pm,b_)(7,1) dbg(1) \
+      Mjoin(pm,b_)(6,6) dbg(6) \
+      Mjoin(pm,b_)(6,0) dbg(0) \
+      Mjoin(pm,b_)(1,1) dbg(1) \
+      Mjoin(pa,b_)(1,0) dbg(0)
+
+
+/*  #undef p1_4_nrm2_4 */
+/*  #define p1_4_nrm2_4(a_) \ */
+/*        pls(a_,ax,1) dbg(1) \ */
+/*        pcs(5,6) dbg(6) \ */
+/*        pan(4,1) dbg(1) \ */
+/*        paxs(1,5) dbg(5) \ */
+/*        pcs(2,7) dbg(7) \ */
+/*        pdsr(5,7) dbg(7) \ */
+/*        pmsr(7,6) dbg(6) \ */
+/*        pmsr(7,1) dbg(1) \ */
+/*        pmsr(6,6) dbg(6) \ */
+/*        pmsr(6,0) dbg(0) \ */
+/*        pmsr(1,1) dbg(1) \ */
+/*        pasr(1,0) dbg(0) */
+#undef p1_4_nrm2_4
+#define p1_4_nrm2_4(a_) \
+      pls(a_,ax,1) dbg(1) \
+      block_nrm2_4(s,sr)
+#undef p1_2_nrm2_4
+#define p1_2_nrm2_4(a_) \
+      px(1) pld(a_,ax,1) dbg(1) \
+      block_nrm2_4(,)
+#undef p1_nrm2_4
+#define p1_nrm2_4(a_) \
+      plq(a_,ax,1) dbg(1) \
+      block_nrm2_4(,)
+#define p2_nrm2_4(a_) \
+      plq(SS(a_,RS4),ax,1) dbg(1) \
+      block_nrm2_4(,) \
+      plq(SS(a_,MM(2,RS4)),ax,1) dbg(1) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      block_nrm2_4(,)
+#undef lpnrm2_4
+#define lpnrm2_4(a_) \
+      plq(SS(a_,MM(0,RS4)),ax,1) dbg(1) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      block_nrm2_4(,)
+#undef dpnrm2_4
+#define dpnrm2_4(a_) \
+      plq(SS(a_,RS4),ax,1) dbg(1) \
+      block_nrm2_4(,)
+#undef plnrm2_4
+#define plnrm2_4 8
+
+
+#undef p1_4_1x1_1
+#define p1_4_1x1_1(a_) \
+      pls(a_,ax,1) \
+      pls(a_,bx,0) \
+      pm(0,1) \
+      pa(1,6) 
+#undef p1_2_1x1_1
+#define p1_2_1x1_1(a_) \
+      pld(a_,ax,1) \
+      pld(a_,bx,0) \
+      pm(0,1) \
+      pa(1,6) 
+#undef p1_1x1_1
+#define p1_1x1_1(a_) \
+      plq(a_,ax,1) \
+      plq(a_,bx,0) \
+      pm(0,1) \
+      pa(0,6) 
+#undef p2_1x1_1
+#define p2_1x1_1(a_) \
+      plq(a_,ax,1) \
+      plq(a_,bx,0) \
+      plq(SS(a_,RS4),ax,2) \
+      plq(SS(a_,RS4),bx,3) \
+      pm(0,1) \
+      pm(2,3) \
+      pa(1,6) \
+      pa(3,6) 
+#undef p4_1x1_1
+#define p4_1x1_1(a_) \
+      f(nta,SS(a_,MM(4,RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,1) \
+      plq(SS(a_,MM(2,RS4)),ax,1) \
+      pm(0,3) \
+      puq(7,a_,ax) \
+      plq(SS(a_,MM(3,RS4)),ax,2) \
+      pm(0,1) \
+      puq(3,SS(a_,RS4),ax) \
+      f(nta,SS(a_,MM(6,RS4)),ax) \
+      plq(SS(a_,MM(4,RS4)),ax,7) \
+      pm(0,2) \
+      puq(1,SS(a_,MM(2,RS4)),ax) \
+      plq(SS(a_,MM(5,RS4)),ax,3) \
+      pm(0,7) \
+      puq(2,SS(a_,MM(3,RS4)),ax) 
+#undef lp1x1_1
+#define lp1x1_1(a_) \
+      plq(a_,ax,7) \
+      plq(SS(a_,RS4),ax,3) \
+      pm(0,7) 
+#undef dp1x1_1
+#define dp1x1_1(a_) \
+      plq(SS(,a_,MM(2,RS4)),ax,1) \
+      pm(0,3) \
+      puq(7,a_,ax) \
+      plq(SS(a_,MM(3,RS4)),ax,2) \
+      pm(0,1) \
+      puq(3,SS(a_,RS4),ax) \
+      pm(0,2) \
+      puq(1,SS(a_,MM(2,RS4)),ax) \
+      puq(2,SS(a_,MM(3,RS4)),ax) 
+#undef pl1x1_1
+#define pl1x1_1 RS4
+
+
+#undef p1_4_0x1_asum_1
+#define p1_4_0x1_asum_1(a_) \
+      pls(a_,ax,1) \
+      pan(4,1) \
+      pasr(1,0) 
+#undef p1_2_0x1_asum_1
+#define p1_2_0x1_asum_1(a_) \
+      px(1) \
+      pld(a_,ax,1) \
+      pan(4,1) \
+      pa(1,0) 
+#undef p1_0x1_asum_1
+#define p1_0x1_asum_1(a_) \
+      plq(a_,ax,1) \
+      pan(4,1) \
+      pa(1,0) 
+#undef p2_0x1_asum_1
+#define p2_0x1_asum_1(a_) \
+      plq(a_,ax,1) \
+      plq(SS(a_,RS4),ax,2) \
+      pan(4,1) \
+      pan(4,2) \
+      pa(1,0) \
+      pa(2,0) 
+#undef p4_0x1_asum_1
+#define p4_0x1_asum_1(a_) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,1) \
+      pan(4,3) \
+      pa(7,0) \
+      plq(SS(a_,MM(3,RS4)),ax,2) \
+      pan(4,1) \
+      pa(3,0) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+      plq(SS(a_,MM(4,RS4)),ax,7) \
+      pan(4,2) \
+      pa(1,0) \
+      plq(SS(a_,MM(5,RS4)),ax,3) \
+      pan(4,7) \
+      pa(2,0) 
+#undef lp0x1_asum_1
+#define lp0x1_asum_1(a_) \
+      plq(a_,ax,7) \
+      plq(SS(a_,MM(1,RS4)),ax,3) \
+      pan(4,7) 
+#undef dp0x1_asum_1
+#define dp0x1_asum_1(a_) \
+      plq(SS(a_,MM(2,RS4)),ax,1) \
+      pan(4,3) \
+      pa(7,0) \
+      plq(SS(a_,MM(3,RS4)),ax,2) \
+      pan(4,1) \
+      pa(3,0) \
+      pan(4,2) \
+      pa(1,0) \
+      pa(2,0) 
+#undef pl0x1_asum_1
+#define pl0x1_asum_1 RS4
+
+
+#undef p1_4_sum_1
+#define p1_4_sum_1(a_) \
+      pls(a_,ax,1) \
+      pasr(1,0) 
+#undef p1_2_sum_1
+#define p1_2_sum_1(a_) \
+      px(1) \
+      pld(a_,ax,1) \
+      pa(1,0) 
+#undef p1_sum_1
+#define p1_sum_1(a_) \
+      plq(a_,ax,1) \
+      pa(1,0) 
+#undef p2_sum_1
+#define p2_sum_1(a_) \
+      plq(a_,ax,1) \
+      plq(SS(a_,RS4),ax,2) \
+      pa(1,0) \
+      pa(2,0) 
+#undef p4_sum_1
+#define p4_sum_1(a_) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,1) \
+      pa(7,0) \
+      plq(SS(a_,MM(3,RS4)),ax,2) \
+      pa(3,0) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+      plq(SS(a_,MM(4,RS4)),ax,7) \
+      pa(1,0) \
+      plq(SS(a_,MM(5,RS4)),ax,3) \
+      pa(2,0) 
+#undef lpsum_1
+#define lpsum_1(a_) \
+      plq(a_,ax,7) \
+      plq(SS(a_,MM(1,RS4)),ax,3) 
+#undef dpsum_1
+#define dpsum_1(a_) \
+      plq(SS(a_,MM(2,RS4)),ax,1) \
+      pa(7,0) \
+      plq(SS(a_,MM(3,RS4)),ax,2) \
+      pa(3,0) \
+      pa(1,0) \
+      pa(2,0) 
+#undef plsum_1
+#define plsum_1 RS4
+
+
+#undef p1_4_dot_1
+#define p1_4_dot_1(a_) \
+      pls(a_,ax,1) \
+      pls(a_,cx,2) \
+      pmsr(2,1) \
+      pasr(1,0) 
+#undef p1_2_dot_1
+#define p1_2_dot_1(a_) \
+      px(1) \
+      pld(a_,ax,1) \
+      px(2) \
+      pld(a_,cx,2) \
+      pm(2,1) \
+      pa(1,0) 
+#undef p1_dot_1
+#define p1_dot_1(a_) \
+      plq(a_,ax,1) \
+      pl(a_,cx,2) \
+      pm(2,1) \
+      pa(1,0) 
+#undef p2_dot_1
+#define p2_dot_1(a_) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pl(SS(a_,MM(1,RS4)),cx,2) \
+      pm(4,3) \
+      pa(3,0) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,3) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(2,RS4)),cx,4) \
+      pm(2,1) \
+      pa(1,0) 
+#undef lpdot_1
+#define lpdot_1(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(a_,ax,3) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(a_,cx,4) 
+#undef dpdot_1
+#define dpdot_1(a_) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pl(SS(a_,MM(1,RS4)),cx,2) \
+      pm(4,3) \
+      pa(3,0) \
+      pm(2,1) \
+      pa(1,0)
+#undef pldot_1
+#define pldot_1 8
+
+#undef p1_4_dot_1c
+#define p1_4_dot_1c(a_)
+#undef p1_2_dot_1c
+#define p1_2_dot_1c(a_) \
+      px(1) \
+      pld(a_,ax,1) \
+      px(2) \
+      pld(a_,cx,2) \
+      pc(1,3) \
+      ps(HSHUF,1,1) \
+      ps(LSHUF,3,3) \
+      pm(7,1) \
+      pm(2,3) \
+      pa(3,0) \
+      pm(2,1) \
+      pa(1,6)
+#undef p1_dot_1c
+#define p1_dot_1c(a_) \
+      plq(a_,ax,1) \
+      pl(a_,cx,2) \
+      pc(1,3) \
+      ps(HSHUF,1,1) \
+      ps(LSHUF,3,3) \
+      pm(7,1) \
+      pm(2,3) \
+      pa(3,0) \
+      pm(2,1) \
+      pa(1,6)
+#undef p2_dot_1c
+#define p2_dot_1c(a_) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pl(SS(a_,MM(1,RS4)),cx,2) \
+      pc(3,5) \
+      ps(HSHUF,3,3) \
+      ps(LSHUF,5,5) \
+      pm(7,3) \
+      pm(4,5) \
+      pa(5,0) \
+      pm(4,3) \
+      pa(3,6) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(2,RS4)),cx,4) \
+      plq(SS(a_,MM(2,RS4)),ax,3) \
+      pc(1,5) \
+      ps(HSHUF,1,1) \
+      ps(LSHUF,5,5) \
+      pm(7,1) \
+      pm(2,5) \
+      pa(5,0) \
+      pm(2,1) \
+      pa(1,6)
+#undef lpdot_1c
+#define lpdot_1c(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(a_,ax,3) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(a_,cx,4) 
+#undef dpdot_1c
+#define dpdot_1c(a_) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pl(SS(a_,MM(1,RS4)),cx,2) \
+      pc(3,5) \
+      ps(HSHUF,3,3) \
+      ps(LSHUF,5,5) \
+      pm(7,3) \
+      pm(4,5) \
+      pa(5,0) \
+      pm(4,3) \
+      pa(3,6) \
+      pc(1,5) \
+      ps(HSHUF,1,1) \
+      ps(LSHUF,5,5) \
+      pm(7,1) \
+      pm(2,5) \
+      pa(5,0) \
+      pm(2,1) \
+      pa(1,6)
+#undef pldot_1c
+#define pldot_1c 8
+
+#undef p1_4_dot_2c
+#define p1_4_dot_2c(a_)
+#undef p1_2_dot_2c
+#define p1_2_dot_2c(a_) \
+      px(1) \
+      pld(a_,ax,1) \
+      px(2) \
+      pld(a_,cx,2) \
+      pc(1,3) \
+      ps(CSHUF,1,1) \
+      pm(2,3) \
+      pa(3,0) \
+      pm(2,1) \
+      pa(1,6)
+#undef p1_dot_2c
+#define p1_dot_2c(a_) \
+      plq(a_,ax,1) \
+      pl(a_,cx,2) \
+      pc(1,3) \
+      ps(CSHUF,1,1) \
+      pm(2,3) \
+      pa(3,0) \
+      pm(2,1) \
+      pa(1,6)
+#undef p2_dot_2c
+#define p2_dot_2c(a_) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pl(SS(a_,MM(1,RS4)),cx,2) \
+      pc(3,5) \
+      ps(CSHUF,3,3) \
+      pm(4,5) \
+      pa(5,0) \
+      pm(4,3) \
+      pa(3,6) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(2,RS4)),cx,4) \
+      plq(SS(a_,MM(2,RS4)),ax,3) \
+      pc(1,5) \
+      ps(CSHUF,1,1) \
+      pm(2,5) \
+      pa(5,0) \
+      pm(2,1) \
+      pa(1,6)
+#undef lpdot_2c
+#define lpdot_2c(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(a_,ax,3) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(a_,cx,4) 
+#undef dpdot_2c
+#define dpdot_2c(a_) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pl(SS(a_,MM(1,RS4)),cx,2) \
+      pc(3,5) \
+      ps(CSHUF,3,3) \
+      pm(4,5) \
+      pa(5,0) \
+      pm(4,3) \
+      pa(3,6) \
+      pc(1,5) \
+      ps(CSHUF,1,1) \
+      pm(2,5) \
+      pa(5,0) \
+      pm(2,1) \
+      pa(1,6)
+#undef pldot_2c
+#define pldot_2c 8
+
+#undef p1_4_axpby_3
+#define p1_4_axpby_3(a_) \
+      pls(a_,ax,0) \
+      pls(a_,cx,3) \
+      pmsr(5,0) \
+      pmsr(6,3) \
+      pasr(3,0) \
+      pus(0,a_,ax)
+#undef p1_2_axpby_3
+#define p1_2_axpby_3(a_) \
+      pld(a_,ax,0) \
+      pld(a_,cx,3) \
+      pm(5,0) \
+      pm(6,3) \
+      pa(3,0) \
+      pud(0,a_,ax)
+#undef p1_axpby_3
+#define p1_axpby_3(a_) \
+      plq(a_,ax,0) \
+      pl(a_,cx,3) \
+      pm(5,0) \
+      pm(6,3) \
+      pa(3,0) \
+      punt(0,a_,ax)
+#undef p2_axpby_3
+#define p2_axpby_3(a_) \
+      plq(a_,ax,0) \
+      pl(a_,cx,3) \
+      plq(SS(a_,RS4),ax,1) \
+      pm(5,0) \
+      pm(6,3) \
+      pa(3,0) \
+      pl(SS(a_,RS4),cx,3) \
+      punt(0,a_,ax) \
+      pm(5,1) \
+      pm(6,3) \
+      pa(3,1) \
+      punt(1,SS(a_,RS4),ax)
+#undef p4_axpby_3
+#define p4_axpby_3(a_) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      pm(5,2) \
+      pl(SS(a_,MM(3,RS4)),cx,7) \
+      pm(6,4) \
+      pa(4,2) \
+      punt(0,a_,ax) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(4,RS4)),cx,4) \
+      pm(5,3) \
+      plq(SS(a_,MM(4,RS4)),ax,0) \
+      pm(6,7) \
+      pa(7,3) \
+      punt(1,SS(a_,RS4),ax) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(5,RS4)),ax,1) \
+      pm(5,0) \
+      pl(SS(a_,MM(5,RS4)),cx,7) \
+      pm(6,4) \
+      pa(4,0) \
+      punt(2,SS(a_,MM(2,RS4)),ax) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+      pl(SS(a_,MM(6,RS4)),cx,4) \
+      pm(5,1) \
+      plq(SS(a_,MM(6,RS4)),ax,2) \
+      pm(6,7) \
+      pa(7,1) \
+      punt(3,SS(a_,MM(3,RS4)),ax) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) 
+#undef lpaxpby_3
+#define lpaxpby_3(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(SS(a_,MM(0,RS4)),cx,4) \
+      plq(SS(a_,MM(0,RS4)),ax,0) \
+      pl(SS(a_,MM(1,RS4)),cx,7) \
+      pm(5,0) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pm(6,4) \
+      pa(4,0) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,2) \
+      pm(5,1) \
+      pl(SS(a_,MM(2,RS4)),cx,4) \
+      pm(6,7) \
+      pa(7,1)
+#undef dpaxpby_3
+#define dpaxpby_3(a_) \
+      pl(SS(a_,MM(3,RS4)),cx,7) \
+      pm(5,2) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      pm(6,4) \
+      pa(4,2) \
+      pm(5,3) \
+      punt(0,a_,ax) \
+      pm(6,7) \
+      pa(7,3) \
+      punt(1,SS(a_,RS4),ax) \
+      punt(2,SS(a_,MM(2,RS4)),ax) \
+      punt(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpby_3
+#define plaxpby_3 16
+
+#undef p1_4_axpby_3c
+#define p1_4_axpby_3c(a_) 
+#undef p1_2_axpby_3c
+#define p1_2_axpby_3c(a_) \
+      pld(a_,ax,0) \
+      pld(a_,cx,2) \
+      pc(0,3) \
+      pm(5,0) \
+      ps(CSHUF,3,3) \
+      pm(4,3) \
+      pa(3,0) \
+      pc(2,3) \
+      pm(6,2) \
+      pa(2,0) \
+      ps(CSHUF,3,3) \
+      pm(7,3) \
+      pa(3,0) \
+      pud(0,a_,ax)
+#undef p1_axpby_3c
+#define p1_axpby_3c(a_) \
+      plq(a_,ax,0) \
+      pl(a_,cx,2) \
+      pc(0,3) \
+      pm(5,0) \
+      ps(CSHUF,3,3) \
+      pm(4,3) \
+      pa(3,0) \
+      pc(2,3) \
+      pm(6,2) \
+      pa(2,0) \
+      ps(CSHUF,3,3) \
+      pm(7,3) \
+      pa(3,0) \
+      puq(0,a_,ax)
+#undef p2_axpby_3c
+#define p2_axpby_3c(a_) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pl(SS(a_,MM(1,RS4)),cx,3) \
+      pc(1,2) \
+      pm(5,1) \
+      ps(CSHUF,2,2) \
+      pm(4,2) \
+      pa(2,1) \
+      pc(3,2) \
+      pm(6,3) \
+      pa(3,1) \
+      ps(CSHUF,2,2) \
+      pm(7,2) \
+      pa(2,1) \
+      puq(0,a_,ax) \
+      plq(SS(a_,MM(2,RS4)),ax,0) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(2,RS4)),cx,2) \
+      pc(0,3) \
+      pm(5,0) \
+      ps(CSHUF,3,3) \
+      pm(4,3) \
+      pa(3,0) \
+      pc(2,3) \
+      pm(6,2) \
+      pa(2,0) \
+      ps(CSHUF,3,3) \
+      pm(7,3) \
+      pa(3,0) \
+      puq(1,SS(a_,RS4),ax) 
+#undef lpaxpby_3c
+#define lpaxpby_3c(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,0) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(SS(a_,MM(0,RS4)),cx,2) \
+      pc(0,3) \
+      pm(5,0) \
+      ps(CSHUF,3,3) \
+      pm(4,3) \
+      pa(3,0) \
+      pc(2,3) \
+      pm(6,2) \
+      pa(2,0) \
+      ps(CSHUF,3,3) \
+      pm(7,3) \
+      pa(3,0) 
+#undef dpaxpby_3c
+#define dpaxpby_3c(a_) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pl(SS(a_,MM(1,RS4)),cx,3) \
+      pc(1,2) \
+      pm(5,1) \
+      ps(CSHUF,2,2) \
+      pm(4,2) \
+      pa(2,1) \
+      pc(3,2) \
+      pm(6,3) \
+      pa(3,1) \
+      ps(CSHUF,2,2) \
+      pm(7,2) \
+      pa(2,1) \
+      puq(0,a_,ax) \
+      puq(1,SS(a_,RS4),ax) 
+#undef plaxpby_3c
+#define plaxpby_3c 8
+
+#undef p1_4_axpby_2
+#define p1_4_axpby_2(a_) \
+      pls(a_,cx,5) \
+      pls(a_,ax,0) \
+      pmsr(6,5) \
+      pasr(5,0) \
+      pus(0,a_,ax)
+#undef p1_2_axpby_2
+#define p1_2_axpby_2(a_) \
+      pld(a_,cx,5) \
+      pld(a_,ax,0) \
+      pm(6,5) \
+      pa(5,0) \
+      pud(0,a_,ax)
+#undef p1_axpby_2
+#define p1_axpby_2(a_) \
+      pl(a_,cx,5) \
+      plq(a_,ax,0) \
+      pm(6,5) \
+      pa(5,0) \
+      puq(0,a_,ax)
+#undef p2_axpby_2
+#define p2_axpby_2(a_) \
+      pl(a_,cx,5) \
+      plq(a_,ax,0) \
+      pl(SS(a_,RS4),cx,4) \
+      pm(6,5) \
+      pa(5,0) \
+      plq(SS(a_,RS4),ax,1) \
+      puq(0,a_,ax) \
+      pm(6,4) \
+      pa(4,1) \
+      puq(1,SS(a_,RS4),ax) 
+#undef p4_axpby_2
+#define p4_axpby_2(a_) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      pl(SS(a_,MM(3,RS4)),cx,5) \
+      pm(6,4) \
+      pa(4,2) \
+      puq(0,a_,ax) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(4,RS4)),cx,4) \
+      plq(SS(a_,MM(4,RS4)),ax,0) \
+      pm(6,5) \
+      pa(5,3) \
+      puq(1,SS(a_,RS4),ax) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(5,RS4)),ax,1) \
+      pl(SS(a_,MM(5,RS4)),cx,5) \
+      pm(6,4) \
+      pa(4,0) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+      pl(SS(a_,MM(6,RS4)),cx,4) \
+      plq(SS(a_,MM(6,RS4)),ax,2) \
+      pm(6,5) \
+      pa(5,1) \
+      puq(3,SS(a_,MM(3,RS4)),ax) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) 
+#undef lpaxpby_2
+#define lpaxpby_2(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(SS(a_,MM(0,RS4)),cx,4) \
+      plq(SS(a_,MM(0,RS4)),ax,0) \
+      pl(SS(a_,MM(1,RS4)),cx,5) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pm(6,4) \
+      pa(4,0) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,2) \
+      pl(SS(a_,MM(2,RS4)),cx,4) \
+      pm(6,5) \
+      pa(5,1)
+#undef dpaxpby_2
+#define dpaxpby_2(a_) \
+      pl(SS(a_,MM(3,RS4)),cx,5) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      pm(6,4) \
+      pa(4,2) \
+      puq(0,a_,ax) \
+      pm(6,5) \
+      pa(5,3) \
+      puq(1,SS(a_,RS4),ax) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpby_2
+#define plaxpby_2 16
+
+#undef p1_4_axpby_2c
+#define p1_4_axpby_2c(a_) 
+#undef p1_2_axpby_2c
+#define p1_2_axpby_2c(a_) \
+      pld(a_,cx,5) \
+      pld(a_,ax,0) \
+      pc(5,1) \
+      pm(6,5) \
+      pa(5,0) \
+      ps(CSHUF,1,1) \
+      pm(7,1) \
+      pa(1,0) \
+      pud(0,a_,ax)
+#undef p1_axpby_2c
+#define p1_axpby_2c(a_) \
+      pl(a_,cx,5) \
+      plq(a_,ax,0) \
+      pc(5,1) \
+      pm(6,5) \
+      pa(5,0) \
+      ps(CSHUF,1,1) \
+      pm(7,1) \
+      pa(1,0) \
+      puq(0,a_,ax)
+#undef p2_axpby_2c
+#define p2_axpby_2c(a_) \
+      pl(a_,cx,5) \
+      plq(a_,ax,0) \
+      pl(SS(a_,RS4),cx,4) \
+      pc(5,1) \
+      pm(6,5) \
+      pa(5,0) \
+      ps(CSHUF,2,2) \
+      pm(7,2) \
+      pa(2,0) \
+      plq(SS(a_,RS4),ax,1) \
+      puq(0,a_,ax) \
+      pc(4,3) \
+      pm(6,4) \
+      pa(4,1) \
+      ps(CSHUF,3,3) \
+      pm(7,3) \
+      pa(3,1) \
+      puq(1,SS(a_,RS4),ax) 
+#undef p4_axpby_2c
+#define p4_axpby_2c(a_) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      puq(0,a_,ax) \
+      pc(4,0) \
+      pm(6,4) \
+      pa(4,2) \
+      ps(CSHUF,0,0) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(4,RS4)),cx,4) \
+      pm(7,0) \
+      pa(0,2) \
+      plq(SS(a_,MM(4,RS4)),ax,0) \
+      puq(1,SS(a_,RS4),ax) \
+      pc(5,1) \
+      pm(6,5) \
+      pa(5,3) \
+      ps(CSHUF,1,1) \
+      pl(SS(a_,MM(5,RS4)),cx,5) \
+      pm(7,1) \
+      pa(1,3) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(5,RS4)),ax,1) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      pc(4,2) \
+      pm(6,4) \
+      pa(4,0) \
+      ps(CSHUF,2,2) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+      pl(SS(a_,MM(6,RS4)),cx,4) \
+      pm(7,2) \
+      pa(2,0) \
+      plq(SS(a_,MM(6,RS4)),ax,2) \
+      puq(3,SS(a_,MM(3,RS4)),ax) \
+      pc(5,3) \
+      pm(6,5) \
+      pa(5,1) \
+      ps(CSHUF,3,3) \
+      pl(SS(a_,MM(7,RS4)),cx,5) \
+      pm(7,3) \
+      pa(3,1) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) 
+#undef lpaxpby_2c
+#define lpaxpby_2c(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(SS(a_,MM(0,RS4)),cx,4) \
+      plq(SS(a_,MM(0,RS4)),ax,0) \
+      pl(SS(a_,MM(1,RS4)),cx,5) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pc(4,2) \
+      pm(6,4) \
+      pa(4,0) \
+      ps(CSHUF,2,2) \
+      pl(SS(a_,MM(2,RS4)),cx,4) \
+      pm(7,2) \
+      pa(2,0) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,2) \
+      pc(5,3) \
+      pm(6,5) \
+      pa(5,1) \
+      ps(CSHUF,3,3) \
+      pl(SS(a_,MM(3,RS4)),cx,5) \
+      pm(7,3) \
+      pa(3,1)
+#undef dpaxpby_2c
+#define dpaxpby_2c(a_) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      puq(0,a_,ax) \
+      pc(4,0) \
+      pm(6,4) \
+      pa(4,2) \
+      ps(CSHUF,0,0) \
+      puq(1,SS(a_,RS4),ax) \
+      pm(7,0) \
+      pa(0,2) \
+      pc(5,1) \
+      pm(6,5) \
+      pa(5,3) \
+      ps(CSHUF,1,1) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      pm(7,1) \
+      pa(1,3) \
+      puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpby_2c
+#define plaxpby_2c 16
+
+#undef p1_4_axpby_1
+#define p1_4_axpby_1(a_) \
+      pls(a_,ax,1) \
+      pls(a_,cx,2) \
+      pmsr(5,1) \
+      pmsr(6,2) \
+      pasr(2,1) \
+      pus(1,a_,ax)
+#undef p1_2_axpby_1
+#define p1_2_axpby_1(a_) \
+      pld(a_,ax,1) \
+      pld(a_,cx,2) \
+      pm(5,1) \
+      pm(6,2) \
+      pa(2,1) \
+      pud(1,a_,ax)
+#undef p1_axpby_1
+#define p1_axpby_1(a_) \
+      plq(a_,ax,1) \
+      pl(a_,cx,2) \
+      pm(5,1) \
+      pm(6,2) \
+      pa(2,1) \
+      puq(1,a_,ax)
+#undef p2_axpby_1
+#define p2_axpby_1(a_) \
+      plq(SS(a_,RS4),ax,3) \
+      pl(SS(a_,RS4),cx,4) \
+      pm(5,1) \
+      pm(6,2) \
+      pa(2,1) \
+      puq(1,a_,ax) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,1) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(2,RS4)),cx,2) \
+      pm(5,3) \
+      pm(6,4) \
+      pa(4,3) \
+      puq(3,SS(a_,RS4),ax)
+#undef lpaxpby_1
+#define lpaxpby_1(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,1) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(SS(a_,MM(0,RS4)),cx,2) 
+#undef dpaxpby_1
+#define dpaxpby_1(a_) \
+      plq(SS(a_,RS4),ax,3) \
+      pl(SS(a_,RS4),cx,4) \
+      pm(5,1) \
+      pm(6,2) \
+      pa(2,1) \
+      puq(1,a_,ax) \
+      pm(5,3) \
+      pm(6,4) \
+      pa(4,3) \
+      puq(3,SS(a_,RS4),ax)
+#undef plaxpby_1
+#define plaxpby_1 8
+
+#undef p1_4_axpy_0
+#define p1_4_axpy_0(a_) \
+      pls(a_,cx,2) \
+      pls(a_,ax,1) \
+      pmsr(6,2) \
+      pasr(2,1) \
+      pus(1,a_,ax)
+#undef p1_2_axpy_0
+#define p1_2_axpy_0(a_) \
+      pld(a_,cx,2) \
+      pld(a_,ax,1) \
+      pm(6,2) \
+      pa(2,1) \
+      pud(1,a_,ax)
+#undef p1_axpy_0
+#define p1_axpy_0(a_) \
+      pl(a_,cx,2) \
+      plq(a_,ax,1) \
+      pm(6,2) \
+      pa(2,1) \
+      puq(1,a_,ax)
+#undef p2_axpy_0
+#define p2_axpy_0(a_) \
+      pl(SS(a_,RS4),cx,4) \
+      pm(6,2) \
+      pa(2,1) \
+      plq(SS(a_,RS4),ax,3) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(2,RS4)),cx,2) \
+      puq(1,a_,ax) \
+      pm(6,4) \
+      pa(4,3) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,1) \
+      puq(3,SS(a_,RS4),ax) 
+#undef lpaxpy_0
+#define lpaxpy_0(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(SS(a_,MM(0,RS4)),cx,2) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,1) 
+#undef dpaxpy_0
+#define dpaxpy_0(a_) \
+      pl(SS(a_,RS4),cx,4) \
+      pm(6,2) \
+      pa(2,1) \
+      plq(SS(a_,RS4),ax,3) \
+      puq(1,a_,ax) \
+      pm(6,4) \
+      pa(4,3) \
+      puq(3,SS(a_,RS4),ax)
+#undef plaxpy_0
+#define plaxpy_0 8
+
+#undef p1_4_axpy_1
+#define p1_4_axpy_1(a_) \
+      pls(a_,cx,2) \
+      pls(a_,ax,1) \
+      pmsr(6,2) \
+      pasr(2,1) \
+      pus(1,a_,ax)
+#undef p1_2_axpy_1
+#define p1_2_axpy_1(a_) \
+      pld(a_,cx,2) \
+      pld(a_,ax,1) \
+      pm(6,2) \
+      pa(2,1) \
+      pud(1,a_,ax)
+#undef p1_axpy_1
+#define p1_axpy_1(a_) \
+      pl(a_,cx,2) \
+      pm(6,2) \
+      pam(a_,ax,2) \
+      puq(2,a_,ax)
+#undef p2_axpy_1
+#define p2_axpy_1(a_) \
+      pl(a_,cx,2) \
+      pm(6,2) \
+      pl(SS(a_,RS4),cx,4) \
+      pam(a_,ax,2) \
+      pm(6,4) \
+      puq(2,a_,ax) \
+      pam(SS(a_,RS4),ax,4) \
+      puq(4,SS(a_,RS4),ax) 
+#undef p4_axpy_1
+#define p4_axpy_1(a_) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(3,RS4)),cx,3) \
+      pm(6,2) \
+      pam(SS(a_,MM(2,RS4)),ax,2) \
+      puq(0,a_,ax) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      pl(SS(a_,MM(4,RS4)),cx,0) \
+      pm(6,3) \
+      pam(SS(a_,MM(3,RS4)),ax,3) \
+      puq(1,SS(a_,RS4),ax) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+      pl(SS(a_,MM(5,RS4)),cx,1) \
+      pm(6,0) \
+      pam(SS(a_,MM(4,RS4)),ax,0) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+      pl(SS(a_,MM(6,RS4)),cx,2) \
+      pm(6,1) \
+      pam(SS(a_,MM(5,RS4)),ax,1) \
+      puq(3,SS(a_,MM(3,RS4)),ax)
+#undef lpaxpy_1
+#define lpaxpy_1(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(a_,cx,0) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      pl(SS(a_,RS4),cx,1) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pm(6,0) \
+      pam(a_,ax,0) \
+      pl(SS(a_,MM(2,RS4)),cx,2) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      pm(6,1) \
+      pam(SS(a_,RS4),ax,1)
+#undef dpaxpy_1
+#define dpaxpy_1(a_) \
+      pl(SS(a_,MM(3,RS4)),cx,3) \
+      pm(6,2) \
+      pam(SS(a_,MM(2,RS4)),ax,2) \
+      puq(0,a_,ax) \
+      pm(6,3) \
+      pam(SS(a_,MM(3,RS4)),ax,3) \
+      puq(1,SS(a_,RS4),ax) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpy_1
+#define plaxpy_1 16
+
+#undef p1_4_axpy_2
+#define p1_4_axpy_2(a_) \
+      pls(a_,cx,5) \
+      pls(a_,ax,0) \
+      pmsr(6,5) \
+      pasr(5,0) \
+      pus(0,a_,ax)
+#undef p1_2_axpy_2
+#define p1_2_axpy_2(a_) \
+      pld(a_,cx,5) \
+      pld(a_,ax,0) \
+      pm(6,5) \
+      pa(5,0) \
+      pud(0,a_,ax)
+#undef p1_axpy_2
+#define p1_axpy_2(a_) \
+      pl(a_,cx,5) \
+      plq(a_,ax,0) \
+      pm(6,5) \
+      pa(5,0) \
+      puq(0,a_,ax)
+#undef p2_axpy_2
+#define p2_axpy_2(a_) \
+      pl(a_,cx,5) \
+      plq(a_,ax,0) \
+      pl(SS(a_,RS4),cx,4) \
+      pm(6,5) \
+      pa(5,0) \
+      plq(SS(a_,RS4),ax,1) \
+      puq(0,a_,ax) \
+      pm(6,4) \
+      pa(4,1) \
+      puq(1,SS(a_,RS4),ax) 
+#undef p4_axpy_2
+#define p4_axpy_2(a_) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      pl(SS(a_,MM(3,RS4)),cx,5) \
+      pm(6,4) \
+      pa(4,2) \
+      puq(0,a_,ax) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(4,RS4)),cx,4) \
+      plq(SS(a_,MM(4,RS4)),ax,0) \
+      pm(6,5) \
+      pa(5,3) \
+      puq(1,SS(a_,RS4),ax) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(5,RS4)),ax,1) \
+      pl(SS(a_,MM(5,RS4)),cx,5) \
+      pm(6,4) \
+      pa(4,0) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+      pl(SS(a_,MM(6,RS4)),cx,4) \
+      plq(SS(a_,MM(6,RS4)),ax,2) \
+      pm(6,5) \
+      pa(5,1) \
+      puq(3,SS(a_,MM(3,RS4)),ax) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) 
+#undef lpaxpy_2
+#define lpaxpy_2(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(SS(a_,MM(0,RS4)),cx,4) \
+      plq(SS(a_,MM(0,RS4)),ax,0) \
+      pl(SS(a_,MM(1,RS4)),cx,5) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pm(6,4) \
+      pa(4,0) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,2) \
+      pl(SS(a_,MM(2,RS4)),cx,4) \
+      pm(6,5) \
+      pa(5,1)
+#undef dpaxpy_2
+#define dpaxpy_2(a_) \
+      pl(SS(a_,MM(3,RS4)),cx,5) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      pm(6,4) \
+      pa(4,2) \
+      puq(0,a_,ax) \
+      pm(6,5) \
+      pa(5,3) \
+      puq(1,SS(a_,RS4),ax) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpy_2
+#define plaxpy_2 16
+
+#undef p1_4_axpy_2c
+#define p1_4_axpy_2c(a_) 
+#undef p1_2_axpy_2c
+#define p1_2_axpy_2c(a_) \
+      pld(a_,cx,4) \
+      pld(a_,ax,0) \
+      pc(4,2) \
+      pm(6,4) \
+      pa(4,0) \
+      ps(CSHUF,2,2) \
+      pm(7,2) \
+      pa(2,0) \
+      pud(0,a_,ax)
+#undef p1_axpy_2c
+#define p1_axpy_2c(a_) \
+      pl(a_,cx,4) \
+      plq(a_,ax,0) \
+      pc(4,2) \
+      pm(6,4) \
+      pa(4,0) \
+      ps(CSHUF,2,2) \
+      pm(7,2) \
+      pa(2,0) \
+      puq(0,a_,ax)
+#undef p2_axpy_2c
+#define p2_axpy_2c(a_) \
+      pl(a_,cx,4) \
+      plq(a_,ax,0) \
+      pl(SS(a_,RS4),cx,5) \
+      pc(4,2) \
+      pm(6,4) \
+      pa(4,0) \
+      ps(CSHUF,2,2) \
+      pm(7,2) \
+      pa(2,0) \
+      plq(SS(a_,RS4),ax,1) \
+      puq(0,a_,ax) \
+      pc(5,3) \
+      pm(6,5) \
+      pa(5,1) \
+      ps(CSHUF,3,3) \
+      pm(7,3) \
+      pa(3,1) \
+      puq(1,SS(a_,RS4),ax) 
+#undef p4_axpy_2c
+#define p4_axpy_2c(a_) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      puq(0,a_,ax) \
+      pc(4,0) \
+      pm(6,4) \
+      pa(4,2) \
+      ps(CSHUF,0,0) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(4,RS4)),cx,4) \
+      pm(7,0) \
+      pa(0,2) \
+      plq(SS(a_,MM(4,RS4)),ax,0) \
+      puq(1,SS(a_,RS4),ax) \
+      pc(5,1) \
+      pm(6,5) \
+      pa(5,3) \
+      ps(CSHUF,1,1) \
+      pl(SS(a_,MM(5,RS4)),cx,5) \
+      pm(7,1) \
+      pa(1,3) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(5,RS4)),ax,1) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      pc(4,2) \
+      pm(6,4) \
+      pa(4,0) \
+      ps(CSHUF,2,2) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+      pl(SS(a_,MM(6,RS4)),cx,4) \
+      pm(7,2) \
+      pa(2,0) \
+      plq(SS(a_,MM(6,RS4)),ax,2) \
+      puq(3,SS(a_,MM(3,RS4)),ax) \
+      pc(5,3) \
+      pm(6,5) \
+      pa(5,1) \
+      ps(CSHUF,3,3) \
+      pl(SS(a_,MM(7,RS4)),cx,5) \
+      pm(7,3) \
+      pa(3,1) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) 
+#undef lpaxpy_2c
+#define lpaxpy_2c(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(SS(a_,MM(0,RS4)),cx,4) \
+      plq(SS(a_,MM(0,RS4)),ax,0) \
+      pl(SS(a_,MM(1,RS4)),cx,5) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pc(4,2) \
+      pm(6,4) \
+      pa(4,0) \
+      ps(CSHUF,2,2) \
+      pl(SS(a_,MM(2,RS4)),cx,4) \
+      pm(7,2) \
+      pa(2,0) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,2) \
+      pc(5,3) \
+      pm(6,5) \
+      pa(5,1) \
+      ps(CSHUF,3,3) \
+      pl(SS(a_,MM(3,RS4)),cx,5) \
+      pm(7,3) \
+      pa(3,1)
+#undef dpaxpy_2c
+#define dpaxpy_2c(a_) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      puq(0,a_,ax) \
+      pc(4,0) \
+      pm(6,4) \
+      pa(4,2) \
+      ps(CSHUF,0,0) \
+      puq(1,SS(a_,RS4),ax) \
+      pm(7,0) \
+      pa(0,2) \
+      pc(5,1) \
+      pm(6,5) \
+      pa(5,3) \
+      ps(CSHUF,1,1) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      pm(7,1) \
+      pa(1,3) \
+      puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plaxpy_2c
+#define plaxpy_2c 16
+
+#undef p1_4_axpy_1c
+#define p1_4_axpy_1c(a_)
+#undef p1_2_axpy_1c
+#define p1_2_axpy_1c(a_) \
+      pld(a_,cx,2) \
+      pc(2,0) \
+      pld(a_,ax,1) \
+      ps(CSHUF,0,0) \
+      pm(6,2) \
+      pa(2,1) \
+      pm(7,0) \
+      pa(0,1) \
+      pud(1,a_,ax)
+#undef p1_axpy_1c
+#define p1_axpy_1c(a_) \
+      pl(a_,cx,2) \
+      pc(2,0) \
+      plq(a_,ax,1) \
+      ps(CSHUF,0,0) \
+      pm(6,2) \
+      pa(2,1) \
+      pm(7,0) \
+      pa(0,1) \
+      puq(1,a_,ax)
+#undef p2_axpy_1c
+#define p2_axpy_1c(a_) \
+      plq(SS(a_,RS4),ax,3) \
+      ps(CSHUF,0,0) \
+      pl(SS(a_,RS4),cx,4) \
+      pm(6,2) \
+      pa(2,1) \
+      pm(7,0) \
+      pa(0,1) \
+      pc(4,0) \
+      puq(1,a_,ax) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,1) \
+      ps(CSHUF,0,0) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(2,RS4)),cx,2) \
+      pm(6,4) \
+      pa(4,3) \
+      pm(7,0) \
+      pa(0,3) \
+      pc(2,0) \
+      puq(3,SS(a_,RS4),ax)
+#undef lpaxpy_1c
+#define lpaxpy_1c(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(SS(a_,MM(0,RS4)),cx,2) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,1) \
+      pc(2,0) 
+#undef dpaxpy_1c
+#define dpaxpy_1c(a_) \
+      plq(SS(a_,RS4),ax,3) \
+      ps(CSHUF,0,0) \
+      pl(SS(a_,RS4),cx,4) \
+      pm(6,2) \
+      pa(2,1) \
+      pm(7,0) \
+      pa(0,1) \
+      pc(4,0) \
+      puq(1,a_,ax) \
+      ps(CSHUF,0,0) \
+      pm(6,4) \
+      pa(4,3) \
+      pm(7,0) \
+      pa(0,3) \
+      puq(3,SS(a_,RS4),ax)
+#undef plaxpy_1c
+#define plaxpy_1c 8
+
+#undef p1_4_copy_1
+#define p1_4_copy_1(a_) \
+      pls(a_,cx,2) \
+      pus(2,a_,ax)
+#undef p1_2_copy_1
+#define p1_2_copy_1(a_) \
+      pld(a_,cx,2) \
+      pud(2,a_,ax)
+#undef p1_copy_1
+#define p1_copy_1(a_) \
+      pl(a_,cx,2) \
+      puq(2,a_,ax)
+#undef p2_copy_1
+#define p2_copy_1(a_) \
+      pl(SS(a_,RS4),cx,4) \
+      puq(2,a_,ax) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(2,RS4)),cx,2) \
+      puq(4,SS(a_,RS4),ax)
+#undef lpcopy_1
+#define lpcopy_1(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(SS(a_,MM(0,RS4)),cx,2) 
+#undef dpcopy_1
+#define dpcopy_1(a_) \
+      pl(SS(a_,RS4),cx,4) \
+      puq(2,a_,ax) \
+      puq(4,SS(a_,RS4),ax)
+#undef plcopy_1
+#define plcopy_1 8
+
+#undef p1_4_copy_2
+#define p1_4_copy_2(a_) \
+      pls(a_,ax,2) \
+      pus(2,a_,cx)
+#undef p1_2_copy_2
+#define p1_2_copy_2(a_) \
+      pld(a_,ax,2) \
+      pud(2,a_,cx)
+#undef p1_copy_2
+#define p1_copy_2(a_) \
+      plq(a_,ax,2) \
+      pu(2,a_,cx)
+#undef p2_copy_2
+#define p2_copy_2(a_) \
+      plq(SS(a_,RS4),ax,4) \
+      pu(2,a_,cx) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,2) \
+      pu(4,SS(a_,RS4),cx)
+#undef lpcopy_2
+#define lpcopy_2(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,2) 
+#undef dpcopy_2
+#define dpcopy_2(a_) \
+      plq(SS(a_,RS4),ax,4) \
+      pu(2,a_,cx) \
+      pu(4,SS(a_,RS4),cx)
+#undef plcopy_2
+#define plcopy_2 8
+
+#undef p1_4_copy_3
+#define p1_4_copy_3(a_) \
+      pls(a_,cx,2) \
+      pus(2,a_,ax)
+#undef p1_2_copy_3
+#define p1_2_copy_3(a_) \
+      pld(a_,cx,2) \
+      pud(2,a_,ax)
+#undef p1_copy_3
+#define p1_copy_3(a_) \
+      pl(a_,cx,2) \
+      punt(2,a_,ax)
+#undef p2_copy_3
+#define p2_copy_3(a_) \
+      pl(SS(a_,MM(0,RS4)),cx,0) \
+      pl(SS(a_,MM(1,RS4)),cx,1) \
+      punt(0,SS(a_,MM(0,RS4)),ax) \
+      punt(1,SS(a_,MM(1,RS4)),ax) 
+#undef p4_copy_3
+#define p4_copy_3(a_) \
+      pl(SS(a_,MM(0,RS4)),cx,0) \
+      pl(SS(a_,MM(1,RS4)),cx,1) \
+      pl(SS(a_,MM(2,RS4)),cx,2) \
+      pl(SS(a_,MM(3,RS4)),cx,3) \
+      punt(0,SS(a_,MM(0,RS4)),ax) \
+      punt(1,SS(a_,MM(1,RS4)),ax) \
+      punt(2,SS(a_,MM(2,RS4)),ax) \
+      punt(3,SS(a_,MM(3,RS4)),ax) 
+#undef p8_copy_3
+#define p8_copy_3(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(SS(a_,MM(0,RS4)),cx,0) \
+      pl(SS(a_,MM(1,RS4)),cx,1) \
+      pl(SS(a_,MM(2,RS4)),cx,2) \
+      pl(SS(a_,MM(3,RS4)),cx,3) \
+      pl(SS(a_,MM(4,RS4)),cx,4) \
+      pl(SS(a_,MM(5,RS4)),cx,5) \
+      pl(SS(a_,MM(6,RS4)),cx,6) \
+      pl(SS(a_,MM(7,RS4)),cx,7) \
+      punt(0,SS(a_,MM(0,RS4)),ax) \
+      punt(1,SS(a_,MM(1,RS4)),ax) \
+      punt(2,SS(a_,MM(2,RS4)),ax) \
+      punt(3,SS(a_,MM(3,RS4)),ax) \
+      punt(4,SS(a_,MM(4,RS4)),ax) \
+      punt(5,SS(a_,MM(5,RS4)),ax) \
+      punt(6,SS(a_,MM(6,RS4)),ax) \
+      punt(7,SS(a_,MM(7,RS4)),ax) 
+#undef lpcopy_3
+#define lpcopy_3(a_) 
+#undef dpcopy_3
+#define dpcopy_3(a_) p8_copy_3(a_)
+#undef plcopy_3
+#define plcopy_3 32
+
+#undef p1_4_cpsc_3
+#define p1_4_cpsc_3(a_) \
+      pls(a_,ax,0) \
+      pmsr(6,0) \
+      pus(0,a_,cx)
+#undef p1_2_cpsc_3
+#define p1_2_cpsc_3(a_) \
+      pld(a_,ax,0) \
+      pm(6,0) \
+      pud(0,a_,cx)
+#undef p1_cpsc_3
+#define p1_cpsc_3(a_) \
+      plq(a_,ax,0) \
+      pm(6,0) \
+      pu(0,a_,cx)
+#undef p2_cpsc_3
+#define p2_cpsc_3(a_) \
+      plq(a_,ax,0) \
+      plq(SS(a_,RS4),ax,1) \
+      pm(6,0) \
+      pm(6,1) \
+      pu(0,a_,cx) \
+      pu(1,SS(a_,RS4),cx) 
+#undef p4_cpsc_3
+#define p4_cpsc_3(a_) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      pm(6,2) \
+      pu(0,a_,cx) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(4,RS4)),ax,0) \
+      pm(6,3) \
+      pu(1,SS(a_,RS4),cx) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      plq(SS(a_,MM(5,RS4)),ax,1) \
+      pm(6,0) \
+      pu(2,SS(a_,MM(2,RS4)),cx) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+      plq(SS(a_,MM(6,RS4)),ax,2) \
+      pm(6,1) \
+      pu(3,SS(a_,MM(3,RS4)),cx) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) 
+#undef lpcpsc_3
+#define lpcpsc_3(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,0) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pm(6,0) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      plq(SS(a_,MM(2,RS4)),ax,2) \
+      pm(6,1) 
+#undef dpcpsc_3
+#define dpcpsc_3(a_) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      pm(6,2) \
+      pu(0,a_,cx) \
+      pm(6,3) \
+      pu(1,SS(a_,RS4),cx) \
+      pu(2,SS(a_,MM(2,RS4)),cx) \
+      pu(3,SS(a_,MM(3,RS4)),cx)
+#undef plcpsc_3
+#define plcpsc_3 16
+
+#undef p1_4_cpsc_3c
+#define p1_4_cpsc_3c(a_) 
+#undef p1_2_cpsc_3c
+#define p1_2_cpsc_3c(a_) \
+      pld(a_,ax,0) \
+      pc(0,1) \
+      pm(6,0) \
+      ps(CSHUF,1,1) \
+      pm(7,1) \
+      pa(1,0) \
+      pud(0,a_,cx)
+#undef p1_cpsc_3c
+#define p1_cpsc_3c(a_) \
+      plq(a_,ax,0) \
+      pc(0,1) \
+      pm(6,0) \
+      ps(CSHUF,1,1) \
+      pm(7,1) \
+      pa(1,0) \
+      pu(0,a_,cx)
+#undef p2_cpsc_3c
+#define p2_cpsc_3c(a_) \
+      plq(a_,ax,0) \
+      plq(SS(a_,RS4),ax,1) \
+      pc(0,2) \
+      pm(6,0) \
+      ps(CSHUF,2,2) \
+      pm(7,2) \
+      pa(2,0) \
+      pu(0,a_,cx) \
+      pc(1,3) \
+      pm(6,1) \
+      ps(CSHUF,3,3) \
+      pm(7,3) \
+      pa(3,1) \
+      pu(1,SS(a_,RS4),cx) 
+#undef p4_cpsc_3c
+#define p4_cpsc_3c(a_) \
+      pu(0,a_,cx) \
+      pc(2,4) \
+      pm(6,2) \
+      ps(CSHUF,4,4) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(4,RS4)),ax,0) \
+      pm(7,4) \
+      pa(4,2) \
+      pu(1,SS(a_,RS4),cx) \
+      pc(3,4) \
+      pm(6,3) \
+      ps(CSHUF,4,4) \
+      plq(SS(a_,MM(5,RS4)),ax,1) \
+      pm(7,4) \
+      pa(4,3) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pu(2,SS(a_,MM(2,RS4)),cx) \
+      pc(0,4) \
+      pm(6,0) \
+      ps(CSHUF,4,4) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+      plq(SS(a_,MM(6,RS4)),ax,2) \
+      pm(7,4) \
+      pa(4,0) \
+      pu(3,SS(a_,MM(3,RS4)),cx) \
+      pc(1,4) \
+      pm(6,1) \
+      ps(CSHUF,4,4) \
+      plq(SS(a_,MM(7,RS4)),ax,3) \
+      pm(7,4) \
+      pa(4,1) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) 
+#undef lpcpsc_3c
+#define lpcpsc_3c(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,0) \
+      plq(SS(a_,MM(1,RS4)),ax,1) \
+      pc(0,4) \
+      pm(6,0) \
+      ps(CSHUF,4,4) \
+      plq(SS(a_,MM(2,RS4)),ax,2) \
+      pm(7,4) \
+      pa(4,0) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pc(1,4) \
+      pm(6,1) \
+      ps(CSHUF,4,4) \
+      plq(SS(a_,MM(3,RS4)),ax,3) \
+      pm(7,4) \
+      pa(4,1)
+#undef dpcpsc_3c
+#define dpcpsc_3c(a_) \
+      pu(0,a_,cx) \
+      pc(2,4) \
+      pm(6,2) \
+      ps(CSHUF,4,4) \
+      pu(1,SS(a_,RS4),cx) \
+      pm(7,4) \
+      pa(4,2) \
+      pc(3,4) \
+      pm(6,3) \
+      ps(CSHUF,4,4) \
+      pu(2,SS(a_,MM(2,RS4)),cx) \
+      pm(7,4) \
+      pa(4,3) \
+      pu(3,SS(a_,MM(3,RS4)),cx)
+#undef plcpsc_3c
+#define plcpsc_3c 16
+
+#undef p1_4_cpsc_4
+#define p1_4_cpsc_4(a_) \
+      pls(a_,cx,0) \
+      pmsr(6,0) \
+      pus(0,a_,ax)
+#undef p1_2_cpsc_4
+#define p1_2_cpsc_4(a_) \
+      pld(a_,cx,0) \
+      pm(6,0) \
+      pud(0,a_,ax)
+#undef p1_cpsc_4
+#define p1_cpsc_4(a_) \
+      pl(a_,cx,0) \
+      pm(6,0) \
+      puq(0,a_,ax)
+#undef p2_cpsc_4
+#define p2_cpsc_4(a_) \
+      pl(a_,cx,0) \
+      pl(SS(a_,RS4),cx,1) \
+      pm(6,0) \
+      pm(6,1) \
+      puq(0,a_,ax) \
+      puq(1,SS(a_,RS4),ax) 
+#undef p4_cpsc_4
+#define p4_cpsc_4(a_) \
+      pl(SS(a_,MM(3,RS4)),cx,3) \
+      pm(6,2) \
+      puq(0,a_,ax) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(4,RS4)),cx,0) \
+      pm(6,3) \
+      puq(1,SS(a_,RS4),ax) \
+      pl(SS(a_,MM(5,RS4)),cx,1) \
+      pm(6,0) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+      pl(SS(a_,MM(6,RS4)),cx,2) \
+      pm(6,1) \
+      puq(3,SS(a_,MM(3,RS4)),ax) 
+#undef lpcpsc_4
+#define lpcpsc_4(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      pl(SS(a_,MM(0,RS4)),cx,0) \
+      pl(SS(a_,MM(1,RS4)),cx,1) \
+      pm(6,0) \
+      pl(SS(a_,MM(2,RS4)),cx,2) \
+      pm(6,1) 
+#undef dpcpsc_4
+#define dpcpsc_4(a_) \
+      pl(SS(a_,MM(3,RS4)),cx,3) \
+      pm(6,2) \
+      puq(0,a_,ax) \
+      pm(6,3) \
+      puq(1,SS(a_,RS4),ax) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      puq(3,SS(a_,MM(3,RS4)),ax)
+#undef plcpsc_4
+#define plcpsc_4 16
+
+#undef p1_4_cpsc_5
+#define p1_4_cpsc_5(a_) \
+      pls(a_,cx,0) \
+      pmsr(6,0) \
+      pus(0,a_,ax)
+#undef p1_2_cpsc_5
+#define p1_2_cpsc_5(a_) \
+      pld(a_,cx,0) \
+      pm(6,0) \
+      pud(0,a_,ax)
+#undef p1_cpsc_5
+#define p1_cpsc_5(a_) \
+      pl(a_,cx,0) \
+      pm(6,0) \
+      puq(0,a_,ax)
+#undef p2_cpsc_5
+#define p2_cpsc_5(a_) \
+      pl(a_,cx,0) \
+      pl(SS(a_,RS4),cx,1) \
+      pm(6,0) \
+      pm(6,1) \
+      puq(0,a_,ax) \
+      puq(1,SS(a_,RS4),ax) 
+#undef p4_cpsc_5
+#define p4_cpsc_5(a_) \
+      pl(SS(a_,MM(0,RS4)),cx,0) \
+      pl(SS(a_,MM(1,RS4)),cx,1) \
+      pl(SS(a_,MM(2,RS4)),cx,2) \
+      pl(SS(a_,MM(3,RS4)),cx,3) \
+      pm(6,0) \
+      pm(6,1) \
+      pm(6,2) \
+      pm(6,3) \
+      puq(0,a_,ax) \
+      puq(1,SS(a_,RS4),ax) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      puq(3,SS(a_,MM(3,RS4)),ax) 
+#undef p8_cpsc_5
+#define p8_cpsc_5(a_) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(0,RS4)),cx,0) \
+      pl(SS(a_,MM(1,RS4)),cx,1) \
+      pl(SS(a_,MM(2,RS4)),cx,2) \
+      pl(SS(a_,MM(3,RS4)),cx,3) \
+      pl(SS(a_,MM(4,RS4)),cx,4) \
+      pl(SS(a_,MM(5,RS4)),cx,5) \
+      pl(SS(a_,MM(6,RS4)),cx,7) \
+      pm(6,0) \
+      pm(6,1) \
+      pm(6,2) \
+      pm(6,3) \
+      puq(0,a_,ax) \
+      pl(SS(a_,MM(7,RS4)),cx,0) \
+      pm(6,4) \
+      pm(6,5) \
+      pm(6,7) \
+      pm(6,0) \
+      puq(1,SS(a_,RS4),ax) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      puq(3,SS(a_,MM(3,RS4)),ax) \
+      puq(4,SS(a_,MM(4,RS4)),ax) \
+      puq(5,SS(a_,MM(5,RS4)),ax) \
+      puq(7,SS(a_,MM(6,RS4)),ax) \
+      puq(0,SS(a_,MM(7,RS4)),ax) 
+#undef lpcpsc_5
+#define lpcpsc_5(a_) 
+#undef dpcpsc_5
+#define dpcpsc_5(a_) p8_cpsc_5(a_)
+#undef plcpsc_5
+#define plcpsc_5 32
+
+#undef cpsc_cdp
+#define cpsc_cdp(a_) pc(a_,5) pm(6,a_) ps(CSHUF,5,5) pm(7,5) pa(5,a_)
+#undef p1_4_cpsc_5c
+#define p1_4_cpsc_5c(a_) 
+#undef p1_2_cpsc_5c
+#define p1_2_cpsc_5c(a_) \
+      pld(a_,cx,0) \
+      cpsc_cdp(0) \
+      pud(0,a_,ax)
+#undef p1_cpsc_5c
+#define p1_cpsc_5c(a_) \
+      pl(a_,cx,0) \
+      cpsc_cdp(0) \
+      puq(0,a_,ax)
+#undef p2_cpsc_5c
+#define p2_cpsc_5c(a_) \
+      pl(a_,cx,0) \
+      pl(SS(a_,RS4),cx,1) \
+      cpsc_cdp(0) \
+      cpsc_cdp(1) \
+      puq(0,a_,ax) \
+      puq(1,SS(a_,RS4),ax) 
+#undef p4_cpsc_5c
+#define p4_cpsc_5c(a_) \
+      pl(SS(a_,MM(0,RS4)),cx,0) \
+      pl(SS(a_,MM(1,RS4)),cx,1) \
+      pl(SS(a_,MM(2,RS4)),cx,2) \
+      pl(SS(a_,MM(3,RS4)),cx,3) \
+      cpsc_cdp(0) \
+      cpsc_cdp(1) \
+      cpsc_cdp(2) \
+      cpsc_cdp(3) \
+      puq(0,a_,ax) \
+      puq(1,SS(a_,RS4),ax) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      puq(3,SS(a_,MM(3,RS4)),ax) 
+#undef p8_cpsc_5c
+#define p8_cpsc_5c(a_) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      pl(SS(a_,MM(0,RS4)),cx,0) \
+      pl(SS(a_,MM(1,RS4)),cx,1) \
+      pl(SS(a_,MM(2,RS4)),cx,2) \
+      pl(SS(a_,MM(3,RS4)),cx,3) \
+      pl(SS(a_,MM(4,RS4)),cx,4) \
+      cpsc_cdp(0) \
+      cpsc_cdp(1) \
+      puq(0,a_,ax) \
+      pl(SS(a_,MM(5,RS4)),cx,0) \
+      cpsc_cdp(2) \
+      cpsc_cdp(3) \
+      puq(1,SS(a_,RS4),ax) \
+      pl(SS(a_,MM(6,RS4)),cx,1) \
+      cpsc_cdp(4) \
+      cpsc_cdp(0) \
+      puq(2,SS(a_,MM(2,RS4)),ax) \
+      pl(SS(a_,MM(7,RS4)),cx,2) \
+      cpsc_cdp(1) \
+      cpsc_cdp(2) \
+      puq(3,SS(a_,MM(3,RS4)),ax) \
+      puq(4,SS(a_,MM(4,RS4)),ax) \
+      puq(0,SS(a_,MM(5,RS4)),ax) \
+      puq(1,SS(a_,MM(6,RS4)),ax) \
+      puq(2,SS(a_,MM(7,RS4)),ax) 
+#undef lpcpsc_5c
+#define lpcpsc_5c(a_) 
+#undef dpcpsc_5c
+#define dpcpsc_5c(a_) p8_cpsc_5c(a_)
+#undef plcpsc_5c
+#define plcpsc_5c 32
+
+#undef p1_4_cpsc_1
+#define p1_4_cpsc_1(a_) \
+      pls(a_,ax,2) \
+      pmsr(3,2) \
+      pus(2,a_,cx)
+#undef p1_2_cpsc_1
+#define p1_2_cpsc_1(a_) \
+      pld(a_,ax,2) \
+      pm(3,2) \
+      pud(2,a_,cx)
+#undef p1_cpsc_1
+#define p1_cpsc_1(a_) \
+      plq(a_,ax,2) \
+      pm(3,2) \
+      pu(2,a_,cx)
+#undef p2_cpsc_1
+#define p2_cpsc_1(a_) \
+      plq(SS(a_,RS4),ax,4) \
+      pm(3,2) \
+      pu(2,a_,cx) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,2) \
+      pm(3,4) \
+      pu(4,SS(a_,RS4),cx)
+#undef lpcpsc_1
+#define lpcpsc_1(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,2) 
+#undef dpcpsc_1
+#define dpcpsc_1(a_) \
+      plq(SS(a_,RS4),ax,4) \
+      pm(3,2) \
+      pu(2,a_,cx) \
+      pm(3,4) \
+      pu(4,SS(a_,RS4),cx)
+#undef plcpsc_1
+#define plcpsc_1 8
+
+#undef p1_4_cpsc_2
+#define p1_4_cpsc_2(a_) \
+      pls(a_,ax,2) \
+      pmsr(3,2) \
+      pus(2,a_,cx)
+#undef p1_2_cpsc_2
+#define p1_2_cpsc_2(a_) \
+      pld(a_,ax,2) \
+      pm(3,2) \
+      pud(2,a_,cx)
+#undef p1_cpsc_2
+#define p1_cpsc_2(a_) \
+      plq(a_,ax,2) \
+      pm(3,2) \
+      pu(2,a_,cx)
+#undef p2_cpsc_2
+#define p2_cpsc_2(a_) \
+      plq(a_,ax,2) \
+      plq(SS(a_,RS4),ax,4) \
+      pm(3,2) \
+      pm(3,4) \
+      pu(2,a_,cx) \
+      pu(4,SS(a_,RS4),cx)
+#undef p4_cpsc_2
+#define p4_cpsc_2(a_) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,7) \
+      pm(3,6) \
+      pu(4,a_,cx) \
+      plq(SS(a_,MM(3,RS4)),ax,2) \
+      pm(3,7) \
+      pu(6,SS(a_,RS4),cx) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),cx) \
+      f(nta,SS(a_,MM((SS(4,CL)),RS4)),ax) \
+      plq(SS(a_,MM(4,RS4)),ax,4) \
+      pm(3,2) \
+      pu(7,SS(a_,MM(2,RS4)),cx) \
+      plq(SS(a_,MM(5,RS4)),ax,6) \
+      pm(3,4) \
+      pu(2,SS(a_,MM(3,RS4)),cx) 
+#undef lpcpsc_2
+#define lpcpsc_2(a_) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),cx) \
+      f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \
+      plq(SS(a_,MM(0,RS4)),ax,4) \
+      plq(SS(a_,MM(1,RS4)),ax,6) \
+      pm(3,4) 
+#undef dpcpsc_2
+#define dpcpsc_2(a_) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),cx) \
+      f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,7) \
+      pm(3,6) \
+      pu(4,a_,cx) \
+      plq(SS(a_,MM(3,RS4)),ax,2) \
+      pm(3,7) \
+      pu(6,SS(a_,RS4),cx) \
+      pm(3,2) \
+      pu(7,SS(a_,MM(2,RS4)),cx) \
+      pu(2,SS(a_,MM(3,RS4)),cx) 
+#undef plcpsc_2
+#define plcpsc_2 RS4
+
+
+#undef p1_4_iamax_1
+#define p1_4_iamax_1(a_) \
+      px(4) \
+      pls(a_,ax,4) \
+      pan(2,4) \
+      pc(3,5) \
+      pcm(6,4,5) \
+      paxs(4,3) \
+      pan(5,6) \
+      pann(0,5) \
+      pasr(5,6) \
+      pasr(1,0) \
+      ps(57,0,0)
+#undef p1_2_iamax_1
+#define p1_2_iamax_1(a_) \
+      px(4) \
+      pld(a_,ax,4) \
+      pan(2,4) \
+      pc(3,5) \
+      pcm(6,4,5) \
+      pax(4,3) \
+      pan(5,6) \
+      pann(0,5) \
+      pa(5,6) \
+      pasr(1,0) \
+      ps(57,0,0)\
+      pasr(1,0) \
+      ps(57,0,0)
+#undef p1_iamax_1
+#define p1_iamax_1(a_) \
+      plq(a_,ax,4) \
+      pan(2,4) \
+      pc(3,5) \
+      pcm(6,4,5) \
+      pax(4,3) \
+      pan(5,6) \
+      pann(0,5) \
+      pa(5,6) \
+      pa(1,0) 
+#define p2_iamax_1(a_) \
+      plq(SS(a_,RS4),ax,4) \
+      pan(2,4) \
+      pc(3,5) \
+      pcm(6,4,5) \
+      pax(4,3) \
+      pan(5,6) \
+      pann(0,5) \
+      pa(5,6) \
+      pa(1,0) \
+      f(nta,SS(a_,MM(SS(2,CL),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,4) \
+      pan(2,4) \
+      pc(3,5) \
+      pcm(6,4,5) \
+      pax(4,3) \
+      pan(5,6) \
+      pann(0,5) \
+      pa(5,6) \
+      pa(1,0) 
+#undef lpiamax_1
+#define lpiamax_1(a_) \
+      f(nta,SS(a_,MM(CL,RS4)),ax) \
+      plq(a_,ax,4) \
+      pan(2,4) \
+      pc(3,5) \
+      pcm(6,4,5) \
+      pax(4,3) \
+      pan(5,6) \
+      pann(0,5) \
+      pa(5,6) \
+      pa(1,0) 
+#undef dpiamax_1
+#define dpiamax_1(a_) \
+      plq(SS(a_,RS4),ax,4) \
+      pan(2,4) \
+      pc(3,5) \
+      pcm(6,4,5) \
+      pax(4,3) \
+      pan(5,6) \
+      pann(0,5) \
+      pa(5,6) \
+      pa(1,0) 
+#undef pliamax_1
+#define pliamax_1 8
+
+#undef p1_4_iamax_1d
+#define p1_4_iamax_1d(a_) 
+#undef p1_2_iamax_1d
+#define p1_2_iamax_1d(a_) \
+      px(4) \
+      pld(a_,ax,4) \
+      dbg(2) \
+      pan(2,4) \
+      dbg(4) \
+      pc(3,5) \
+      dbg(5) \
+      pcm(6,4,5) \
+      dbg(5) \
+      pax(4,3) \
+      dbg(3) \
+      pan(5,6) \
+      dbg(6) \
+      pann(0,5) \
+      dbg(5) \
+      pa(5,6) \
+      dbg(6) \
+      pasr(1,0) \
+      dbg(0) \
+      ps(1,0,0)
+#undef p1_iamax_1d
+#define p1_iamax_1d(a_) \
+      plq(a_,ax,4) \
+      dbg(2) \
+      pan(2,4) \
+      dbg(4) \
+      pc(3,5) \
+      dbg(5) \
+      pcm(6,4,5) \
+      dbg(5) \
+      pax(4,3) \
+      dbg(3) \
+      pan(5,6) \
+      dbg(6) \
+      pann(0,5) \
+      dbg(5) \
+      pa(5,6) \
+      dbg(6) \
+      pa(1,0) 
+#define p2_iamax_1d(a_) \
+      plq(SS(a_,RS4),ax,4) \
+      dbg(2) \
+      pan(2,4) \
+      dbg(4) \
+      pc(3,5) \
+      dbg(5) \
+      pcm(6,4,5) \
+      dbg(5) \
+      pax(4,3) \
+      dbg(3) \
+      pan(5,6) \
+      dbg(6) \
+      pann(0,5) \
+      dbg(5) \
+      pa(5,6) \
+      dbg(6) \
+      pa(1,0) \
+      dbg(0) \
+      f(nta,SS(a_,MM(SS(2,CL),RS4)),ax) \
+      plq(SS(a_,MM(2,RS4)),ax,4) \
+      dbg(2) \
+      pan(2,4) \
+      dbg(4) \
+      pc(3,5) \
+      dbg(5) \
+      pcm(6,4,5) \
+      dbg(5) \
+      pax(4,3) \
+      dbg(3) \
+      pan(5,6) \
+      dbg(6) \
+      pann(0,5) \
+      dbg(5) \
+      pa(5,6) \
+      dbg(6) \
+      pa(1,0) 
+#undef lpiamax_1d
+#define lpiamax_1d(a_) \
+      f(nta,SS(a_,MM(CL,RS4)),ax) \
+      plq(a_,ax,4) \
+      dbg(2) \
+      pan(2,4) \
+      dbg(4) \
+      pc(3,5) \
+      dbg(5) \
+      pcm(6,4,5) \
+      dbg(5) \
+      pax(4,3) \
+      dbg(3) \
+      pan(5,6) \
+      dbg(6) \
+      pann(0,5) \
+      dbg(5) \
+      pa(5,6) \
+      dbg(6) \
+      pa(1,0) 
+#undef dpiamax_1d
+#define dpiamax_1d(a_) \
+      plq(SS(a_,RS4),ax,4) \
+      dbg(2) \
+      pan(2,4) \
+      dbg(4) \
+      pc(3,5) \
+      dbg(5) \
+      pcm(6,4,5) \
+      dbg(5) \
+      pax(4,3) \
+      dbg(3) \
+      pan(5,6) \
+      dbg(6) \
+      pann(0,5) \
+      dbg(5) \
+      pa(5,6) \
+      dbg(6) \
+      pa(1,0) 
+#undef pliamax_1d
+#define pliamax_1d 8
+
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h
new file mode 100644
index 0000000..03486cf
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_tpipe.h
@@ -0,0 +1,331 @@
+/***************************************
+					$Header: /cvsroot/math-atlas/AtlasBase/kernel/CammMaguire/camm_tpipe.h,v 1.2 2003/10/18 18:13:30 yycamm Exp $
+
+					
+***************************************/
+
+
+/* #ifndef CAMM_TPIPE_H */
+/* #define CAMM_TPIPE_H */    /*+ To stop multiple inclusions. +*/
+
+#ifndef BITS
+#error BITS must be defined in camm_tpipe.h
+#endif
+#ifndef DIV
+#error DIV must be defined in camm_tpipe.h
+#endif
+#ifndef INC
+#error INC(a_) must be defined in camm_tpipe.h
+#endif
+#ifndef LR
+#error LR must be defined in camm_tpipe.h
+#endif
+
+#ifdef ALIGN
+
+#if defined(SREAL)
+
+     test(4,ax) 
+     je(a2)
+      
+#undef KB
+#define KB ( 1 /* / DIV */ )
+#include "camm_pipe3.h"
+
+     KB_block 
+     INC(4) 
+     sub(1,LR) 
+
+     lab(a2)
+
+#endif
+
+#if defined(SREAL) || defined(DREAL)
+
+     test(8,ax) 
+     je(a4) 
+     test(-2,LR) 
+     je(a4)
+
+#undef KB
+#define KB ( 2 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     INC(8) 
+     sub(2,LR) 
+     
+     lab(a4)
+
+#endif
+#endif
+
+/*      "movl %%edx,%%edi\n\t"  */
+     push(LR)
+     shr(BITS,LR) 
+     shl(BITS,LR) 
+     m(4,LR) 
+     ra(ax,LR) 
+
+#if defined(ALIGN) && ( defined(SCPLX) || defined(DCPLX) )
+     test(12,ax)
+     je(loopa)
+#endif
+
+#if !defined(ALIGN) || defined(SCPLX) || defined(DCPLX)
+#undef plq
+#define plq(a_,b_,c_) pl(a_,b_,c_)
+#undef puq
+#define puq(a_,b_,c_)  pu(a_,b_,c_)
+#undef plqx
+#define plqx(a_,b_,c_,d_,e_) plx(a_,b_,c_,d_,e_)
+#undef puqx
+#define puqx(a_,b_,c_,d_,e_)  pux(a_,b_,c_,d_,e_)
+#else
+#undef plq
+#define plq(a_,b_,c_) pla(a_,b_,c_)
+#undef puq
+#define puq(a_,b_,c_)  punt(a_,b_,c_)
+#undef plqx
+#define plqx(a_,b_,c_,d_,e_) plax(a_,b_,c_,d_,e_)
+#undef puqx
+#define puqx(a_,b_,c_,d_,e_)  puax(a_,b_,c_,d_,e_)
+#endif
+
+     align
+     lab(loop) 
+     cmp(ax,LR) 
+     je(stop)
+
+#undef KB
+#define KB ( (1 << BITS) /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block  
+     INC(4*KB/**DIV*/) 
+
+     jmp(loop) 
+
+     lab(stop)
+     pop(LR)
+
+#if ( 1 << BITS ) > 128
+     test(128,LR) 
+     je(64)
+#undef KB
+#define KB ( 128 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     INC(512) 
+     
+     lab(64)
+#endif
+
+#if ( 1 << BITS ) > 64
+     test(64,LR) 
+     je(32)
+#undef KB
+#define KB ( 64 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     INC(256) 
+
+     lab(32)
+#endif
+
+#if ( 1 << BITS ) > 32
+     test(32,LR) 
+     je(16)
+#undef KB
+#define KB ( 32 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     INC(128) 
+
+     lab(16)
+#endif
+
+#if ( 1 << BITS ) > 16
+     test(16,LR) 
+     je(8)
+#undef KB
+#define KB ( 16 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     INC(64) 
+
+     lab(8)
+#endif
+
+#if ( 1 << BITS ) > 8
+     test(8,LR) 
+     je(4)
+#undef KB
+#define KB ( 8 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     INC(32) 
+
+     lab(4)
+#endif
+
+#if ( 1 << BITS ) > 4
+     test(4,LR) 
+     je(2)
+#undef KB
+#define KB ( 4 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     INC(16) 
+
+     lab(2)
+#endif
+
+#if DIV != 4 && ( 1 << BITS ) > 2
+     test(2,LR) 
+     je(1)
+#undef KB
+#define KB ( 2 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     INC(8) 
+
+     lab(1)
+#endif
+
+#if DIV == 1 && ( 1 << BITS ) > 1
+     test(1,LR) 
+     je(end)
+#undef KB
+#define KB ( 1 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     lab(end)
+#endif
+
+#if defined (ALIGN) && ( defined(SCPLX) || defined(DCPLX) )
+
+     jmp(tend)
+
+#undef plq
+#define plq(a_,b_,c_) pla(a_,b_,c_)
+#undef puq
+#define puq(a_,b_,c_) punt(a_,b_,c_)
+#undef plqx
+#define plqx(a_,b_,c_,d_,e_) plax(a_,b_,c_,d_,e_)
+#undef puqx
+#define puqx(a_,b_,c_,d_,e_)  puax(a_,b_,c_,d_,e_)
+
+     align
+     lab(loopa) 
+     cmp(ax,LR) 
+     je(stopa)
+
+#undef KB
+#define KB ( (1 << BITS) /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block  
+     INC(4*KB/**DIV*/) 
+
+     jmp(loopa) 
+
+     lab(stopa)
+     pop(LR)
+
+#if ( 1 << BITS ) > 128
+     test(128,LR) 
+     je(64a)
+#undef KB
+#define KB ( 128 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     INC(512) 
+     
+     lab(64a)
+#endif
+
+#if ( 1 << BITS ) > 64
+     test(64,LR) 
+     je(32a)
+#undef KB
+#define KB ( 64 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     INC(256) 
+
+     lab(32a)
+#endif
+
+#if ( 1 << BITS ) > 32
+     test(32,LR) 
+     je(16a)
+#undef KB
+#define KB ( 32 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     INC(128) 
+
+     lab(16a)
+#endif
+
+#if ( 1 << BITS ) > 16
+     test(16,LR) 
+     je(8a)
+#undef KB
+#define KB ( 16 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     INC(64) 
+
+     lab(8a)
+#endif
+
+#if ( 1 << BITS ) > 8
+     test(8,LR) 
+     je(4a)
+#undef KB
+#define KB ( 8 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     INC(32) 
+
+     lab(4a)
+#endif
+
+#if ( 1 << BITS ) > 4
+     test(4,LR) 
+     je(2a)
+#undef KB
+#define KB ( 4 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     INC(16) 
+
+     lab(2a)
+#endif
+
+#if DIV != 4 && ( 1 << BITS ) > 2
+     test(2,LR) 
+     je(1a)
+#undef KB
+#define KB ( 2 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     INC(8) 
+
+     lab(1a)
+#endif
+
+#if DIV == 1 && ( 1 << BITS ) > 1
+     test(1,LR) 
+     je(enda)
+#undef KB
+#define KB ( 1 /* / DIV */ )
+#include "camm_pipe3.h"
+     KB_block 
+     lab(enda)
+#endif
+
+     lab(tend)
+
+#endif
+
+/* #endif */ /* CAMM_TPIPE_H */
diff --git a/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h b/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h
new file mode 100644
index 0000000..6b150d3
--- /dev/null
+++ b/kaldi_io/src/tools/ATLAS/include/contrib/camm_util.h
@@ -0,0 +1,508 @@
+#ifndef CAMM_UTIL_H
+#define CAMM_UTIL_H    /*+ To stop multiple inclusions. +*/
+
+typedef struct {
+  float r,i;
+} Complex;
+
+typedef struct {
+  double r,i;
+} Dcomplex;
+
+#undef str
+#define str(a_) xstr(a_)
+#undef xstr
+#define xstr(a_) #a_
+
+#undef val
+#define val(a_) xval(a_)
+#undef xval
+#define xval(a_) a_
+
+#ifndef Mjoin
+#define Mjoin(a,b) mjoin(a,b)
+#ifdef mjoin
+   #undef mjoin
+#endif
+#define mjoin(a,b) a ## b
+#endif
+
+#undef VOLATILE
+#define VOLATILE __volatile__
+#undef ASM
+#define ASM __asm__ VOLATILE
+
+#ifdef BETA0
+#undef BL
+#define BL b0
+#endif
+#ifdef BETA1
+#undef BL
+#define BL b1
+#endif
+#ifdef BETAX
+#undef BL
+#define BL bX
+#endif
+#ifdef BETAXI0
+#undef BL
+#define BL bXi0
+#endif
+
+#ifdef NO_TRANSPOSE
+#ifdef GER
+#ifdef Conj_
+#undef FEXT
+#define FEXT Gc
+#else
+#undef FEXT
+#define FEXT Gu
+#endif
+#else
+#ifdef Conj_
+#undef FEXT
+#define FEXT Nc
+#else
+#undef FEXT
+#define FEXT N
+#endif
+#endif
+#else
+#ifdef Conj_
+#undef FEXT
+#define FEXT C
+#else
+#undef FEXT
+#define FEXT T
+#endif
+#endif
+
+#undef BLC
+#define BLC Mjoin(FEXT,BL)
+
+#ifdef __GNUC__
+#undef NO_INLINE
+#define NO_INLINE  double sq(double x) {return x*x;}
+#else
+#undef NO_INLINE
+#define NO_INLINE
+#endif
+
+#undef lab
+#define lab(a_)     "\n" str(MY_FUNCTION)  "_" str(N) "_" str(a_) ":\n\t"
+#undef jmp
+#define jmp(a_)     "jmp " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef je
+#define je(a_)      "je " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef jge
+#define jge(a_)     "jge " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef jle
+#define jle(a_)     "jle " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef jl
+#define jl(a_)      "jl " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef jne
+#define jne(a_)     "jne " str(MY_FUNCTION) "_" str(N) "_" str(a_) "\n\t"
+#undef align
+#define align       ".align 16\n\t"
+#undef test
+#define test(a_,b_) "testl $" str(a_) ",%%e" str(b_) "\n\t"
+#undef and
+#define and(a_,b_)  "andl $" str(a_) ",%%e" str(b_) "\n\t"
+#undef sub
+#define sub(a_,b_)  "subl $" str(a_) ",%%e" str(b_) "\n\t"
+#undef SS
+#define SS(a_,b_)   a_ + b_
+#undef MM
+#define MM(a_,b_)   a_ * b_
+#undef E4
+#define E4(a_)      (( a_ >> 2 ) << 2 )
+
+#undef TYPE
+#undef SCALAR
+#undef PREC
+#undef CSHUF
+#undef LSHUF
+#undef HSHUF
+#undef ISHUF
+#undef RSHUF
+#undef SINGLE
+#undef REAL
+#undef DIV
+
+#ifdef SCPLX
+#define TYPE Complex
+#define SCALAR Complex *
+#define PREC c
+#define CSHUF 177
+#define LSHUF 160
+#define HSHUF 245
+#define ISHUF 13*17
+#define RSHUF 8*17
+#define SINGLE
+#define DIV 2
+/* #ifdef Conj_ */
+/*   static const TYPE signd[2]={{-1.0,1.0},{-1.0,1.0}}; */
+/* #else */
+  static const TYPE signd[2]={{1.0,-1.0},{1.0,-1.0}};
+/* #endif */
+#endif
+
+#ifdef SREAL
+#define TYPE float
+#define SCALAR float
+#define PREC s
+#define SINGLE
+#define REAL
+#define DIV 1
+#endif
+
+#ifdef DREAL
+#define TYPE double
+#define SCALAR double
+#define PREC d
+#define REAL
+#define DIV 2
+#endif
+
+#ifdef DCPLX
+#define TYPE Dcomplex
+#define SCALAR Dcomplex *
+#define PREC z
+#define CSHUF 1
+#define LSHUF 0
+#define HSHUF 3
+#define ISHUF 3
+#define RSHUF 0
+#define DIV 4
+/* #ifdef Conj_ */
+/*   static const TYPE signd[1]={{-1.0,1.0}}; */
+/* #else */
+  static const TYPE signd[1]={{1.0,-1.0}};
+/* #endif */
+#endif
+
+#undef M11
+#define M11 0
+#undef M12
+#define M12 1
+#undef M13
+#define M13 2
+#undef M14
+#define M14 3
+#undef M15
+#define M15 4
+#undef M16
+#define M16 5
+#undef M17
+#define M17 6
+#undef M18
+#define M18 7
+
+#undef M23
+#define M23 1
+#undef M24
+#define M24 2
+#undef M25
+#define M25 3
+#undef M26
+#define M26 4
+#undef M27
+#define M27 5
+#undef M28
+#define M28 6
+
+#undef M33
+#define M33 0
+#undef M34
+#define M34 1
+#undef M35
+#define M35 2
+#undef M36
+#define M36 3
+#undef M37
+#define M37 4
+#undef M38
+#define M38 5
+
+#undef P10
+#define P10 1
+#undef P11
+#define P11 2
+#undef P12
+#define P12 3
+#undef P13
+#define P13 4
+#undef P14
+#define P14 5
+#undef P15
+#define P15 6
+#undef P16
+#define P16 7
+
+#undef XM
+#define XM(a_,b_)     M ## b_ ## a_
+#undef M
+#define M(a_,b_)      XM(a_,b_)
+
+#undef XP
+#define XP(a_,b_)     P ## b_ ## a_
+#undef P
+#define P(a_,b_)      XP(a_,b_)
+
+#undef mex
+#define mex(a_)       str(%%e ## a_)
+#undef msx
+#define msx(a_)       "%%st(" str(a_) ")"
+
+#undef cmp
+#define cmp(a_,b_)    "cmp " mex(a_) "," mex(b_) "\n\t"
+#undef icmpr
+#define icmpr(a_,b_)    "cmp " mex(a_) ",(" mex(b_) ")\n\t"
+#undef f
+#define f(a_,b_,c_)   "prefetch" str(a_) " " str(b_) "(%%e" #c_ ")\n\t"
+#undef pfx
+#define pfx(a_,b_,c_,d_,e_)   "prefetch" str(a_) " " str(b_) "(%%e" #c_ ",%%e" #d_ "," str(e_) ")\n\t"
+#undef a
+#define a(a_,b_)      "addl $" str(a_) "," mex(b_) "\n\t"
+#undef m
+#define m(a_,b_)      "imul $" str(a_) "," mex(b_) "\n\t"
+#undef pop
+#define pop(a_)       "popl %%e" str(a_) "\n\t"
+#undef push
+#define push(a_)      "pushl %%e" str(a_) "\n\t"
+#undef d
+#define d(a_,b_)      "idiv $" str(a_) "," mex(b_) "\n\t"
+#undef shl
+#define shl(a_,b_)    "shl $" str(a_) "," mex(b_) "\n\t"
+#undef shr
+#define shr(a_,b_)    "shr $" str(a_) "," mex(b_) "\n\t"
+#undef mm
+#define mm(a_,b_)     "mov $" str(a_) "," mex(b_) "\n\t"
+#undef ra
+#define ra(a_,b_)     "addl %%e" str(a_) "," mex(b_) "\n\t"
+#undef rs
+#define rs(a_,b_)     "subl %%e" str(a_) "," mex(b_) "\n\t"
+
+#undef fl
+#define fl(a_,b_)     "fldl " str(a_) "(" mex(b_) ")\n\t"
+#undef fp
+#define fp(a_,b_)     "fstpl " str(a_) "(" mex(b_) ")\n\t"
+#undef fd
+#define fd(a_)        "fld " msx(a_) "\n\t"
+#undef fap
+#define fap(a_,b_)    "faddp " msx(a_) "," msx(b_) "\n\t"
+/* #define fsp(a_)       fx(a_) "fsubp %%st," msx(a_) "\n\t" */
+#undef fsp
+#define fsp(a_)       "fsubrp %%st," msx(a_) "\n\t"
+#undef fmp
+#define fmp(a_,b_)    "fmulp " msx(a_) "," msx(b_) "\n\t"
+#undef fa
+#define fa(a_,b_)     "fadd " msx(a_) "," msx(b_) "\n\t"
+#undef fm
+#define fm(a_,b_)     "fmul " msx(a_) "," msx(b_) "\n\t"
+#undef faa
+#define faa(a_,b_)    "faddl " str(a_) "(" mex(b_) ")\n\t"
+#undef fma
+#define fma(a_,b_)    "fmull " str(a_) "(" mex(b_) ")\n\t"
+#undef fz
+#define fz            "fldz\n\t"
+#undef fx
+#define fx(a_)        "fxch " msx(a_) "\n\t"
+#undef fx1
+#define fx1           "fxch\n\t"
+#undef fc
+#define fc(a_)        "fstp " msx(a_) "\n\t"
+
+
+#ifndef ATHLON
+
+
+#if defined(DREAL) || defined(DCPLX)
+#undef SSESUF
+#define SSESUF "d "
+#undef RS4
+#define RS4 16
+#undef RS
+#define RS 4
+#else
+#undef SSESUF
+#define SSESUF "s "
+#undef RS4
+#define RS4 16
+#undef RS
+#define RS  4
+#endif
+
+#undef mxx
+#define mxx(a_)        str(%%xmm ## a_)
+#undef prp
+#define prp(a_,b_)     "rcpp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef prps
+#define prps(a_,b_)    "rcps" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pann
+#define pann(a_,b_)    "andnp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef psqs
+#define psqs(a_,b_)    "sqrts" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef por
+#define por(a_,b_)     "orp"   SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pan
+#define pan(a_,b_)     "andp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pcm
+#define pcm(a_,b_,c_)  "cmpp" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t"
+#undef pcms
+#define pcms(a_,b_,c_) "cmps" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t"
+#undef pax
+#define pax(a_,b_)     "maxp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef paxs
+#define paxs(a_,b_)    "maxs" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pd
+#define pd(a_,b_)      "divp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pdsr
+#define pdsr(a_,b_)    "divs" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pxx
+#define pxx(a_,b_)     "xorp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef px
+#define px(a_)         "xorp" SSESUF mxx(a_) "," mxx(a_) "\n\t"
+#undef pm
+#define pm(a_,b_)      "mulp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pa
+#define pa(a_,b_)      "addp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pmm
+#define pmm(a_,b_,c_)  "mulp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pam
+#define pam(a_,b_,c_)  "addp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pl
+#define pl(a_,b_,c_)   "movup" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pla
+#define pla(a_,b_,c_)  "movap" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pu
+#define pu(a_,b_,c_)   "movup" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef punt
+#define punt(a_,b_,c_) "movntp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pua
+#define pua(a_,b_,c_)  "movap" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pud
+#define pud(a_,b_,c_)  "movlp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pudr
+#define pudr(a_,b_)    "movlp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pc
+#define pc(a_,b_)      "movap" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef ps
+#define ps(a_,b_,c_)   "shufp" SSESUF " $" str(a_) "," mxx(b_) "," mxx(c_) "\n\t"
+#undef phl
+#define phl(a_,b_)     "movhlp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pus
+#define pus(a_,b_,c_)  "movs" SSESUF mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pls
+#define pls(a_,b_,c_)  "movs" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pld
+#define pld(a_,b_,c_)  "movlp" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef plh
+#define plh(a_,b_)     "movlhp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pas
+#define pas(a_,b_,c_)  "adds" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pms
+#define pms(a_,b_,c_)  "muls" SSESUF str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pcs
+#define pcs(a_,b_)     "movs" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pasr
+#define pasr(a_,b_)    "adds" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pmsr
+#define pmsr(a_,b_)    "muls" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef pul
+#define pul(a_,b_)     "unpcklp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+#undef puh
+#define puh(a_,b_)     "unpckhp" SSESUF mxx(a_) "," mxx(b_) "\n\t"
+
+#undef plsx
+#define plsx(a_,b_,c_,d_,e_) \
+                       "movs" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef plx
+#define plx(a_,b_,c_,d_,e_) \
+                       "movup" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef plax
+#define plax(a_,b_,c_,d_,e_) \
+                       "movap" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef pasx
+#define pasx(a_,b_,c_,d_,e_) \
+                       "adds" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef pusx
+#define pusx(a_,b_,c_,d_,e_) \
+                       "movs" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+#undef pux
+#define pux(a_,b_,c_,d_,e_) \
+                       "movup" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+#undef puax
+#define puax(a_,b_,c_,d_,e_) \
+                       "movap" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+#undef pudx
+#define pudx(a_,b_,c_,d_,e_) \
+                       "movlp" SSESUF mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+
+#undef pldx
+#define pldx(a_,b_,c_,d_,e_) \
+                       "movlp" SSESUF str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+
+#else
+
+#undef RS4
+#define RS4 8
+#undef RS
+#define RS  2
+
+#undef mxx
+#define mxx(a_)       str(%%mm ## a_)
+#undef pul
+#define pul(a_,b_)    "punpckldq " mxx(a_) "," mxx(b_) "\n\t"
+#undef puh
+#define puh(a_,b_)    "punpckhdq " mxx(a_) "," mxx(b_) "\n\t"
+
+#undef px
+#define px(a_)        "pxor " mxx(a_) "," mxx(a_) "\n\t"
+#undef pm
+#define pm(a_,b_)     "pfmul " mxx(a_) "," mxx(b_) "\n\t"
+#undef pa
+#define pa(a_,b_)     "pfadd " mxx(a_) "," mxx(b_) "\n\t"
+#undef pac
+#define pac(a_,b_)    "pfacc " mxx(a_) "," mxx(b_) "\n\t"
+#undef pmm
+#define pmm(a_,b_,c_) "pfmul " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pam
+#define pam(a_,b_,c_) "pfadd " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pl
+#define pl(a_,b_,c_)  "movq " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pla
+#define pla(a_,b_,c_) "movq " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+#undef pu
+#define pu(a_,b_,c_)  "movq " mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pc
+#define pc(a_,b_)     "movq " mxx(a_) "," mxx(b_) "\n\t"
+#undef ps
+#define ps(a_,b_,c_)  "pswapd " mxx(b_) "," mxx(c_) "\n\t"
+#undef phl
+#define phl(a_,b_)    "punpckhdq " mxx(a_) "," mxx(b_) "\n\t"
+#undef plh
+#define plh(a_,b_)    "punpckldq " mxx(a_) "," mxx(b_) "\n\t"
+#undef pus
+#define pus(a_,b_,c_) "movd " mxx(a_) "," str(b_) "(" mex(c_) ")\n\t"
+#undef pls
+#define pls(a_,b_,c_) "movd " str(a_) "(" mex(b_) ")," mxx(c_) "\n\t"
+
+#undef plsx
+#define plsx(a_,b_,c_,d_,e_) \
+                      "movd " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef plx
+#define plx(a_,b_,c_,d_,e_)  \
+                      "movq " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef pasx
+#define pasx(a_,b_,c_,d_,e_) \
+                      "addss " str(a_) "(" mex(b_) "," mex(c_) "," #d_ ")," mxx(e_) "\n\t"
+#undef pusx
+#define pusx(a_,b_,c_,d_,e_) \
+                      "movd " mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+#undef pux
+#define pux(a_,b_,c_,d_,e_)  \
+                      "movq " mxx(a_) "," str(b_) "(" mex(c_) "," mex(d_) "," #e_ ")\n\t"
+#endif
+
+#endif /* CAMM_UTIL_H */
-- 
cgit v1.2.3-70-g09d2