summaryrefslogtreecommitdiff
path: root/kaldi_io/src/tools/ATLAS/include/atlas_prefetch.h
blob: 83ee2df31a458d536452e8f69a745195fb9953a4 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#ifndef ATLAS_PREFETCH_H
#define ATLAS_PREFETCH_H
/*
 * Altivec prefetch model not well utilized by SSE-like prefetch, so have
 * special commands for it.
 */
#if defined(ATL_AltiVec)
   #include "atlas_altivec.h"
#endif
/*
 *
 * ATL_pfl1R(mem)  : fetch location mem to L1, with intent to read *only*
 * ATL_pfl1W(mem)  : fetch location mem to L1, with intent to read/write
 * ATL_pfl1WO(mem) : fetch location mem to L1, with intent to write ONLY
 */

#if defined(ATL_3DNow)
   #ifdef __GNUC__
      #define ATL_pfl1R(mem) \
         __asm__ __volatile__ ("prefetch %0" : : "m" (*((char *)(mem))))
      #define ATL_pfl1W(mem) \
         __asm__ __volatile__ ("prefetchw %0" : : "m" (*((char *)(mem))))
      #define ATL_pfl1WO ATL_pfl1W
      #define ATL_GOT_L1PREFETCH
      #ifdef ATL_SSE1
         #define ATL_pfl2R(mem) \
            __asm__ __volatile__ ("prefetcht1 %0" : : "m" (*((char *)(mem))))
         #define ATL_pfl2W(mem) \
            __asm__ __volatile__ ("prefetcht1 %0" : : "m" (*((char *)(mem))))
         #define ATL_pfl2WO ATL_pfl2W
         #define ATL_GOT_L2PREFETCH
      #endif
   #endif
#elif defined(ATL_SSE1) || defined (ATL_SSE2) /* SSE prefetch is available */
   #ifdef __GNUC__
      #define ATL_pfl1R(mem) \
         __asm__ __volatile__ ("prefetchnta %0" : : "m" (*((char *)(mem))))
      #define ATL_pfl1W(mem) \
         __asm__ __volatile__ ("prefetchnta %0" : : "m" (*((char *)(mem))))
      #define ATL_pfl1WO ATL_pfl1W
      #define ATL_GOT_L1PREFETCH

      #define ATL_pfl2R(mem) \
         __asm__ __volatile__ ("prefetcht1 %0" : : "m" (*((char *)(mem))))
      #define ATL_pfl2W(mem) \
         __asm__ __volatile__ ("prefetcht1 %0" : : "m" (*((char *)(mem))))
      #define ATL_pfl2WO ATL_pfl2W
      #define ATL_GOT_L2PREFETCH
   #endif
#elif defined(__SUNPRO_C) && defined(__sparc) /* && __SUNPRO_CC > 0x600 */
   #include <sun_prefetch.h>
   #define ATL_pfl1R(mem) sparc_prefetch_read_many((void*)(mem))
   #define ATL_pfl1W(mem) sparc_prefetch_write_many((void*)(mem))
   #define ATL_GOT_L1PREFETCH
   #define ATL_pfl2R(mem) sparc_prefetch_read_many((void*)(mem))
   #define ATL_pfl2W(mem) sparc_prefetch_write_many((void*)(mem))
   #define ATL_GOT_L2PREFETCH
#elif defined(ATL_ARCH_21264)
   #ifdef __GNUC__
      #define ATL_pfl1R(mem) \
         __asm__ __volatile__ ("ldt $f31, %0" : : "m" (*((char *)(mem))))
      #define ATL_pfl1W(mem) \
         __asm__ __volatile__ ("lds $f31, %0" : : "m" (*((char *)(mem))))
      #define ATL_pfl1WO(mem) \
         __asm__ __volatile__ ("wh64 %0" : : "m" (*((char *)(mem))))
      #define ATL_GOT_L1PREFETCH
   #elif defined(__DECC)
      #include "c_asm.h"
      #define ATL_pfl1R(mem) asm ("ldt %f31,(%a0) ;", mem)
      #define ATL_pfl1W(mem) asm ("lds %f31,(%a0) ;", mem)
      #define ATL_pfl1WO(mem) asm ("wh64 (%a0) ;", mem)
      #define ATL_GOT_L1PREFETCH
   #endif
/*
 * Note: SunUS5/10 seems to get no benefit from prefetch, so don't enable
 */
#elif defined(ATL_ARCH_USIV) || defined(ATL_ARCH_SunUSIII) || \
      defined(ATL_ARCH_SunUSII) || defined(ATL_ARCH_SunUSI)
   #ifdef __GNUC__
      #define ATL_pfl1R(mem) \
         __asm__ __volatile__ ("prefetch %0,0" : : "m" (*((char *)(mem))))
      #define ATL_pfl1W(mem) \
         __asm__ __volatile__ ("prefetch %0,2" : : "m" (*((char *)(mem))))
      #define ATL_GOT_L1PREFETCH
      #define ATL_pfl2R(mem) \
         __asm__ __volatile__ ("prefetch %0,3" : : "m" (*((char *)(mem))))
      #define ATL_pfl2W(mem) \
         __asm__ __volatile__ ("prefetch %0,2" : : "m" (*((char *)(mem))))
      #define ATL_GOT_L2PREFETCH
   #endif
/*
 * Gives gigantic slowdown on POWER4, so don't enable there, just use gcc
 * builtin
 */
#elif defined(ATL_ARCH_PPCG5) || defined(ATL_ARCH_PPCG5) || \
      defined(ATL_ARCH_POWER5)
   #if defined(__GNUC__) || defined(__IBM_GCC_ASM)
      #define ATL_pfl1R(mem) \
         __asm__ __volatile__ ("dcbt  0, %0, 0" : : "r" ((mem)))
      #define ATL_pfl1W(mem) \
         __asm__ __volatile__ ("dcbtst  0, %0" : : "r" ((mem)))
      #define ATL_pfST(mem) \
         __asm__ __volatile__ ("dcbt  0, %0, 1" : : "r" ((mem)))
      #define ATL_pfl1STi(mem, str) \
        __asm__ __volatile__ ("rlwinm %0, %0, 0, 0, 24\n\t" \
                              "ori %0, %0, 96+%2\n\t" \
                              "dcbt 0, %0, 8"  \
                              : "=r" (mem) \
                              : "0" (mem), "i" (str))

      #define ATL_GOT_L1PREFETCH
      #define ATL_L1LS 128
   #endif
#elif defined(ATL_ARCH_IA64Itan) || defined(ATL_ARCH_IA64Itan2)
/*
 * Have to use nt2, 'cause fpu ignored L1.
 * NOTE: just let icc to prefetch, keep inst here for reference
 */
   #if defined(__ECC) && 0
      #include "ia64intrin.h"
      #define ATL_pfl1R(mem) __lfetch(2, (mem))
      #define ATL_pfl1W(mem)  __lfetch_excl(2, (mem))
      #define ATL_GOT_L1PREFETCH
   #elif defined(__GNUC__) && !defined(__ECC)
      #define ATL_pfl1R(mem) \
         __asm__ ("    lfetch.nt2  [%0]": : "r"((void *)(mem)))
      #define ATL_pfl1W(mem) \
         __asm__ ("    lfetch.excl     [%0]": : "r"((void *)(mem)))
      #define ATL_GOT_L1PREFETCH
   #endif
#elif defined(ATL_ARCH_HPPA20) && defined(__GNUC__)
      #define ATL_pfl1R(mem) \
         __asm__ __volatile__ ("ldw %0, %%r0" : : "m" (*((char *)(mem))))
      #define ATL_pfl1W(mem) \
         __asm__ __volatile__ ("ldd %0, %%r0" : : "m" (*((char *)(mem))))
      #define ATL_GOT_L1PREFETCH
#elif defined(ATL_AltiVec) && !defined(ATL_pfl1R)
   #ifndef ATL_NoFakePF
      /* 33619968 is ATL_GetCtrl(0, 1, 2), or fetch 1 32-byte block */
      #define ATL_pfl1R(mem) ATL_pfavR(mem, 33619968, 3)
      #define ATL_pfl1W(mem) ATL_pfavW(mem, 33619968, 2)
      #define ATL_GOT_L1PREFETCH
   #endif
#elif defined(ATL_ARCH_MIPSICE9) && defined(__GNUC__)
   #define ATL_pfl1R(mem) \
      __asm__ __volatile__ ("pref 6,%0" : : "m" (*((char *)(mem))))
   #define ATL_pfl1W(mem) \
      __asm__ __volatile__ ("pref 7,%0" : : "m" (*((char *)(mem))))
   #define ATL_GOT_L1PREFETCH
   #define ATL_L1LS 32
   #define ATL_L2LS 64
#elif defined(__GNUC__)  /* last ditch, use gcc predefined func */
   #define ATL_pfl1R(mem) __builtin_prefetch(mem, 0, 3)
   #define ATL_pfl1W(mem) __builtin_prefetch(mem, 1, 3)
   #define ATL_GOT_L1PREFETCH
#endif
#if defined(ATL_pfl1W) && !defined(ATL_pfl1WO)
   #define ATL_pfl1WO ATL_pfl1W
#endif

#ifdef ATL_NOL1PREFETCH
   #ifdef ATL_GOT_L1PREFETCH
      #undef ATL_pfl1R
      #undef ATL_pfl1W
      #undef ATL_pfl1WO
      #undef ATL_GOT_L1PREFETCH
   #endif
#endif
#ifdef ATL_NOL2PREFETCH
   #ifdef ATL_GOT_L2PREFETCH
      #undef ATL_pfl2R
      #undef ATL_pfl2W
      #undef ATL_pfl2WO
      #undef ATL_GOT_L2PREFETCH
   #endif
#endif
#ifndef ATL_GOT_L1PREFETCH  /* dummy calls cpp takes out of code */
   #define ATL_pfl1R(mem)
   #define ATL_pfl1W(mem)
   #define ATL_pfl1WO(mem)
#endif
#ifndef ATL_GOT_L2PREFETCH  /* dummy calls cpp takes out of code */
   #define ATL_pfl2R(mem)
   #define ATL_pfl2W(mem)
#endif

/*
 * Define Cache line sizes for L1 and L2
 */
#ifndef ATL_L1LS
   #define ATL_L1LS 64
#endif
#ifndef ATL_L2LS
   #define ATL_L2LS ATL_L1LS
#endif

#endif