SMAUG
Simulating Machine Learning Applications on gem5-Aladdin
fp16_utils.h
1 #ifndef _FP16_UTILS_H_
2 #define _FP16_UTILS_H_
3 
4 #include <x86intrin.h>
5 #include "fp16.h"
6 
7 //=------------- SW emulation of conversion instructions --------------=//
8 
9 
14 #define _SW_CVT_PS_PH_256(fp32x8_data, rounding_mode) \
15  { \
16  fp16_ieee_from_fp32_value((fp32x8_data)[0]), \
17  fp16_ieee_from_fp32_value((fp32x8_data)[1]), \
18  fp16_ieee_from_fp32_value((fp32x8_data)[2]), \
19  fp16_ieee_from_fp32_value((fp32x8_data)[3]), \
20  fp16_ieee_from_fp32_value((fp32x8_data)[4]), \
21  fp16_ieee_from_fp32_value((fp32x8_data)[5]), \
22  fp16_ieee_from_fp32_value((fp32x8_data)[6]), \
23  fp16_ieee_from_fp32_value((fp32x8_data)[7]) \
24  }
25 
30 #define _SW_CVT_PH_PS_256(fp16x8_data) \
31  { \
32  fp16_ieee_to_fp32_value((fp16x8_data)[0]), \
33  fp16_ieee_to_fp32_value((fp16x8_data)[1]), \
34  fp16_ieee_to_fp32_value((fp16x8_data)[2]), \
35  fp16_ieee_to_fp32_value((fp16x8_data)[3]), \
36  fp16_ieee_to_fp32_value((fp16x8_data)[4]), \
37  fp16_ieee_to_fp32_value((fp16x8_data)[5]), \
38  fp16_ieee_to_fp32_value((fp16x8_data)[6]), \
39  fp16_ieee_to_fp32_value((fp16x8_data)[7]) \
40  }
41 
46 #define _SW_CVT_PS_PH_128(fp32x4_data, rounding_mode) \
47  { \
48  fp16_ieee_from_fp32_value((fp32x4_data)[0]), \
49  fp16_ieee_from_fp32_value((fp32x4_data)[1]), \
50  fp16_ieee_from_fp32_value((fp32x4_data)[2]), \
51  fp16_ieee_from_fp32_value((fp32x4_data)[3]) \
52  }
53 
58 #define _SW_CVT_PH_PS_128(fp16x4_data) \
59  { \
60  fp16_ieee_to_fp32_value((fp16x4_data)[0]), \
61  fp16_ieee_to_fp32_value((fp16x4_data)[1]), \
62  fp16_ieee_to_fp32_value((fp16x4_data)[2]), \
63  fp16_ieee_to_fp32_value((fp16x4_data)[3]) \
64  }
65 
66 //=----------------- Manual assembly implementions -------------------=//
67 //
68 // These are used if certain side-effects of adding -mf16c are unacceptable
69 // (for example, we want to enable F16C without AVX).
70 
79 static inline __m128i __smaug_vcvtps2ph(__m128i a, int imm8) {
80  __m128i res = (__m128i){ 0 };
81  __asm__ volatile("vcvtps2ph %2, %1, %0" : "+xm"(res) : "x"(a), "i"(imm8) :);
82  return res;
83 }
84 
93 static inline __m128i __smaug_vcvtph2ps(__m128i a) {
94  __m128i res = (__m128i){ 0 };
95  __asm__ volatile("vcvtph2ps %1, %0" : "+x"(res) : "xm"(a) :);
96  return res;
97 }
98 
107 typedef float __smaug256 __attribute__((__vector_size__(32)));
108 static inline __m128i __smaug_vcvtps2ph256(__smaug256 a, int imm8) {
109  __m128i res = (__m128i){ 0 };
110  __asm__ volatile("vcvtps2ph %2, %1, %0" : "+xm"(res) : "x"(a), "i"(imm8) :);
111  return res;
112 }
113 
122 static inline __smaug256 __smaug_vcvtph2ps256(__m128i a) {
123  __smaug256 res = (__smaug256){ 0 };
124  __asm__ volatile("vcvtph2ps %1, %0" : "+x"(res) : "xm"(a) :);
125  return res;
126 }
127 
128 //=----------------- 128-bit conversion instructions -----------------=//
129 
145 #ifdef __F16C__
146 
147 // Use built-in compiler intrinsics.
148 #define _CVT_PS_PH_128(p4_fp32_data, rounding_mode) \
149  _mm_cvtps_ph(p4_fp32_data, rounding_mode)
150 #define _CVT_PH_PS_128(p4_fp16_data) _mm_cvtph_ps(p4_fp16_data)
151 
152 #elif defined(__USE_F16C_ANYWAYS__)
153 
154 // We can't use the compiler intrinsics, so use our own manual asm.
155 #define _CVT_PS_PH_128(p4_fp32_data, rounding_mode) \
156  __smaug_vcvtps2ph(p4_fp32_data, rounding_mode)
157 #define _CVT_PH_PS_128(p4_fp16_data) __smaug_vcvtph2ps(p4_fp16_data)
158 
159 #else
160 
161 #ifdef TRACE_MODE
162 # warning "No F16C: LLVM-Tracer cannot emit IR FP convert instructions!"
163 #endif
164 
165 // Fallback to the SW emulations.
166 #define _CVT_PS_PH_128(p4_fp32_data, rounding_mode) \
167  _SW_CVT_PS_PH_128(p4_fp32_data, rounding_mode)
168 #define _CVT_PH_PS_128(p4_fp16_data) _SW_CVT_PH_PS_128(p4_fp16_data)
169 
170 #endif // __F16C__
171 
172 //=----------------- 256-bit conversion instructions -----------------=//
173 
174 // gem5 doesn't support the 256-bit iforms, due to lack of support for YMM
175 // registers, so fallback to the SW.
176 #if defined(GEM5)
177 
178 #define _CVT_PS_PH_256(p8_fp32_data, rounding_mode) \
179  _SW_CVT_PS_PH_256(p8_fp32_data, rounding_mode)
180 #define _CVT_PH_PS_256(p8_fp16_data) _SW_CVT_PH_PS_256(p8_fp16_data)
181 
182 #elif defined(__F16C__)
183 
184 #define _CVT_PS_PH_256(p8_fp32_data, rounding_mode) \
185  _mm256_cvtps_ph(p8_fp32_data, rounding_mode)
186 #define _CVT_PH_PS_256(p8_fp16_data) _mm256_cvtph_ps(p8_fp16_data)
187 
188 #elif defined(__USE_F16C_ANYWAYS__)
189 
190 #define _CVT_PS_PH_256(p8_fp32_data, rounding_mode) \
191  __smaug_vcvtps2ph256(p8_fp32_data, rounding_mode)
192 #define _CVT_PH_PS_256(p8_fp16_data) __smaug_vcvtph2ps256(p8_fp16_data)
193 
194 #else
195 
196 #ifdef TRACE_MODE
197 # warning "No F16C: LLVM-Tracer cannot emit IR FP convert instructions!"
198 #endif
199 
200 // No F16C in HW; fallback to SW implementations.
201 #define _CVT_PS_PH_256(p8_fp32_data, rounding_mode) \
202  _SW_CVT_PS_PH_256(p8_fp32_data, rounding_mode)
203 #define _CVT_PH_PS_256(p8_fp16_data) _SW_CVT_PH_PS_256(p8_fp16_data)
204 
205 #endif
206 
211 //=----------------- Scalar conversion instructions -----------------=//
212 
222 #define _CVT_SS_SH(val, rounding_mode) fp16_ieee_from_fp32_value(val)
223 
225 #define _CVT_SH_SS(val) fp16_ieee_to_fp32_value(val)
226 
227 //=----------------- Miscellaneous vector instructions -----------------=//
228 
230 #define _SHUFFLE_PD(a, b, imm8) _mm_shuffle_pd((__m128d)(a), (__m128d)(b), imm8)
231 
232 #endif