14 #define _SW_CVT_PS_PH_256(fp32x8_data, rounding_mode) \
16 fp16_ieee_from_fp32_value((fp32x8_data)[0]), \
17 fp16_ieee_from_fp32_value((fp32x8_data)[1]), \
18 fp16_ieee_from_fp32_value((fp32x8_data)[2]), \
19 fp16_ieee_from_fp32_value((fp32x8_data)[3]), \
20 fp16_ieee_from_fp32_value((fp32x8_data)[4]), \
21 fp16_ieee_from_fp32_value((fp32x8_data)[5]), \
22 fp16_ieee_from_fp32_value((fp32x8_data)[6]), \
23 fp16_ieee_from_fp32_value((fp32x8_data)[7]) \
30 #define _SW_CVT_PH_PS_256(fp16x8_data) \
32 fp16_ieee_to_fp32_value((fp16x8_data)[0]), \
33 fp16_ieee_to_fp32_value((fp16x8_data)[1]), \
34 fp16_ieee_to_fp32_value((fp16x8_data)[2]), \
35 fp16_ieee_to_fp32_value((fp16x8_data)[3]), \
36 fp16_ieee_to_fp32_value((fp16x8_data)[4]), \
37 fp16_ieee_to_fp32_value((fp16x8_data)[5]), \
38 fp16_ieee_to_fp32_value((fp16x8_data)[6]), \
39 fp16_ieee_to_fp32_value((fp16x8_data)[7]) \
46 #define _SW_CVT_PS_PH_128(fp32x4_data, rounding_mode) \
48 fp16_ieee_from_fp32_value((fp32x4_data)[0]), \
49 fp16_ieee_from_fp32_value((fp32x4_data)[1]), \
50 fp16_ieee_from_fp32_value((fp32x4_data)[2]), \
51 fp16_ieee_from_fp32_value((fp32x4_data)[3]) \
58 #define _SW_CVT_PH_PS_128(fp16x4_data) \
60 fp16_ieee_to_fp32_value((fp16x4_data)[0]), \
61 fp16_ieee_to_fp32_value((fp16x4_data)[1]), \
62 fp16_ieee_to_fp32_value((fp16x4_data)[2]), \
63 fp16_ieee_to_fp32_value((fp16x4_data)[3]) \
79 static inline __m128i __smaug_vcvtps2ph(__m128i a,
int imm8) {
80 __m128i res = (__m128i){ 0 };
81 __asm__
volatile(
"vcvtps2ph %2, %1, %0" :
"+xm"(res) :
"x"(a),
"i"(imm8) :);
93 static inline __m128i __smaug_vcvtph2ps(__m128i a) {
94 __m128i res = (__m128i){ 0 };
95 __asm__
volatile(
"vcvtph2ps %1, %0" :
"+x"(res) :
"xm"(a) :);
107 typedef float __smaug256 __attribute__((__vector_size__(32)));
108 static inline __m128i __smaug_vcvtps2ph256(__smaug256 a,
int imm8) {
109 __m128i res = (__m128i){ 0 };
110 __asm__
volatile(
"vcvtps2ph %2, %1, %0" :
"+xm"(res) :
"x"(a),
"i"(imm8) :);
122 static inline __smaug256 __smaug_vcvtph2ps256(__m128i a) {
123 __smaug256 res = (__smaug256){ 0 };
124 __asm__
volatile(
"vcvtph2ps %1, %0" :
"+x"(res) :
"xm"(a) :);
148 #define _CVT_PS_PH_128(p4_fp32_data, rounding_mode) \
149 _mm_cvtps_ph(p4_fp32_data, rounding_mode)
150 #define _CVT_PH_PS_128(p4_fp16_data) _mm_cvtph_ps(p4_fp16_data)
152 #elif defined(__USE_F16C_ANYWAYS__)
155 #define _CVT_PS_PH_128(p4_fp32_data, rounding_mode) \
156 __smaug_vcvtps2ph(p4_fp32_data, rounding_mode)
157 #define _CVT_PH_PS_128(p4_fp16_data) __smaug_vcvtph2ps(p4_fp16_data)
162 # warning "No F16C: LLVM-Tracer cannot emit IR FP convert instructions!"
166 #define _CVT_PS_PH_128(p4_fp32_data, rounding_mode) \
167 _SW_CVT_PS_PH_128(p4_fp32_data, rounding_mode)
168 #define _CVT_PH_PS_128(p4_fp16_data) _SW_CVT_PH_PS_128(p4_fp16_data)
178 #define _CVT_PS_PH_256(p8_fp32_data, rounding_mode) \
179 _SW_CVT_PS_PH_256(p8_fp32_data, rounding_mode)
180 #define _CVT_PH_PS_256(p8_fp16_data) _SW_CVT_PH_PS_256(p8_fp16_data)
182 #elif defined(__F16C__)
184 #define _CVT_PS_PH_256(p8_fp32_data, rounding_mode) \
185 _mm256_cvtps_ph(p8_fp32_data, rounding_mode)
186 #define _CVT_PH_PS_256(p8_fp16_data) _mm256_cvtph_ps(p8_fp16_data)
188 #elif defined(__USE_F16C_ANYWAYS__)
190 #define _CVT_PS_PH_256(p8_fp32_data, rounding_mode) \
191 __smaug_vcvtps2ph256(p8_fp32_data, rounding_mode)
192 #define _CVT_PH_PS_256(p8_fp16_data) __smaug_vcvtph2ps256(p8_fp16_data)
197 # warning "No F16C: LLVM-Tracer cannot emit IR FP convert instructions!"
201 #define _CVT_PS_PH_256(p8_fp32_data, rounding_mode) \
202 _SW_CVT_PS_PH_256(p8_fp32_data, rounding_mode)
203 #define _CVT_PH_PS_256(p8_fp16_data) _SW_CVT_PH_PS_256(p8_fp16_data)
222 #define _CVT_SS_SH(val, rounding_mode) fp16_ieee_from_fp32_value(val)
225 #define _CVT_SH_SS(val) fp16_ieee_to_fp32_value(val)
230 #define _SHUFFLE_PD(a, b, imm8) _mm_shuffle_pd((__m128d)(a), (__m128d)(b), imm8)