ETISS 0.8.0
Extendable Translating Instruction Set Simulator (version 0.8.0)
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
avx512vlbf16intrin.h
Go to the documentation of this file.
1/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9#ifndef __IMMINTRIN_H
10#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
11#endif
12
13#ifndef __AVX512VLBF16INTRIN_H
14#define __AVX512VLBF16INTRIN_H
15
16typedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
17
18#define __DEFAULT_FN_ATTRS128 \
19 __attribute__((__always_inline__, __nodebug__, \
20 __target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
21#define __DEFAULT_FN_ATTRS256 \
22 __attribute__((__always_inline__, __nodebug__, \
23 __target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
24
37static __inline__ __m128bh __DEFAULT_FN_ATTRS128
38_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
39 return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
40 (__v4sf) __B);
41}
42
60static __inline__ __m128bh __DEFAULT_FN_ATTRS128
61_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
62 return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
63 (__v8hi)_mm_cvtne2ps_pbh(__A, __B),
64 (__v8hi)__W);
65}
66
82static __inline__ __m128bh __DEFAULT_FN_ATTRS128
83_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
84 return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
85 (__v8hi)_mm_cvtne2ps_pbh(__A, __B),
86 (__v8hi)_mm_setzero_si128());
87}
88
101static __inline__ __m256bh __DEFAULT_FN_ATTRS256
102_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
103 return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
104 (__v8sf) __B);
105}
106
124static __inline__ __m256bh __DEFAULT_FN_ATTRS256
125_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
126 return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
127 (__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
128 (__v16hi)__W);
129}
130
146static __inline__ __m256bh __DEFAULT_FN_ATTRS256
147_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
148 return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
149 (__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
150 (__v16hi)_mm256_setzero_si256());
151}
152
163static __inline__ __m128bh __DEFAULT_FN_ATTRS128
164_mm_cvtneps_pbh(__m128 __A) {
165 return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
166 (__v8hi)_mm_undefined_si128(),
167 (__mmask8)-1);
168}
169
185static __inline__ __m128bh __DEFAULT_FN_ATTRS128
186_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
187 return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
188 (__v8hi)__W,
189 (__mmask8)__U);
190}
191
205static __inline__ __m128bh __DEFAULT_FN_ATTRS128
207 return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
208 (__v8hi)_mm_setzero_si128(),
209 (__mmask8)__U);
210}
211
221static __inline__ __m128bh __DEFAULT_FN_ATTRS256
223 return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
224 (__v8hi)_mm_undefined_si128(),
225 (__mmask8)-1);
226}
227
242static __inline__ __m128bh __DEFAULT_FN_ATTRS256
243_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
244 return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
245 (__v8hi)__W,
246 (__mmask8)__U);
247}
248
261static __inline__ __m128bh __DEFAULT_FN_ATTRS256
263 return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
264 (__v8hi)_mm_setzero_si128(),
265 (__mmask8)__U);
266}
267
282static __inline__ __m128 __DEFAULT_FN_ATTRS128
283_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
284 return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
285 (__v4si)__A,
286 (__v4si)__B);
287}
288
306static __inline__ __m128 __DEFAULT_FN_ATTRS128
307_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
308 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
309 (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
310 (__v4sf)__D);
311}
312
330static __inline__ __m128 __DEFAULT_FN_ATTRS128
331_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
332 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
333 (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
334 (__v4sf)_mm_setzero_si128());
335}
336
351static __inline__ __m256 __DEFAULT_FN_ATTRS256
352_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
353 return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
354 (__v8si)__A,
355 (__v8si)__B);
356}
357
375static __inline__ __m256 __DEFAULT_FN_ATTRS256
376_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
377 return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
378 (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
379 (__v8sf)__D);
380}
381
399static __inline__ __m256 __DEFAULT_FN_ATTRS256
400_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
401 return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
402 (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
403 (__v8sf)_mm256_setzero_si256());
404}
405
417 __v4sf __V = {__A, 0, 0, 0};
418 __v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(
419 (__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
420 return __R[0];
421}
422
430static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
432 (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
433}
434
445static __inline__ __m256 __DEFAULT_FN_ATTRS256
448 (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
449}
450
464static __inline__ __m256 __DEFAULT_FN_ATTRS256
465_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
467 (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
468 16));
469}
470
471#undef __DEFAULT_FN_ATTRS128
472#undef __DEFAULT_FN_ATTRS256
473
474#endif
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi16_epi32(__m128i __V)
Definition avx2intrin.h:356
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi32(__m256i __a, int __count)
Definition avx2intrin.h:515
unsigned short __bfloat16
unsigned char __mmask8
unsigned short __mmask16
static __inline__ __m128bh __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A)
Convert Packed Single Data to Packed BF16 Data.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B)
Convert Two Packed Single Data to One Packed BF16 Data.
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B)
Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B)
Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
static __inline__ __m128bh __DEFAULT_FN_ATTRS256 _mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A)
Convert Packed Single Data to Packed BF16 Data.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A)
Convert Packed Single Data to Packed BF16 Data.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B)
Convert Two Packed Single Data to One Packed BF16 Data.
static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B)
Convert Two Packed Single Data to One Packed BF16 Data.
static __inline__ __m128bh __DEFAULT_FN_ATTRS256 _mm256_cvtneps_pbh(__m256 __A)
Convert Packed Single Data to Packed BF16 Data.
static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B)
Convert Two Packed Single Data to One Packed BF16 Data.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A)
Convert Packed Single Data to Packed BF16 Data.
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A)
Convert Packed BF16 Data to Packed float Data.
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B)
Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_cvtne2ps_pbh(__m128 __A, __m128 __B)
Convert Two Packed Single Data to One Packed BF16 Data.
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A)
Convert Packed BF16 Data to Packed float Data using merging mask.
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A)
Convert Packed BF16 Data to Packed float Data using zeroing mask.
#define __DEFAULT_FN_ATTRS256
#define __DEFAULT_FN_ATTRS128
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B)
Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B)
Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
short __m128bh __attribute__((__vector_size__(16), __aligned__(16)))
static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_cvtneps_pbh(__m128 __A)
Convert Packed Single Data to Packed BF16 Data.
static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_cvtne2ps_pbh(__m256 __A, __m256 __B)
Convert Two Packed Single Data to One Packed BF16 Data.
static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A)
Convert One Single float Data to One BF16 Data.
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B)
Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A)
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castsi256_ps(__m256i __a)
Casts a 256-bit integer vector into a 256-bit floating-point vector of [8 x float].
Definition avxintrin.h:4367
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
Definition avxintrin.h:4281
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition emmintrin.h:3587
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition emmintrin.h:3977
static __inline__ unsigned int unsigned char __D
Definition ia32intrin.h:283