13#ifndef NO_WARN_X86_INTRINSICS
31#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
34#ifndef _XMMINTRIN_H_INCLUDED
35#define _XMMINTRIN_H_INCLUDED
37#if defined(__linux__) && defined(__ppc64__)
40#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
47#if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
48 (defined(__STDC_VERSION__) && \
49 __STDC_VERSION__ >= 201112L))
65typedef float __m128
__attribute__ ((__vector_size__ (16), __may_alias__));
68typedef float __m128_u
__attribute__ ((__vector_size__ (16), __may_alias__,
75extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
83extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
86 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
90extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
93 return ((__m128)
vec_ld(0, (__v4sf*)__P));
97extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
100 return (vec_vsx_ld(0, __P));
104extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
109 static const __vector
unsigned char permute_vector =
110 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
111 0x17, 0x10, 0x11, 0x12, 0x13 };
113 __tmp =
vec_ld (0, (__v4sf *) __P);
114 result = (__m128)
vec_perm (__tmp, __tmp, permute_vector);
119extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
122 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
125extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
132extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
133_mm_set_ps (
const float __Z,
const float __Y,
const float __X,
const float __W)
135 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
139extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
140_mm_setr_ps (
float __Z,
float __Y,
float __X,
float __W)
142 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
146extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
149 vec_st((__v4sf)__A, 0, (__v4sf*)__P);
153extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156 *(__m128_u *)__P = __A;
160extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
164 static const __vector
unsigned char permute_vector =
165 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
166 0x17, 0x10, 0x11, 0x12, 0x13 };
168 __tmp = (__m128)
vec_perm (__A, __A, permute_vector);
174extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
181extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
191 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
195extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
198 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
200 return (
vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
204extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
211extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
214 *__P = ((__v4sf)__A)[0];
221extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
226 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
238 __A[0] = __A[0] + __B[0];
243extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
260 __A[0] = __A[0] - __B[0];
265extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
270 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
282 __A[0] = __A[0] * __B[0];
287extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
292 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
304 __A[0] = __A[0] / __B[0];
309extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
313 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
326extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
329 return (__m128) ((__v4sf)__A + (__v4sf)__B);
332extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
335 return (__m128) ((__v4sf)__A - (__v4sf)__B);
338extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
341 return (__m128) ((__v4sf)__A * (__v4sf)__B);
344extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
347 return (__m128) ((__v4sf)__A / (__v4sf)__B);
350extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
353 return (vec_sqrt ((__v4sf)__A));
356extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
359 return (
vec_re ((__v4sf)__A));
362extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
368extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
372 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
384extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
388 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
400extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
404 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
414 return (
vec_sel ((__v4sf)__A,
c, mask));
417extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
421 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
431 return (
vec_sel ((__v4sf)__A,
c, mask));
434extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
437 __vector __bool
int m =
vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
441extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
444 __vector __bool
int m =
vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
449extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
452 return ((__m128)
vec_and ((__v4sf)__A, (__v4sf)__B));
456extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
459 return ((__m128)
vec_andc ((__v4sf)__B, (__v4sf)__A));
462extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 return ((__m128)
vec_or ((__v4sf)__A, (__v4sf)__B));
468extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
471 return ((__m128)
vec_xor ((__v4sf)__A, (__v4sf)__B));
477extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
480 return ((__m128)
vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
483extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
486 return ((__m128)
vec_cmplt ((__v4sf)__A, (__v4sf)__B));
489extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
492 return ((__m128)
vec_cmple ((__v4sf)__A, (__v4sf)__B));
495extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
498 return ((__m128)
vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
501extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
504 return ((__m128)
vec_cmpge ((__v4sf)__A, (__v4sf)__B));
507extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
510 __v4sf temp = (__v4sf )
vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
511 return ((__m128)
vec_nor (temp, temp));
514extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
517 return ((__m128)
vec_cmpge ((__v4sf)__A, (__v4sf)__B));
520extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
523 return ((__m128)
vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
526extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
529 return ((__m128)
vec_cmple ((__v4sf)__A, (__v4sf)__B));
532extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
535 return ((__m128)
vec_cmplt ((__v4sf)__A, (__v4sf)__B));
538extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
541 __vector
unsigned int a,
b;
542 __vector
unsigned int c,
d;
543 static const __vector
unsigned int float_exp_mask =
544 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
546 a = (__vector
unsigned int)
vec_abs ((__v4sf)__A);
553extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
556 __vector
unsigned int a,
b;
557 __vector
unsigned int c,
d;
558 static const __vector
unsigned int float_exp_mask =
559 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
561 a = (__vector
unsigned int)
vec_abs ((__v4sf)__A);
565 return ((__m128 )
vec_or (
c, d));
571extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
574 static const __vector
unsigned int mask =
575 { 0xffffffff, 0, 0, 0 };
586 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
589extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
592 static const __vector
unsigned int mask =
593 { 0xffffffff, 0, 0, 0 };
604 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
607extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
610 static const __vector
unsigned int mask =
611 { 0xffffffff, 0, 0, 0 };
622 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
625extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
628 static const __vector
unsigned int mask =
629 { 0xffffffff, 0, 0, 0 };
640 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
643extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
646 static const __vector
unsigned int mask =
647 { 0xffffffff, 0, 0, 0 };
658 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
661extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
664 static const __vector
unsigned int mask =
665 { 0xffffffff, 0, 0, 0 };
677 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
680extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
683 static const __vector
unsigned int mask =
684 { 0xffffffff, 0, 0, 0 };
695 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
698extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
701 static const __vector
unsigned int mask =
702 { 0xffffffff, 0, 0, 0 };
713 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
716extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
719 static const __vector
unsigned int mask =
720 { 0xffffffff, 0, 0, 0 };
731 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
734extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
737 static const __vector
unsigned int mask =
738 { 0xffffffff, 0, 0, 0 };
749 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
752extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
755 __vector
unsigned int a,
b;
756 __vector
unsigned int c,
d;
757 static const __vector
unsigned int float_exp_mask =
758 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
759 static const __vector
unsigned int mask =
760 { 0xffffffff, 0, 0, 0 };
762 a = (__vector
unsigned int)
vec_abs ((__v4sf)__A);
769 return ((__m128)
vec_sel ((__v4sf)__A, (__v4sf)
c, mask));
772extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
775 __vector
unsigned int a,
b;
776 __vector
unsigned int c,
d;
777 static const __vector
unsigned int float_exp_mask =
778 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
779 static const __vector
unsigned int mask =
780 { 0xffffffff, 0, 0, 0 };
782 a = (__vector
unsigned int)
vec_abs ((__v4sf)__A);
789 return ((__m128)
vec_sel ((__v4sf)__A, (__v4sf)
c, mask));
794extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
797 return (__A[0] == __B[0]);
800extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803 return (__A[0] < __B[0]);
806extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809 return (__A[0] <= __B[0]);
812extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
815 return (__A[0] > __B[0]);
818extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
821 return (__A[0] >= __B[0]);
824extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
827 return (__A[0] != __B[0]);
838extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
841 return (__A[0] == __B[0]);
844extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
847 return (__A[0] < __B[0]);
850extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
853 return (__A[0] <= __B[0]);
856extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859 return (__A[0] > __B[0]);
862extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865 return (__A[0] >= __B[0]);
868extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871 return (__A[0] != __B[0]);
874extern __inline
float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
877 return ((__v4sf)__A)[0];
882extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
889#ifdef __LITTLE_ENDIAN__
890 "xxsldwi %x0,%x0,%x0,3;\n"
892 "xscvspdp %x2,%x0;\n"
900 res = __builtin_rint(__A[0]);
905extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
915extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
916_mm_cvtss_si64 (__m128 __A)
922#ifdef __LITTLE_ENDIAN__
923 "xxsldwi %x0,%x0,%x0,3;\n"
925 "xscvspdp %x2,%x0;\n"
933 res = __builtin_llrint(__A[0]);
939extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
940_mm_cvtss_si64x (__m128 __A)
942 return _mm_cvtss_si64 ((__v4sf) __A);
959extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963 __builtin_prefetch (__P);
968extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
972 __v4sf temp, rounded;
973 __vector
unsigned long long result;
976 temp = (__v4sf)
vec_splat ((__vector
long long)__A, 0);
977 rounded = vec_rint(temp);
978 result = (__vector
unsigned long long)
vec_cts (rounded, 0);
980 return (__m64) ((__vector
long long) result)[0];
983extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
990extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
999extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1006extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007_mm_cvttss_si64 (__m128 __A)
1010 float temp = __A[0];
1016extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017_mm_cvttss_si64x (__m128 __A)
1020 float temp = __A[0];
1027extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1031 __vector
unsigned long long result;
1034 temp = (__v4sf)
vec_splat ((__vector
long long)__A, 0);
1035 result = (__vector
unsigned long long)
vec_cts (temp, 0);
1037 return (__m64) ((__vector
long long) result)[0];
1040extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1064extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065_mm_cvtsi64_ss (__m128 __A,
long long __B)
1074extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1075_mm_cvtsi64x_ss (__m128 __A,
long long __B)
1077 return _mm_cvtsi64_ss (__A, __B);
1082extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1085 __vector
signed int vm1;
1088 vm1 = (__vector
signed int) (__vector
unsigned long long) {__B, __B};
1091 return ((__m128) (__vector
unsigned long long)
1092 { ((__vector
unsigned long long)vf1) [0],
1093 ((__vector
unsigned long long)__A) [1]});
1096extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1103extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1106 __vector
signed short vs8;
1107 __vector
signed int vi4;
1110 vs8 = (__vector
signed short) (__vector
unsigned long long) { __A, __A };
1114 return (__m128) vf1;
1118extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1121 const __vector
unsigned short zero =
1122 { 0, 0, 0, 0, 0, 0, 0, 0 };
1123 __vector
unsigned short vs8;
1124 __vector
unsigned int vi4;
1127 vs8 = (__vector
unsigned short) (__vector
unsigned long long) { __A, __A };
1129#ifdef __LITTLE_ENDIAN__
1136 return (__m128) vf1;
1140extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1143 __vector
signed char vc16;
1144 __vector
signed short vs8;
1145 __vector
signed int vi4;
1148 vc16 = (__vector
signed char) (__vector
unsigned long long) { __A, __A };
1153 return (__m128) vf1;
1157extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1161 const __vector
unsigned char zero =
1162 { 0, 0, 0, 0, 0, 0, 0, 0 };
1163 __vector
unsigned char vc16;
1164 __vector
unsigned short vs8;
1165 __vector
unsigned int vi4;
1168 vc16 = (__vector
unsigned char) (__vector
unsigned long long) { __A, __A };
1169#ifdef __LITTLE_ENDIAN__
1170 vs8 = (__vector
unsigned short)
vec_mergel (vc16, zero);
1172 (__vector
unsigned short) zero);
1174 vs8 = (__vector
unsigned short)
vec_mergel (zero, vc16);
1175 vi4 = (__vector
unsigned int)
vec_mergeh ((__vector
unsigned short) zero,
1180 return (__m128) vf1;
1184extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1187 __vector
signed int vi4;
1190 vi4 = (__vector
signed int) (__vector
unsigned long long) { __A, __B };
1192 return (__m128) vf4;
1196extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1200 __vector
signed int temp;
1201 __vector
unsigned long long result;
1203 rounded = vec_rint(__A);
1205 result = (__vector
unsigned long long)
vec_pack (temp, temp);
1207 return (__m64) ((__vector
long long) result)[0];
1211extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1215 __vector
signed int tmp_i;
1216 static const __vector
signed int zero = {0, 0, 0, 0};
1217 __vector
signed short tmp_s;
1218 __vector
signed char res_v;
1220 rounded = vec_rint(__A);
1224 return (__m64) ((__vector
long long) res_v)[0];
1228extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1232 unsigned long element_selector_10 = __mask & 0x03;
1233 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1234 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1235 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1236 static const unsigned int permute_selectors[4] =
1238#ifdef __LITTLE_ENDIAN__
1239 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1241 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1244 __vector
unsigned int t;
1246 t[0] = permute_selectors[element_selector_10];
1247 t[1] = permute_selectors[element_selector_32];
1248 t[2] = permute_selectors[element_selector_54] + 0x10101010;
1249 t[3] = permute_selectors[element_selector_76] + 0x10101010;
1250 return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector
unsigned char)t);
1254extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1257 return (__m128)
vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1261extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1264 return (__m128)
vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1269extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1272 __vector
unsigned long long __a = (__vector
unsigned long long)__A;
1280extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1283 __vector
unsigned long long __a = (__vector
unsigned long long) __A;
1289extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1292 return (__m128)
vec_mergel ((__vector
unsigned long long)__B,
1293 (__vector
unsigned long long)__A);
1297extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300 return (__m128)
vec_mergeh ((__vector
unsigned long long)__A,
1301 (__vector
unsigned long long)__B);
1306extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309 __vector
unsigned long long __a = (__vector
unsigned long long)__A;
1317extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1320 __vector
unsigned long long __a = (__vector
unsigned long long) __A;
1329extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1332 __vector
unsigned long long result;
1333 static const __vector
unsigned int perm_mask =
1335#ifdef __LITTLE_ENDIAN__
1336 0x00204060, 0x80808080, 0x80808080, 0x80808080
1338 0x80808080, 0x80808080, 0x80808080, 0x00204060
1342 result = ((__vector
unsigned long long)
1343 vec_vbpermq ((__vector
unsigned char) __A,
1344 (__vector
unsigned char) perm_mask));
1346#ifdef __LITTLE_ENDIAN__
1355extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1361extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1368extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1371 unsigned int shiftr = __N & 3;
1372#ifdef __BIG_ENDIAN__
1373 shiftr = 3 - shiftr;
1376 return ((__A >> (shiftr * 16)) & 0xffff);
1379extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1380_m_pextrw (__m64
const __A,
int const __N)
1387extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1390 const int shiftl = (__N & 3) * 16;
1391 const __m64 shiftD = (
const __m64)
__D << shiftl;
1392 const __m64 mask = 0xffffUL << shiftl;
1393 __m64 result = (__A & (~mask)) | (shiftD & mask);
1398extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1399_m_pinsrw (__m64
const __A,
int const __D,
int const __N)
1405extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1410 __vector
signed short a,
b, r;
1411 __vector __bool
short c;
1417 return (__m64) ((__vector
long long) r)[0];
1419 __m64_union m1, m2, res;
1425 (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.
as_short[0];
1427 (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.
as_short[1];
1429 (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.
as_short[2];
1431 (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.
as_short[3];
1433 return (__m64) res.as_m64;
1437extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1444extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1448 __vector
unsigned char a,
b, r;
1449 __vector __bool
char c;
1451 a = (__vector
unsigned char)
vec_splats (__A);
1455 return (__m64) ((__vector
long long) r)[0];
1457 __m64_union m1, m2, res;
1464 for (i = 0; i < 8; i++)
1466 ((
unsigned char) m1.as_char[i] > (
unsigned char) m2.as_char[i]) ?
1467 m1.as_char[i] : m2.as_char[i];
1469 return (__m64) res.as_m64;
1473extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1480extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1484 __vector
signed short a,
b, r;
1485 __vector __bool
short c;
1491 return (__m64) ((__vector
long long) r)[0];
1493 __m64_union m1, m2, res;
1499 (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.
as_short[0];
1501 (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.
as_short[1];
1503 (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.
as_short[2];
1505 (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.
as_short[3];
1507 return (__m64) res.as_m64;
1511extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1518extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1522 __vector
unsigned char a,
b, r;
1523 __vector __bool
char c;
1525 a = (__vector
unsigned char)
vec_splats (__A);
1529 return (__m64) ((__vector
long long) r)[0];
1531 __m64_union m1, m2, res;
1538 for (i = 0; i < 8; i++)
1540 ((
unsigned char) m1.as_char[i] < (
unsigned char) m2.as_char[i]) ?
1541 m1.as_char[i] : m2.as_char[i];
1543 return (__m64) res.as_m64;
1547extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1554extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1557 unsigned long long p =
1558#ifdef __LITTLE_ENDIAN__
1559 0x0008101820283038UL;
1561 0x3830282018100800UL;
1563 return __builtin_bpermd (p, __A);
1566extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1574extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1577 __vector
unsigned short a,
b;
1578 __vector
unsigned short c;
1579 __vector
unsigned int w0, w1;
1580 __vector
unsigned char xform1 = {
1581#ifdef __LITTLE_ENDIAN__
1582 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1583 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1585 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1586 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1590 a = (__vector
unsigned short)
vec_splats (__A);
1593 w0 = vec_vmuleuh (a,
b);
1594 w1 = vec_vmulouh (a,
b);
1595 c = (__vector
unsigned short)
vec_perm (w0, w1, xform1);
1597 return (__m64) ((__vector
long long)
c)[0];
1600extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1608extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1611 unsigned long element_selector_10 = __N & 0x03;
1612 unsigned long element_selector_32 = (__N >> 2) & 0x03;
1613 unsigned long element_selector_54 = (__N >> 4) & 0x03;
1614 unsigned long element_selector_76 = (__N >> 6) & 0x03;
1615 static const unsigned short permute_selectors[4] =
1617#ifdef __LITTLE_ENDIAN__
1618 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1620 0x0607, 0x0405, 0x0203, 0x0001
1624 __vector
unsigned long long a, p, r;
1626#ifdef __LITTLE_ENDIAN__
1627 t.as_short[0] = permute_selectors[element_selector_10];
1628 t.as_short[1] = permute_selectors[element_selector_32];
1629 t.as_short[2] = permute_selectors[element_selector_54];
1630 t.as_short[3] = permute_selectors[element_selector_76];
1632 t.as_short[3] = permute_selectors[element_selector_10];
1633 t.as_short[2] = permute_selectors[element_selector_32];
1634 t.as_short[1] = permute_selectors[element_selector_54];
1635 t.as_short[0] = permute_selectors[element_selector_76];
1639 r =
vec_perm (a, a, (__vector
unsigned char)p);
1640 return (__m64) ((__vector
long long) r)[0];
1643extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1652extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1655 __m64 hibit = 0x8080808080808080UL;
1657 __m64 *p = (__m64*)__P;
1661 tmp = (tmp & (~mask)) | (__A & mask);
1665extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1672extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1675 __vector
unsigned char a,
b,
c;
1677 a = (__vector
unsigned char)
vec_splats (__A);
1680 return (__m64) ((__vector
long long)
c)[0];
1683extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1690extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1693 __vector
unsigned short a,
b,
c;
1695 a = (__vector
unsigned short)
vec_splats (__A);
1698 return (__m64) ((__vector
long long)
c)[0];
1701extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1710extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1713 __vector
unsigned char a,
b;
1714 __vector
unsigned char vmin, vmax, vabsdiff;
1715 __vector
signed int vsum;
1716 const __vector
unsigned int zero =
1718 __m64_union result = {0};
1720 a = (__vector
unsigned char) (__vector
unsigned long long) { 0UL, __A };
1721 b = (__vector
unsigned char) (__vector
unsigned long long) { 0UL, __B };
1724 vabsdiff =
vec_sub (vmax, vmin);
1726 vsum = (__vector
signed int)
vec_sum4s (vabsdiff, zero);
1728 vsum = vec_sums (vsum, (__vector
signed int) zero);
1731 result.as_short[0] = vsum[3];
1732 return result.as_m64;
1735extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1742extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1756extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1771extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1775 __atomic_thread_fence (__ATOMIC_RELEASE);
1783extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1799 unsigned long __PPR;
1815 __atomic_thread_fence (__ATOMIC_SEQ_CST);
1820#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1822 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1823 __v4sf __t0 = vec_vmrghw (__r0, __r1); \
1824 __v4sf __t1 = vec_vmrghw (__r2, __r3); \
1825 __v4sf __t2 = vec_vmrglw (__r0, __r1); \
1826 __v4sf __t3 = vec_vmrglw (__r2, __r3); \
1827 (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \
1828 (__vector long long)__t1); \
1829 (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \
1830 (__vector long long)__t1); \
1831 (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \
1832 (__vector long long)__t3); \
1833 (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \
1834 (__vector long long)__t3); \
1841#include_next <xmmintrin.h>
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
__device__ __2f16 float c
static __inline unsigned char unsigned int unsigned int unsigned int * __p
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_ld(int __a, const vector signed char *__b)
#define vec_ctf(__a, __b)
static __inline__ vector int __ATTRS_o_ai vec_vupkhsh(vector short __a)
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
static __inline__ vector short __ATTRS_o_ai vec_vupkhsb(vector signed char __a)
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, int __b, vector signed char *__c)
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
static __inline__ vector int __ATTRS_o_ai vec_vmrglw(vector int __a, vector int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
static __inline__ vector int __ATTRS_o_ai vec_vupklsh(vector short __a)
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_pack(vector signed short __a, vector signed short __b)
static __inline__ vector float __ATTRS_o_ai vec_re(vector float __a)
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_abs(vector signed char __a)
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
static __inline__ vector float __ATTRS_o_ai vec_rsqrte(vector float __a)
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
static __inline__ void int __a
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
static __inline__ unsigned int unsigned char __D
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(float *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
#define _mm_shuffle_ps(a, b, mask)
Selects 4 float values from the 128-bit operands of [4 x float], as specified by the immediate value ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_stream_pi(__m64 *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
#define _mm_extract_pi16(a, n)
Extracts 16-bit element from a 64-bit vector of [4 x i16] and returns it, as specified by the immedia...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality and returns th...
#define _mm_prefetch(a, sel)
Loads one cache line of data from the specified address to a location closer to the processor.
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
#define _mm_insert_pi16(a, d, n)
Copies data from the 64-bit vector of [4 x i16] to the destination, and inserts the lower 16-bits of ...
#define _mm_shuffle_pi16(a, n)
Shuffles the 4 16-bit integers from a 64-bit integer vector to the destination, as specified by the i...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
static __inline__ int __DEFAULT_FN_ATTRS_MMX _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...