13 #ifndef NO_WARN_X86_INTRINSICS
31 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
34 #ifndef _XMMINTRIN_H_INCLUDED
35 #define _XMMINTRIN_H_INCLUDED
37 #if defined(__linux__) && defined(__ppc64__)
40 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
47 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
48 (defined(__STDC_VERSION__) && \
49 __STDC_VERSION__ >= 201112L))
60 #include <mm_malloc.h>
65 typedef float __m128
__attribute__ ((__vector_size__ (16), __may_alias__));
68 typedef float __m128_u
__attribute__ ((__vector_size__ (16), __may_alias__,
75 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
83 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
86 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
90 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
93 return ((__m128)
vec_ld(0, (__v4sf*)__P));
97 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
100 return (vec_vsx_ld(0, __P));
104 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
109 static const __vector
unsigned char permute_vector =
110 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
111 0x17, 0x10, 0x11, 0x12, 0x13 };
113 __tmp =
vec_ld (0, (__v4sf *) __P);
114 result = (__m128)
vec_perm (__tmp, __tmp, permute_vector);
119 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
122 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
125 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
132 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
133 _mm_set_ps (
const float __Z,
const float __Y,
const float __X,
const float __W)
135 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
139 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
140 _mm_setr_ps (
float __Z,
float __Y,
float __X,
float __W)
142 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
146 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
149 vec_st((__v4sf)__A, 0, (__v4sf*)__P);
153 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156 *(__m128_u *)__P = __A;
160 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
164 static const __vector
unsigned char permute_vector =
165 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
166 0x17, 0x10, 0x11, 0x12, 0x13 };
168 __tmp = (__m128)
vec_perm (__A, __A, permute_vector);
174 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
181 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
191 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
195 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
198 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
200 return (
vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
204 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
211 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
214 *__P = ((__v4sf)__A)[0];
221 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
226 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
238 __A[0] = __A[0] + __B[0];
243 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
248 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
260 __A[0] = __A[0] - __B[0];
265 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
270 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
282 __A[0] = __A[0] * __B[0];
287 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
292 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
304 __A[0] = __A[0] / __B[0];
309 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
313 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
326 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
329 return (__m128) ((__v4sf)__A + (__v4sf)__B);
332 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
335 return (__m128) ((__v4sf)__A - (__v4sf)__B);
338 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
341 return (__m128) ((__v4sf)__A * (__v4sf)__B);
344 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
347 return (__m128) ((__v4sf)__A / (__v4sf)__B);
350 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
353 return (vec_sqrt ((__v4sf)__A));
356 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
359 return (
vec_re ((__v4sf)__A));
362 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
368 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
372 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
384 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
388 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
400 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
404 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
414 return (
vec_sel ((__v4sf)__A,
c, mask));
417 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
421 static const __vector
unsigned int mask = {0xffffffff, 0, 0, 0};
431 return (
vec_sel ((__v4sf)__A,
c, mask));
434 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
437 __vector __bool
int m =
vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
441 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
444 __vector __bool
int m =
vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
449 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
452 return ((__m128)
vec_and ((__v4sf)__A, (__v4sf)__B));
456 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
459 return ((__m128)
vec_andc ((__v4sf)__B, (__v4sf)__A));
462 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 return ((__m128)
vec_or ((__v4sf)__A, (__v4sf)__B));
468 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
471 return ((__m128)
vec_xor ((__v4sf)__A, (__v4sf)__B));
477 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
480 return ((__m128)
vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
483 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
486 return ((__m128)
vec_cmplt ((__v4sf)__A, (__v4sf)__B));
489 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
492 return ((__m128)
vec_cmple ((__v4sf)__A, (__v4sf)__B));
495 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
498 return ((__m128)
vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
501 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
504 return ((__m128)
vec_cmpge ((__v4sf)__A, (__v4sf)__B));
507 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
510 __v4sf temp = (__v4sf )
vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
511 return ((__m128)
vec_nor (temp, temp));
514 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
517 return ((__m128)
vec_cmpge ((__v4sf)__A, (__v4sf)__B));
520 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
523 return ((__m128)
vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
526 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
529 return ((__m128)
vec_cmple ((__v4sf)__A, (__v4sf)__B));
532 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
535 return ((__m128)
vec_cmplt ((__v4sf)__A, (__v4sf)__B));
538 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
541 __vector
unsigned int a,
b;
542 __vector
unsigned int c,
d;
543 static const __vector
unsigned int float_exp_mask =
544 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
546 a = (__vector
unsigned int)
vec_abs ((__v4sf)__A);
553 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
556 __vector
unsigned int a,
b;
557 __vector
unsigned int c,
d;
558 static const __vector
unsigned int float_exp_mask =
559 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
561 a = (__vector
unsigned int)
vec_abs ((__v4sf)__A);
571 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
574 static const __vector
unsigned int mask =
575 { 0xffffffff, 0, 0, 0 };
586 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
589 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
592 static const __vector
unsigned int mask =
593 { 0xffffffff, 0, 0, 0 };
604 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
607 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
610 static const __vector
unsigned int mask =
611 { 0xffffffff, 0, 0, 0 };
622 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
625 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
628 static const __vector
unsigned int mask =
629 { 0xffffffff, 0, 0, 0 };
640 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
643 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
646 static const __vector
unsigned int mask =
647 { 0xffffffff, 0, 0, 0 };
658 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
661 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
664 static const __vector
unsigned int mask =
665 { 0xffffffff, 0, 0, 0 };
677 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
680 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
683 static const __vector
unsigned int mask =
684 { 0xffffffff, 0, 0, 0 };
695 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
698 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
701 static const __vector
unsigned int mask =
702 { 0xffffffff, 0, 0, 0 };
713 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
716 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
719 static const __vector
unsigned int mask =
720 { 0xffffffff, 0, 0, 0 };
731 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
734 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
737 static const __vector
unsigned int mask =
738 { 0xffffffff, 0, 0, 0 };
749 return ((__m128)
vec_sel ((__v4sf)__A,
c, mask));
752 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
755 __vector
unsigned int a,
b;
756 __vector
unsigned int c,
d;
757 static const __vector
unsigned int float_exp_mask =
758 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
759 static const __vector
unsigned int mask =
760 { 0xffffffff, 0, 0, 0 };
762 a = (__vector
unsigned int)
vec_abs ((__v4sf)__A);
769 return ((__m128)
vec_sel ((__v4sf)__A, (__v4sf)
c, mask));
772 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
775 __vector
unsigned int a,
b;
776 __vector
unsigned int c,
d;
777 static const __vector
unsigned int float_exp_mask =
778 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
779 static const __vector
unsigned int mask =
780 { 0xffffffff, 0, 0, 0 };
782 a = (__vector
unsigned int)
vec_abs ((__v4sf)__A);
789 return ((__m128)
vec_sel ((__v4sf)__A, (__v4sf)
c, mask));
794 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
797 return (__A[0] == __B[0]);
800 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803 return (__A[0] < __B[0]);
806 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809 return (__A[0] <= __B[0]);
812 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
815 return (__A[0] > __B[0]);
818 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
821 return (__A[0] >= __B[0]);
824 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
827 return (__A[0] != __B[0]);
838 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
841 return (__A[0] == __B[0]);
844 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
847 return (__A[0] < __B[0]);
850 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
853 return (__A[0] <= __B[0]);
856 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859 return (__A[0] > __B[0]);
862 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865 return (__A[0] >= __B[0]);
868 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871 return (__A[0] != __B[0]);
874 extern __inline
float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
877 return ((__v4sf)__A)[0];
882 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
889 #ifdef __LITTLE_ENDIAN__
890 "xxsldwi %x0,%x0,%x0,3;\n"
892 "xscvspdp %x2,%x0;\n"
900 res = __builtin_rint(__A[0]);
905 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
915 extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
916 _mm_cvtss_si64 (__m128 __A)
922 #ifdef __LITTLE_ENDIAN__
923 "xxsldwi %x0,%x0,%x0,3;\n"
925 "xscvspdp %x2,%x0;\n"
933 res = __builtin_llrint(__A[0]);
939 extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
940 _mm_cvtss_si64x (__m128 __A)
942 return _mm_cvtss_si64 ((__v4sf) __A);
959 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963 __builtin_prefetch (__P);
968 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
972 __v4sf temp, rounded;
973 __vector
unsigned long long result;
976 temp = (__v4sf)
vec_splat ((__vector
long long)__A, 0);
977 rounded = vec_rint(temp);
978 result = (__vector
unsigned long long)
vec_cts (rounded, 0);
980 return (__m64) ((__vector
long long) result)[0];
983 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
990 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
999 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1006 extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007 _mm_cvttss_si64 (__m128 __A)
1010 float temp = __A[0];
1016 extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017 _mm_cvttss_si64x (__m128 __A)
1020 float temp = __A[0];
1027 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1031 __vector
unsigned long long result;
1034 temp = (__v4sf)
vec_splat ((__vector
long long)__A, 0);
1035 result = (__vector
unsigned long long)
vec_cts (temp, 0);
1037 return (__m64) ((__vector
long long) result)[0];
1040 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1064 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065 _mm_cvtsi64_ss (__m128 __A,
long long __B)
1074 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1075 _mm_cvtsi64x_ss (__m128 __A,
long long __B)
1077 return _mm_cvtsi64_ss (__A, __B);
1082 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1085 __vector
signed int vm1;
1088 vm1 = (__vector
signed int) (__vector
unsigned long long) {__B, __B};
1091 return ((__m128) (__vector
unsigned long long)
1092 { ((__vector
unsigned long long)vf1) [0],
1093 ((__vector
unsigned long long)__A) [1]});
1096 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1103 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1106 __vector
signed short vs8;
1107 __vector
signed int vi4;
1110 vs8 = (__vector
signed short) (__vector
unsigned long long) { __A, __A };
1114 return (__m128) vf1;
1118 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1121 const __vector
unsigned short zero =
1122 { 0, 0, 0, 0, 0, 0, 0, 0 };
1123 __vector
unsigned short vs8;
1124 __vector
unsigned int vi4;
1127 vs8 = (__vector
unsigned short) (__vector
unsigned long long) { __A, __A };
1129 #ifdef __LITTLE_ENDIAN__
1136 return (__m128) vf1;
1140 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1143 __vector
signed char vc16;
1144 __vector
signed short vs8;
1145 __vector
signed int vi4;
1148 vc16 = (__vector
signed char) (__vector
unsigned long long) { __A, __A };
1153 return (__m128) vf1;
1157 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1161 const __vector
unsigned char zero =
1162 { 0, 0, 0, 0, 0, 0, 0, 0 };
1163 __vector
unsigned char vc16;
1164 __vector
unsigned short vs8;
1165 __vector
unsigned int vi4;
1168 vc16 = (__vector
unsigned char) (__vector
unsigned long long) { __A, __A };
1169 #ifdef __LITTLE_ENDIAN__
1170 vs8 = (__vector
unsigned short)
vec_mergel (vc16, zero);
1172 (__vector
unsigned short) zero);
1174 vs8 = (__vector
unsigned short)
vec_mergel (zero, vc16);
1175 vi4 = (__vector
unsigned int)
vec_mergeh ((__vector
unsigned short) zero,
1180 return (__m128) vf1;
1184 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1187 __vector
signed int vi4;
1190 vi4 = (__vector
signed int) (__vector
unsigned long long) { __A, __B };
1192 return (__m128) vf4;
1196 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1200 __vector
signed int temp;
1201 __vector
unsigned long long result;
1203 rounded = vec_rint(__A);
1205 result = (__vector
unsigned long long)
vec_pack (temp, temp);
1207 return (__m64) ((__vector
long long) result)[0];
1211 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1215 __vector
signed int tmp_i;
1216 static const __vector
signed int zero = {0, 0, 0, 0};
1217 __vector
signed short tmp_s;
1218 __vector
signed char res_v;
1220 rounded = vec_rint(__A);
1224 return (__m64) ((__vector
long long) res_v)[0];
1228 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1232 unsigned long element_selector_10 = __mask & 0x03;
1233 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1234 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1235 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1236 static const unsigned int permute_selectors[4] =
1238 #ifdef __LITTLE_ENDIAN__
1239 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1241 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1244 __vector
unsigned int t;
1246 t[0] = permute_selectors[element_selector_10];
1247 t[1] = permute_selectors[element_selector_32];
1248 t[2] = permute_selectors[element_selector_54] + 0x10101010;
1249 t[3] = permute_selectors[element_selector_76] + 0x10101010;
1250 return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector
unsigned char)t);
1254 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1257 return (__m128)
vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1261 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1264 return (__m128)
vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1269 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1272 __vector
unsigned long long __a = (__vector
unsigned long long)__A;
1280 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1283 __vector
unsigned long long __a = (__vector
unsigned long long) __A;
1289 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1292 return (__m128)
vec_mergel ((__vector
unsigned long long)__B,
1293 (__vector
unsigned long long)__A);
1297 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300 return (__m128)
vec_mergeh ((__vector
unsigned long long)__A,
1301 (__vector
unsigned long long)__B);
1306 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309 __vector
unsigned long long __a = (__vector
unsigned long long)__A;
1317 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1320 __vector
unsigned long long __a = (__vector
unsigned long long) __A;
1329 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1332 __vector
unsigned long long result;
1333 static const __vector
unsigned int perm_mask =
1335 #ifdef __LITTLE_ENDIAN__
1336 0x00204060, 0x80808080, 0x80808080, 0x80808080
1338 0x80808080, 0x80808080, 0x80808080, 0x00204060
1342 result = ((__vector
unsigned long long)
1343 vec_vbpermq ((__vector
unsigned char) __A,
1344 (__vector
unsigned char) perm_mask));
1346 #ifdef __LITTLE_ENDIAN__
1355 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1361 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1368 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1371 unsigned int shiftr = __N & 3;
1372 #ifdef __BIG_ENDIAN__
1373 shiftr = 3 - shiftr;
1376 return ((__A >> (shiftr * 16)) & 0xffff);
1379 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1380 _m_pextrw (__m64
const __A,
int const __N)
1387 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1390 const int shiftl = (__N & 3) * 16;
1391 const __m64 shiftD = (
const __m64)
__D << shiftl;
1392 const __m64 mask = 0xffffUL << shiftl;
1393 __m64 result = (__A & (~mask)) | (shiftD & mask);
1398 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1399 _m_pinsrw (__m64
const __A,
int const __D,
int const __N)
1405 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1410 __vector
signed short a,
b, r;
1411 __vector __bool
short c;
1417 return (__m64) ((__vector
long long) r)[0];
1419 __m64_union m1, m2, res;
1425 (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1427 (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1429 (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1431 (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1433 return (__m64) res.as_m64;
1437 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1444 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1448 __vector
unsigned char a,
b, r;
1449 __vector __bool
char c;
1451 a = (__vector
unsigned char)
vec_splats (__A);
1455 return (__m64) ((__vector
long long) r)[0];
1457 __m64_union m1, m2, res;
1464 for (i = 0; i < 8; i++)
1466 ((
unsigned char) m1.as_char[i] > (
unsigned char) m2.as_char[i]) ?
1467 m1.as_char[i] : m2.as_char[i];
1469 return (__m64) res.as_m64;
1473 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1480 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1484 __vector
signed short a,
b, r;
1485 __vector __bool
short c;
1491 return (__m64) ((__vector
long long) r)[0];
1493 __m64_union m1, m2, res;
1499 (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1501 (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1503 (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1505 (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1507 return (__m64) res.as_m64;
1511 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1518 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1522 __vector
unsigned char a,
b, r;
1523 __vector __bool
char c;
1525 a = (__vector
unsigned char)
vec_splats (__A);
1529 return (__m64) ((__vector
long long) r)[0];
1531 __m64_union m1, m2, res;
1538 for (i = 0; i < 8; i++)
1540 ((
unsigned char) m1.as_char[i] < (
unsigned char) m2.as_char[i]) ?
1541 m1.as_char[i] : m2.as_char[i];
1543 return (__m64) res.as_m64;
1547 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1554 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1557 unsigned long long p =
1558 #ifdef __LITTLE_ENDIAN__
1559 0x0008101820283038UL;
1561 0x3830282018100800UL;
1563 return __builtin_bpermd (p, __A);
1566 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1574 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1577 __vector
unsigned short a,
b;
1578 __vector
unsigned short c;
1579 __vector
unsigned int w0, w1;
1580 __vector
unsigned char xform1 = {
1581 #ifdef __LITTLE_ENDIAN__
1582 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1583 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1585 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1586 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1590 a = (__vector
unsigned short)
vec_splats (__A);
1593 w0 = vec_vmuleuh (a,
b);
1594 w1 = vec_vmulouh (a,
b);
1595 c = (__vector
unsigned short)
vec_perm (w0, w1, xform1);
1597 return (__m64) ((__vector
long long)
c)[0];
1600 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1608 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1611 unsigned long element_selector_10 = __N & 0x03;
1612 unsigned long element_selector_32 = (__N >> 2) & 0x03;
1613 unsigned long element_selector_54 = (__N >> 4) & 0x03;
1614 unsigned long element_selector_76 = (__N >> 6) & 0x03;
1615 static const unsigned short permute_selectors[4] =
1617 #ifdef __LITTLE_ENDIAN__
1618 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1620 0x0607, 0x0405, 0x0203, 0x0001
1624 __vector
unsigned long long a, p, r;
1626 #ifdef __LITTLE_ENDIAN__
1627 t.as_short[0] = permute_selectors[element_selector_10];
1628 t.as_short[1] = permute_selectors[element_selector_32];
1629 t.as_short[2] = permute_selectors[element_selector_54];
1630 t.as_short[3] = permute_selectors[element_selector_76];
1632 t.as_short[3] = permute_selectors[element_selector_10];
1633 t.as_short[2] = permute_selectors[element_selector_32];
1634 t.as_short[1] = permute_selectors[element_selector_54];
1635 t.as_short[0] = permute_selectors[element_selector_76];
1639 r =
vec_perm (a, a, (__vector
unsigned char)p);
1640 return (__m64) ((__vector
long long) r)[0];
1643 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1652 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1655 __m64 hibit = 0x8080808080808080UL;
1657 __m64 *p = (__m64*)__P;
1661 tmp = (tmp & (~mask)) | (__A & mask);
1665 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1672 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1675 __vector
unsigned char a,
b,
c;
1677 a = (__vector
unsigned char)
vec_splats (__A);
1680 return (__m64) ((__vector
long long)
c)[0];
1683 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1690 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1693 __vector
unsigned short a,
b,
c;
1695 a = (__vector
unsigned short)
vec_splats (__A);
1698 return (__m64) ((__vector
long long)
c)[0];
1701 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1710 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1713 __vector
unsigned char a,
b;
1714 __vector
unsigned char vmin, vmax, vabsdiff;
1715 __vector
signed int vsum;
1716 const __vector
unsigned int zero =
1718 __m64_union result = {0};
1720 a = (__vector
unsigned char) (__vector
unsigned long long) { 0UL, __A };
1721 b = (__vector
unsigned char) (__vector
unsigned long long) { 0UL, __B };
1724 vabsdiff =
vec_sub (vmax, vmin);
1726 vsum = (__vector
signed int)
vec_sum4s (vabsdiff, zero);
1728 vsum = vec_sums (vsum, (__vector
signed int) zero);
1731 result.as_short[0] = vsum[3];
1732 return result.as_m64;
1735 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1742 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1756 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1771 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1775 __atomic_thread_fence (__ATOMIC_RELEASE);
1783 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1799 unsigned long __PPR;
1815 __atomic_thread_fence (__ATOMIC_SEQ_CST);
1820 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1822 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1823 __v4sf __t0 = vec_vmrghw (__r0, __r1); \
1824 __v4sf __t1 = vec_vmrghw (__r2, __r3); \
1825 __v4sf __t2 = vec_vmrglw (__r0, __r1); \
1826 __v4sf __t3 = vec_vmrglw (__r2, __r3); \
1827 (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \
1828 (__vector long long)__t1); \
1829 (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \
1830 (__vector long long)__t1); \
1831 (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \
1832 (__vector long long)__t3); \
1833 (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \
1834 (__vector long long)__t3); \
1841 #include_next <xmmintrin.h>
__device__ __2f16 float c
static __inline unsigned char unsigned int unsigned int unsigned int * __p
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_ld(int __a, const vector signed char *__b)
#define vec_ctf(__a, __b)
static __inline__ vector int __ATTRS_o_ai vec_vupkhsh(vector short __a)
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
static __inline__ vector short __ATTRS_o_ai vec_vupkhsb(vector signed char __a)
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, int __b, vector signed char *__c)
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
static __inline__ vector int __ATTRS_o_ai vec_vmrglw(vector int __a, vector int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
static __inline__ vector int __ATTRS_o_ai vec_vupklsh(vector short __a)
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_pack(vector signed short __a, vector signed short __b)
static __inline__ vector float __ATTRS_o_ai vec_re(vector float __a)
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_abs(vector signed char __a)
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
static __inline__ vector float __ATTRS_o_ai vec_rsqrte(vector float __a)
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
static __inline__ void int __a
void _mm_pause(void)
Indicates that a spin loop is being executed for the purposes of optimizing power consumption during ...
static __inline__ unsigned int unsigned char __D
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
Compares the 8-bit integer elements of two 64-bit integer vectors of [8 x i8] to determine if the ele...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a)
Calculates the approximate reciprocal of the value stored in the low-order bits of a 128-bit vector o...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a)
Calculates the square root of the value stored in the low-order bits of a 128-bit vector of [4 x floa...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ps(__m128 __a, __m128 __b)
Divides two 128-bit vectors of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for equa...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ps(__m128 __a, __m128 __b)
Adds two 128-bit vectors of [4 x float], and returns the results of the addition.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an aligned memory location.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mulhi_pu16(__m64 __a, __m64 __b)
Multiplies packed 16-bit unsigned integer values and writes the high-order 16 bits of each 32-bit pro...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] for ineq...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a)
Stores float values from a 128-bit vector of [4 x float] to an aligned memory location in reverse ord...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttps_pi32(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(float *__p, __m128 __a)
Moves packed float values from a 128-bit vector of [4 x float] to a 128-bit aligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_si2ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi16_ps(__m64 __a)
Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a)
Calculates the approximate reciprocals of the square roots of the values stored in a 128-bit vector o...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi8(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a)
Stores the lower 64 bits of a 128-bit vector of [4 x float] to a memory location.
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
#define _mm_shuffle_ps(a, b, mask)
Selects 4 float values from the 128-bit operands of [4 x float], as specified by the immediate value ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for equality and returns the ...
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu16_ps(__m64 __a)
Converts a 64-bit vector of 16-bit unsigned integer values into a 128-bit vector of [4 x float].
void _mm_sfence(void)
Forces strong memory ordering (serialization) between store instructions preceding this instruction a...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps1(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ps(__m128 __a, __m128 __b)
Multiplies two 128-bit vectors of [4 x float] and returns the results of the multiplication.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the greater of each pair of values.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a)
Calculates the approximate reciprocal of the square root of the value stored in the low-order bits of...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu16(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 16-bit integer values and writes the averages to...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the low-order bits of a 128-bit vector of [4 ...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] to an unaligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ss(__m128 __a, __m128 __b)
Subtracts the 32-bit float value in the low-order bits of the second operand from the corresponding v...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ps(__m128 __a, __m128 __b)
Subtracts each of the values of the second operand from the first operand, both of which are 128-bit ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p)
Loads a 32-bit float value and duplicates it to all four vector elements of a 128-bit vector of [4 x ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b)
Compares two 128-bit vectors of [4 x float] and returns the lesser of each pair of values.
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float], initialized in reverse order with the spec...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x float] and interleaves the...
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] to a memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p)
Loads two packed float values from the address __p into the high-order bits of a 128-bit vector of [4...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a)
Calculates the approximate reciprocals of the values stored in a 128-bit vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w)
Constructs a 128-bit floating-point vector of [4 x float], with each of the four single-precision flo...
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_stream_pi(__m64 *__p, __m64 __a)
Stores a 64-bit integer in the specified aligned memory location.
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a)
Stores a 128-bit vector of [4 x float] into an aligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
#define _mm_extract_pi16(a, n)
Extracts 16-bit element from a 64-bit vector of [4 x i16] and returns it, as specified by the immedia...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a)
Calculates the square roots of the values stored in a 128-bit vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands for inequality and returns th...
#define _mm_prefetch(a, sel)
Loads one cache line of data from the specified address to a location closer to the processor.
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtss_f32(__m128 __a)
Extracts a float value contained in the lower 32 bits of a vector of [4 x float].
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
int __v4si __attribute__((__vector_size__(16)))
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ss(__m128 __a, __m128 __b)
Multiplies two 32-bit float values in the low-order bits of the operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi32_ss(__m128 __a, int __b)
Converts a 32-bit signed integer value into a floating point value and writes it to the lower 32 bits...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtt_ps2pi(__m128 __a)
Converts two low-order float values in a 128-bit vector of [4 x float] into a 64-bit vector of [2 x i...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a)
Converts a float value contained in the lower 32 bits of a vector of [4 x float] into a 32-bit intege...
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a)
Extracts the sign bits from each single-precision floating-point element of a 128-bit floating-point ...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
Converts the two 32-bit signed integer values from each 64-bit vector operand of [2 x i32] into a 128...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p)
Loads four packed float values, in reverse order, from an aligned memory location to 32-bit elements ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a)
Stores the upper 64 bits of a 128-bit vector of [4 x float] to a memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
#define _mm_insert_pi16(a, d, n)
Copies data from the 64-bit vector of [4 x i16] to the destination, and inserts the lower 16-bits of ...
#define _mm_shuffle_pi16(a, n)
Shuffles the 4 16-bit integers from a 64-bit integer vector to the destination, as specified by the i...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvt_pi2ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ss(__m128 __a, __m128 __b)
Adds the 32-bit float values in the low-order bits of the operands.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w)
Constructs a 128-bit floating-point vector of [4 x float] initialized with the specified single-preci...
static __inline__ int __DEFAULT_FN_ATTRS_MMX _mm_movemask_pi8(__m64 __a)
Takes the most significant bit from each 8-bit element in a 64-bit integer vector to create an 8-bit ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a)
Stores the lower 32 bits of a 128-bit vector of [4 x float] into four contiguous elements in an align...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the lesser value ...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu8_ps(__m64 __a)
Converts the lower four unsigned 8-bit integer values from a 64-bit vector of [8 x u8] into a 128-bit...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu8(__m64 __a, __m64 __b)
Computes the rounded averages of the packed unsigned 8-bit integer values and writes the averages to ...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi8_ps(__m64 __a)
Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] into a 128-bit vector of [4 x f...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x float] and interleaves them...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands and returns the greater value...
static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
Conditionally copies the values from each 8-bit element in the first 64-bit integer vector operand to...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pi16(__m64 __a, __m64 __b)
Compares each of the corresponding packed 16-bit integer values of the 64-bit integer vectors,...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b)
Performs an unordered comparison of two 32-bit float values using the low-order bits of both operands...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi16(__m128 __a)
Converts each single-precision floating-point element of a 128-bit floating-point vector of [4 x floa...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the first ope...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pu8(__m64 __a, __m64 __b)
Compares each of the corresponding packed 8-bit unsigned integer values of the 64-bit integer vectors...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sad_pu8(__m64 __a, __m64 __b)
Subtracts the corresponding 8-bit unsigned integer values of the two 64-bit vector operands and compu...
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_ps(__m128 __a, __m64 __b)
Converts two elements of a 64-bit vector of [2 x i32] into two floating point values and writes them ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b)
Compares two 32-bit float values in the low-order bits of both operands to determine if the value in ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b)
Compares each of the corresponding 32-bit float values of the 128-bit vectors of [4 x float] to deter...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p)
Loads a 128-bit floating-point vector of [4 x float] from an unaligned memory location.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p)
Constructs a 128-bit floating-point vector of [4 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ss(__m128 __a, __m128 __b)
Divides the value in the low-order 32 bits of the first operand by the corresponding value in the sec...