13 #ifndef NO_WARN_X86_INTRINSICS
32 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
38 #if defined(__linux__) && defined(__ppc64__)
43 #include <xmmintrin.h>
46 typedef __vector
double __v2df;
47 typedef __vector
long long __v2di;
48 typedef __vector
unsigned long long __v2du;
49 typedef __vector
int __v4si;
50 typedef __vector
unsigned int __v4su;
51 typedef __vector
short __v8hi;
52 typedef __vector
unsigned short __v8hu;
53 typedef __vector
signed char __v16qi;
54 typedef __vector
unsigned char __v16qu;
58 typedef long long __m128i
__attribute__ ((__vector_size__ (16), __may_alias__));
59 typedef double __m128d
__attribute__ ((__vector_size__ (16), __may_alias__));
62 typedef long long __m128i_u
__attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
63 typedef double __m128d_u
__attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
66 #define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
69 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
72 return __extension__ (__m128d){ __F, 0.0 };
76 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
79 return __extension__ (__m128d){ __F, __F };
82 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
89 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
92 return __extension__ (__m128d){ __X, __W };
96 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
99 return __extension__ (__m128d){ __W, __X };
103 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
111 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
118 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
121 __v2df result = (__v2df) __A;
122 result [0] = ((__v2df) __B)[0];
123 return (__m128d) result;
127 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
130 return ((__m128d)
vec_ld(0, (__v16qu*)__P));
134 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
137 return (vec_vsx_ld(0, __P));
141 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
148 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
154 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
161 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
165 return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
169 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
172 vec_st((__v16qu)__A, 0, (__v16qu*)__P);
176 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
179 *(__m128d_u *)__P = __A;
183 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
186 *__P = ((__v2df)__A)[0];
189 extern __inline
double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
192 return ((__v2df)__A)[0];
195 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
205 *__P = ((__v2df)__A)[1];
209 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
215 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
222 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
229 extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
230 _mm_cvtsi128_si64 (__m128i __A)
232 return ((__v2di)__A)[0];
236 extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
237 _mm_cvtsi128_si64x (__m128i __A)
239 return ((__v2di)__A)[0];
242 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
245 return (__m128d) ((__v2df)__A + (__v2df)__B);
251 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
254 __A[0] = __A[0] + __B[0];
258 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
261 return (__m128d) ((__v2df)__A - (__v2df)__B);
264 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
267 __A[0] = __A[0] - __B[0];
271 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
274 return (__m128d) ((__v2df)__A * (__v2df)__B);
277 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
280 __A[0] = __A[0] * __B[0];
284 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
287 return (__m128d) ((__v2df)__A / (__v2df)__B);
290 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
293 __A[0] = __A[0] / __B[0];
297 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
300 return (vec_sqrt (__A));
304 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
312 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
318 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
328 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
334 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
344 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
347 return ((__m128d)
vec_cmpeq ((__v2df) __A, (__v2df) __B));
350 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
353 return ((__m128d)
vec_cmplt ((__v2df) __A, (__v2df) __B));
356 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
359 return ((__m128d)
vec_cmple ((__v2df) __A, (__v2df) __B));
362 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
365 return ((__m128d)
vec_cmpgt ((__v2df) __A, (__v2df) __B));
368 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
371 return ((__m128d)
vec_cmpge ((__v2df) __A,(__v2df) __B));
374 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
377 __v2df temp = (__v2df)
vec_cmpeq ((__v2df) __A, (__v2df)__B);
378 return ((__m128d)
vec_nor (temp, temp));
381 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
384 return ((__m128d)
vec_cmpge ((__v2df) __A, (__v2df) __B));
387 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
390 return ((__m128d)
vec_cmpgt ((__v2df) __A, (__v2df) __B));
393 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
396 return ((__m128d)
vec_cmple ((__v2df) __A, (__v2df) __B));
399 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
402 return ((__m128d)
vec_cmplt ((__v2df) __A, (__v2df) __B));
405 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
416 const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000};
417 a = (__v2du)
vec_abs ((__v2df)__A);
426 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
432 c = (__v2du)
vec_cmpeq ((__v2df)__A, (__v2df)__A);
433 d = (__v2du)
vec_cmpeq ((__v2df)__B, (__v2df)__B);
437 return ((__m128d)vec_orc(
c,
d));
441 c = (__v2du)
vec_cmpeq ((__v2df)__A, (__v2df)__A);
442 d = (__v2du)
vec_cmpeq ((__v2df)__B, (__v2df)__B);
450 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
466 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
476 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
486 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
496 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
506 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
517 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
528 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
539 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
550 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
561 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
566 return (__m128d)
_mm_setr_pd (r[0], ((__v2df)__A)[1]);
569 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
584 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
587 return (__A[0] == __B[0]);
590 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
593 return (__A[0] < __B[0]);
596 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
599 return (__A[0] <= __B[0]);
602 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
605 return (__A[0] > __B[0]);
608 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
611 return (__A[0] >= __B[0]);
614 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
617 return (__A[0] != __B[0]);
620 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
623 return (__A[0] == __B[0]);
626 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
629 return (__A[0] < __B[0]);
632 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
635 return (__A[0] <= __B[0]);
638 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
641 return (__A[0] > __B[0]);
644 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
647 return (__A[0] >= __B[0]);
650 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
653 return (__A[0] != __B[0]);
657 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
660 return __extension__ (__m128i)(__v2di){ __q0, __q1 };
663 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
669 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
672 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
675 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
676 _mm_set_epi16 (
short __q7,
short __q6,
short __q5,
short __q4,
677 short __q3,
short __q2,
short __q1,
short __q0)
679 return __extension__ (__m128i)(__v8hi){
680 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
683 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
684 _mm_set_epi8 (
char __q15,
char __q14,
char __q13,
char __q12,
685 char __q11,
char __q10,
char __q09,
char __q08,
686 char __q07,
char __q06,
char __q05,
char __q04,
687 char __q03,
char __q02,
char __q01,
char __q00)
689 return __extension__ (__m128i)(__v16qi){
690 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
691 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
696 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
702 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
708 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
714 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
717 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
720 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
723 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
724 __A, __A, __A, __A, __A, __A, __A, __A);
729 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
735 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
741 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
743 short __q4,
short __q5,
short __q6,
short __q7)
745 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
748 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
749 _mm_setr_epi8 (
char __q00,
char __q01,
char __q02,
char __q03,
750 char __q04,
char __q05,
char __q06,
char __q07,
751 char __q08,
char __q09,
char __q10,
char __q11,
752 char __q12,
char __q13,
char __q14,
char __q15)
754 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
755 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
759 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
765 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
768 return (__m128i) (vec_vsx_ld(0, (
signed int const *)__P));
771 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
777 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
780 vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
783 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
789 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
792 *(
long long *)__P = ((__v2di)__B)[0];
795 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
798 return (__m64) ((__v2di)__B)[0];
801 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
807 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
814 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
822 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
825 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
829 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
837 return (__m128d)
vec_ctf (val, 0);
841 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
844 return ((__m128)
vec_ctf((__v4si)__A, 0));
847 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
850 __v2df rounded = vec_rint (__A);
864 temp = vec_mergeo (temp, temp);
865 result = (__v4si) vec_vpkudum ((__vector
long long) temp,
866 (__vector
long long) vzero);
869 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
870 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
871 result = (__v4si)
vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
874 return (__m128i) result;
877 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
882 return (__m64) result[0];
885 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
890 const __v4si vzero = { 0, 0, 0, 0 };
899 temp = vec_mergeo (temp, temp);
900 result = (__v4sf) vec_vpkudum ((__vector
long long) temp,
901 (__vector
long long) vzero);
904 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
905 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
906 result = (__v4sf)
vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
909 return ((__m128)result);
912 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
917 const __v4si vzero = { 0, 0, 0, 0 };
928 temp = vec_mergeo (temp, temp);
929 result = (__v4si) vec_vpkudum ((__vector
long long) temp,
930 (__vector
long long) vzero);
933 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
934 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
935 result = (__v4si)
vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
939 return ((__m128i) result);
942 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
947 return (__m64) result[0];
950 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
953 return ((__v4si)__A)[0];
957 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
966 result =
vec_ctf ((__vector
signed long long) tmp2, 0);
967 return (__m128d)result;
971 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
977 rounded = vec_rint((__v4sf) __A);
979 return (__m128i) result;
982 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
987 result =
vec_cts ((__v4sf) __A, 0);
988 return (__m128i) result;
991 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
996 return (__m128d) vec_doubleh ((__v4sf)__A);
1000 __v4sf a = (__v4sf)__A;
1003 #ifdef __LITTLE_ENDIAN__
1008 temp = __builtin_vsx_xxsldwi (a, a, 3);
1009 temp = __builtin_vsx_xxsldwi (a, temp, 2);
1022 return (__m128d) result;
1026 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1029 __v2df rounded = vec_rint((__v2df) __A);
1030 int result = ((__v2df)rounded)[0];
1035 extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036 _mm_cvtsd_si64 (__m128d __A)
1038 __v2df rounded = vec_rint ((__v2df) __A );
1039 long long result = ((__v2df) rounded)[0];
1045 extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1046 _mm_cvtsd_si64x (__m128d __A)
1048 return _mm_cvtsd_si64 ((__v2df)__A);
1051 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1054 int result = ((__v2df)__A)[0];
1060 extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1061 _mm_cvttsd_si64 (__m128d __A)
1063 long long result = ((__v2df)__A)[0];
1069 extern __inline
long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1070 _mm_cvttsd_si64x (__m128d __A)
1072 return _mm_cvttsd_si64 (__A);
1075 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1078 __v4sf result = (__v4sf)__A;
1080 #ifdef __LITTLE_ENDIAN__
1083 __v2df temp_b =
vec_splat((__v2df)__B, 0);
1086 result = __builtin_vsx_xxsldwi (result, result, 3);
1094 result = __builtin_vsx_xxsldwi (result, temp_s, 1);
1096 result [0] = ((__v2df)__B)[0];
1098 return (__m128) result;
1101 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104 __v2df result = (__v2df)__A;
1107 return (__m128d)result;
1111 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1112 _mm_cvtsi64_sd (__m128d __A,
long long __B)
1114 __v2df result = (__v2df)__A;
1117 return (__m128d)result;
1121 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1122 _mm_cvtsi64x_sd (__m128d __A,
long long __B)
1124 return _mm_cvtsi64_sd (__A, __B);
1127 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1130 #ifdef __LITTLE_ENDIAN__
1132 __v4sf temp =
vec_splat ((__v4sf)__B, 0);
1140 return (__m128d)
vec_mergel (res, (__v2df)__A);
1142 __v2df res = (__v2df)__A;
1143 res [0] = ((__v4sf)__B) [0];
1144 return (__m128d) res;
1148 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1151 __vector
double result;
1152 const int litmsk = __mask & 0x3;
1157 else if (litmsk == 1)
1158 result = vec_xxpermdi (__B, __A, 2);
1159 else if (litmsk == 2)
1160 result = vec_xxpermdi (__B, __A, 1);
1162 else if (litmsk == 1)
1163 result = vec_xxpermdi (__A, __B, 2);
1164 else if (litmsk == 2)
1165 result = vec_xxpermdi (__A, __B, 1);
1173 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1176 return (__m128d)
vec_mergel ((__v2df)__A, (__v2df)__B);
1179 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1182 return (__m128d)
vec_mergeh ((__v2df)__A, (__v2df)__B);
1185 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188 __v2df result = (__v2df)__A;
1190 return (__m128d)result;
1193 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1196 __v2df result = (__v2df)__A;
1198 return (__m128d)result;
1205 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1208 __vector
unsigned long long result;
1209 static const __vector
unsigned int perm_mask =
1211 #ifdef __LITTLE_ENDIAN__
1212 0x80800040, 0x80808080, 0x80808080, 0x80808080
1214 0x80808080, 0x80808080, 0x80808080, 0x80804000
1218 result = ((__vector
unsigned long long)
1219 vec_vbpermq ((__vector
unsigned char) __A,
1220 (__vector
unsigned char) perm_mask));
1222 #ifdef __LITTLE_ENDIAN__
1230 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1233 return (__m128i)
vec_packs ((__v8hi) __A, (__v8hi)__B);
1236 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1239 return (__m128i)
vec_packs ((__v4si)__A, (__v4si)__B);
1242 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1245 return (__m128i)
vec_packsu ((__v8hi) __A, (__v8hi)__B);
1248 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251 return (__m128i)
vec_mergel ((__v16qu)__A, (__v16qu)__B);
1254 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1257 return (__m128i)
vec_mergel ((__v8hu)__A, (__v8hu)__B);
1260 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1263 return (__m128i)
vec_mergel ((__v4su)__A, (__v4su)__B);
1266 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269 return (__m128i)
vec_mergel ((__vector
long long) __A,
1270 (__vector
long long) __B);
1273 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1276 return (__m128i)
vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1279 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1282 return (__m128i)
vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1285 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1288 return (__m128i)
vec_mergeh ((__v4si)__A, (__v4si)__B);
1291 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1294 return (__m128i)
vec_mergeh ((__vector
long long) __A,
1295 (__vector
long long) __B);
1298 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1301 return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1304 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1307 return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1310 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1313 return (__m128i) ((__v4su)__A + (__v4su)__B);
1316 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1319 return (__m128i) ((__v2du)__A + (__v2du)__B);
1322 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1325 return (__m128i)
vec_adds ((__v16qi)__A, (__v16qi)__B);
1328 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1331 return (__m128i)
vec_adds ((__v8hi)__A, (__v8hi)__B);
1334 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1337 return (__m128i)
vec_adds ((__v16qu)__A, (__v16qu)__B);
1340 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1343 return (__m128i)
vec_adds ((__v8hu)__A, (__v8hu)__B);
1346 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1349 return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1352 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1355 return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1358 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1361 return (__m128i) ((__v4su)__A - (__v4su)__B);
1364 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1367 return (__m128i) ((__v2du)__A - (__v2du)__B);
1370 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1373 return (__m128i)
vec_subs ((__v16qi)__A, (__v16qi)__B);
1376 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379 return (__m128i)
vec_subs ((__v8hi)__A, (__v8hi)__B);
1382 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1385 return (__m128i)
vec_subs ((__v16qu)__A, (__v16qu)__B);
1388 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1391 return (__m128i)
vec_subs ((__v8hu)__A, (__v8hu)__B);
1394 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1397 __vector
signed int zero = {0, 0, 0, 0};
1399 return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
1402 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1405 __vector
signed int w0, w1;
1407 __vector
unsigned char xform1 = {
1408 #ifdef __LITTLE_ENDIAN__
1409 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1410 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1412 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1413 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1417 w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1418 w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1419 return (__m128i)
vec_perm (w0, w1, xform1);
1422 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1425 return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1428 extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1431 unsigned int a = __A;
1432 unsigned int b = __B;
1434 return ((__m64)a * (__m64)
b);
1437 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1443 #ifdef __LITTLE_ENDIAN__
1448 :
"v" (__A),
"v" (__B)
1455 :
"v" (__A),
"v" (__B)
1458 return (__m128i) result;
1460 return (__m128i)
vec_mule ((__v4su)__A, (__v4su)__B);
1464 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1468 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1470 if (__B >= 0 && __B < 16)
1472 if (__builtin_constant_p(__B))
1477 result =
vec_sl ((__v8hi) __A, lshift);
1480 return (__m128i) result;
1483 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1487 __v4si result = { 0, 0, 0, 0 };
1489 if (__B >= 0 && __B < 32)
1491 if (__builtin_constant_p(__B) && __B < 16)
1496 result =
vec_sl ((__v4si) __A, lshift);
1499 return (__m128i) result;
1503 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1507 __v2di result = { 0, 0 };
1509 if (__B >= 0 && __B < 64)
1511 if (__builtin_constant_p(__B) && __B < 16)
1514 lshift = (__v2du)
vec_splats ((
unsigned int) __B);
1516 result =
vec_sl ((__v2di) __A, lshift);
1519 return (__m128i) result;
1523 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1526 __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1531 if (__builtin_constant_p(__B))
1536 result =
vec_sra ((__v8hi) __A, rshift);
1538 return (__m128i) result;
1541 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1544 __v4su rshift = { 31, 31, 31, 31 };
1549 if (__builtin_constant_p(__B))
1554 rshift = (__v4su)
vec_splats((
unsigned int)__B);
1559 result =
vec_sra ((__v4si) __A, rshift);
1561 return (__m128i) result;
1564 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1568 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1571 result =
vec_sld ((__v16qu) __A, zeros, __N);
1575 return (__m128i) result;
1578 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1582 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1585 #ifdef __LITTLE_ENDIAN__
1586 if (__builtin_constant_p(__N))
1590 result =
vec_sld (zeros, (__v16qu) __A, (16 - __N));
1594 __v16qu shift =
vec_splats((
unsigned char)(__N*8));
1595 #ifdef __LITTLE_ENDIAN__
1596 result =
vec_sro ((__v16qu)__A, shift);
1598 result =
vec_slo ((__v16qu)__A, shift);
1604 return (__m128i) result;
1607 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1613 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1617 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1620 #ifdef __LITTLE_ENDIAN__
1621 result =
vec_sld ((__v16qu) __A, zeros, _imm5);
1623 result =
vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
1628 return (__m128i) result;
1631 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1636 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1640 if (__builtin_constant_p(__B))
1645 result =
vec_sr ((__v8hi) __A, rshift);
1648 return (__m128i) result;
1651 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1655 __v4si result = { 0, 0, 0, 0 };
1659 if (__builtin_constant_p(__B))
1664 rshift = (__v4su)
vec_splats((
unsigned int)__B);
1669 result =
vec_sr ((__v4si) __A, rshift);
1672 return (__m128i) result;
1676 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1680 __v2di result = { 0, 0 };
1684 if (__builtin_constant_p(__B))
1689 rshift = (__v2du)
vec_splats((
unsigned long long)__B);
1692 rshift = (__v2du)
vec_splats ((
unsigned int) __B);
1694 result =
vec_sr ((__v2di) __A, rshift);
1697 return (__m128i) result;
1701 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1705 __vector __bool
short shmask;
1706 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1709 #ifdef __LITTLE_ENDIAN__
1715 result =
vec_sl ((__v8hu) __A, lshift);
1716 result =
vec_sel ((__v8hu) shmask, result, shmask);
1718 return (__m128i) result;
1721 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1725 __vector __bool
int shmask;
1726 const __v4su shmax = { 32, 32, 32, 32 };
1728 #ifdef __LITTLE_ENDIAN__
1734 result =
vec_sl ((__v4su) __A, lshift);
1735 result =
vec_sel ((__v4su) shmask, result, shmask);
1737 return (__m128i) result;
1741 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1745 __vector __bool
long long shmask;
1746 const __v2du shmax = { 64, 64 };
1751 result =
vec_sl ((__v2du) __A, lshift);
1752 result = (__v2du)
vec_sel ((__v2df) shmask, (__v2df)result, shmask);
1754 return (__m128i) result;
1758 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1761 const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1765 #ifdef __LITTLE_ENDIAN__
1770 rshift =
vec_min (rshift, rshmax);
1771 result =
vec_sra ((__v8hi) __A, rshift);
1773 return (__m128i) result;
1776 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1779 const __v4su rshmax = { 31, 31, 31, 31 };
1783 #ifdef __LITTLE_ENDIAN__
1788 rshift =
vec_min (rshift, rshmax);
1789 result =
vec_sra ((__v4si) __A, rshift);
1791 return (__m128i) result;
1794 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1798 __vector __bool
short shmask;
1799 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1802 #ifdef __LITTLE_ENDIAN__
1808 result =
vec_sr ((__v8hu) __A, rshift);
1809 result =
vec_sel ((__v8hu) shmask, result, shmask);
1811 return (__m128i) result;
1814 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1818 __vector __bool
int shmask;
1819 const __v4su shmax = { 32, 32, 32, 32 };
1822 #ifdef __LITTLE_ENDIAN__
1828 result =
vec_sr ((__v4su) __A, rshift);
1829 result =
vec_sel ((__v4su) shmask, result, shmask);
1831 return (__m128i) result;
1835 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1839 __vector __bool
long long shmask;
1840 const __v2du shmax = { 64, 64 };
1845 result =
vec_sr ((__v2du) __A, rshift);
1846 result = (__v2du)
vec_sel ((__v2df) shmask, (__v2df)result, shmask);
1848 return (__m128i) result;
1852 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1855 return (
vec_and ((__v2df) __A, (__v2df) __B));
1858 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1861 return (
vec_andc ((__v2df) __B, (__v2df) __A));
1864 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1867 return (
vec_or ((__v2df) __A, (__v2df) __B));
1870 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1873 return (
vec_xor ((__v2df) __A, (__v2df) __B));
1876 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1879 return (__m128i)
vec_and ((__v2di) __A, (__v2di) __B);
1882 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1885 return (__m128i)
vec_andc ((__v2di) __B, (__v2di) __A);
1888 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1891 return (__m128i)
vec_or ((__v2di) __A, (__v2di) __B);
1894 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1897 return (__m128i)
vec_xor ((__v2di) __A, (__v2di) __B);
1900 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1903 return (__m128i)
vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1906 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1909 return (__m128i)
vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1912 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1915 return (__m128i)
vec_cmpeq ((__v4si) __A, (__v4si)__B);
1918 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1921 return (__m128i)
vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1924 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1927 return (__m128i)
vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1930 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1933 return (__m128i)
vec_cmplt ((__v4si) __A, (__v4si)__B);
1936 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1939 return (__m128i)
vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1942 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1945 return (__m128i)
vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1948 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1951 return (__m128i)
vec_cmpgt ((__v4si) __A, (__v4si)__B);
1954 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1957 return (
unsigned short) ((__v8hi)__A)[__N & 7];
1960 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1963 __v8hi result = (__v8hi)__A;
1965 result [(__N & 7)] =
__D;
1967 return (__m128i) result;
1970 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1973 return (__m128i)
vec_max ((__v8hi)__A, (__v8hi)__B);
1976 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1979 return (__m128i)
vec_max ((__v16qu) __A, (__v16qu)__B);
1982 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1985 return (__m128i)
vec_min ((__v8hi) __A, (__v8hi)__B);
1988 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
1991 return (__m128i)
vec_min ((__v16qu) __A, (__v16qu)__B);
1999 extern __inline
int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2002 __vector
unsigned long long result;
2003 static const __vector
unsigned char perm_mask =
2005 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2006 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2009 result = ((__vector
unsigned long long)
2010 vec_vbpermq ((__vector
unsigned char) __A,
2011 (__vector
unsigned char) perm_mask));
2013 #ifdef __LITTLE_ENDIAN__
2021 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2026 #ifdef __LITTLE_ENDIAN__
2027 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
2028 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
2030 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
2031 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
2035 w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2036 w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2037 return (__m128i)
vec_perm (w0, w1, xform1);
2040 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2043 unsigned long element_selector_98 = __mask & 0x03;
2044 unsigned long element_selector_BA = (__mask >> 2) & 0x03;
2045 unsigned long element_selector_DC = (__mask >> 4) & 0x03;
2046 unsigned long element_selector_FE = (__mask >> 6) & 0x03;
2047 static const unsigned short permute_selectors[4] =
2049 #ifdef __LITTLE_ENDIAN__
2050 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2052 0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2056 #ifdef __LITTLE_ENDIAN__
2057 { 0x1716151413121110UL, 0UL};
2059 { 0x1011121314151617UL, 0UL};
2064 t.as_short[0] = permute_selectors[element_selector_98];
2065 t.as_short[1] = permute_selectors[element_selector_BA];
2066 t.as_short[2] = permute_selectors[element_selector_DC];
2067 t.as_short[3] = permute_selectors[element_selector_FE];
2068 pmask[1] = t.as_m64;
2070 r =
vec_perm (a, a, (__vector
unsigned char)pmask);
2074 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2077 unsigned long element_selector_10 = __mask & 0x03;
2078 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2079 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2080 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2081 static const unsigned short permute_selectors[4] =
2083 #ifdef __LITTLE_ENDIAN__
2084 0x0100, 0x0302, 0x0504, 0x0706
2086 0x0001, 0x0203, 0x0405, 0x0607
2090 #ifdef __LITTLE_ENDIAN__
2091 { 0UL, 0x1f1e1d1c1b1a1918UL};
2093 { 0UL, 0x18191a1b1c1d1e1fUL};
2097 t.as_short[0] = permute_selectors[element_selector_10];
2098 t.as_short[1] = permute_selectors[element_selector_32];
2099 t.as_short[2] = permute_selectors[element_selector_54];
2100 t.as_short[3] = permute_selectors[element_selector_76];
2101 pmask[0] = t.as_m64;
2103 r =
vec_perm (a, a, (__vector
unsigned char)pmask);
2107 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2110 unsigned long element_selector_10 = __mask & 0x03;
2111 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2112 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2113 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2114 static const unsigned int permute_selectors[4] =
2116 #ifdef __LITTLE_ENDIAN__
2117 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2119 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2124 t[0] = permute_selectors[element_selector_10];
2125 t[1] = permute_selectors[element_selector_32];
2126 t[2] = permute_selectors[element_selector_54] + 0x10101010;
2127 t[3] = permute_selectors[element_selector_76] + 0x10101010;
2128 return (__m128i)
vec_perm ((__v4si) __A, (__v4si)__A, (__vector
unsigned char)t);
2131 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2134 __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2136 __m128i_u *p = (__m128i_u*)
__C;
2139 mask = (__v16qu)
vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
2140 tmp =
vec_sel (tmp, (__v16qu)__A, mask);
2144 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2147 return (__m128i)
vec_avg ((__v16qu)__A, (__v16qu)__B);
2150 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2153 return (__m128i)
vec_avg ((__v8hu)__A, (__v8hu)__B);
2157 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2161 __v16qu vmin, vmax, vabsdiff;
2163 const __v4su zero = { 0, 0, 0, 0 };
2170 vabsdiff =
vec_sub (vmax, vmin);
2172 vsum = (__vector
signed int)
vec_sum4s (vabsdiff, zero);
2174 result = vec_sum2s (vsum, (__vector
signed int) zero);
2176 #ifdef __LITTLE_ENDIAN__
2177 result =
vec_sld (result, result, 4);
2179 result =
vec_sld (result, result, 6);
2182 return (__m128i) result;
2185 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2186 _mm_stream_si32 (
int *__A,
int __B)
2198 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2199 _mm_stream_si64 (
long long int *__A,
long long int __B)
2211 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2224 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2234 *(__m128d*)__A = __B;
2237 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2249 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2253 __atomic_thread_fence (__ATOMIC_RELEASE);
2256 extern __inline
void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2260 __atomic_thread_fence (__ATOMIC_SEQ_CST);
2263 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2269 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2270 _mm_cvtsi64_si128 (
long long __A)
2272 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2276 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2277 _mm_cvtsi64x_si128 (
long long __A)
2279 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2284 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2287 return (__m128) __A;
2290 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2293 return (__m128i) __A;
2296 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2299 return (__m128d) __A;
2302 extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2305 return (__m128i) __A;
2308 extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2311 return (__m128) __A;
2314 extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
2317 return (__m128d) __A;
2321 #include_next <emmintrin.h>
__device__ __2f16 float c
static __inline__ vector unsigned char __ATTRS_o_ai vec_sr(vector unsigned char __a, vector unsigned char __b)
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_sra(vector signed char __a, vector unsigned char __b)
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_sro(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_ld(int __a, const vector signed char *__b)
#define vec_ctf(__a, __b)
static __inline__ vector short __ATTRS_o_ai vec_mule(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, int __b, vector signed char *__c)
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
static __inline__ vector signed int __ATTRS_o_ai vec_sld(vector signed int, vector signed int, unsigned const int __c)
static __inline__ vector short __ATTRS_o_ai vec_unpackl(vector signed char __a)
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_subs(vector signed char __a, vector signed char __b)
static __inline__ vector int __ATTRS_o_ai vec_splat_s32(signed char __a)
static __inline__ vector signed char __ATTRS_o_ai vec_adds(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_slo(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
static __inline__ vector unsigned char __ATTRS_o_ai vec_packsu(vector short __a, vector short __b)
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
static __inline__ vector short __ATTRS_o_ai vec_unpackh(vector signed char __a)
static __inline__ vector unsigned char __ATTRS_o_ai vec_sl(vector unsigned char __a, vector unsigned char __b)
static __inline__ vector short __ATTRS_o_ai vec_splat_s16(signed char __a)
static __inline__ vector signed char __ATTRS_o_ai vec_abs(vector signed char __a)
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
static __inline__ vector signed char __ATTRS_o_ai vec_packs(vector short __a, vector short __b)
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit unsigned integer...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location.
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors,...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
#define _mm_slli_si128(a, imm)
Left-shifts the 128-bit integer vector operand by the specified number of bytes.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one's complement of the values conta...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit signed integer values in the input and returns the differences in the c...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double],...
#define _mm_shuffle_pd(a, b, i)
Constructs a 128-bit floating-point vector of [2 x double] from two 128-bit vector parameters of [2 x...
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit unsigned integer values in the input and returns the differences in th...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed integer value,...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float],...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double].
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
#define _mm_insert_epi16(a, b, imm)
Constructs a 128-bit integer vector by first making a copy of the 128-bit integer vector parameter,...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double],...
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double].
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors,...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit signed integer values in the input and returns the differences in the ...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
#define _mm_extract_epi16(a, imm)
Extracts 16 bits from a 128-bit integer vector of [8 x i16], using the immediate-value parameter as a...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
#define _mm_shufflelo_epi16(a, imm)
Constructs a 128-bit integer vector by shuffling four lower 16-bit elements of a 128-bit integer vect...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit signed integers,...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
#define _mm_bsrli_si128(a, imm)
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts 32-bit signed integers from both 128-bit integer vector operands into 16-bit signed integers...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32], truncating the result when it is inexact...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double].
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location.
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
#define _mm_shufflehi_epi16(a, imm)
Constructs a 128-bit integer vector by shuffling four upper 16-bit elements of a 128-bit integer vect...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand,...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
#define _mm_bslli_si128(a, imm)
#define _mm_srli_si128(a, imm)
Right-shifts the 128-bit integer vector operand by the specified number of bytes.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit unsigned integer values in the input and returns the differences in the...
#define _mm_shuffle_epi32(a, imm)
Constructs a 128-bit integer vector by shuffling four 32-bit elements of a 128-bit integer vector par...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
static __inline__ unsigned int unsigned char __D
static __inline__ unsigned char int __C