ETISS 0.8.0
Extendable Translating Instruction Set Simulator (version 0.8.0)
emmintrin.h
Go to the documentation of this file.
1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11  User Guide and Reference, version 9.0. */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15  explicitly from x86_64 to powerpc64/powerpc64le.
16 
17  Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18  PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19  However scalar float operations in vector (XMM) registers require
20  the POWER8 VSX ISA (2.07) level. There are differences for data
21  format and placement of float scalars in the vector register, which
22  require extra steps to match SSE2 scalar float semantics on POWER.
23 
24  It should be noted that there's much difference between X86_64's
25  MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26  portable <fenv.h> instead of access MXSCR directly.
27 
28  Most SSE2 scalar float intrinsic operations can be performed more
29  efficiently as C language float scalar operations or optimized to
30  use vector SIMD operations. We recommend this for new applications.
31 */
32 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
33 #endif
34 
35 #ifndef EMMINTRIN_H_
36 #define EMMINTRIN_H_
37 
38 #if defined(__linux__) && defined(__ppc64__)
39 
40 #include <altivec.h>
41 
42 /* We need definitions from the SSE header files. */
43 #include <xmmintrin.h>
44 
45 /* SSE2 */
46 typedef __vector double __v2df;
47 typedef __vector long long __v2di;
48 typedef __vector unsigned long long __v2du;
49 typedef __vector int __v4si;
50 typedef __vector unsigned int __v4su;
51 typedef __vector short __v8hi;
52 typedef __vector unsigned short __v8hu;
53 typedef __vector signed char __v16qi;
54 typedef __vector unsigned char __v16qu;
55 
56 /* The Intel API is flexible enough that we must allow aliasing with other
57  vector types, and their scalar components. */
58 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
59 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
60 
61 /* Unaligned version of the same types. */
62 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
63 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
64 
65 /* Define two value permute mask. */
66 #define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
67 
68 /* Create a vector with element 0 as F and the rest zero. */
69 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70 _mm_set_sd (double __F)
71 {
72  return __extension__ (__m128d){ __F, 0.0 };
73 }
74 
75 /* Create a vector with both elements equal to F. */
76 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77 _mm_set1_pd (double __F)
78 {
79  return __extension__ (__m128d){ __F, __F };
80 }
81 
82 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83 _mm_set_pd1 (double __F)
84 {
85  return _mm_set1_pd (__F);
86 }
87 
88 /* Create a vector with the lower value X and upper value W. */
89 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
90 _mm_set_pd (double __W, double __X)
91 {
92  return __extension__ (__m128d){ __X, __W };
93 }
94 
95 /* Create a vector with the lower value W and upper value X. */
96 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
97 _mm_setr_pd (double __W, double __X)
98 {
99  return __extension__ (__m128d){ __W, __X };
100 }
101 
102 /* Create an undefined vector. */
103 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
104 _mm_undefined_pd (void)
105 {
106  __m128d __Y = __Y;
107  return __Y;
108 }
109 
110 /* Create a vector of zeros. */
111 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
112 _mm_setzero_pd (void)
113 {
114  return (__m128d) vec_splats (0);
115 }
116 
117 /* Sets the low DPFP value of A from the low value of B. */
118 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119 _mm_move_sd (__m128d __A, __m128d __B)
120 {
121  __v2df result = (__v2df) __A;
122  result [0] = ((__v2df) __B)[0];
123  return (__m128d) result;
124 }
125 
126 /* Load two DPFP values from P. The address must be 16-byte aligned. */
127 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128 _mm_load_pd (double const *__P)
129 {
130  return ((__m128d)vec_ld(0, (__v16qu*)__P));
131 }
132 
133 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
134 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
135 _mm_loadu_pd (double const *__P)
136 {
137  return (vec_vsx_ld(0, __P));
138 }
139 
140 /* Create a vector with all two elements equal to *P. */
141 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142 _mm_load1_pd (double const *__P)
143 {
144  return (vec_splats (*__P));
145 }
146 
147 /* Create a vector with element 0 as *P and the rest zero. */
148 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
149 _mm_load_sd (double const *__P)
150 {
151  return _mm_set_sd (*__P);
152 }
153 
154 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
155 _mm_load_pd1 (double const *__P)
156 {
157  return _mm_load1_pd (__P);
158 }
159 
160 /* Load two DPFP values in reverse order. The address must be aligned. */
161 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162 _mm_loadr_pd (double const *__P)
163 {
164  __v2df __tmp = _mm_load_pd (__P);
165  return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
166 }
167 
168 /* Store two DPFP values. The address must be 16-byte aligned. */
169 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
170 _mm_store_pd (double *__P, __m128d __A)
171 {
172  vec_st((__v16qu)__A, 0, (__v16qu*)__P);
173 }
174 
175 /* Store two DPFP values. The address need not be 16-byte aligned. */
176 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177 _mm_storeu_pd (double *__P, __m128d __A)
178 {
179  *(__m128d_u *)__P = __A;
180 }
181 
182 /* Stores the lower DPFP value. */
183 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
184 _mm_store_sd (double *__P, __m128d __A)
185 {
186  *__P = ((__v2df)__A)[0];
187 }
188 
189 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
190 _mm_cvtsd_f64 (__m128d __A)
191 {
192  return ((__v2df)__A)[0];
193 }
194 
195 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196 _mm_storel_pd (double *__P, __m128d __A)
197 {
198  _mm_store_sd (__P, __A);
199 }
200 
201 /* Stores the upper DPFP value. */
202 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
203 _mm_storeh_pd (double *__P, __m128d __A)
204 {
205  *__P = ((__v2df)__A)[1];
206 }
207 /* Store the lower DPFP value across two words.
208  The address must be 16-byte aligned. */
209 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
210 _mm_store1_pd (double *__P, __m128d __A)
211 {
212  _mm_store_pd (__P, vec_splat (__A, 0));
213 }
214 
215 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
216 _mm_store_pd1 (double *__P, __m128d __A)
217 {
218  _mm_store1_pd (__P, __A);
219 }
220 
221 /* Store two DPFP values in reverse order. The address must be aligned. */
222 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
223 _mm_storer_pd (double *__P, __m128d __A)
224 {
225  _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
226 }
227 
228 /* Intel intrinsic. */
229 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
230 _mm_cvtsi128_si64 (__m128i __A)
231 {
232  return ((__v2di)__A)[0];
233 }
234 
235 /* Microsoft intrinsic. */
236 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
237 _mm_cvtsi128_si64x (__m128i __A)
238 {
239  return ((__v2di)__A)[0];
240 }
241 
242 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
243 _mm_add_pd (__m128d __A, __m128d __B)
244 {
245  return (__m128d) ((__v2df)__A + (__v2df)__B);
246 }
247 
248 /* Add the lower double-precision (64-bit) floating-point element in
249  a and b, store the result in the lower element of dst, and copy
250  the upper element from a to the upper element of dst. */
251 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
252 _mm_add_sd (__m128d __A, __m128d __B)
253 {
254  __A[0] = __A[0] + __B[0];
255  return (__A);
256 }
257 
258 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
259 _mm_sub_pd (__m128d __A, __m128d __B)
260 {
261  return (__m128d) ((__v2df)__A - (__v2df)__B);
262 }
263 
264 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265 _mm_sub_sd (__m128d __A, __m128d __B)
266 {
267  __A[0] = __A[0] - __B[0];
268  return (__A);
269 }
270 
271 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272 _mm_mul_pd (__m128d __A, __m128d __B)
273 {
274  return (__m128d) ((__v2df)__A * (__v2df)__B);
275 }
276 
277 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
278 _mm_mul_sd (__m128d __A, __m128d __B)
279 {
280  __A[0] = __A[0] * __B[0];
281  return (__A);
282 }
283 
284 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
285 _mm_div_pd (__m128d __A, __m128d __B)
286 {
287  return (__m128d) ((__v2df)__A / (__v2df)__B);
288 }
289 
290 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
291 _mm_div_sd (__m128d __A, __m128d __B)
292 {
293  __A[0] = __A[0] / __B[0];
294  return (__A);
295 }
296 
297 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
298 _mm_sqrt_pd (__m128d __A)
299 {
300  return (vec_sqrt (__A));
301 }
302 
303 /* Return pair {sqrt (B[0]), A[1]}. */
304 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
305 _mm_sqrt_sd (__m128d __A, __m128d __B)
306 {
307  __v2df c;
308  c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
309  return (__m128d) _mm_setr_pd (c[0], __A[1]);
310 }
311 
312 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
313 _mm_min_pd (__m128d __A, __m128d __B)
314 {
315  return (vec_min (__A, __B));
316 }
317 
318 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
319 _mm_min_sd (__m128d __A, __m128d __B)
320 {
321  __v2df a, b, c;
322  a = vec_splats (__A[0]);
323  b = vec_splats (__B[0]);
324  c = vec_min (a, b);
325  return (__m128d) _mm_setr_pd (c[0], __A[1]);
326 }
327 
328 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
329 _mm_max_pd (__m128d __A, __m128d __B)
330 {
331  return (vec_max (__A, __B));
332 }
333 
334 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335 _mm_max_sd (__m128d __A, __m128d __B)
336 {
337  __v2df a, b, c;
338  a = vec_splats (__A[0]);
339  b = vec_splats (__B[0]);
340  c = vec_max (a, b);
341  return (__m128d) _mm_setr_pd (c[0], __A[1]);
342 }
343 
344 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
345 _mm_cmpeq_pd (__m128d __A, __m128d __B)
346 {
347  return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
348 }
349 
350 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351 _mm_cmplt_pd (__m128d __A, __m128d __B)
352 {
353  return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
354 }
355 
356 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357 _mm_cmple_pd (__m128d __A, __m128d __B)
358 {
359  return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
360 }
361 
362 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363 _mm_cmpgt_pd (__m128d __A, __m128d __B)
364 {
365  return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
366 }
367 
368 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369 _mm_cmpge_pd (__m128d __A, __m128d __B)
370 {
371  return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
372 }
373 
374 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
375 _mm_cmpneq_pd (__m128d __A, __m128d __B)
376 {
377  __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
378  return ((__m128d)vec_nor (temp, temp));
379 }
380 
381 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
382 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
383 {
384  return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
385 }
386 
387 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388 _mm_cmpnle_pd (__m128d __A, __m128d __B)
389 {
390  return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
391 }
392 
393 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
394 _mm_cmpngt_pd (__m128d __A, __m128d __B)
395 {
396  return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
397 }
398 
399 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
400 _mm_cmpnge_pd (__m128d __A, __m128d __B)
401 {
402  return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
403 }
404 
405 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
406 _mm_cmpord_pd (__m128d __A, __m128d __B)
407 {
408 #if _ARCH_PWR8
409  __v2du c, d;
410  /* Compare against self will return false (0's) if NAN. */
411  c = (__v2du)vec_cmpeq (__A, __A);
412  d = (__v2du)vec_cmpeq (__B, __B);
413 #else
414  __v2du a, b;
415  __v2du c, d;
416  const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000};
417  a = (__v2du)vec_abs ((__v2df)__A);
418  b = (__v2du)vec_abs ((__v2df)__B);
419  c = (__v2du)vec_cmpgt (double_exp_mask, a);
420  d = (__v2du)vec_cmpgt (double_exp_mask, b);
421 #endif
422  /* A != NAN and B != NAN. */
423  return ((__m128d)vec_and(c, d));
424 }
425 
426 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
427 _mm_cmpunord_pd (__m128d __A, __m128d __B)
428 {
429 #if _ARCH_PWR8
430  __v2du c, d;
431  /* Compare against self will return false (0's) if NAN. */
432  c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
433  d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
434  /* A == NAN OR B == NAN converts too:
435  NOT(A != NAN) OR NOT(B != NAN). */
436  c = vec_nor (c, c);
437  return ((__m128d)vec_orc(c, d));
438 #else
439  __v2du c, d;
440  /* Compare against self will return false (0's) if NAN. */
441  c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
442  d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
443  /* Convert the true ('1's) is NAN. */
444  c = vec_nor (c, c);
445  d = vec_nor (d, d);
446  return ((__m128d)vec_or(c, d));
447 #endif
448 }
449 
450 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
451 _mm_cmpeq_sd(__m128d __A, __m128d __B)
452 {
453  __v2df a, b, c;
454  /* PowerISA VSX does not allow partial (for just lower double)
455  results. So to insure we don't generate spurious exceptions
456  (from the upper double values) we splat the lower double
457  before we do the operation. */
458  a = vec_splats (__A[0]);
459  b = vec_splats (__B[0]);
460  c = (__v2df) vec_cmpeq(a, b);
461  /* Then we merge the lower double result with the original upper
462  double from __A. */
463  return (__m128d) _mm_setr_pd (c[0], __A[1]);
464 }
465 
466 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
467 _mm_cmplt_sd (__m128d __A, __m128d __B)
468 {
469  __v2df a, b, c;
470  a = vec_splats (__A[0]);
471  b = vec_splats (__B[0]);
472  c = (__v2df) vec_cmplt(a, b);
473  return (__m128d) _mm_setr_pd (c[0], __A[1]);
474 }
475 
476 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
477 _mm_cmple_sd (__m128d __A, __m128d __B)
478 {
479  __v2df a, b, c;
480  a = vec_splats (__A[0]);
481  b = vec_splats (__B[0]);
482  c = (__v2df) vec_cmple(a, b);
483  return (__m128d) _mm_setr_pd (c[0], __A[1]);
484 }
485 
486 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
487 _mm_cmpgt_sd (__m128d __A, __m128d __B)
488 {
489  __v2df a, b, c;
490  a = vec_splats (__A[0]);
491  b = vec_splats (__B[0]);
492  c = (__v2df) vec_cmpgt(a, b);
493  return (__m128d) _mm_setr_pd (c[0], __A[1]);
494 }
495 
496 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
497 _mm_cmpge_sd (__m128d __A, __m128d __B)
498 {
499  __v2df a, b, c;
500  a = vec_splats (__A[0]);
501  b = vec_splats (__B[0]);
502  c = (__v2df) vec_cmpge(a, b);
503  return (__m128d) _mm_setr_pd (c[0], __A[1]);
504 }
505 
506 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507 _mm_cmpneq_sd (__m128d __A, __m128d __B)
508 {
509  __v2df a, b, c;
510  a = vec_splats (__A[0]);
511  b = vec_splats (__B[0]);
512  c = (__v2df) vec_cmpeq(a, b);
513  c = vec_nor (c, c);
514  return (__m128d) _mm_setr_pd (c[0], __A[1]);
515 }
516 
517 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
518 _mm_cmpnlt_sd (__m128d __A, __m128d __B)
519 {
520  __v2df a, b, c;
521  a = vec_splats (__A[0]);
522  b = vec_splats (__B[0]);
523  /* Not less than is just greater than or equal. */
524  c = (__v2df) vec_cmpge(a, b);
525  return (__m128d) _mm_setr_pd (c[0], __A[1]);
526 }
527 
528 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
529 _mm_cmpnle_sd (__m128d __A, __m128d __B)
530 {
531  __v2df a, b, c;
532  a = vec_splats (__A[0]);
533  b = vec_splats (__B[0]);
534  /* Not less than or equal is just greater than. */
535  c = (__v2df) vec_cmpge(a, b);
536  return (__m128d) _mm_setr_pd (c[0], __A[1]);
537 }
538 
539 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
540 _mm_cmpngt_sd (__m128d __A, __m128d __B)
541 {
542  __v2df a, b, c;
543  a = vec_splats (__A[0]);
544  b = vec_splats (__B[0]);
545  /* Not greater than is just less than or equal. */
546  c = (__v2df) vec_cmple(a, b);
547  return (__m128d) _mm_setr_pd (c[0], __A[1]);
548 }
549 
550 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
551 _mm_cmpnge_sd (__m128d __A, __m128d __B)
552 {
553  __v2df a, b, c;
554  a = vec_splats (__A[0]);
555  b = vec_splats (__B[0]);
556  /* Not greater than or equal is just less than. */
557  c = (__v2df) vec_cmplt(a, b);
558  return (__m128d) _mm_setr_pd (c[0], __A[1]);
559 }
560 
561 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
562 _mm_cmpord_sd (__m128d __A, __m128d __B)
563 {
564  __v2df r;
565  r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
566  return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
567 }
568 
569 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
570 _mm_cmpunord_sd (__m128d __A, __m128d __B)
571 {
572  __v2df r;
573  r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
574  return (__m128d) _mm_setr_pd (r[0], __A[1]);
575 }
576 
577 /* FIXME
578  The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
579  exactly the same because GCC for PowerPC only generates unordered
580  compares (scalar and vector).
581  Technically __mm_comieq_sp et all should be using the ordered
582  compare and signal for QNaNs. The __mm_ucomieq_sd et all should
583  be OK. */
584 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
585 _mm_comieq_sd (__m128d __A, __m128d __B)
586 {
587  return (__A[0] == __B[0]);
588 }
589 
590 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591 _mm_comilt_sd (__m128d __A, __m128d __B)
592 {
593  return (__A[0] < __B[0]);
594 }
595 
596 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
597 _mm_comile_sd (__m128d __A, __m128d __B)
598 {
599  return (__A[0] <= __B[0]);
600 }
601 
602 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
603 _mm_comigt_sd (__m128d __A, __m128d __B)
604 {
605  return (__A[0] > __B[0]);
606 }
607 
608 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
609 _mm_comige_sd (__m128d __A, __m128d __B)
610 {
611  return (__A[0] >= __B[0]);
612 }
613 
614 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
615 _mm_comineq_sd (__m128d __A, __m128d __B)
616 {
617  return (__A[0] != __B[0]);
618 }
619 
620 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
621 _mm_ucomieq_sd (__m128d __A, __m128d __B)
622 {
623  return (__A[0] == __B[0]);
624 }
625 
626 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
627 _mm_ucomilt_sd (__m128d __A, __m128d __B)
628 {
629  return (__A[0] < __B[0]);
630 }
631 
632 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
633 _mm_ucomile_sd (__m128d __A, __m128d __B)
634 {
635  return (__A[0] <= __B[0]);
636 }
637 
638 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
639 _mm_ucomigt_sd (__m128d __A, __m128d __B)
640 {
641  return (__A[0] > __B[0]);
642 }
643 
644 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
645 _mm_ucomige_sd (__m128d __A, __m128d __B)
646 {
647  return (__A[0] >= __B[0]);
648 }
649 
650 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
651 _mm_ucomineq_sd (__m128d __A, __m128d __B)
652 {
653  return (__A[0] != __B[0]);
654 }
655 
656 /* Create a vector of Qi, where i is the element number. */
657 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
658 _mm_set_epi64x (long long __q1, long long __q0)
659 {
660  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
661 }
662 
663 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
664 _mm_set_epi64 (__m64 __q1, __m64 __q0)
665 {
666  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
667 }
668 
669 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
670 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
671 {
672  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
673 }
674 
675 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
676 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
677  short __q3, short __q2, short __q1, short __q0)
678 {
679  return __extension__ (__m128i)(__v8hi){
680  __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
681 }
682 
683 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
684 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
685  char __q11, char __q10, char __q09, char __q08,
686  char __q07, char __q06, char __q05, char __q04,
687  char __q03, char __q02, char __q01, char __q00)
688 {
689  return __extension__ (__m128i)(__v16qi){
690  __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
691  __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
692  };
693 }
694 
695 /* Set all of the elements of the vector to A. */
696 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697 _mm_set1_epi64x (long long __A)
698 {
699  return _mm_set_epi64x (__A, __A);
700 }
701 
702 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703 _mm_set1_epi64 (__m64 __A)
704 {
705  return _mm_set_epi64 (__A, __A);
706 }
707 
708 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709 _mm_set1_epi32 (int __A)
710 {
711  return _mm_set_epi32 (__A, __A, __A, __A);
712 }
713 
714 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
715 _mm_set1_epi16 (short __A)
716 {
717  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
718 }
719 
720 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
721 _mm_set1_epi8 (char __A)
722 {
723  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
724  __A, __A, __A, __A, __A, __A, __A, __A);
725 }
726 
727 /* Create a vector of Qi, where i is the element number.
728  The parameter order is reversed from the _mm_set_epi* functions. */
729 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
730 _mm_setr_epi64 (__m64 __q0, __m64 __q1)
731 {
732  return _mm_set_epi64 (__q1, __q0);
733 }
734 
735 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
736 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
737 {
738  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
739 }
740 
741 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
742 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
743  short __q4, short __q5, short __q6, short __q7)
744 {
745  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
746 }
747 
748 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
749 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
750  char __q04, char __q05, char __q06, char __q07,
751  char __q08, char __q09, char __q10, char __q11,
752  char __q12, char __q13, char __q14, char __q15)
753 {
754  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
755  __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
756 }
757 
758 /* Create a vector with element 0 as *P and the rest zero. */
759 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
760 _mm_load_si128 (__m128i const *__P)
761 {
762  return *__P;
763 }
764 
765 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
766 _mm_loadu_si128 (__m128i_u const *__P)
767 {
768  return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
769 }
770 
771 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
772 _mm_loadl_epi64 (__m128i_u const *__P)
773 {
774  return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
775 }
776 
777 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
778 _mm_store_si128 (__m128i *__P, __m128i __B)
779 {
780  vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
781 }
782 
783 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
784 _mm_storeu_si128 (__m128i_u *__P, __m128i __B)
785 {
786  *__P = __B;
787 }
788 
789 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
790 _mm_storel_epi64 (__m128i_u *__P, __m128i __B)
791 {
792  *(long long *)__P = ((__v2di)__B)[0];
793 }
794 
795 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
796 _mm_movepi64_pi64 (__m128i_u __B)
797 {
798  return (__m64) ((__v2di)__B)[0];
799 }
800 
801 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
802 _mm_movpi64_epi64 (__m64 __A)
803 {
804  return _mm_set_epi64 ((__m64)0LL, __A);
805 }
806 
807 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
808 _mm_move_epi64 (__m128i __A)
809 {
810  return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
811 }
812 
813 /* Create an undefined vector. */
814 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
815 _mm_undefined_si128 (void)
816 {
817  __m128i __Y = __Y;
818  return __Y;
819 }
820 
821 /* Create a vector of zeros. */
822 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
823 _mm_setzero_si128 (void)
824 {
825  return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
826 }
827 
828 #ifdef _ARCH_PWR8
829 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
830 _mm_cvtepi32_pd (__m128i __A)
831 {
832  __v2di val;
833  /* For LE need to generate Vector Unpack Low Signed Word.
834  Which is generated from unpackh. */
835  val = (__v2di)vec_unpackh ((__v4si)__A);
836 
837  return (__m128d)vec_ctf (val, 0);
838 }
839 #endif
840 
841 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842 _mm_cvtepi32_ps (__m128i __A)
843 {
844  return ((__m128)vec_ctf((__v4si)__A, 0));
845 }
846 
847 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
848 _mm_cvtpd_epi32 (__m128d __A)
849 {
850  __v2df rounded = vec_rint (__A);
851  __v4si result, temp;
852  const __v4si vzero =
853  { 0, 0, 0, 0 };
854 
855  /* VSX Vector truncate Double-Precision to integer and Convert to
856  Signed Integer Word format with Saturate. */
857  __asm__(
858  "xvcvdpsxws %x0,%x1"
859  : "=wa" (temp)
860  : "wa" (rounded)
861  : );
862 
863 #ifdef _ARCH_PWR8
864  temp = vec_mergeo (temp, temp);
865  result = (__v4si) vec_vpkudum ((__vector long long) temp,
866  (__vector long long) vzero);
867 #else
868  {
869  const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
870  0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
871  result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
872  }
873 #endif
874  return (__m128i) result;
875 }
876 
877 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
878 _mm_cvtpd_pi32 (__m128d __A)
879 {
880  __m128i result = _mm_cvtpd_epi32(__A);
881 
882  return (__m64) result[0];
883 }
884 
885 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
886 _mm_cvtpd_ps (__m128d __A)
887 {
888  __v4sf result;
889  __v4si temp;
890  const __v4si vzero = { 0, 0, 0, 0 };
891 
892  __asm__(
893  "xvcvdpsp %x0,%x1"
894  : "=wa" (temp)
895  : "wa" (__A)
896  : );
897 
898 #ifdef _ARCH_PWR8
899  temp = vec_mergeo (temp, temp);
900  result = (__v4sf) vec_vpkudum ((__vector long long) temp,
901  (__vector long long) vzero);
902 #else
903  {
904  const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
905  0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
906  result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
907  }
908 #endif
909  return ((__m128)result);
910 }
911 
912 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
913 _mm_cvttpd_epi32 (__m128d __A)
914 {
915  __v4si result;
916  __v4si temp;
917  const __v4si vzero = { 0, 0, 0, 0 };
918 
919  /* VSX Vector truncate Double-Precision to integer and Convert to
920  Signed Integer Word format with Saturate. */
921  __asm__(
922  "xvcvdpsxws %x0,%x1"
923  : "=wa" (temp)
924  : "wa" (__A)
925  : );
926 
927 #ifdef _ARCH_PWR8
928  temp = vec_mergeo (temp, temp);
929  result = (__v4si) vec_vpkudum ((__vector long long) temp,
930  (__vector long long) vzero);
931 #else
932  {
933  const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
934  0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
935  result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
936  }
937 #endif
938 
939  return ((__m128i) result);
940 }
941 
942 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
943 _mm_cvttpd_pi32 (__m128d __A)
944 {
945  __m128i result = _mm_cvttpd_epi32 (__A);
946 
947  return (__m64) result[0];
948 }
949 
950 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
951 _mm_cvtsi128_si32 (__m128i __A)
952 {
953  return ((__v4si)__A)[0];
954 }
955 
956 #ifdef _ARCH_PWR8
957 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
958 _mm_cvtpi32_pd (__m64 __A)
959 {
960  __v4si temp;
961  __v2di tmp2;
962  __v2df result;
963 
964  temp = (__v4si)vec_splats (__A);
965  tmp2 = (__v2di)vec_unpackl (temp);
966  result = vec_ctf ((__vector signed long long) tmp2, 0);
967  return (__m128d)result;
968 }
969 #endif
970 
971 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
972 _mm_cvtps_epi32 (__m128 __A)
973 {
974  __v4sf rounded;
975  __v4si result;
976 
977  rounded = vec_rint((__v4sf) __A);
978  result = vec_cts (rounded, 0);
979  return (__m128i) result;
980 }
981 
982 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983 _mm_cvttps_epi32 (__m128 __A)
984 {
985  __v4si result;
986 
987  result = vec_cts ((__v4sf) __A, 0);
988  return (__m128i) result;
989 }
990 
991 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
992 _mm_cvtps_pd (__m128 __A)
993 {
994  /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
995 #ifdef vec_doubleh
996  return (__m128d) vec_doubleh ((__v4sf)__A);
997 #else
998  /* Otherwise the compiler is not current and so need to generate the
999  equivalent code. */
1000  __v4sf a = (__v4sf)__A;
1001  __v4sf temp;
1002  __v2df result;
1003 #ifdef __LITTLE_ENDIAN__
1004  /* The input float values are in elements {[0], [1]} but the convert
1005  instruction needs them in elements {[1], [3]}, So we use two
1006  shift left double vector word immediates to get the elements
1007  lined up. */
1008  temp = __builtin_vsx_xxsldwi (a, a, 3);
1009  temp = __builtin_vsx_xxsldwi (a, temp, 2);
1010 #else
1011  /* The input float values are in elements {[0], [1]} but the convert
1012  instruction needs them in elements {[0], [2]}, So we use two
1013  shift left double vector word immediates to get the elements
1014  lined up. */
1015  temp = vec_vmrghw (a, a);
1016 #endif
1017  __asm__(
1018  " xvcvspdp %x0,%x1"
1019  : "=wa" (result)
1020  : "wa" (temp)
1021  : );
1022  return (__m128d) result;
1023 #endif
1024 }
1025 
1026 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1027 _mm_cvtsd_si32 (__m128d __A)
1028 {
1029  __v2df rounded = vec_rint((__v2df) __A);
1030  int result = ((__v2df)rounded)[0];
1031 
1032  return result;
1033 }
1034 /* Intel intrinsic. */
1035 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036 _mm_cvtsd_si64 (__m128d __A)
1037 {
1038  __v2df rounded = vec_rint ((__v2df) __A );
1039  long long result = ((__v2df) rounded)[0];
1040 
1041  return result;
1042 }
1043 
1044 /* Microsoft intrinsic. */
1045 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1046 _mm_cvtsd_si64x (__m128d __A)
1047 {
1048  return _mm_cvtsd_si64 ((__v2df)__A);
1049 }
1050 
1051 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1052 _mm_cvttsd_si32 (__m128d __A)
1053 {
1054  int result = ((__v2df)__A)[0];
1055 
1056  return result;
1057 }
1058 
1059 /* Intel intrinsic. */
1060 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1061 _mm_cvttsd_si64 (__m128d __A)
1062 {
1063  long long result = ((__v2df)__A)[0];
1064 
1065  return result;
1066 }
1067 
1068 /* Microsoft intrinsic. */
1069 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1070 _mm_cvttsd_si64x (__m128d __A)
1071 {
1072  return _mm_cvttsd_si64 (__A);
1073 }
1074 
1075 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1076 _mm_cvtsd_ss (__m128 __A, __m128d __B)
1077 {
1078  __v4sf result = (__v4sf)__A;
1079 
1080 #ifdef __LITTLE_ENDIAN__
1081  __v4sf temp_s;
1082  /* Copy double element[0] to element [1] for conversion. */
1083  __v2df temp_b = vec_splat((__v2df)__B, 0);
1084 
1085  /* Pre-rotate __A left 3 (logically right 1) elements. */
1086  result = __builtin_vsx_xxsldwi (result, result, 3);
1087  /* Convert double to single float scalar in a vector. */
1088  __asm__(
1089  "xscvdpsp %x0,%x1"
1090  : "=wa" (temp_s)
1091  : "wa" (temp_b)
1092  : );
1093  /* Shift the resulting scalar into vector element [0]. */
1094  result = __builtin_vsx_xxsldwi (result, temp_s, 1);
1095 #else
1096  result [0] = ((__v2df)__B)[0];
1097 #endif
1098  return (__m128) result;
1099 }
1100 
1101 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102 _mm_cvtsi32_sd (__m128d __A, int __B)
1103 {
1104  __v2df result = (__v2df)__A;
1105  double db = __B;
1106  result [0] = db;
1107  return (__m128d)result;
1108 }
1109 
1110 /* Intel intrinsic. */
1111 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1112 _mm_cvtsi64_sd (__m128d __A, long long __B)
1113 {
1114  __v2df result = (__v2df)__A;
1115  double db = __B;
1116  result [0] = db;
1117  return (__m128d)result;
1118 }
1119 
1120 /* Microsoft intrinsic. */
1121 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1122 _mm_cvtsi64x_sd (__m128d __A, long long __B)
1123 {
1124  return _mm_cvtsi64_sd (__A, __B);
1125 }
1126 
1127 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128 _mm_cvtss_sd (__m128d __A, __m128 __B)
1129 {
1130 #ifdef __LITTLE_ENDIAN__
1131  /* Use splat to move element [0] into position for the convert. */
1132  __v4sf temp = vec_splat ((__v4sf)__B, 0);
1133  __v2df res;
1134  /* Convert single float scalar to double in a vector. */
1135  __asm__(
1136  "xscvspdp %x0,%x1"
1137  : "=wa" (res)
1138  : "wa" (temp)
1139  : );
1140  return (__m128d) vec_mergel (res, (__v2df)__A);
1141 #else
1142  __v2df res = (__v2df)__A;
1143  res [0] = ((__v4sf)__B) [0];
1144  return (__m128d) res;
1145 #endif
1146 }
1147 
1148 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1149 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1150 {
1151  __vector double result;
1152  const int litmsk = __mask & 0x3;
1153 
1154  if (litmsk == 0)
1155  result = vec_mergeh (__A, __B);
1156 #if __GNUC__ < 6
1157  else if (litmsk == 1)
1158  result = vec_xxpermdi (__B, __A, 2);
1159  else if (litmsk == 2)
1160  result = vec_xxpermdi (__B, __A, 1);
1161 #else
1162  else if (litmsk == 1)
1163  result = vec_xxpermdi (__A, __B, 2);
1164  else if (litmsk == 2)
1165  result = vec_xxpermdi (__A, __B, 1);
1166 #endif
1167  else
1168  result = vec_mergel (__A, __B);
1169 
1170  return result;
1171 }
1172 
1173 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1174 _mm_unpackhi_pd (__m128d __A, __m128d __B)
1175 {
1176  return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1177 }
1178 
1179 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1180 _mm_unpacklo_pd (__m128d __A, __m128d __B)
1181 {
1182  return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1183 }
1184 
1185 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1186 _mm_loadh_pd (__m128d __A, double const *__B)
1187 {
1188  __v2df result = (__v2df)__A;
1189  result [1] = *__B;
1190  return (__m128d)result;
1191 }
1192 
1193 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1194 _mm_loadl_pd (__m128d __A, double const *__B)
1195 {
1196  __v2df result = (__v2df)__A;
1197  result [0] = *__B;
1198  return (__m128d)result;
1199 }
1200 
1201 #ifdef _ARCH_PWR8
1202 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1203 
1204 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1205 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206 _mm_movemask_pd (__m128d __A)
1207 {
1208  __vector unsigned long long result;
1209  static const __vector unsigned int perm_mask =
1210  {
1211 #ifdef __LITTLE_ENDIAN__
1212  0x80800040, 0x80808080, 0x80808080, 0x80808080
1213 #else
1214  0x80808080, 0x80808080, 0x80808080, 0x80804000
1215 #endif
1216  };
1217 
1218  result = ((__vector unsigned long long)
1219  vec_vbpermq ((__vector unsigned char) __A,
1220  (__vector unsigned char) perm_mask));
1221 
1222 #ifdef __LITTLE_ENDIAN__
1223  return result[1];
1224 #else
1225  return result[0];
1226 #endif
1227 }
1228 #endif /* _ARCH_PWR8 */
1229 
1230 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1231 _mm_packs_epi16 (__m128i __A, __m128i __B)
1232 {
1233  return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1234 }
1235 
1236 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1237 _mm_packs_epi32 (__m128i __A, __m128i __B)
1238 {
1239  return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1240 }
1241 
1242 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1243 _mm_packus_epi16 (__m128i __A, __m128i __B)
1244 {
1245  return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1246 }
1247 
1248 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1249 _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1250 {
1251  return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1252 }
1253 
1254 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1255 _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1256 {
1257  return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1258 }
1259 
1260 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1261 _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1262 {
1263  return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1264 }
1265 
1266 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1267 _mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1268 {
1269  return (__m128i) vec_mergel ((__vector long long) __A,
1270  (__vector long long) __B);
1271 }
1272 
1273 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274 _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1275 {
1276  return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1277 }
1278 
1279 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280 _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1281 {
1282  return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1283 }
1284 
1285 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286 _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1287 {
1288  return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1289 }
1290 
1291 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1292 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1293 {
1294  return (__m128i) vec_mergeh ((__vector long long) __A,
1295  (__vector long long) __B);
1296 }
1297 
1298 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1299 _mm_add_epi8 (__m128i __A, __m128i __B)
1300 {
1301  return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1302 }
1303 
1304 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1305 _mm_add_epi16 (__m128i __A, __m128i __B)
1306 {
1307  return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1308 }
1309 
1310 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311 _mm_add_epi32 (__m128i __A, __m128i __B)
1312 {
1313  return (__m128i) ((__v4su)__A + (__v4su)__B);
1314 }
1315 
1316 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1317 _mm_add_epi64 (__m128i __A, __m128i __B)
1318 {
1319  return (__m128i) ((__v2du)__A + (__v2du)__B);
1320 }
1321 
1322 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1323 _mm_adds_epi8 (__m128i __A, __m128i __B)
1324 {
1325  return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1326 }
1327 
1328 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329 _mm_adds_epi16 (__m128i __A, __m128i __B)
1330 {
1331  return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1332 }
1333 
1334 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1335 _mm_adds_epu8 (__m128i __A, __m128i __B)
1336 {
1337  return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1338 }
1339 
1340 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341 _mm_adds_epu16 (__m128i __A, __m128i __B)
1342 {
1343  return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1344 }
1345 
1346 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1347 _mm_sub_epi8 (__m128i __A, __m128i __B)
1348 {
1349  return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1350 }
1351 
1352 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353 _mm_sub_epi16 (__m128i __A, __m128i __B)
1354 {
1355  return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1356 }
1357 
1358 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1359 _mm_sub_epi32 (__m128i __A, __m128i __B)
1360 {
1361  return (__m128i) ((__v4su)__A - (__v4su)__B);
1362 }
1363 
1364 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1365 _mm_sub_epi64 (__m128i __A, __m128i __B)
1366 {
1367  return (__m128i) ((__v2du)__A - (__v2du)__B);
1368 }
1369 
1370 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1371 _mm_subs_epi8 (__m128i __A, __m128i __B)
1372 {
1373  return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1374 }
1375 
1376 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1377 _mm_subs_epi16 (__m128i __A, __m128i __B)
1378 {
1379  return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1380 }
1381 
1382 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383 _mm_subs_epu8 (__m128i __A, __m128i __B)
1384 {
1385  return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1386 }
1387 
1388 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1389 _mm_subs_epu16 (__m128i __A, __m128i __B)
1390 {
1391  return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1392 }
1393 
1394 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1395 _mm_madd_epi16 (__m128i __A, __m128i __B)
1396 {
1397  __vector signed int zero = {0, 0, 0, 0};
1398 
1399  return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
1400 }
1401 
1402 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
1404 {
1405  __vector signed int w0, w1;
1406 
1407  __vector unsigned char xform1 = {
1408 #ifdef __LITTLE_ENDIAN__
1409  0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1410  0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1411 #else
1412  0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1413  0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1414 #endif
1415  };
1416 
1417  w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1418  w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1419  return (__m128i) vec_perm (w0, w1, xform1);
1420 }
1421 
1422 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1423 _mm_mullo_epi16 (__m128i __A, __m128i __B)
1424 {
1425  return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1426 }
1427 
1428 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1429 _mm_mul_su32 (__m64 __A, __m64 __B)
1430 {
1431  unsigned int a = __A;
1432  unsigned int b = __B;
1433 
1434  return ((__m64)a * (__m64)b);
1435 }
1436 
1437 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1438 _mm_mul_epu32 (__m128i __A, __m128i __B)
1439 {
1440 #if __GNUC__ < 8
1441  __v2du result;
1442 
1443 #ifdef __LITTLE_ENDIAN__
1444  /* VMX Vector Multiply Odd Unsigned Word. */
1445  __asm__(
1446  "vmulouw %0,%1,%2"
1447  : "=v" (result)
1448  : "v" (__A), "v" (__B)
1449  : );
1450 #else
1451  /* VMX Vector Multiply Even Unsigned Word. */
1452  __asm__(
1453  "vmuleuw %0,%1,%2"
1454  : "=v" (result)
1455  : "v" (__A), "v" (__B)
1456  : );
1457 #endif
1458  return (__m128i) result;
1459 #else
1460  return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1461 #endif
1462 }
1463 
1464 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1465 _mm_slli_epi16 (__m128i __A, int __B)
1466 {
1467  __v8hu lshift;
1468  __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1469 
1470  if (__B >= 0 && __B < 16)
1471  {
1472  if (__builtin_constant_p(__B))
1473  lshift = (__v8hu) vec_splat_s16(__B);
1474  else
1475  lshift = vec_splats ((unsigned short) __B);
1476 
1477  result = vec_sl ((__v8hi) __A, lshift);
1478  }
1479 
1480  return (__m128i) result;
1481 }
1482 
1483 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1484 _mm_slli_epi32 (__m128i __A, int __B)
1485 {
1486  __v4su lshift;
1487  __v4si result = { 0, 0, 0, 0 };
1488 
1489  if (__B >= 0 && __B < 32)
1490  {
1491  if (__builtin_constant_p(__B) && __B < 16)
1492  lshift = (__v4su) vec_splat_s32(__B);
1493  else
1494  lshift = vec_splats ((unsigned int) __B);
1495 
1496  result = vec_sl ((__v4si) __A, lshift);
1497  }
1498 
1499  return (__m128i) result;
1500 }
1501 
1502 #ifdef _ARCH_PWR8
1503 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1504 _mm_slli_epi64 (__m128i __A, int __B)
1505 {
1506  __v2du lshift;
1507  __v2di result = { 0, 0 };
1508 
1509  if (__B >= 0 && __B < 64)
1510  {
1511  if (__builtin_constant_p(__B) && __B < 16)
1512  lshift = (__v2du) vec_splat_s32(__B);
1513  else
1514  lshift = (__v2du) vec_splats ((unsigned int) __B);
1515 
1516  result = vec_sl ((__v2di) __A, lshift);
1517  }
1518 
1519  return (__m128i) result;
1520 }
1521 #endif
1522 
1523 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1524 _mm_srai_epi16 (__m128i __A, int __B)
1525 {
1526  __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1527  __v8hi result;
1528 
1529  if (__B < 16)
1530  {
1531  if (__builtin_constant_p(__B))
1532  rshift = (__v8hu) vec_splat_s16(__B);
1533  else
1534  rshift = vec_splats ((unsigned short) __B);
1535  }
1536  result = vec_sra ((__v8hi) __A, rshift);
1537 
1538  return (__m128i) result;
1539 }
1540 
1541 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1542 _mm_srai_epi32 (__m128i __A, int __B)
1543 {
1544  __v4su rshift = { 31, 31, 31, 31 };
1545  __v4si result;
1546 
1547  if (__B < 32)
1548  {
1549  if (__builtin_constant_p(__B))
1550  {
1551  if (__B < 16)
1552  rshift = (__v4su) vec_splat_s32(__B);
1553  else
1554  rshift = (__v4su) vec_splats((unsigned int)__B);
1555  }
1556  else
1557  rshift = vec_splats ((unsigned int) __B);
1558  }
1559  result = vec_sra ((__v4si) __A, rshift);
1560 
1561  return (__m128i) result;
1562 }
1563 
1564 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1565 _mm_bslli_si128 (__m128i __A, const int __N)
1566 {
1567  __v16qu result;
1568  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1569 
1570  if (__N < 16)
1571  result = vec_sld ((__v16qu) __A, zeros, __N);
1572  else
1573  result = zeros;
1574 
1575  return (__m128i) result;
1576 }
1577 
1578 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1579 _mm_bsrli_si128 (__m128i __A, const int __N)
1580 {
1581  __v16qu result;
1582  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1583 
1584  if (__N < 16)
1585 #ifdef __LITTLE_ENDIAN__
1586  if (__builtin_constant_p(__N))
1587  /* Would like to use Vector Shift Left Double by Octet
1588  Immediate here to use the immediate form and avoid
1589  load of __N * 8 value into a separate VR. */
1590  result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
1591  else
1592 #endif
1593  {
1594  __v16qu shift = vec_splats((unsigned char)(__N*8));
1595 #ifdef __LITTLE_ENDIAN__
1596  result = vec_sro ((__v16qu)__A, shift);
1597 #else
1598  result = vec_slo ((__v16qu)__A, shift);
1599 #endif
1600  }
1601  else
1602  result = zeros;
1603 
1604  return (__m128i) result;
1605 }
1606 
1607 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1608 _mm_srli_si128 (__m128i __A, const int __N)
1609 {
1610  return _mm_bsrli_si128 (__A, __N);
1611 }
1612 
1613 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1614 _mm_slli_si128 (__m128i __A, const int _imm5)
1615 {
1616  __v16qu result;
1617  const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1618 
1619  if (_imm5 < 16)
1620 #ifdef __LITTLE_ENDIAN__
1621  result = vec_sld ((__v16qu) __A, zeros, _imm5);
1622 #else
1623  result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
1624 #endif
1625  else
1626  result = zeros;
1627 
1628  return (__m128i) result;
1629 }
1630 
1631 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1632 
1633 _mm_srli_epi16 (__m128i __A, int __B)
1634 {
1635  __v8hu rshift;
1636  __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1637 
1638  if (__B < 16)
1639  {
1640  if (__builtin_constant_p(__B))
1641  rshift = (__v8hu) vec_splat_s16(__B);
1642  else
1643  rshift = vec_splats ((unsigned short) __B);
1644 
1645  result = vec_sr ((__v8hi) __A, rshift);
1646  }
1647 
1648  return (__m128i) result;
1649 }
1650 
1651 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1652 _mm_srli_epi32 (__m128i __A, int __B)
1653 {
1654  __v4su rshift;
1655  __v4si result = { 0, 0, 0, 0 };
1656 
1657  if (__B < 32)
1658  {
1659  if (__builtin_constant_p(__B))
1660  {
1661  if (__B < 16)
1662  rshift = (__v4su) vec_splat_s32(__B);
1663  else
1664  rshift = (__v4su) vec_splats((unsigned int)__B);
1665  }
1666  else
1667  rshift = vec_splats ((unsigned int) __B);
1668 
1669  result = vec_sr ((__v4si) __A, rshift);
1670  }
1671 
1672  return (__m128i) result;
1673 }
1674 
1675 #ifdef _ARCH_PWR8
1676 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1677 _mm_srli_epi64 (__m128i __A, int __B)
1678 {
1679  __v2du rshift;
1680  __v2di result = { 0, 0 };
1681 
1682  if (__B < 64)
1683  {
1684  if (__builtin_constant_p(__B))
1685  {
1686  if (__B < 16)
1687  rshift = (__v2du) vec_splat_s32(__B);
1688  else
1689  rshift = (__v2du) vec_splats((unsigned long long)__B);
1690  }
1691  else
1692  rshift = (__v2du) vec_splats ((unsigned int) __B);
1693 
1694  result = vec_sr ((__v2di) __A, rshift);
1695  }
1696 
1697  return (__m128i) result;
1698 }
1699 #endif
1700 
1701 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1702 _mm_sll_epi16 (__m128i __A, __m128i __B)
1703 {
1704  __v8hu lshift;
1705  __vector __bool short shmask;
1706  const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1707  __v8hu result;
1708 
1709 #ifdef __LITTLE_ENDIAN__
1710  lshift = vec_splat ((__v8hu) __B, 0);
1711 #else
1712  lshift = vec_splat ((__v8hu) __B, 3);
1713 #endif
1714  shmask = vec_cmple (lshift, shmax);
1715  result = vec_sl ((__v8hu) __A, lshift);
1716  result = vec_sel ((__v8hu) shmask, result, shmask);
1717 
1718  return (__m128i) result;
1719 }
1720 
1721 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1722 _mm_sll_epi32 (__m128i __A, __m128i __B)
1723 {
1724  __v4su lshift;
1725  __vector __bool int shmask;
1726  const __v4su shmax = { 32, 32, 32, 32 };
1727  __v4su result;
1728 #ifdef __LITTLE_ENDIAN__
1729  lshift = vec_splat ((__v4su) __B, 0);
1730 #else
1731  lshift = vec_splat ((__v4su) __B, 1);
1732 #endif
1733  shmask = vec_cmplt (lshift, shmax);
1734  result = vec_sl ((__v4su) __A, lshift);
1735  result = vec_sel ((__v4su) shmask, result, shmask);
1736 
1737  return (__m128i) result;
1738 }
1739 
1740 #ifdef _ARCH_PWR8
1741 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1742 _mm_sll_epi64 (__m128i __A, __m128i __B)
1743 {
1744  __v2du lshift;
1745  __vector __bool long long shmask;
1746  const __v2du shmax = { 64, 64 };
1747  __v2du result;
1748 
1749  lshift = vec_splat ((__v2du) __B, 0);
1750  shmask = vec_cmplt (lshift, shmax);
1751  result = vec_sl ((__v2du) __A, lshift);
1752  result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
1753 
1754  return (__m128i) result;
1755 }
1756 #endif
1757 
1758 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1759 _mm_sra_epi16 (__m128i __A, __m128i __B)
1760 {
1761  const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1762  __v8hu rshift;
1763  __v8hi result;
1764 
1765 #ifdef __LITTLE_ENDIAN__
1766  rshift = vec_splat ((__v8hu)__B, 0);
1767 #else
1768  rshift = vec_splat ((__v8hu)__B, 3);
1769 #endif
1770  rshift = vec_min (rshift, rshmax);
1771  result = vec_sra ((__v8hi) __A, rshift);
1772 
1773  return (__m128i) result;
1774 }
1775 
1776 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1777 _mm_sra_epi32 (__m128i __A, __m128i __B)
1778 {
1779  const __v4su rshmax = { 31, 31, 31, 31 };
1780  __v4su rshift;
1781  __v4si result;
1782 
1783 #ifdef __LITTLE_ENDIAN__
1784  rshift = vec_splat ((__v4su)__B, 0);
1785 #else
1786  rshift = vec_splat ((__v4su)__B, 1);
1787 #endif
1788  rshift = vec_min (rshift, rshmax);
1789  result = vec_sra ((__v4si) __A, rshift);
1790 
1791  return (__m128i) result;
1792 }
1793 
1794 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1795 _mm_srl_epi16 (__m128i __A, __m128i __B)
1796 {
1797  __v8hu rshift;
1798  __vector __bool short shmask;
1799  const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1800  __v8hu result;
1801 
1802 #ifdef __LITTLE_ENDIAN__
1803  rshift = vec_splat ((__v8hu) __B, 0);
1804 #else
1805  rshift = vec_splat ((__v8hu) __B, 3);
1806 #endif
1807  shmask = vec_cmple (rshift, shmax);
1808  result = vec_sr ((__v8hu) __A, rshift);
1809  result = vec_sel ((__v8hu) shmask, result, shmask);
1810 
1811  return (__m128i) result;
1812 }
1813 
1814 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1815 _mm_srl_epi32 (__m128i __A, __m128i __B)
1816 {
1817  __v4su rshift;
1818  __vector __bool int shmask;
1819  const __v4su shmax = { 32, 32, 32, 32 };
1820  __v4su result;
1821 
1822 #ifdef __LITTLE_ENDIAN__
1823  rshift = vec_splat ((__v4su) __B, 0);
1824 #else
1825  rshift = vec_splat ((__v4su) __B, 1);
1826 #endif
1827  shmask = vec_cmplt (rshift, shmax);
1828  result = vec_sr ((__v4su) __A, rshift);
1829  result = vec_sel ((__v4su) shmask, result, shmask);
1830 
1831  return (__m128i) result;
1832 }
1833 
1834 #ifdef _ARCH_PWR8
1835 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1836 _mm_srl_epi64 (__m128i __A, __m128i __B)
1837 {
1838  __v2du rshift;
1839  __vector __bool long long shmask;
1840  const __v2du shmax = { 64, 64 };
1841  __v2du result;
1842 
1843  rshift = vec_splat ((__v2du) __B, 0);
1844  shmask = vec_cmplt (rshift, shmax);
1845  result = vec_sr ((__v2du) __A, rshift);
1846  result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
1847 
1848  return (__m128i) result;
1849 }
1850 #endif
1851 
1852 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1853 _mm_and_pd (__m128d __A, __m128d __B)
1854 {
1855  return (vec_and ((__v2df) __A, (__v2df) __B));
1856 }
1857 
1858 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1859 _mm_andnot_pd (__m128d __A, __m128d __B)
1860 {
1861  return (vec_andc ((__v2df) __B, (__v2df) __A));
1862 }
1863 
1864 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1865 _mm_or_pd (__m128d __A, __m128d __B)
1866 {
1867  return (vec_or ((__v2df) __A, (__v2df) __B));
1868 }
1869 
1870 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1871 _mm_xor_pd (__m128d __A, __m128d __B)
1872 {
1873  return (vec_xor ((__v2df) __A, (__v2df) __B));
1874 }
1875 
1876 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1877 _mm_and_si128 (__m128i __A, __m128i __B)
1878 {
1879  return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
1880 }
1881 
1882 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1883 _mm_andnot_si128 (__m128i __A, __m128i __B)
1884 {
1885  return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
1886 }
1887 
1888 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1889 _mm_or_si128 (__m128i __A, __m128i __B)
1890 {
1891  return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
1892 }
1893 
1894 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1895 _mm_xor_si128 (__m128i __A, __m128i __B)
1896 {
1897  return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
1898 }
1899 
1900 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1901 _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1902 {
1903  return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1904 }
1905 
1906 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1907 _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1908 {
1909  return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1910 }
1911 
1912 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1913 _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1914 {
1915  return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1916 }
1917 
1918 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1919 _mm_cmplt_epi8 (__m128i __A, __m128i __B)
1920 {
1921  return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1922 }
1923 
1924 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1925 _mm_cmplt_epi16 (__m128i __A, __m128i __B)
1926 {
1927  return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1928 }
1929 
1930 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1931 _mm_cmplt_epi32 (__m128i __A, __m128i __B)
1932 {
1933  return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1934 }
1935 
1936 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1937 _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1938 {
1939  return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1940 }
1941 
1942 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1943 _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1944 {
1945  return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1946 }
1947 
1948 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1949 _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1950 {
1951  return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1952 }
1953 
1954 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1955 _mm_extract_epi16 (__m128i const __A, int const __N)
1956 {
1957  return (unsigned short) ((__v8hi)__A)[__N & 7];
1958 }
1959 
1960 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1961 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1962 {
1963  __v8hi result = (__v8hi)__A;
1964 
1965  result [(__N & 7)] = __D;
1966 
1967  return (__m128i) result;
1968 }
1969 
1970 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1971 _mm_max_epi16 (__m128i __A, __m128i __B)
1972 {
1973  return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
1974 }
1975 
1976 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1977 _mm_max_epu8 (__m128i __A, __m128i __B)
1978 {
1979  return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
1980 }
1981 
1982 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1983 _mm_min_epi16 (__m128i __A, __m128i __B)
1984 {
1985  return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
1986 }
1987 
1988 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1989 _mm_min_epu8 (__m128i __A, __m128i __B)
1990 {
1991  return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
1992 }
1993 
1994 
1995 #ifdef _ARCH_PWR8
1996 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1997 
1998 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1999 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2000 _mm_movemask_epi8 (__m128i __A)
2001 {
2002  __vector unsigned long long result;
2003  static const __vector unsigned char perm_mask =
2004  {
2005  0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2006  0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2007  };
2008 
2009  result = ((__vector unsigned long long)
2010  vec_vbpermq ((__vector unsigned char) __A,
2011  (__vector unsigned char) perm_mask));
2012 
2013 #ifdef __LITTLE_ENDIAN__
2014  return result[1];
2015 #else
2016  return result[0];
2017 #endif
2018 }
2019 #endif /* _ARCH_PWR8 */
2020 
2021 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2022 _mm_mulhi_epu16 (__m128i __A, __m128i __B)
2023 {
2024  __v4su w0, w1;
2025  __v16qu xform1 = {
2026 #ifdef __LITTLE_ENDIAN__
2027  0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
2028  0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
2029 #else
2030  0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
2031  0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
2032 #endif
2033  };
2034 
2035  w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2036  w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2037  return (__m128i) vec_perm (w0, w1, xform1);
2038 }
2039 
2040 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2041 _mm_shufflehi_epi16 (__m128i __A, const int __mask)
2042 {
2043  unsigned long element_selector_98 = __mask & 0x03;
2044  unsigned long element_selector_BA = (__mask >> 2) & 0x03;
2045  unsigned long element_selector_DC = (__mask >> 4) & 0x03;
2046  unsigned long element_selector_FE = (__mask >> 6) & 0x03;
2047  static const unsigned short permute_selectors[4] =
2048  {
2049 #ifdef __LITTLE_ENDIAN__
2050  0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2051 #else
2052  0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2053 #endif
2054  };
2055  __v2du pmask =
2056 #ifdef __LITTLE_ENDIAN__
2057  { 0x1716151413121110UL, 0UL};
2058 #else
2059  { 0x1011121314151617UL, 0UL};
2060 #endif
2061  __m64_union t;
2062  __v2du a, r;
2063 
2064  t.as_short[0] = permute_selectors[element_selector_98];
2065  t.as_short[1] = permute_selectors[element_selector_BA];
2066  t.as_short[2] = permute_selectors[element_selector_DC];
2067  t.as_short[3] = permute_selectors[element_selector_FE];
2068  pmask[1] = t.as_m64;
2069  a = (__v2du)__A;
2070  r = vec_perm (a, a, (__vector unsigned char)pmask);
2071  return (__m128i) r;
2072 }
2073 
2074 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2075 _mm_shufflelo_epi16 (__m128i __A, const int __mask)
2076 {
2077  unsigned long element_selector_10 = __mask & 0x03;
2078  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2079  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2080  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2081  static const unsigned short permute_selectors[4] =
2082  {
2083 #ifdef __LITTLE_ENDIAN__
2084  0x0100, 0x0302, 0x0504, 0x0706
2085 #else
2086  0x0001, 0x0203, 0x0405, 0x0607
2087 #endif
2088  };
2089  __v2du pmask =
2090 #ifdef __LITTLE_ENDIAN__
2091  { 0UL, 0x1f1e1d1c1b1a1918UL};
2092 #else
2093  { 0UL, 0x18191a1b1c1d1e1fUL};
2094 #endif
2095  __m64_union t;
2096  __v2du a, r;
2097  t.as_short[0] = permute_selectors[element_selector_10];
2098  t.as_short[1] = permute_selectors[element_selector_32];
2099  t.as_short[2] = permute_selectors[element_selector_54];
2100  t.as_short[3] = permute_selectors[element_selector_76];
2101  pmask[0] = t.as_m64;
2102  a = (__v2du)__A;
2103  r = vec_perm (a, a, (__vector unsigned char)pmask);
2104  return (__m128i) r;
2105 }
2106 
2107 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2108 _mm_shuffle_epi32 (__m128i __A, const int __mask)
2109 {
2110  unsigned long element_selector_10 = __mask & 0x03;
2111  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2112  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2113  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2114  static const unsigned int permute_selectors[4] =
2115  {
2116 #ifdef __LITTLE_ENDIAN__
2117  0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2118 #else
2119  0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2120 #endif
2121  };
2122  __v4su t;
2123 
2124  t[0] = permute_selectors[element_selector_10];
2125  t[1] = permute_selectors[element_selector_32];
2126  t[2] = permute_selectors[element_selector_54] + 0x10101010;
2127  t[3] = permute_selectors[element_selector_76] + 0x10101010;
2128  return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
2129 }
2130 
2131 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2132 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2133 {
2134  __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2135  __v16qu mask, tmp;
2136  __m128i_u *p = (__m128i_u*)__C;
2137 
2138  tmp = (__v16qu)_mm_loadu_si128(p);
2139  mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
2140  tmp = vec_sel (tmp, (__v16qu)__A, mask);
2141  _mm_storeu_si128 (p, (__m128i)tmp);
2142 }
2143 
2144 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2145 _mm_avg_epu8 (__m128i __A, __m128i __B)
2146 {
2147  return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2148 }
2149 
2150 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2151 _mm_avg_epu16 (__m128i __A, __m128i __B)
2152 {
2153  return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2154 }
2155 
2156 
2157 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2158 _mm_sad_epu8 (__m128i __A, __m128i __B)
2159 {
2160  __v16qu a, b;
2161  __v16qu vmin, vmax, vabsdiff;
2162  __v4si vsum;
2163  const __v4su zero = { 0, 0, 0, 0 };
2164  __v4si result;
2165 
2166  a = (__v16qu) __A;
2167  b = (__v16qu) __B;
2168  vmin = vec_min (a, b);
2169  vmax = vec_max (a, b);
2170  vabsdiff = vec_sub (vmax, vmin);
2171  /* Sum four groups of bytes into integers. */
2172  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
2173  /* Sum across four integers with two integer results. */
2174  result = vec_sum2s (vsum, (__vector signed int) zero);
2175  /* Rotate the sums into the correct position. */
2176 #ifdef __LITTLE_ENDIAN__
2177  result = vec_sld (result, result, 4);
2178 #else
2179  result = vec_sld (result, result, 6);
2180 #endif
2181  /* Rotate the sums into the correct position. */
2182  return (__m128i) result;
2183 }
2184 
2185 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2186 _mm_stream_si32 (int *__A, int __B)
2187 {
2188  /* Use the data cache block touch for store transient. */
2189  __asm__ (
2190  "dcbtstt 0,%0"
2191  :
2192  : "b" (__A)
2193  : "memory"
2194  );
2195  *__A = __B;
2196 }
2197 
2198 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2199 _mm_stream_si64 (long long int *__A, long long int __B)
2200 {
2201  /* Use the data cache block touch for store transient. */
2202  __asm__ (
2203  " dcbtstt 0,%0"
2204  :
2205  : "b" (__A)
2206  : "memory"
2207  );
2208  *__A = __B;
2209 }
2210 
2211 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2212 _mm_stream_si128 (__m128i *__A, __m128i __B)
2213 {
2214  /* Use the data cache block touch for store transient. */
2215  __asm__ (
2216  "dcbtstt 0,%0"
2217  :
2218  : "b" (__A)
2219  : "memory"
2220  );
2221  *__A = __B;
2222 }
2223 
2224 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2225 _mm_stream_pd (double *__A, __m128d __B)
2226 {
2227  /* Use the data cache block touch for store transient. */
2228  __asm__ (
2229  "dcbtstt 0,%0"
2230  :
2231  : "b" (__A)
2232  : "memory"
2233  );
2234  *(__m128d*)__A = __B;
2235 }
2236 
2237 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2238 _mm_clflush (void const *__A)
2239 {
2240  /* Use the data cache block flush. */
2241  __asm__ (
2242  "dcbf 0,%0"
2243  :
2244  : "b" (__A)
2245  : "memory"
2246  );
2247 }
2248 
2249 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2250 _mm_lfence (void)
2251 {
2252  /* Use light weight sync for load to load ordering. */
2253  __atomic_thread_fence (__ATOMIC_RELEASE);
2254 }
2255 
2256 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2257 _mm_mfence (void)
2258 {
2259  /* Use heavy weight sync for any to any ordering. */
2260  __atomic_thread_fence (__ATOMIC_SEQ_CST);
2261 }
2262 
2263 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2264 _mm_cvtsi32_si128 (int __A)
2265 {
2266  return _mm_set_epi32 (0, 0, 0, __A);
2267 }
2268 
2269 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2270 _mm_cvtsi64_si128 (long long __A)
2271 {
2272  return __extension__ (__m128i)(__v2di){ __A, 0LL };
2273 }
2274 
2275 /* Microsoft intrinsic. */
2276 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2277 _mm_cvtsi64x_si128 (long long __A)
2278 {
2279  return __extension__ (__m128i)(__v2di){ __A, 0LL };
2280 }
2281 
2282 /* Casts between various SP, DP, INT vector types. Note that these do no
2283  conversion of values, they just change the type. */
2284 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2285 _mm_castpd_ps(__m128d __A)
2286 {
2287  return (__m128) __A;
2288 }
2289 
2290 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2291 _mm_castpd_si128(__m128d __A)
2292 {
2293  return (__m128i) __A;
2294 }
2295 
2296 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2297 _mm_castps_pd(__m128 __A)
2298 {
2299  return (__m128d) __A;
2300 }
2301 
2302 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2303 _mm_castps_si128(__m128 __A)
2304 {
2305  return (__m128i) __A;
2306 }
2307 
2308 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2309 _mm_castsi128_ps(__m128i __A)
2310 {
2311  return (__m128) __A;
2312 }
2313 
2314 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2315 _mm_castsi128_pd(__m128i __A)
2316 {
2317  return (__m128d) __A;
2318 }
2319 
2320 #else
2321 #include_next <emmintrin.h>
2322 #endif /* defined(__linux__) && defined(__ppc64__) */
2323 
2324 #endif /* EMMINTRIN_H_ */
__device__ __2f16 b
__device__ int
__device__ __2f16 float c
static __inline__ vector unsigned char __ATTRS_o_ai vec_sr(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:9543
static __inline__ vector bool char __ATTRS_o_ai vec_cmpeq(vector signed char __a, vector signed char __b)
Definition: altivec.h:1625
static __inline__ vector signed char __ATTRS_o_ai vec_sra(vector signed char __a, vector unsigned char __b)
Definition: altivec.h:9633
static __inline__ vector int __ATTRS_o_ai vec_vmrghw(vector int __a, vector int __b)
Definition: altivec.h:4769
static __inline__ vector signed char __ATTRS_o_ai vec_sro(vector signed char __a, vector signed char __b)
Definition: altivec.h:10073
static __inline__ vector signed char __ATTRS_o_ai vec_ld(int __a, const vector signed char *__b)
Definition: altivec.h:3504
#define vec_ctf(__a, __b)
Definition: altivec.h:2950
static __inline__ vector short __ATTRS_o_ai vec_mule(vector signed char __a, vector signed char __b)
Definition: altivec.h:5696
static __inline__ vector signed char __ATTRS_o_ai vec_splats(signed char __a)
Definition: altivec.h:13710
static __inline__ void __ATTRS_o_ai vec_st(vector signed char __a, int __b, vector signed char *__c)
Definition: altivec.h:10278
static __inline__ vector signed char __ATTRS_o_ai vec_andc(vector signed char __a, vector signed char __b)
Definition: altivec.h:1163
static __inline__ vector signed int __ATTRS_o_ai vec_sld(vector signed int, vector signed int, unsigned const int __c)
Definition: altivec.h:8309
static __inline__ vector short __ATTRS_o_ai vec_unpackl(vector signed char __a)
Definition: altivec.h:11807
static __inline__ vector int __ATTRS_o_ai vec_sum4s(vector signed char __a, vector int __b)
Definition: altivec.h:11531
static __inline__ vector signed char __ATTRS_o_ai vec_and(vector signed char __a, vector signed char __b)
Definition: altivec.h:810
static __inline__ vector signed char __ATTRS_o_ai vec_avg(vector signed char __a, vector signed char __b)
Definition: altivec.h:1514
static __inline__ vector signed char __ATTRS_o_ai vec_mergel(vector signed char __a, vector signed char __b)
Definition: altivec.h:4804
static __inline__ vector signed char __ATTRS_o_ai vec_subs(vector signed char __a, vector signed char __b)
Definition: altivec.h:11232
static __inline__ vector int __ATTRS_o_ai vec_splat_s32(signed char __a)
Definition: altivec.h:9503
static __inline__ vector signed char __ATTRS_o_ai vec_adds(vector signed char __a, vector signed char __b)
Definition: altivec.h:560
static __inline__ vector signed char __ATTRS_o_ai vec_perm(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7320
static __inline__ vector signed char __ATTRS_o_ai vec_sel(vector signed char __a, vector signed char __b, vector unsigned char __c)
Definition: altivec.h:7834
static __inline__ vector signed char __ATTRS_o_ai vec_mergeh(vector signed char __a, vector signed char __b)
Definition: altivec.h:4534
static __inline__ vector bool char __ATTRS_o_ai vec_cmplt(vector signed char __a, vector signed char __b)
Definition: altivec.h:2196
static __inline__ vector signed char __ATTRS_o_ai vec_max(vector signed char __a, vector signed char __b)
Definition: altivec.h:4281
static __inline__ vector signed char __ATTRS_o_ai vec_slo(vector signed char __a, vector signed char __b)
Definition: altivec.h:9034
static __inline__ vector signed char __ATTRS_o_ai vec_nor(vector signed char __a, vector signed char __b)
Definition: altivec.h:6098
static __inline__ vector bool char __ATTRS_o_ai vec_cmpge(vector signed char __a, vector signed char __b)
Definition: altivec.h:2024
static __inline__ vector unsigned char __ATTRS_o_ai vec_packsu(vector short __a, vector short __b)
Definition: altivec.h:7202
static __inline__ vector signed char __ATTRS_o_ai vec_min(vector signed char __a, vector signed char __b)
Definition: altivec.h:5185
#define vec_cts
Definition: altivec.h:2981
static __inline__ vector signed char __ATTRS_o_ai vec_splat(vector signed char __a, unsigned const int __b)
Definition: altivec.h:9240
static __inline__ vector signed char __ATTRS_o_ai vec_or(vector signed char __a, vector signed char __b)
Definition: altivec.h:6234
static __inline__ vector short __ATTRS_o_ai vec_unpackh(vector signed char __a)
Definition: altivec.h:11668
static __inline__ vector unsigned char __ATTRS_o_ai vec_sl(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:8088
static __inline__ vector short __ATTRS_o_ai vec_splat_s16(signed char __a)
Definition: altivec.h:9487
static __inline__ vector signed char __ATTRS_o_ai vec_abs(vector signed char __a)
Definition: altivec.h:115
static __inline__ vector unsigned char __ATTRS_o_ai vec_xor(vector unsigned char __a, vector unsigned char __b)
Definition: altivec.h:12223
static __inline__ vector bool char __ATTRS_o_ai vec_cmpgt(vector signed char __a, vector signed char __b)
Definition: altivec.h:1964
static __inline__ vector bool char __ATTRS_o_ai vec_cmple(vector signed char __a, vector signed char __b)
Definition: altivec.h:2140
static __inline__ vector signed char __ATTRS_o_ai vec_packs(vector short __a, vector short __b)
Definition: altivec.h:7073
static __inline__ vector signed char __ATTRS_o_ai vec_sub(vector signed char __a, vector signed char __b)
Definition: altivec.h:10963
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1508
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 64-bit integral ...
Definition: emmintrin.h:3862
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1044
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-3) values from each of the two 128-bit vectors of [8 x i16] and interl...
Definition: emmintrin.h:4659
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a)
Moves the 64-bit operand to a 128-bit integer vector, zeroing the upper bits.
Definition: emmintrin.h:4737
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:2013
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
Initializes the 16-bit values in a 128-bit vector of [8 x i16] with the specified 16-bit integer valu...
Definition: emmintrin.h:3699
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1018
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1855
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit unsigned integer...
Definition: emmintrin.h:4331
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the smaller value f...
Definition: emmintrin.h:2432
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:577
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b)
Adds lower double-precision values in both operands and returns the sum in the lower 64 bits of the r...
Definition: emmintrin.h:56
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b)
Performs an element-by-element division of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:201
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b)
Subtracts two 128-bit vectors of [2 x double].
Definition: emmintrin.h:117
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit integer vector.
Definition: emmintrin.h:4879
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:389
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4399
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2897
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors.
Definition: emmintrin.h:2743
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:813
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1198
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an unaligned memory location.
Definition: emmintrin.h:1641
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b)
Subtracts the corresponding elements of two [2 x i64] vectors.
Definition: emmintrin.h:2643
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
Moves bytes selected by the mask from the first operand to the specified unaligned memory location.
Definition: emmintrin.h:4104
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1172
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
Initializes the 32-bit values in a 128-bit vector of [4 x i32] with the specified 32-bit integer valu...
Definition: emmintrin.h:3659
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1224
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [16 x i8] vectors,...
Definition: emmintrin.h:2222
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp)
Loads a 128-bit floating-point vector of [2 x double] from an aligned memory location.
Definition: emmintrin.h:1579
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a)
Converts the lower two integer elements of a 128-bit vector of [4 x i32] into two double-precision fl...
Definition: emmintrin.h:1338
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3171
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3095
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w)
Constructs a 128-bit floating-point vector of [2 x double], with each of the two double-precision flo...
Definition: emmintrin.h:1837
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3315
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double] initialized with the specified double-prec...
Definition: emmintrin.h:1875
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b)
Subtracts the corresponding 16-bit integer values in the operands.
Definition: emmintrin.h:2588
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:734
#define _mm_slli_si128(a, imm)
Left-shifts the 128-bit integer vector operand by the specified number of bytes.
Definition: emmintrin.h:2820
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b)
Divides the lower double-precision value of the first operand by the lower double-precision value of ...
Definition: emmintrin.h:181
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, __m128i __a)
Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to a memory location.
Definition: emmintrin.h:4123
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3336
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b)
Performs a bitwise OR of two 128-bit integer vectors.
Definition: emmintrin.h:2780
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:509
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:288
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp)
Loads a 64-bit double-precision value to the low element of a 128-bit integer vector and clears the u...
Definition: emmintrin.h:1724
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them ...
Definition: emmintrin.h:4776
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:759
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b)
Performs a bitwise AND of two 128-bit integer vectors, using the one's complement of the values conta...
Definition: emmintrin.h:2763
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3229
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count)
Right-shifts each of 32-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3114
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding 16-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3190
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the lower 16 bits of ea...
Definition: emmintrin.h:2492
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1146
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2955
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:407
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit unsigned [16 x i8] vectors, saving the greater value f...
Definition: emmintrin.h:2392
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2326
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a)
Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to the upper and lower 64 bits of a...
Definition: emmintrin.h:1992
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:4947
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3076
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b)
Unpacks the low-order (index 0-7) values from two 128-bit vectors of [16 x i8] and interleaves them i...
Definition: emmintrin.h:4631
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:785
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b)
Compares each of the corresponding signed 16-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3252
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:967
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x double] and interleaves them i...
Definition: emmintrin.h:4797
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit signed integer values in the input and returns the differences in the c...
Definition: emmintrin.h:2664
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:709
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:661
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b)
Unpacks the low-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them into...
Definition: emmintrin.h:4703
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2995
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a)
Extracts the sign bits of the double-precision values in the 128-bit vector of [2 x double],...
Definition: emmintrin.h:4816
#define _mm_shuffle_pd(a, b, i)
Constructs a 128-bit floating-point vector of [2 x double] from two 128-bit vector parameters of [2 x...
Definition: emmintrin.h:4846
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, __m128i __a)
Stores a 128-bit integer vector to a 128-bit aligned memory location.
Definition: emmintrin.h:4165
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding 32-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3209
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b)
Multiplies lower double-precision values in both operands and returns the product in the lower 64 bit...
Definition: emmintrin.h:139
void _mm_mfence(void)
Forces strong memory ordering (serialization) between load and store instructions preceding this inst...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit unsigned integer values in the input and returns the differences in th...
Definition: emmintrin.h:2725
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a)
Moves the lower 64 bits of a 128-bit integer vector to a 128-bit integer vector, zeroing the upper bi...
Definition: emmintrin.h:4755
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:467
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b)
Unpacks the high-order 64-bit elements from two 128-bit vectors of [2 x i64] and interleaves them int...
Definition: emmintrin.h:4596
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a)
Converts the low-order element of a [2 x double] vector into a 32-bit signed integer value,...
Definition: emmintrin.h:1491
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a)
Converts a vector of [4 x i32] into a vector of [4 x float].
Definition: emmintrin.h:3412
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b)
Unpacks the low-order (index 0,1) values from two 128-bit vectors of [4 x i32] and interleaves them i...
Definition: emmintrin.h:4682
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b)
Compares each of the corresponding signed 32-bit values of the 128-bit integer vectors to determine i...
Definition: emmintrin.h:3273
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b)
Converts the lower single-precision floating-point element of a 128-bit vector of [4 x float],...
Definition: emmintrin.h:1449
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a)
Calculates the square root of the each of two values stored in a 128-bit vector of [2 x double].
Definition: emmintrin.h:244
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1932
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b)
Unpacks the high-order (index 4-7) values from two 128-bit vectors of [8 x i16] and interleaves them ...
Definition: emmintrin.h:4552
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1528
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1358
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
Definition: emmintrin.h:3587
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit integer vector.
Definition: emmintrin.h:4913
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1250
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two 128-bit signed [8 x i16] vectors, producing eight interm...
Definition: emmintrin.h:2352
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [8 x i16] vectors,...
Definition: emmintrin.h:2286
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a)
Returns a vector of [4 x i32] where the lowest element is the input operand and the remaining element...
Definition: emmintrin.h:3461
#define _mm_load_pd1(dp)
Definition: emmintrin.h:1606
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3133
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2840
#define _mm_insert_epi16(a, b, imm)
Constructs a 128-bit integer vector by first making a copy of the 128-bit integer vector parameter,...
Definition: emmintrin.h:4382
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:598
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b)
Converts the lower double-precision floating-point element of a 128-bit vector of [2 x double],...
Definition: emmintrin.h:1400
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a)
Converts the low-order element of a 128-bit vector of [2 x double] into a 32-bit signed integer value...
Definition: emmintrin.h:1375
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b)
Subtracts the lower double-precision value of the second operand from the lower double-precision valu...
Definition: emmintrin.h:98
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the low-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1778
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2265
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b)
Adds, with saturation, the corresponding elements of two 128-bit signed [8 x i16] vectors,...
Definition: emmintrin.h:2244
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b)
Subtracts corresponding 16-bit signed integer values in the input and returns the differences in the ...
Definition: emmintrin.h:2685
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b)
Subtracts the corresponding 8-bit integer values in the operands.
Definition: emmintrin.h:2570
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] with unspecified content.
Definition: emmintrin.h:1799
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b)
Unpacks the high-order (index 2,3) values from two 128-bit vectors of [4 x i32] and interleaves them ...
Definition: emmintrin.h:4575
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1295
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 8-bit integral v...
Definition: emmintrin.h:3963
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two signed [8 x i16] vectors, saving the upper 16 bits of ea...
Definition: emmintrin.h:2452
#define _mm_extract_epi16(a, imm)
Extracts 16 bits from a 128-bit integer vector of [8 x i16], using the immediate-value parameter as a...
Definition: emmintrin.h:4358
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1096
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:556
#define _mm_shufflelo_epi16(a, imm)
Constructs a 128-bit integer vector by shuffling four lower 16-bit elements of a 128-bit integer vect...
Definition: emmintrin.h:4462
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [2 x i64], saving the lower 64 bits of each...
Definition: emmintrin.h:2201
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b)
Multiplies the corresponding elements of two unsigned [8 x i16] vectors, saving the upper 16 bits of ...
Definition: emmintrin.h:2472
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
Definition: emmintrin.h:371
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:992
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the greater value fro...
Definition: emmintrin.h:2372
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a)
Casts a 128-bit floating-point vector of [2 x double] into a 128-bit floating-point vector of [4 x fl...
Definition: emmintrin.h:4862
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2935
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b)
Converts 16-bit signed integers from both 128-bit integer vector operands into 8-bit signed integers,...
Definition: emmintrin.h:4275
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i)
Initializes all values in a 128-bit vector of [4 x i32] with the specified 32-bit value.
Definition: emmintrin.h:3804
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count)
Right-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2975
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp)
Loads two double-precision values, in reverse order, from an aligned memory location into a 128-bit v...
Definition: emmintrin.h:1623
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1070
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:640
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a)
Casts a 128-bit floating-point vector of [4 x float] into a 128-bit floating-point vector of [2 x dou...
Definition: emmintrin.h:4896
#define _mm_bsrli_si128(a, imm)
Definition: emmintrin.h:3040
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a)
Stores the upper 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2072
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count)
Left-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2878
void _mm_lfence(void)
Forces strong memory ordering (serialization) between load instructions preceding this instruction an...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b)
Computes the absolute differences of corresponding 8-bit integer values in two 128-bit vectors.
Definition: emmintrin.h:2552
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:941
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:841
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:2092
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a)
Moves packed double-precision values from a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1972
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [4 x i32], saving the lower 32 bits of each...
Definition: emmintrin.h:2161
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3548
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3842
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b)
Converts 32-bit signed integers from both 128-bit integer vector operands into 16-bit signed integers...
Definition: emmintrin.h:4303
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count)
Left-shifts each 16-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2859
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [16 x i8], saving the lower 8 bits of each ...
Definition: emmintrin.h:2117
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:916
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:488
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3532
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:684
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding signed 8-bit values of the 128-bit integer vectors to determine if...
Definition: emmintrin.h:3294
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a)
Casts a 128-bit integer vector into a 128-bit floating-point vector of [4 x float].
Definition: emmintrin.h:4930
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
Definition: emmintrin.h:1911
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
Definition: emmintrin.h:350
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3609
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b)
Converts a 32-bit signed integer value, in the second parameter, into a double-precision floating-poi...
Definition: emmintrin.h:1423
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the corresponding elements o...
Definition: emmintrin.h:2530
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:891
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count)
Right-shifts each of 64-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3152
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:866
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32], truncating the result when it is inexact...
Definition: emmintrin.h:3445
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 16-bit integral ...
Definition: emmintrin.h:3916
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp)
Loads a double-precision value into the high-order bits of a 128-bit vector of [2 x double].
Definition: emmintrin.h:1751
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
Initializes the 8-bit values in a 128-bit vector of [16 x i8] with the specified 8-bit integer values...
Definition: emmintrin.h:3747
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q)
Initializes both values in a 128-bit vector of [2 x i64] with the specified 64-bit value.
Definition: emmintrin.h:3785
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b)
Calculates the square root of the lower double-precision value of the second operand and returns it i...
Definition: emmintrin.h:226
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x)
Constructs a 128-bit floating-point vector of [2 x double], initialized in reverse order with the spe...
Definition: emmintrin.h:1896
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0)
Initializes both 64-bit values in a 128-bit vector of [2 x i64] with the specified 64-bit integer val...
Definition: emmintrin.h:3631
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the lesser of the pair of...
Definition: emmintrin.h:268
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count)
Left-shifts each 64-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:2916
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b)
Adds two 128-bit vectors of [2 x double].
Definition: emmintrin.h:75
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, __m128d __a)
Stores a 128-bit floating point vector of [2 x double] to a 128-bit aligned memory location.
Definition: emmintrin.h:4146
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a)
Stores the lower 64 bits of a 128-bit vector of [2 x double] to a memory location.
Definition: emmintrin.h:1950
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp)
Loads a double-precision floating-point value from a specified memory location and duplicates it to b...
Definition: emmintrin.h:1597
#define _mm_shufflehi_epi16(a, imm)
Constructs a 128-bit integer vector by shuffling four upper 16-bit elements of a 128-bit integer vect...
Definition: emmintrin.h:4492
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b)
Performs element-by-element comparison of the two 128-bit vectors of [2 x double] and returns the vec...
Definition: emmintrin.h:332
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b)
Compares corresponding elements of two 128-bit signed [8 x i16] vectors, saving the smaller value fro...
Definition: emmintrin.h:2412
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a)
Returns the lower 64 bits of a 128-bit integer vector as a 64-bit integer.
Definition: emmintrin.h:4720
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b)
Stores a 128-bit integer vector to a memory location aligned on a 128-bit boundary.
Definition: emmintrin.h:3995
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b)
Multiplies two 128-bit vectors of [2 x double].
Definition: emmintrin.h:158
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] for...
Definition: emmintrin.h:426
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w)
Constructs a 128-bit floating-point vector of [2 x double].
Definition: emmintrin.h:1819
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a)
Converts the two signed 32-bit integer elements of a 64-bit vector of [2 x i32] into two double-preci...
Definition: emmintrin.h:1545
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b)
Compares lower 64-bit double-precision values of both operands, and returns the greater of the pair o...
Definition: emmintrin.h:312
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a)
Returns the low-order element of a 128-bit vector of [2 x double] as a double-precision floating-poin...
Definition: emmintrin.h:1562
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i_u const *__p)
Returns a vector of [2 x i64] where the lower element is taken from the lower element of the operand,...
Definition: emmintrin.h:3569
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w)
Initializes all values in a 128-bit vector of [8 x i16] with the specified 16-bit value.
Definition: emmintrin.h:3823
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q)
Initializes both values in a 128-bit integer vector with the specified 64-bit integer value.
Definition: emmintrin.h:3766
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a)
Moves the least significant 32 bits of a vector of [4 x i32] to a 32-bit signed integer value.
Definition: emmintrin.h:3496
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b)
Subtracts the corresponding 32-bit integer values in the operands.
Definition: emmintrin.h:2606
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b)
Computes the rounded averages of corresponding elements of two 128-bit unsigned [16 x i8] vectors,...
Definition: emmintrin.h:2306
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b)
Multiplies 32-bit unsigned integer values contained in the lower bits of the two 64-bit integer vecto...
Definition: emmintrin.h:2511
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a)
Stores two double-precision values, in reverse order, from a 128-bit vector of [2 x double] to a 16-b...
Definition: emmintrin.h:2054
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b)
Unpacks the high-order (index 8-15) values from two 128-bit vectors of [16 x i8] and interleaves them...
Definition: emmintrin.h:4525
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count)
Right-shifts each of 16-bit values in the 128-bit integer vector operand by the specified number of b...
Definition: emmintrin.h:3057
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a)
Stores a 128-bit vector of [2 x double] into an unaligned memory location.
Definition: emmintrin.h:2031
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1276
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b)
Adds the corresponding elements of two 128-bit vectors of [8 x i16], saving the lower 16 bits of each...
Definition: emmintrin.h:2139
#define _mm_bslli_si128(a, imm)
Definition: emmintrin.h:2823
#define _mm_srli_si128(a, imm)
Right-shifts the 128-bit integer vector operand by the specified number of bytes.
Definition: emmintrin.h:3037
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b)
Subtracts corresponding 8-bit unsigned integer values in the input and returns the differences in the...
Definition: emmintrin.h:2705
#define _mm_shuffle_epi32(a, imm)
Constructs a 128-bit integer vector by shuffling four 32-bit elements of a 128-bit integer vector par...
Definition: emmintrin.h:4432
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a)
Converts the lower two single-precision floating-point elements of a 128-bit vector of [4 x float] in...
Definition: emmintrin.h:1315
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
Definition: emmintrin.h:3977
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b)
Compares the lower double-precision floating-point values in each of the two 128-bit floating-point v...
Definition: emmintrin.h:1122
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:532
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:446
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, __m128i __b)
Stores a 128-bit integer vector to an unaligned memory location.
Definition: emmintrin.h:4011
double __m128d __attribute__((__vector_size__(16), __aligned__(16)))
Definition: emmintrin.h:15
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count)
Right-shifts each 32-bit value in the 128-bit integer vector operand by the specified number of bits.
Definition: emmintrin.h:3015
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
Constructs a 128-bit integer vector, initialized in reverse order with the specified 32-bit integral ...
Definition: emmintrin.h:3885
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a)
Converts the two double-precision floating-point elements of a 128-bit vector of [2 x double] into tw...
Definition: emmintrin.h:1473
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a)
Converts a vector of [4 x float] into a vector of [4 x i32].
Definition: emmintrin.h:3428
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b)
Performs a bitwise exclusive OR of two 128-bit integer vectors.
Definition: emmintrin.h:2798
void _mm_clflush(void const *__p)
The cache line containing __p is flushed and invalidated from all caches in the coherency domain.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b)
Compares each of the corresponding double-precision values of the 128-bit vectors of [2 x double] to ...
Definition: emmintrin.h:619
static __inline__ unsigned int unsigned char __D
Definition: ia32intrin.h:283
static __inline__ unsigned char int __C
Definition: ia32intrin.h:373